| 1 | #include "simdjson/error.h" |
| 2 | |
| 3 | namespace simdjson { |
| 4 | namespace SIMDJSON_IMPLEMENTATION { |
| 5 | namespace ondemand { |
| 6 | |
| 7 | class array; |
| 8 | class object; |
| 9 | class value; |
| 10 | class raw_json_string; |
| 11 | class document_stream; |
| 12 | |
| 13 | /** |
| 14 | * The default batch size for document_stream instances for this On Demand kernel. |
| 15 | * Note that different On Demand kernel may use a different DEFAULT_BATCH_SIZE value |
| 16 | * in the future. |
| 17 | */ |
| 18 | static constexpr size_t DEFAULT_BATCH_SIZE = 1000000; |
| 19 | /** |
| 20 | * Some adversary might try to set the batch size to 0 or 1, which might cause problems. |
| 21 | * We set a minimum of 32B since anything else is highly likely to be an error. In practice, |
| 22 | * most users will want a much larger batch size. |
| 23 | * |
| 24 | * All non-negative MINIMAL_BATCH_SIZE values should be 'safe' except that, obviously, no JSON |
| 25 | * document can ever span 0 or 1 byte and that very large values would create memory allocation issues. |
| 26 | */ |
| 27 | static constexpr size_t MINIMAL_BATCH_SIZE = 32; |
| 28 | |
| 29 | /** |
| 30 | * A JSON fragment iterator. |
| 31 | * |
| 32 | * This holds the actual iterator as well as the buffer for writing strings. |
| 33 | */ |
| 34 | class parser { |
| 35 | public: |
| 36 | /** |
| 37 | * Create a JSON parser. |
| 38 | * |
| 39 | * The new parser will have zero capacity. |
| 40 | */ |
| 41 | inline explicit parser(size_t max_capacity = SIMDJSON_MAXSIZE_BYTES) noexcept; |
| 42 | |
| 43 | inline parser(parser &&other) noexcept = default; |
| 44 | simdjson_inline parser(const parser &other) = delete; |
| 45 | simdjson_inline parser &operator=(const parser &other) = delete; |
| 46 | simdjson_inline parser &operator=(parser &&other) noexcept = default; |
| 47 | |
| 48 | /** Deallocate the JSON parser. */ |
| 49 | inline ~parser() noexcept = default; |
| 50 | |
| 51 | /** |
| 52 | * Start iterating an on-demand JSON document. |
| 53 | * |
| 54 | * ondemand::parser parser; |
| 55 | * document doc = parser.iterate(json); |
| 56 | * |
| 57 | * It is expected that the content is a valid UTF-8 file, containing a valid JSON document. |
| 58 | * Otherwise the iterate method may return an error. In particular, the whole input should be |
| 59 | * valid: we do not attempt to tolerate incorrect content either before or after a JSON |
| 60 | * document. |
| 61 | * |
| 62 | * ### IMPORTANT: Validate what you use |
| 63 | * |
| 64 | * Calling iterate on an invalid JSON document may not immediately trigger an error. The call to |
| 65 | * iterate does not parse and validate the whole document. |
| 66 | * |
| 67 | * ### IMPORTANT: Buffer Lifetime |
| 68 | * |
| 69 | * Because parsing is done while you iterate, you *must* keep the JSON buffer around at least as |
| 70 | * long as the document iteration. |
| 71 | * |
| 72 | * ### IMPORTANT: Document Lifetime |
| 73 | * |
| 74 | * Only one iteration at a time can happen per parser, and the parser *must* be kept alive during |
| 75 | * iteration to ensure intermediate buffers can be accessed. Any document must be destroyed before |
| 76 | * you call parse() again or destroy the parser. |
| 77 | * |
| 78 | * ### REQUIRED: Buffer Padding |
| 79 | * |
| 80 | * The buffer must have at least SIMDJSON_PADDING extra allocated bytes. It does not matter what |
| 81 | * those bytes are initialized to, as long as they are allocated. These bytes will be read: if you |
| 82 | * using a sanitizer that verifies that no uninitialized byte is read, then you should initialize the |
| 83 | * SIMDJSON_PADDING bytes to avoid runtime warnings. |
| 84 | * |
| 85 | * @param json The JSON to parse. |
| 86 | * @param len The length of the JSON. |
| 87 | * @param capacity The number of bytes allocated in the JSON (must be at least len+SIMDJSON_PADDING). |
| 88 | * |
| 89 | * @return The document, or an error: |
| 90 | * - INSUFFICIENT_PADDING if the input has less than SIMDJSON_PADDING extra bytes. |
| 91 | * - MEMALLOC if realloc_if_needed the parser does not have enough capacity, and memory |
| 92 | * allocation fails. |
| 93 | * - EMPTY if the document is all whitespace. |
| 94 | * - UTF8_ERROR if the document is not valid UTF-8. |
| 95 | * - UNESCAPED_CHARS if a string contains control characters that must be escaped |
| 96 | * - UNCLOSED_STRING if there is an unclosed string in the document. |
| 97 | */ |
| 98 | simdjson_warn_unused simdjson_result<document> iterate(padded_string_view json) & noexcept; |
| 99 | /** @overload simdjson_result<document> iterate(padded_string_view json) & noexcept */ |
| 100 | simdjson_warn_unused simdjson_result<document> iterate(const char *json, size_t len, size_t capacity) & noexcept; |
| 101 | /** @overload simdjson_result<document> iterate(padded_string_view json) & noexcept */ |
| 102 | simdjson_warn_unused simdjson_result<document> iterate(const uint8_t *json, size_t len, size_t capacity) & noexcept; |
| 103 | /** @overload simdjson_result<document> iterate(padded_string_view json) & noexcept */ |
| 104 | simdjson_warn_unused simdjson_result<document> iterate(std::string_view json, size_t capacity) & noexcept; |
| 105 | /** @overload simdjson_result<document> iterate(padded_string_view json) & noexcept */ |
| 106 | simdjson_warn_unused simdjson_result<document> iterate(const std::string &json) & noexcept; |
| 107 | /** @overload simdjson_result<document> iterate(padded_string_view json) & noexcept */ |
| 108 | simdjson_warn_unused simdjson_result<document> iterate(const simdjson_result<padded_string> &json) & noexcept; |
| 109 | /** @overload simdjson_result<document> iterate(padded_string_view json) & noexcept */ |
| 110 | simdjson_warn_unused simdjson_result<document> iterate(const simdjson_result<padded_string_view> &json) & noexcept; |
| 111 | /** @overload simdjson_result<document> iterate(padded_string_view json) & noexcept */ |
| 112 | simdjson_warn_unused simdjson_result<document> iterate(padded_string &&json) & noexcept = delete; |
| 113 | |
| 114 | /** |
| 115 | * @private |
| 116 | * |
| 117 | * Start iterating an on-demand JSON document. |
| 118 | * |
| 119 | * ondemand::parser parser; |
| 120 | * json_iterator doc = parser.iterate(json); |
| 121 | * |
| 122 | * ### IMPORTANT: Buffer Lifetime |
| 123 | * |
| 124 | * Because parsing is done while you iterate, you *must* keep the JSON buffer around at least as |
| 125 | * long as the document iteration. |
| 126 | * |
| 127 | * ### IMPORTANT: Document Lifetime |
| 128 | * |
| 129 | * Only one iteration at a time can happen per parser, and the parser *must* be kept alive during |
| 130 | * iteration to ensure intermediate buffers can be accessed. Any document must be destroyed before |
| 131 | * you call parse() again or destroy the parser. |
| 132 | * |
| 133 | * The ondemand::document instance holds the iterator. The document must remain in scope |
| 134 | * while you are accessing instances of ondemand::value, ondemand::object, ondemand::array. |
| 135 | * |
| 136 | * ### REQUIRED: Buffer Padding |
| 137 | * |
| 138 | * The buffer must have at least SIMDJSON_PADDING extra allocated bytes. It does not matter what |
| 139 | * those bytes are initialized to, as long as they are allocated. These bytes will be read: if you |
| 140 | * using a sanitizer that verifies that no uninitialized byte is read, then you should initialize the |
| 141 | * SIMDJSON_PADDING bytes to avoid runtime warnings. |
| 142 | * |
| 143 | * @param json The JSON to parse. |
| 144 | * |
| 145 | * @return The iterator, or an error: |
| 146 | * - INSUFFICIENT_PADDING if the input has less than SIMDJSON_PADDING extra bytes. |
| 147 | * - MEMALLOC if realloc_if_needed the parser does not have enough capacity, and memory |
| 148 | * allocation fails. |
| 149 | * - EMPTY if the document is all whitespace. |
| 150 | * - UTF8_ERROR if the document is not valid UTF-8. |
| 151 | * - UNESCAPED_CHARS if a string contains control characters that must be escaped |
| 152 | * - UNCLOSED_STRING if there is an unclosed string in the document. |
| 153 | */ |
| 154 | simdjson_warn_unused simdjson_result<json_iterator> iterate_raw(padded_string_view json) & noexcept; |
| 155 | |
| 156 | |
| 157 | /** |
| 158 | * Parse a buffer containing many JSON documents. |
| 159 | * |
| 160 | * auto json = R"({ "foo": 1 } { "foo": 2 } { "foo": 3 } )"_padded; |
| 161 | * ondemand::parser parser; |
| 162 | * ondemand::document_stream docs = parser.iterate_many(json); |
| 163 | * for (auto & doc : docs) { |
| 164 | * std::cout << doc["foo"] << std::endl; |
| 165 | * } |
| 166 | * // Prints 1 2 3 |
| 167 | * |
| 168 | * No copy of the input buffer is made. |
| 169 | * |
| 170 | * The function is lazy: it may be that no more than one JSON document at a time is parsed. |
| 171 | * |
| 172 | * The caller is responsabile to ensure that the input string data remains unchanged and is |
| 173 | * not deleted during the loop. |
| 174 | * |
| 175 | * ### Format |
| 176 | * |
| 177 | * The buffer must contain a series of one or more JSON documents, concatenated into a single |
| 178 | * buffer, separated by ASCII whitespace. It effectively parses until it has a fully valid document, |
| 179 | * then starts parsing the next document at that point. (It does this with more parallelism and |
| 180 | * lookahead than you might think, though.) |
| 181 | * |
| 182 | * documents that consist of an object or array may omit the whitespace between them, concatenating |
| 183 | * with no separator. Documents that consist of a single primitive (i.e. documents that are not |
| 184 | * arrays or objects) MUST be separated with ASCII whitespace. |
| 185 | * |
| 186 | * The characters inside a JSON document, and between JSON documents, must be valid Unicode (UTF-8). |
| 187 | * |
| 188 | * The documents must not exceed batch_size bytes (by default 1MB) or they will fail to parse. |
| 189 | * Setting batch_size to excessively large or excessively small values may impact negatively the |
| 190 | * performance. |
| 191 | * |
| 192 | * ### REQUIRED: Buffer Padding |
| 193 | * |
| 194 | * The buffer must have at least SIMDJSON_PADDING extra allocated bytes. It does not matter what |
| 195 | * those bytes are initialized to, as long as they are allocated. These bytes will be read: if you |
| 196 | * using a sanitizer that verifies that no uninitialized byte is read, then you should initialize the |
| 197 | * SIMDJSON_PADDING bytes to avoid runtime warnings. |
| 198 | * |
| 199 | * ### Threads |
| 200 | * |
| 201 | * When compiled with SIMDJSON_THREADS_ENABLED, this method will use a single thread under the |
| 202 | * hood to do some lookahead. |
| 203 | * |
| 204 | * ### Parser Capacity |
| 205 | * |
| 206 | * If the parser's current capacity is less than batch_size, it will allocate enough capacity |
| 207 | * to handle it (up to max_capacity). |
| 208 | * |
| 209 | * @param buf The concatenated JSON to parse. |
| 210 | * @param len The length of the concatenated JSON. |
| 211 | * @param batch_size The batch size to use. MUST be larger than the largest document. The sweet |
| 212 | * spot is cache-related: small enough to fit in cache, yet big enough to |
| 213 | * parse as many documents as possible in one tight loop. |
| 214 | * Defaults to 10MB, which has been a reasonable sweet spot in our tests. |
| 215 | * @return The stream, or an error. An empty input will yield 0 documents rather than an EMPTY error. Errors: |
| 216 | * - MEMALLOC if the parser does not have enough capacity and memory allocation fails |
| 217 | * - CAPACITY if the parser does not have enough capacity and batch_size > max_capacity. |
| 218 | * - other json errors if parsing fails. You should not rely on these errors to always the same for the |
| 219 | * same document: they may vary under runtime dispatch (so they may vary depending on your system and hardware). |
| 220 | */ |
| 221 | inline simdjson_result<document_stream> iterate_many(const uint8_t *buf, size_t len, size_t batch_size = DEFAULT_BATCH_SIZE) noexcept; |
| 222 | /** @overload parse_many(const uint8_t *buf, size_t len, size_t batch_size) */ |
| 223 | inline simdjson_result<document_stream> iterate_many(const char *buf, size_t len, size_t batch_size = DEFAULT_BATCH_SIZE) noexcept; |
| 224 | /** @overload parse_many(const uint8_t *buf, size_t len, size_t batch_size) */ |
| 225 | inline simdjson_result<document_stream> iterate_many(const std::string &s, size_t batch_size = DEFAULT_BATCH_SIZE) noexcept; |
| 226 | inline simdjson_result<document_stream> iterate_many(const std::string &&s, size_t batch_size) = delete;// unsafe |
| 227 | /** @overload parse_many(const uint8_t *buf, size_t len, size_t batch_size) */ |
| 228 | inline simdjson_result<document_stream> iterate_many(const padded_string &s, size_t batch_size = DEFAULT_BATCH_SIZE) noexcept; |
| 229 | inline simdjson_result<document_stream> iterate_many(const padded_string &&s, size_t batch_size) = delete;// unsafe |
| 230 | |
| 231 | /** @private We do not want to allow implicit conversion from C string to std::string. */ |
| 232 | simdjson_result<document_stream> iterate_many(const char *buf, size_t batch_size = DEFAULT_BATCH_SIZE) noexcept = delete; |
| 233 | |
| 234 | /** The capacity of this parser (the largest document it can process). */ |
| 235 | simdjson_inline size_t capacity() const noexcept; |
| 236 | /** The maximum capacity of this parser (the largest document it is allowed to process). */ |
| 237 | simdjson_inline size_t max_capacity() const noexcept; |
| 238 | simdjson_inline void set_max_capacity(size_t max_capacity) noexcept; |
| 239 | /** |
| 240 | * The maximum depth of this parser (the most deeply nested objects and arrays it can process). |
| 241 | * This parameter is only relevant when the macro SIMDJSON_DEVELOPMENT_CHECKS is set to true. |
| 242 | * The document's instance current_depth() method should be used to monitor the parsing |
| 243 | * depth and limit it if desired. |
| 244 | */ |
| 245 | simdjson_inline size_t max_depth() const noexcept; |
| 246 | |
| 247 | /** |
| 248 | * Ensure this parser has enough memory to process JSON documents up to `capacity` bytes in length |
| 249 | * and `max_depth` depth. |
| 250 | * |
| 251 | * The max_depth parameter is only relevant when the macro SIMDJSON_DEVELOPMENT_CHECKS is set to true. |
| 252 | * The document's instance current_depth() method should be used to monitor the parsing |
| 253 | * depth and limit it if desired. |
| 254 | * |
| 255 | * @param capacity The new capacity. |
| 256 | * @param max_depth The new max_depth. Defaults to DEFAULT_MAX_DEPTH. |
| 257 | * @return The error, if there is one. |
| 258 | */ |
| 259 | simdjson_warn_unused error_code allocate(size_t capacity, size_t max_depth=DEFAULT_MAX_DEPTH) noexcept; |
| 260 | |
| 261 | #ifdef SIMDJSON_THREADS_ENABLED |
| 262 | /** |
| 263 | * The parser instance can use threads when they are available to speed up some |
| 264 | * operations. It is enabled by default. Changing this attribute will change the |
| 265 | * behavior of the parser for future operations. |
| 266 | */ |
| 267 | bool threaded{true}; |
| 268 | #endif |
| 269 | |
| 270 | /** |
| 271 | * Unescape this JSON string, replacing \\ with \, \n with newline, etc. to a user-provided buffer. |
| 272 | * The result must be valid UTF-8. |
| 273 | * The provided pointer is advanced to the end of the string by reference, and a string_view instance |
| 274 | * is returned. You can ensure that your buffer is large enough by allocating a block of memory at least |
| 275 | * as large as the input JSON plus SIMDJSON_PADDING and then unescape all strings to this one buffer. |
| 276 | * |
| 277 | * This unescape function is a low-level function. If you want a more user-friendly approach, you should |
| 278 | * avoid raw_json_string instances (e.g., by calling unescaped_key() instead of key() or get_string() |
| 279 | * instead of get_raw_json_string()). |
| 280 | * |
| 281 | * ## IMPORTANT: string_view lifetime |
| 282 | * |
| 283 | * The string_view is only valid as long as the bytes in dst. |
| 284 | * |
| 285 | * @param raw_json_string input |
| 286 | * @param dst A pointer to a buffer at least large enough to write this string as well as |
| 287 | * an additional SIMDJSON_PADDING bytes. |
| 288 | * @param allow_replacement Whether we allow a replacement if the input string contains unmatched surrogate pairs. |
| 289 | * @return A string_view pointing at the unescaped string in dst |
| 290 | * @error STRING_ERROR if escapes are incorrect. |
| 291 | */ |
| 292 | simdjson_inline simdjson_result<std::string_view> unescape(raw_json_string in, uint8_t *&dst, bool allow_replacement = false) const noexcept; |
| 293 | |
| 294 | /** |
| 295 | * Unescape this JSON string, replacing \\ with \, \n with newline, etc. to a user-provided buffer. |
| 296 | * The result may not be valid UTF-8. See https://simonsapin.github.io/wtf-8/ |
| 297 | * The provided pointer is advanced to the end of the string by reference, and a string_view instance |
| 298 | * is returned. You can ensure that your buffer is large enough by allocating a block of memory at least |
| 299 | * as large as the input JSON plus SIMDJSON_PADDING and then unescape all strings to this one buffer. |
| 300 | * |
| 301 | * This unescape function is a low-level function. If you want a more user-friendly approach, you should |
| 302 | * avoid raw_json_string instances (e.g., by calling unescaped_key() instead of key() or get_string() |
| 303 | * instead of get_raw_json_string()). |
| 304 | * |
| 305 | * ## IMPORTANT: string_view lifetime |
| 306 | * |
| 307 | * The string_view is only valid as long as the bytes in dst. |
| 308 | * |
| 309 | * @param raw_json_string input |
| 310 | * @param dst A pointer to a buffer at least large enough to write this string as well as |
| 311 | * an additional SIMDJSON_PADDING bytes. |
| 312 | * @return A string_view pointing at the unescaped string in dst |
| 313 | * @error STRING_ERROR if escapes are incorrect. |
| 314 | */ |
| 315 | simdjson_inline simdjson_result<std::string_view> unescape_wobbly(raw_json_string in, uint8_t *&dst) const noexcept; |
| 316 | |
| 317 | private: |
| 318 | /** @private [for benchmarking access] The implementation to use */ |
| 319 | std::unique_ptr<internal::dom_parser_implementation> implementation{}; |
| 320 | size_t _capacity{0}; |
| 321 | size_t _max_capacity; |
| 322 | size_t _max_depth{DEFAULT_MAX_DEPTH}; |
| 323 | std::unique_ptr<uint8_t[]> string_buf{}; |
| 324 | #if SIMDJSON_DEVELOPMENT_CHECKS |
| 325 | std::unique_ptr<token_position[]> start_positions{}; |
| 326 | #endif |
| 327 | |
| 328 | friend class json_iterator; |
| 329 | friend class document_stream; |
| 330 | }; |
| 331 | |
| 332 | } // namespace ondemand |
| 333 | } // namespace SIMDJSON_IMPLEMENTATION |
| 334 | } // namespace simdjson |
| 335 | |
| 336 | namespace simdjson { |
| 337 | |
| 338 | template<> |
| 339 | struct simdjson_result<SIMDJSON_IMPLEMENTATION::ondemand::parser> : public SIMDJSON_IMPLEMENTATION::implementation_simdjson_result_base<SIMDJSON_IMPLEMENTATION::ondemand::parser> { |
| 340 | public: |
| 341 | simdjson_inline simdjson_result(SIMDJSON_IMPLEMENTATION::ondemand::parser &&value) noexcept; ///< @private |
| 342 | simdjson_inline simdjson_result(error_code error) noexcept; ///< @private |
| 343 | simdjson_inline simdjson_result() noexcept = default; |
| 344 | }; |
| 345 | |
| 346 | } // namespace simdjson |
| 347 | |