| 1 | #ifndef SIMDJSON_INTERNAL_DOM_PARSER_IMPLEMENTATION_H |
| 2 | #define SIMDJSON_INTERNAL_DOM_PARSER_IMPLEMENTATION_H |
| 3 | |
| 4 | #include "simdjson/common_defs.h" |
| 5 | #include "simdjson/error.h" |
| 6 | #include <memory> |
| 7 | |
| 8 | namespace simdjson { |
| 9 | |
| 10 | namespace dom { |
| 11 | class document; |
| 12 | } // namespace dom |
| 13 | |
| 14 | /** |
| 15 | * This enum is used with the dom_parser_implementation::stage1 function. |
| 16 | * 1) The regular mode expects a fully formed JSON document. |
| 17 | * 2) The streaming_partial mode expects a possibly truncated |
| 18 | * input within a stream on JSON documents. |
| 19 | * 3) The stream_final mode allows us to truncate final |
| 20 | * unterminated strings. It is useful in conjunction with streaming_partial. |
| 21 | */ |
| 22 | enum class stage1_mode { regular, streaming_partial, streaming_final}; |
| 23 | |
| 24 | /** |
| 25 | * Returns true if mode == streaming_partial or mode == streaming_final |
| 26 | */ |
| 27 | inline bool is_streaming(stage1_mode mode) { |
| 28 | // performance note: it is probably faster to check that mode is different |
| 29 | // from regular than checking that it is either streaming_partial or streaming_final. |
| 30 | return (mode != stage1_mode::regular); |
| 31 | // return (mode == stage1_mode::streaming_partial || mode == stage1_mode::streaming_final); |
| 32 | } |
| 33 | |
| 34 | |
| 35 | namespace internal { |
| 36 | |
| 37 | |
| 38 | /** |
| 39 | * An implementation of simdjson's DOM parser for a particular CPU architecture. |
| 40 | * |
| 41 | * This class is expected to be accessed only by pointer, and never move in memory (though the |
| 42 | * pointer can move). |
| 43 | */ |
| 44 | class dom_parser_implementation { |
| 45 | public: |
| 46 | |
| 47 | /** |
| 48 | * @private For internal implementation use |
| 49 | * |
| 50 | * Run a full JSON parse on a single document (stage1 + stage2). |
| 51 | * |
| 52 | * Guaranteed only to be called when capacity > document length. |
| 53 | * |
| 54 | * Overridden by each implementation. |
| 55 | * |
| 56 | * @param buf The json document to parse. *MUST* be allocated up to len + SIMDJSON_PADDING bytes. |
| 57 | * @param len The length of the json document. |
| 58 | * @return The error code, or SUCCESS if there was no error. |
| 59 | */ |
| 60 | simdjson_warn_unused virtual error_code parse(const uint8_t *buf, size_t len, dom::document &doc) noexcept = 0; |
| 61 | |
| 62 | /** |
| 63 | * @private For internal implementation use |
| 64 | * |
| 65 | * Stage 1 of the document parser. |
| 66 | * |
| 67 | * Guaranteed only to be called when capacity > document length. |
| 68 | * |
| 69 | * Overridden by each implementation. |
| 70 | * |
| 71 | * @param buf The json document to parse. |
| 72 | * @param len The length of the json document. |
| 73 | * @param streaming Whether this is being called by parser::parse_many. |
| 74 | * @return The error code, or SUCCESS if there was no error. |
| 75 | */ |
| 76 | simdjson_warn_unused virtual error_code stage1(const uint8_t *buf, size_t len, stage1_mode streaming) noexcept = 0; |
| 77 | |
| 78 | /** |
| 79 | * @private For internal implementation use |
| 80 | * |
| 81 | * Stage 2 of the document parser. |
| 82 | * |
| 83 | * Called after stage1(). |
| 84 | * |
| 85 | * Overridden by each implementation. |
| 86 | * |
| 87 | * @param doc The document to output to. |
| 88 | * @return The error code, or SUCCESS if there was no error. |
| 89 | */ |
| 90 | simdjson_warn_unused virtual error_code stage2(dom::document &doc) noexcept = 0; |
| 91 | |
| 92 | /** |
| 93 | * @private For internal implementation use |
| 94 | * |
| 95 | * Stage 2 of the document parser for parser::parse_many. |
| 96 | * |
| 97 | * Guaranteed only to be called after stage1(). |
| 98 | * Overridden by each implementation. |
| 99 | * |
| 100 | * @param doc The document to output to. |
| 101 | * @return The error code, SUCCESS if there was no error, or EMPTY if all documents have been parsed. |
| 102 | */ |
| 103 | simdjson_warn_unused virtual error_code stage2_next(dom::document &doc) noexcept = 0; |
| 104 | |
| 105 | /** |
| 106 | * Unescape a valid UTF-8 string from src to dst, stopping at a final unescaped quote. There |
| 107 | * must be an unescaped quote terminating the string. It returns the final output |
| 108 | * position as pointer. In case of error (e.g., the string has bad escaped codes), |
| 109 | * then null_nullptrptr is returned. It is assumed that the output buffer is large |
| 110 | * enough. E.g., if src points at 'joe"', then dst needs to have four free bytes + |
| 111 | * SIMDJSON_PADDING bytes. |
| 112 | * |
| 113 | * Overridden by each implementation. |
| 114 | * |
| 115 | * @param str pointer to the beginning of a valid UTF-8 JSON string, must end with an unescaped quote. |
| 116 | * @param dst pointer to a destination buffer, it must point a region in memory of sufficient size. |
| 117 | * @param allow_replacement whether we allow a replacement character when the UTF-8 contains unmatched surrogate pairs. |
| 118 | * @return end of the of the written region (exclusive) or nullptr in case of error. |
| 119 | */ |
| 120 | simdjson_warn_unused virtual uint8_t *parse_string(const uint8_t *src, uint8_t *dst, bool allow_replacement) const noexcept = 0; |
| 121 | |
| 122 | /** |
| 123 | * Unescape a NON-valid UTF-8 string from src to dst, stopping at a final unescaped quote. There |
| 124 | * must be an unescaped quote terminating the string. It returns the final output |
| 125 | * position as pointer. In case of error (e.g., the string has bad escaped codes), |
| 126 | * then null_nullptrptr is returned. It is assumed that the output buffer is large |
| 127 | * enough. E.g., if src points at 'joe"', then dst needs to have four free bytes + |
| 128 | * SIMDJSON_PADDING bytes. |
| 129 | * |
| 130 | * Overridden by each implementation. |
| 131 | * |
| 132 | * @param str pointer to the beginning of a possibly invalid UTF-8 JSON string, must end with an unescaped quote. |
| 133 | * @param dst pointer to a destination buffer, it must point a region in memory of sufficient size. |
| 134 | * @return end of the of the written region (exclusive) or nullptr in case of error. |
| 135 | */ |
| 136 | simdjson_warn_unused virtual uint8_t *parse_wobbly_string(const uint8_t *src, uint8_t *dst) const noexcept = 0; |
| 137 | |
| 138 | /** |
| 139 | * Change the capacity of this parser. |
| 140 | * |
| 141 | * The capacity can never exceed SIMDJSON_MAXSIZE_BYTES (e.g., 4 GB) |
| 142 | * and an CAPACITY error is returned if it is attempted. |
| 143 | * |
| 144 | * Generally used for reallocation. |
| 145 | * |
| 146 | * @param capacity The new capacity. |
| 147 | * @param max_depth The new max_depth. |
| 148 | * @return The error code, or SUCCESS if there was no error. |
| 149 | */ |
| 150 | virtual error_code set_capacity(size_t capacity) noexcept = 0; |
| 151 | |
| 152 | /** |
| 153 | * Change the max depth of this parser. |
| 154 | * |
| 155 | * Generally used for reallocation. |
| 156 | * |
| 157 | * @param capacity The new capacity. |
| 158 | * @param max_depth The new max_depth. |
| 159 | * @return The error code, or SUCCESS if there was no error. |
| 160 | */ |
| 161 | virtual error_code set_max_depth(size_t max_depth) noexcept = 0; |
| 162 | |
| 163 | /** |
| 164 | * Deallocate this parser. |
| 165 | */ |
| 166 | virtual ~dom_parser_implementation() = default; |
| 167 | |
| 168 | /** Number of structural indices passed from stage 1 to stage 2 */ |
| 169 | uint32_t n_structural_indexes{0}; |
| 170 | /** Structural indices passed from stage 1 to stage 2 */ |
| 171 | std::unique_ptr<uint32_t[]> structural_indexes{}; |
| 172 | /** Next structural index to parse */ |
| 173 | uint32_t next_structural_index{0}; |
| 174 | |
| 175 | /** |
| 176 | * The largest document this parser can support without reallocating. |
| 177 | * |
| 178 | * @return Current capacity, in bytes. |
| 179 | */ |
| 180 | simdjson_inline size_t capacity() const noexcept; |
| 181 | |
| 182 | /** |
| 183 | * The maximum level of nested object and arrays supported by this parser. |
| 184 | * |
| 185 | * @return Maximum depth, in bytes. |
| 186 | */ |
| 187 | simdjson_inline size_t max_depth() const noexcept; |
| 188 | |
| 189 | /** |
| 190 | * Ensure this parser has enough memory to process JSON documents up to `capacity` bytes in length |
| 191 | * and `max_depth` depth. |
| 192 | * |
| 193 | * @param capacity The new capacity. |
| 194 | * @param max_depth The new max_depth. Defaults to DEFAULT_MAX_DEPTH. |
| 195 | * @return The error, if there is one. |
| 196 | */ |
| 197 | simdjson_warn_unused inline error_code allocate(size_t capacity, size_t max_depth) noexcept; |
| 198 | |
| 199 | |
| 200 | protected: |
| 201 | /** |
| 202 | * The maximum document length this parser supports. |
| 203 | * |
| 204 | * Buffers are large enough to handle any document up to this length. |
| 205 | */ |
| 206 | size_t _capacity{0}; |
| 207 | |
| 208 | /** |
| 209 | * The maximum depth (number of nested objects and arrays) supported by this parser. |
| 210 | * |
| 211 | * Defaults to DEFAULT_MAX_DEPTH. |
| 212 | */ |
| 213 | size_t _max_depth{0}; |
| 214 | |
| 215 | // Declaring these so that subclasses can use them to implement their constructors. |
| 216 | simdjson_inline dom_parser_implementation() noexcept; |
| 217 | simdjson_inline dom_parser_implementation(dom_parser_implementation &&other) noexcept; |
| 218 | simdjson_inline dom_parser_implementation &operator=(dom_parser_implementation &&other) noexcept; |
| 219 | |
| 220 | simdjson_inline dom_parser_implementation(const dom_parser_implementation &) noexcept = delete; |
| 221 | simdjson_inline dom_parser_implementation &operator=(const dom_parser_implementation &other) noexcept = delete; |
| 222 | }; // class dom_parser_implementation |
| 223 | |
| 224 | simdjson_inline dom_parser_implementation::dom_parser_implementation() noexcept = default; |
| 225 | simdjson_inline dom_parser_implementation::dom_parser_implementation(dom_parser_implementation &&other) noexcept = default; |
| 226 | simdjson_inline dom_parser_implementation &dom_parser_implementation::operator=(dom_parser_implementation &&other) noexcept = default; |
| 227 | |
| 228 | simdjson_inline size_t dom_parser_implementation::capacity() const noexcept { |
| 229 | return _capacity; |
| 230 | } |
| 231 | |
| 232 | simdjson_inline size_t dom_parser_implementation::max_depth() const noexcept { |
| 233 | return _max_depth; |
| 234 | } |
| 235 | |
| 236 | simdjson_warn_unused |
| 237 | inline error_code dom_parser_implementation::allocate(size_t capacity, size_t max_depth) noexcept { |
| 238 | if (this->max_depth() != max_depth) { |
| 239 | error_code err = set_max_depth(max_depth); |
| 240 | if (err) { return err; } |
| 241 | } |
| 242 | if (_capacity != capacity) { |
| 243 | error_code err = set_capacity(capacity); |
| 244 | if (err) { return err; } |
| 245 | } |
| 246 | return SUCCESS; |
| 247 | } |
| 248 | |
| 249 | } // namespace internal |
| 250 | } // namespace simdjson |
| 251 | |
| 252 | #endif // SIMDJSON_INTERNAL_DOM_PARSER_IMPLEMENTATION_H |
| 253 | |