1 | // Protocol Buffers - Google's data interchange format |
2 | // Copyright 2008 Google Inc. All rights reserved. |
3 | // https://developers.google.com/protocol-buffers/ |
4 | // |
5 | // Redistribution and use in source and binary forms, with or without |
6 | // modification, are permitted provided that the following conditions are |
7 | // met: |
8 | // |
9 | // * Redistributions of source code must retain the above copyright |
10 | // notice, this list of conditions and the following disclaimer. |
11 | // * Redistributions in binary form must reproduce the above |
12 | // copyright notice, this list of conditions and the following disclaimer |
13 | // in the documentation and/or other materials provided with the |
14 | // distribution. |
15 | // * Neither the name of Google Inc. nor the names of its |
16 | // contributors may be used to endorse or promote products derived from |
17 | // this software without specific prior written permission. |
18 | // |
19 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
20 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
21 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
22 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
23 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
24 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
25 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
26 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
27 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
28 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
29 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
30 | |
31 | #ifndef GOOGLE_PROTOBUF_UTIL_INTERNAL_JSON_STREAM_PARSER_H__ |
32 | #define GOOGLE_PROTOBUF_UTIL_INTERNAL_JSON_STREAM_PARSER_H__ |
33 | |
34 | #include <cstdint> |
35 | #include <stack> |
36 | #include <string> |
37 | |
38 | #include <google/protobuf/stubs/common.h> |
39 | #include <google/protobuf/stubs/status.h> |
40 | #include <google/protobuf/stubs/strutil.h> |
41 | #include <google/protobuf/stubs/status.h> |
42 | |
43 | // Must be included last. |
44 | #include <google/protobuf/port_def.inc> |
45 | |
46 | namespace google { |
47 | namespace protobuf { |
48 | namespace util { |
49 | namespace converter { |
50 | |
51 | |
52 | class ObjectWriter; |
53 | |
54 | // A JSON parser that can parse a stream of JSON chunks rather than needing the |
55 | // entire JSON string up front. It is a modified version of the parser in |
56 | // //net/proto/json/json-parser.h that has been changed in the following ways: |
57 | // - Changed from recursion to an explicit stack to allow resumption |
58 | // - Added support for int64 and uint64 numbers |
59 | // - Removed support for octal and decimal escapes |
60 | // - Removed support for numeric keys |
61 | // - Removed support for functions (javascript) |
62 | // - Removed some lax-comma support (but kept trailing comma support) |
63 | // - Writes directly to an ObjectWriter rather than using subclassing |
64 | // |
65 | // Here is an example usage: |
66 | // JsonStreamParser parser(ow_.get()); |
67 | // util::Status result = parser.Parse(chunk1); |
68 | // result.Update(parser.Parse(chunk2)); |
69 | // result.Update(parser.FinishParse()); |
70 | // GOOGLE_DCHECK(result.ok()) << "Failed to parse JSON"; |
71 | // |
72 | // This parser is thread-compatible as long as only one thread is calling a |
73 | // Parse() method at a time. |
74 | class PROTOBUF_EXPORT JsonStreamParser { |
75 | public: |
76 | // Creates a JsonStreamParser that will write to the given ObjectWriter. |
77 | explicit JsonStreamParser(ObjectWriter* ow); |
78 | virtual ~JsonStreamParser(); |
79 | |
80 | // Parses a UTF-8 encoded JSON string from a StringPiece. If the returned |
81 | // status is non-ok, the status might contain a payload ParseErrorType with |
82 | // type_url kParseErrorTypeUrl and a payload containing string snippet of the |
83 | // error with type_url kParseErrorSnippetUrl. |
84 | util::Status Parse(StringPiece json); |
85 | |
86 | |
87 | // Finish parsing the JSON string. If the returned status is non-ok, the |
88 | // status might contain a payload ParseErrorType with type_url |
89 | // kParseErrorTypeUrl and a payload containing string snippet of the error |
90 | // with type_url kParseErrorSnippetUrl. |
91 | util::Status FinishParse(); |
92 | |
93 | |
94 | // Sets the max recursion depth of JSON message to be deserialized. JSON |
95 | // messages over this depth will fail to be deserialized. |
96 | // Default value is 100. |
97 | void set_max_recursion_depth(int max_depth) { |
98 | max_recursion_depth_ = max_depth; |
99 | } |
100 | |
101 | // Denotes the cause of error. |
102 | enum ParseErrorType { |
103 | UNKNOWN_PARSE_ERROR, |
104 | OCTAL_OR_HEX_ARE_NOT_VALID_JSON_VALUES, |
105 | EXPECTED_COLON, |
106 | EXPECTED_COMMA_OR_BRACKET, |
107 | EXPECTED_VALUE, |
108 | EXPECTED_COMMA_OR_BRACES, |
109 | EXPECTED_OBJECT_KEY_OR_BRACES, |
110 | EXPECTED_VALUE_OR_BRACKET, |
111 | INVALID_KEY_OR_VARIABLE_NAME, |
112 | NON_UTF_8, |
113 | PARSING_TERMINATED_BEFORE_END_OF_INPUT, |
114 | UNEXPECTED_TOKEN, |
115 | EXPECTED_CLOSING_QUOTE, |
116 | ILLEGAL_HEX_STRING, |
117 | INVALID_ESCAPE_SEQUENCE, |
118 | MISSING_LOW_SURROGATE, |
119 | INVALID_LOW_SURROGATE, |
120 | INVALID_UNICODE, |
121 | UNABLE_TO_PARSE_NUMBER, |
122 | NUMBER_EXCEEDS_RANGE_DOUBLE |
123 | }; |
124 | |
125 | private: |
126 | friend class JsonStreamParserTest; |
127 | // Return the current recursion depth. |
128 | int recursion_depth() { return recursion_depth_; } |
129 | |
130 | enum TokenType { |
131 | BEGIN_STRING, // " or ' |
132 | BEGIN_NUMBER, // - or digit |
133 | BEGIN_TRUE, // true |
134 | BEGIN_FALSE, // false |
135 | BEGIN_NULL, // null |
136 | BEGIN_OBJECT, // { |
137 | END_OBJECT, // } |
138 | BEGIN_ARRAY, // [ |
139 | END_ARRAY, // ] |
140 | ENTRY_SEPARATOR, // : |
141 | VALUE_SEPARATOR, // , |
142 | BEGIN_KEY, // letter, _, $ or digit. Must begin with non-digit |
143 | UNKNOWN // Unknown token or we ran out of the stream. |
144 | }; |
145 | |
146 | enum ParseType { |
147 | VALUE, // Expects a {, [, true, false, null, string or number |
148 | OBJ_MID, // Expects a ',' or } |
149 | ENTRY, // Expects a key or } |
150 | ENTRY_MID, // Expects a : |
151 | ARRAY_VALUE, // Expects a value or ] |
152 | ARRAY_MID // Expects a ',' or ] |
153 | }; |
154 | |
155 | // Holds the result of parsing a number |
156 | struct NumberResult { |
157 | enum Type { DOUBLE, INT, UINT }; |
158 | Type type; |
159 | union { |
160 | double double_val; |
161 | int64_t int_val; |
162 | uint64_t uint_val; |
163 | }; |
164 | }; |
165 | |
166 | // Parses a single chunk of JSON, returning an error if the JSON was invalid. |
167 | util::Status ParseChunk(StringPiece chunk); |
168 | |
169 | // Runs the parser based on stack_ and p_, until the stack is empty or p_ runs |
170 | // out of data. If we unexpectedly run out of p_ we push the latest back onto |
171 | // the stack and return. |
172 | util::Status RunParser(); |
173 | |
174 | // Parses a value from p_ and writes it to ow_. |
175 | // A value may be an object, array, true, false, null, string or number. |
176 | util::Status ParseValue(TokenType type); |
177 | |
178 | // Parses a string and writes it out to the ow_. |
179 | util::Status ParseString(); |
180 | |
181 | // Parses a string, storing the result in parsed_. |
182 | util::Status ParseStringHelper(); |
183 | |
184 | // This function parses unicode escape sequences in strings. It returns an |
185 | // error when there's a parsing error, either the size is not the expected |
186 | // size or a character is not a hex digit. When it returns str will contain |
187 | // what has been successfully parsed so far. |
188 | util::Status ParseUnicodeEscape(); |
189 | |
190 | // Expects p_ to point to a JSON number, writes the number to the writer using |
191 | // the appropriate Render method based on the type of number. |
192 | util::Status ParseNumber(); |
193 | |
194 | // Parse a number into a NumberResult, reporting an error if no number could |
195 | // be parsed. This method will try to parse into a uint64, int64, or double |
196 | // based on whether the number was positive or negative or had a decimal |
197 | // component. |
198 | util::Status ParseNumberHelper(NumberResult* result); |
199 | |
200 | // Parse a number as double into a NumberResult. |
201 | util::Status ParseDoubleHelper(const std::string& number, |
202 | NumberResult* result); |
203 | |
204 | // Handles a { during parsing of a value. |
205 | util::Status HandleBeginObject(); |
206 | |
207 | // Parses from the ENTRY state. |
208 | util::Status ParseEntry(TokenType type); |
209 | |
210 | // Parses from the ENTRY_MID state. |
211 | util::Status ParseEntryMid(TokenType type); |
212 | |
213 | // Parses from the OBJ_MID state. |
214 | util::Status ParseObjectMid(TokenType type); |
215 | |
216 | // Handles a [ during parsing of a value. |
217 | util::Status HandleBeginArray(); |
218 | |
219 | // Parses from the ARRAY_VALUE state. |
220 | util::Status ParseArrayValue(TokenType type); |
221 | |
222 | // Parses from the ARRAY_MID state. |
223 | util::Status ParseArrayMid(TokenType type); |
224 | |
225 | // Expects p_ to point to an unquoted literal |
226 | util::Status ParseTrue(); |
227 | util::Status ParseFalse(); |
228 | util::Status ParseNull(); |
229 | util::Status ParseEmptyNull(); |
230 | |
231 | // Whether an empty-null is allowed in the current state. |
232 | bool IsEmptyNullAllowed(TokenType type); |
233 | |
234 | // Whether the whole input is all whitespaces. |
235 | bool IsInputAllWhiteSpaces(TokenType type); |
236 | |
237 | // Report a failure as a util::Status. |
238 | util::Status ReportFailure(StringPiece message, |
239 | ParseErrorType parse_code); |
240 | |
241 | // Report a failure due to an UNKNOWN token type. We check if we hit the |
242 | // end of the stream and if we're finishing or not to detect what type of |
243 | // status to return in this case. |
244 | util::Status ReportUnknown(StringPiece message, |
245 | ParseErrorType parse_code); |
246 | |
247 | // Helper function to check recursion depth and increment it. It will return |
248 | // OkStatus() if the current depth is allowed. Otherwise an error is returned. |
249 | // key is used for error reporting. |
250 | util::Status IncrementRecursionDepth(StringPiece key) const; |
251 | |
252 | // Advance p_ past all whitespace or until the end of the string. |
253 | void SkipWhitespace(); |
254 | |
255 | // Advance p_ one UTF-8 character |
256 | void Advance(); |
257 | |
258 | // Expects p_ to point to the beginning of a key. |
259 | util::Status ParseKey(); |
260 | |
261 | // Return the type of the next token at p_. |
262 | TokenType GetNextTokenType(); |
263 | |
264 | // The object writer to write parse events to. |
265 | ObjectWriter* ow_; |
266 | |
267 | // The stack of parsing we still need to do. When the stack runs empty we will |
268 | // have parsed a single value from the root (e.g. an object or list). |
269 | std::stack<ParseType> stack_; |
270 | |
271 | // Contains any leftover text from a previous chunk that we weren't able to |
272 | // fully parse, for example the start of a key or number. |
273 | std::string leftover_; |
274 | |
275 | // The current chunk of JSON being parsed. Primarily used for providing |
276 | // context during error reporting. |
277 | StringPiece json_; |
278 | |
279 | // A pointer within the current JSON being parsed, used to track location. |
280 | StringPiece p_; |
281 | |
282 | // Stores the last key read, as we separate parsing of keys and values. |
283 | StringPiece key_; |
284 | |
285 | // Storage for key_ if we need to keep ownership, for example between chunks |
286 | // or if the key was unescaped from a JSON string. |
287 | std::string key_storage_; |
288 | |
289 | // True during the FinishParse() call, so we know that any errors are fatal. |
290 | // For example an unterminated string will normally result in cancelling and |
291 | // trying during the next chunk, but during FinishParse() it is an error. |
292 | bool finishing_; |
293 | |
294 | // Whether non whitespace tokens have been seen during parsing. |
295 | // It is used to handle the case of a pure whitespace stream input. |
296 | bool seen_non_whitespace_; |
297 | |
298 | // The JsonStreamParser requires a root element by default and it will raise |
299 | // error if the root element is missing. If `allow_no_root_element_` is true, |
300 | // the JsonStreamParser can also handle this case. |
301 | bool allow_no_root_element_; |
302 | |
303 | // String we parsed during a call to ParseStringHelper(). |
304 | StringPiece parsed_; |
305 | |
306 | // Storage for the string we parsed. This may be empty if the string was able |
307 | // to be parsed directly from the input. |
308 | std::string parsed_storage_; |
309 | |
310 | // The character that opened the string, either ' or ". |
311 | // A value of 0 indicates that string parsing is not in process. |
312 | char string_open_; |
313 | |
314 | // Storage for the chunk that are being parsed in ParseChunk(). |
315 | std::string chunk_storage_; |
316 | |
317 | // Whether to allow non UTF-8 encoded input and replace invalid code points. |
318 | bool coerce_to_utf8_; |
319 | |
320 | // Replacement character for invalid UTF-8 code points. |
321 | std::string utf8_replacement_character_; |
322 | |
323 | // Whether allows empty string represented null array value or object entry |
324 | // value. |
325 | bool allow_empty_null_; |
326 | |
327 | // Whether unquoted object keys can contain embedded non-alphanumeric |
328 | // characters when this is unambiguous for parsing. |
329 | bool allow_permissive_key_naming_; |
330 | |
331 | // Whether allows out-of-range floating point numbers or reject them. |
332 | bool loose_float_number_conversion_; |
333 | |
334 | // Tracks current recursion depth. |
335 | mutable int recursion_depth_; |
336 | |
337 | // Maximum allowed recursion depth. |
338 | int max_recursion_depth_; |
339 | |
340 | GOOGLE_DISALLOW_IMPLICIT_CONSTRUCTORS(JsonStreamParser); |
341 | }; |
342 | |
343 | } // namespace converter |
344 | } // namespace util |
345 | } // namespace protobuf |
346 | } // namespace google |
347 | |
348 | #include <google/protobuf/port_undef.inc> |
349 | |
350 | #endif // GOOGLE_PROTOBUF_UTIL_INTERNAL_JSON_STREAM_PARSER_H__ |
351 | |