1// Protocol Buffers - Google's data interchange format
2// Copyright 2008 Google Inc. All rights reserved.
3// https://developers.google.com/protocol-buffers/
4//
5// Redistribution and use in source and binary forms, with or without
6// modification, are permitted provided that the following conditions are
7// met:
8//
9// * Redistributions of source code must retain the above copyright
10// notice, this list of conditions and the following disclaimer.
11// * Redistributions in binary form must reproduce the above
12// copyright notice, this list of conditions and the following disclaimer
13// in the documentation and/or other materials provided with the
14// distribution.
15// * Neither the name of Google Inc. nor the names of its
16// contributors may be used to endorse or promote products derived from
17// this software without specific prior written permission.
18//
19// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30
31#ifndef GOOGLE_PROTOBUF_UTIL_INTERNAL_JSON_STREAM_PARSER_H__
32#define GOOGLE_PROTOBUF_UTIL_INTERNAL_JSON_STREAM_PARSER_H__
33
34#include <cstdint>
35#include <stack>
36#include <string>
37
38#include <google/protobuf/stubs/common.h>
39#include <google/protobuf/stubs/status.h>
40#include <google/protobuf/stubs/strutil.h>
41#include <google/protobuf/stubs/status.h>
42
43// Must be included last.
44#include <google/protobuf/port_def.inc>
45
46namespace google {
47namespace protobuf {
48namespace util {
49namespace converter {
50
51
52class ObjectWriter;
53
54// A JSON parser that can parse a stream of JSON chunks rather than needing the
55// entire JSON string up front. It is a modified version of the parser in
56// //net/proto/json/json-parser.h that has been changed in the following ways:
57// - Changed from recursion to an explicit stack to allow resumption
58// - Added support for int64 and uint64 numbers
59// - Removed support for octal and decimal escapes
60// - Removed support for numeric keys
61// - Removed support for functions (javascript)
62// - Removed some lax-comma support (but kept trailing comma support)
63// - Writes directly to an ObjectWriter rather than using subclassing
64//
65// Here is an example usage:
66// JsonStreamParser parser(ow_.get());
67// util::Status result = parser.Parse(chunk1);
68// result.Update(parser.Parse(chunk2));
69// result.Update(parser.FinishParse());
70// GOOGLE_DCHECK(result.ok()) << "Failed to parse JSON";
71//
72// This parser is thread-compatible as long as only one thread is calling a
73// Parse() method at a time.
74class PROTOBUF_EXPORT JsonStreamParser {
75 public:
76 // Creates a JsonStreamParser that will write to the given ObjectWriter.
77 explicit JsonStreamParser(ObjectWriter* ow);
78 virtual ~JsonStreamParser();
79
80 // Parses a UTF-8 encoded JSON string from a StringPiece. If the returned
81 // status is non-ok, the status might contain a payload ParseErrorType with
82 // type_url kParseErrorTypeUrl and a payload containing string snippet of the
83 // error with type_url kParseErrorSnippetUrl.
84 util::Status Parse(StringPiece json);
85
86
87 // Finish parsing the JSON string. If the returned status is non-ok, the
88 // status might contain a payload ParseErrorType with type_url
89 // kParseErrorTypeUrl and a payload containing string snippet of the error
90 // with type_url kParseErrorSnippetUrl.
91 util::Status FinishParse();
92
93
94 // Sets the max recursion depth of JSON message to be deserialized. JSON
95 // messages over this depth will fail to be deserialized.
96 // Default value is 100.
97 void set_max_recursion_depth(int max_depth) {
98 max_recursion_depth_ = max_depth;
99 }
100
101 // Denotes the cause of error.
102 enum ParseErrorType {
103 UNKNOWN_PARSE_ERROR,
104 OCTAL_OR_HEX_ARE_NOT_VALID_JSON_VALUES,
105 EXPECTED_COLON,
106 EXPECTED_COMMA_OR_BRACKET,
107 EXPECTED_VALUE,
108 EXPECTED_COMMA_OR_BRACES,
109 EXPECTED_OBJECT_KEY_OR_BRACES,
110 EXPECTED_VALUE_OR_BRACKET,
111 INVALID_KEY_OR_VARIABLE_NAME,
112 NON_UTF_8,
113 PARSING_TERMINATED_BEFORE_END_OF_INPUT,
114 UNEXPECTED_TOKEN,
115 EXPECTED_CLOSING_QUOTE,
116 ILLEGAL_HEX_STRING,
117 INVALID_ESCAPE_SEQUENCE,
118 MISSING_LOW_SURROGATE,
119 INVALID_LOW_SURROGATE,
120 INVALID_UNICODE,
121 UNABLE_TO_PARSE_NUMBER,
122 NUMBER_EXCEEDS_RANGE_DOUBLE
123 };
124
125 private:
126 friend class JsonStreamParserTest;
127 // Return the current recursion depth.
128 int recursion_depth() { return recursion_depth_; }
129
130 enum TokenType {
131 BEGIN_STRING, // " or '
132 BEGIN_NUMBER, // - or digit
133 BEGIN_TRUE, // true
134 BEGIN_FALSE, // false
135 BEGIN_NULL, // null
136 BEGIN_OBJECT, // {
137 END_OBJECT, // }
138 BEGIN_ARRAY, // [
139 END_ARRAY, // ]
140 ENTRY_SEPARATOR, // :
141 VALUE_SEPARATOR, // ,
142 BEGIN_KEY, // letter, _, $ or digit. Must begin with non-digit
143 UNKNOWN // Unknown token or we ran out of the stream.
144 };
145
146 enum ParseType {
147 VALUE, // Expects a {, [, true, false, null, string or number
148 OBJ_MID, // Expects a ',' or }
149 ENTRY, // Expects a key or }
150 ENTRY_MID, // Expects a :
151 ARRAY_VALUE, // Expects a value or ]
152 ARRAY_MID // Expects a ',' or ]
153 };
154
155 // Holds the result of parsing a number
156 struct NumberResult {
157 enum Type { DOUBLE, INT, UINT };
158 Type type;
159 union {
160 double double_val;
161 int64_t int_val;
162 uint64_t uint_val;
163 };
164 };
165
166 // Parses a single chunk of JSON, returning an error if the JSON was invalid.
167 util::Status ParseChunk(StringPiece chunk);
168
169 // Runs the parser based on stack_ and p_, until the stack is empty or p_ runs
170 // out of data. If we unexpectedly run out of p_ we push the latest back onto
171 // the stack and return.
172 util::Status RunParser();
173
174 // Parses a value from p_ and writes it to ow_.
175 // A value may be an object, array, true, false, null, string or number.
176 util::Status ParseValue(TokenType type);
177
178 // Parses a string and writes it out to the ow_.
179 util::Status ParseString();
180
181 // Parses a string, storing the result in parsed_.
182 util::Status ParseStringHelper();
183
184 // This function parses unicode escape sequences in strings. It returns an
185 // error when there's a parsing error, either the size is not the expected
186 // size or a character is not a hex digit. When it returns str will contain
187 // what has been successfully parsed so far.
188 util::Status ParseUnicodeEscape();
189
190 // Expects p_ to point to a JSON number, writes the number to the writer using
191 // the appropriate Render method based on the type of number.
192 util::Status ParseNumber();
193
194 // Parse a number into a NumberResult, reporting an error if no number could
195 // be parsed. This method will try to parse into a uint64, int64, or double
196 // based on whether the number was positive or negative or had a decimal
197 // component.
198 util::Status ParseNumberHelper(NumberResult* result);
199
200 // Parse a number as double into a NumberResult.
201 util::Status ParseDoubleHelper(const std::string& number,
202 NumberResult* result);
203
204 // Handles a { during parsing of a value.
205 util::Status HandleBeginObject();
206
207 // Parses from the ENTRY state.
208 util::Status ParseEntry(TokenType type);
209
210 // Parses from the ENTRY_MID state.
211 util::Status ParseEntryMid(TokenType type);
212
213 // Parses from the OBJ_MID state.
214 util::Status ParseObjectMid(TokenType type);
215
216 // Handles a [ during parsing of a value.
217 util::Status HandleBeginArray();
218
219 // Parses from the ARRAY_VALUE state.
220 util::Status ParseArrayValue(TokenType type);
221
222 // Parses from the ARRAY_MID state.
223 util::Status ParseArrayMid(TokenType type);
224
225 // Expects p_ to point to an unquoted literal
226 util::Status ParseTrue();
227 util::Status ParseFalse();
228 util::Status ParseNull();
229 util::Status ParseEmptyNull();
230
231 // Whether an empty-null is allowed in the current state.
232 bool IsEmptyNullAllowed(TokenType type);
233
234 // Whether the whole input is all whitespaces.
235 bool IsInputAllWhiteSpaces(TokenType type);
236
237 // Report a failure as a util::Status.
238 util::Status ReportFailure(StringPiece message,
239 ParseErrorType parse_code);
240
241 // Report a failure due to an UNKNOWN token type. We check if we hit the
242 // end of the stream and if we're finishing or not to detect what type of
243 // status to return in this case.
244 util::Status ReportUnknown(StringPiece message,
245 ParseErrorType parse_code);
246
247 // Helper function to check recursion depth and increment it. It will return
248 // OkStatus() if the current depth is allowed. Otherwise an error is returned.
249 // key is used for error reporting.
250 util::Status IncrementRecursionDepth(StringPiece key) const;
251
252 // Advance p_ past all whitespace or until the end of the string.
253 void SkipWhitespace();
254
255 // Advance p_ one UTF-8 character
256 void Advance();
257
258 // Expects p_ to point to the beginning of a key.
259 util::Status ParseKey();
260
261 // Return the type of the next token at p_.
262 TokenType GetNextTokenType();
263
264 // The object writer to write parse events to.
265 ObjectWriter* ow_;
266
267 // The stack of parsing we still need to do. When the stack runs empty we will
268 // have parsed a single value from the root (e.g. an object or list).
269 std::stack<ParseType> stack_;
270
271 // Contains any leftover text from a previous chunk that we weren't able to
272 // fully parse, for example the start of a key or number.
273 std::string leftover_;
274
275 // The current chunk of JSON being parsed. Primarily used for providing
276 // context during error reporting.
277 StringPiece json_;
278
279 // A pointer within the current JSON being parsed, used to track location.
280 StringPiece p_;
281
282 // Stores the last key read, as we separate parsing of keys and values.
283 StringPiece key_;
284
285 // Storage for key_ if we need to keep ownership, for example between chunks
286 // or if the key was unescaped from a JSON string.
287 std::string key_storage_;
288
289 // True during the FinishParse() call, so we know that any errors are fatal.
290 // For example an unterminated string will normally result in cancelling and
291 // trying during the next chunk, but during FinishParse() it is an error.
292 bool finishing_;
293
294 // Whether non whitespace tokens have been seen during parsing.
295 // It is used to handle the case of a pure whitespace stream input.
296 bool seen_non_whitespace_;
297
298 // The JsonStreamParser requires a root element by default and it will raise
299 // error if the root element is missing. If `allow_no_root_element_` is true,
300 // the JsonStreamParser can also handle this case.
301 bool allow_no_root_element_;
302
303 // String we parsed during a call to ParseStringHelper().
304 StringPiece parsed_;
305
306 // Storage for the string we parsed. This may be empty if the string was able
307 // to be parsed directly from the input.
308 std::string parsed_storage_;
309
310 // The character that opened the string, either ' or ".
311 // A value of 0 indicates that string parsing is not in process.
312 char string_open_;
313
314 // Storage for the chunk that are being parsed in ParseChunk().
315 std::string chunk_storage_;
316
317 // Whether to allow non UTF-8 encoded input and replace invalid code points.
318 bool coerce_to_utf8_;
319
320 // Replacement character for invalid UTF-8 code points.
321 std::string utf8_replacement_character_;
322
323 // Whether allows empty string represented null array value or object entry
324 // value.
325 bool allow_empty_null_;
326
327 // Whether unquoted object keys can contain embedded non-alphanumeric
328 // characters when this is unambiguous for parsing.
329 bool allow_permissive_key_naming_;
330
331 // Whether allows out-of-range floating point numbers or reject them.
332 bool loose_float_number_conversion_;
333
334 // Tracks current recursion depth.
335 mutable int recursion_depth_;
336
337 // Maximum allowed recursion depth.
338 int max_recursion_depth_;
339
340 GOOGLE_DISALLOW_IMPLICIT_CONSTRUCTORS(JsonStreamParser);
341};
342
343} // namespace converter
344} // namespace util
345} // namespace protobuf
346} // namespace google
347
348#include <google/protobuf/port_undef.inc>
349
350#endif // GOOGLE_PROTOBUF_UTIL_INTERNAL_JSON_STREAM_PARSER_H__
351