1// Protocol Buffers - Google's data interchange format
2// Copyright 2008 Google Inc. All rights reserved.
3// https://developers.google.com/protocol-buffers/
4//
5// Redistribution and use in source and binary forms, with or without
6// modification, are permitted provided that the following conditions are
7// met:
8//
9// * Redistributions of source code must retain the above copyright
10// notice, this list of conditions and the following disclaimer.
11// * Redistributions in binary form must reproduce the above
12// copyright notice, this list of conditions and the following disclaimer
13// in the documentation and/or other materials provided with the
14// distribution.
15// * Neither the name of Google Inc. nor the names of its
16// contributors may be used to endorse or promote products derived from
17// this software without specific prior written permission.
18//
19// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30
31#include <google/protobuf/util/internal/json_stream_parser.h>
32
33#include <algorithm>
34#include <cctype>
35#include <cmath>
36#include <memory>
37#include <stack>
38#include <string>
39
40#include <google/protobuf/stubs/common.h>
41#include <google/protobuf/stubs/logging.h>
42#include <google/protobuf/stubs/strutil.h>
43#include <google/protobuf/stubs/status.h>
44#include <google/protobuf/util/internal/object_writer.h>
45#include <google/protobuf/util/internal/json_escaping.h>
46
47
48namespace google {
49namespace protobuf {
50namespace util {
51
52namespace converter {
53
54// Number of digits in an escaped UTF-16 code unit ('\\' 'u' X X X X)
55static const int kUnicodeEscapedLength = 6;
56
57static const int kDefaultMaxRecursionDepth = 100;
58
59// These cannot be constexpr for portability with VS2015.
60static const StringPiece kKeywordTrue = "true";
61static const StringPiece kKeywordFalse = "false";
62static const StringPiece kKeywordNull = "null";
63
64inline bool IsLetter(char c) {
65 return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || (c == '_') ||
66 (c == '$');
67}
68
69inline bool IsAlphanumeric(char c) {
70 return IsLetter(c) || ('0' <= c && c <= '9');
71}
72
73// Indicates a character may not be part of an unquoted key.
74inline bool IsKeySeparator(char c) {
75 return (ascii_isspace(c) || c == '"' || c == '\'' || c == '{' ||
76 c == '}' || c == '[' || c == ']' || c == ':' || c == ',');
77}
78
79inline void ReplaceInvalidCodePoints(StringPiece str,
80 const std::string& replacement,
81 std::string* dst) {
82 while (!str.empty()) {
83 int n_valid_bytes = internal::UTF8SpnStructurallyValid(str);
84 StringPiece valid_part = str.substr(pos: 0, n: n_valid_bytes);
85 StrAppend(dest: dst, a: valid_part);
86
87 if (n_valid_bytes == str.size()) {
88 break;
89 }
90
91 // Append replacement value.
92 StrAppend(dest: dst, a: replacement);
93
94 // Move past valid bytes + one invalid byte.
95 str.remove_prefix(n: n_valid_bytes + 1);
96 }
97}
98
99static bool ConsumeKey(StringPiece* input, StringPiece* key) {
100 if (input->empty() || !IsLetter(c: (*input)[0])) return false;
101 int len = 1;
102 for (; len < input->size(); ++len) {
103 if (!IsAlphanumeric(c: (*input)[len])) {
104 break;
105 }
106 }
107 *key = StringPiece(input->data(), len);
108 *input = StringPiece(input->data() + len, input->size() - len);
109 return true;
110}
111
112// Same as 'ConsumeKey', but allows a widened set of key characters.
113static bool ConsumeKeyPermissive(StringPiece* input,
114 StringPiece* key) {
115 if (input->empty() || !IsLetter(c: (*input)[0])) return false;
116 int len = 1;
117 for (; len < input->size(); ++len) {
118 if (IsKeySeparator(c: (*input)[len])) {
119 break;
120 }
121 }
122 *key = StringPiece(input->data(), len);
123 *input = StringPiece(input->data() + len, input->size() - len);
124 return true;
125}
126
127static bool MatchKey(StringPiece input) {
128 return !input.empty() && IsLetter(c: input[0]);
129}
130
131JsonStreamParser::JsonStreamParser(ObjectWriter* ow)
132 : ow_(ow),
133 stack_(),
134 leftover_(),
135 json_(),
136 p_(),
137 key_(),
138 key_storage_(),
139 finishing_(false),
140 seen_non_whitespace_(false),
141 allow_no_root_element_(false),
142 parsed_(),
143 parsed_storage_(),
144 string_open_(0),
145 chunk_storage_(),
146 coerce_to_utf8_(false),
147 utf8_replacement_character_(" "),
148 allow_empty_null_(false),
149 allow_permissive_key_naming_(false),
150 loose_float_number_conversion_(false),
151 recursion_depth_(0),
152 max_recursion_depth_(kDefaultMaxRecursionDepth) {
153 // Initialize the stack with a single value to be parsed.
154 stack_.push(x: VALUE);
155}
156
157JsonStreamParser::~JsonStreamParser() {}
158
159
160util::Status JsonStreamParser::Parse(StringPiece json) {
161 StringPiece chunk = json;
162 // If we have leftovers from a previous chunk, append the new chunk to it
163 // and create a new StringPiece pointing at the string's data. This could
164 // be large but we rely on the chunks to be small, assuming they are
165 // fragments of a Cord.
166 if (!leftover_.empty()) {
167 // Don't point chunk to leftover_ because leftover_ will be updated in
168 // ParseChunk(chunk).
169 chunk_storage_.swap(s&: leftover_);
170 StrAppend(dest: &chunk_storage_, a: json);
171 chunk = StringPiece(chunk_storage_);
172 }
173
174 // Find the structurally valid UTF8 prefix and parse only that.
175 int n = internal::UTF8SpnStructurallyValid(str: chunk);
176 if (n > 0) {
177 util::Status status = ParseChunk(chunk: chunk.substr(pos: 0, n));
178
179 // Any leftover characters are stashed in leftover_ for later parsing when
180 // there is more data available.
181 StrAppend(dest: &leftover_, a: chunk.substr(pos: n));
182 return status;
183 } else {
184 leftover_.assign(s: chunk.data(), n: chunk.size());
185 return util::Status();
186 }
187}
188
189util::Status JsonStreamParser::FinishParse() {
190 // If we do not expect anything and there is nothing left to parse we're all
191 // done.
192 if (stack_.empty() && leftover_.empty()) {
193 return util::Status();
194 }
195
196 // Lifetime needs to last until RunParser returns, so keep this variable
197 // outside of the coerce_to_utf8 block.
198 std::unique_ptr<std::string> scratch;
199
200 bool is_valid_utf8 = internal::IsStructurallyValidUTF8(str: leftover_);
201 if (coerce_to_utf8_ && !is_valid_utf8) {
202 scratch.reset(p: new std::string);
203 scratch->reserve(res_arg: leftover_.size() * utf8_replacement_character_.size());
204 ReplaceInvalidCodePoints(str: leftover_, replacement: utf8_replacement_character_,
205 dst: scratch.get());
206 p_ = json_ = *scratch;
207 } else {
208 p_ = json_ = leftover_;
209 if (!is_valid_utf8) {
210 return ReportFailure(message: "Encountered non UTF-8 code points.",
211 parse_code: ParseErrorType::NON_UTF_8);
212 }
213 }
214
215 // Parse the remainder in finishing mode, which reports errors for things like
216 // unterminated strings or unknown tokens that would normally be retried.
217 finishing_ = true;
218 util::Status result = RunParser();
219 if (result.ok()) {
220 SkipWhitespace();
221 if (!p_.empty()) {
222 result =
223 ReportFailure(message: "Parsing terminated before end of input.",
224 parse_code: ParseErrorType::PARSING_TERMINATED_BEFORE_END_OF_INPUT);
225 }
226 }
227 return result;
228}
229
230util::Status JsonStreamParser::ParseChunk(StringPiece chunk) {
231 // Do not do any work if the chunk is empty.
232 if (chunk.empty()) return util::Status();
233
234 p_ = json_ = chunk;
235
236 finishing_ = false;
237 util::Status result = RunParser();
238 if (!result.ok()) return result;
239
240 SkipWhitespace();
241 if (p_.empty()) {
242 // If we parsed everything we had, clear the leftover.
243 leftover_.clear();
244 } else {
245 // If we do not expect anything i.e. stack is empty, and we have non-empty
246 // string left to parse, we report an error.
247 if (stack_.empty()) {
248 return ReportFailure(
249 message: "Parsing terminated before end of input.",
250 parse_code: ParseErrorType::PARSING_TERMINATED_BEFORE_END_OF_INPUT);
251 }
252 // If we expect future data i.e. stack is non-empty, and we have some
253 // unparsed data left, we save it for later parse.
254 leftover_ = std::string(p_);
255 }
256 return util::Status();
257}
258
259bool JsonStreamParser::IsInputAllWhiteSpaces(TokenType type) {
260 // Conclude the whole input is full of white spaces by:
261 // - it is at the finishing stage
262 // - we have run out of the input data
263 // - haven't seen non-whitespace char so far
264 if (finishing_ && p_.empty() && type == UNKNOWN && !seen_non_whitespace_) {
265 return true;
266 }
267 return false;
268}
269
270util::Status JsonStreamParser::RunParser() {
271 while (!stack_.empty()) {
272 ParseType type = stack_.top();
273 TokenType t = (string_open_ == 0) ? GetNextTokenType() : BEGIN_STRING;
274 stack_.pop();
275 util::Status result;
276 switch (type) {
277 case VALUE:
278 if (allow_no_root_element_ && IsInputAllWhiteSpaces(type: t)) {
279 return util::Status();
280 }
281 result = ParseValue(type: t);
282 break;
283
284 case OBJ_MID:
285 result = ParseObjectMid(type: t);
286 break;
287
288 case ENTRY:
289 result = ParseEntry(type: t);
290 break;
291
292 case ENTRY_MID:
293 result = ParseEntryMid(type: t);
294 break;
295
296 case ARRAY_VALUE:
297 result = ParseArrayValue(type: t);
298 break;
299
300 case ARRAY_MID:
301 result = ParseArrayMid(type: t);
302 break;
303
304 default:
305 result =
306 util::InternalError(message: StrCat(a: "Unknown parse type: ", b: type));
307 break;
308 }
309 if (!result.ok()) {
310 // If we were cancelled, save our state and try again later.
311 if (!finishing_ && util::IsCancelled(status: result)) {
312 stack_.push(x: type);
313 // If we have a key we still need to render, make sure to save off the
314 // contents in our own storage.
315 if (!key_.empty() && key_storage_.empty()) {
316 StrAppend(dest: &key_storage_, a: key_);
317 key_ = StringPiece(key_storage_);
318 }
319 result = util::Status();
320 }
321 return result;
322 }
323 }
324 return util::Status();
325}
326
327util::Status JsonStreamParser::ParseValue(TokenType type) {
328 switch (type) {
329 case BEGIN_OBJECT:
330 return HandleBeginObject();
331 case BEGIN_ARRAY:
332 return HandleBeginArray();
333 case BEGIN_STRING:
334 return ParseString();
335 case BEGIN_NUMBER:
336 return ParseNumber();
337 case BEGIN_TRUE:
338 return ParseTrue();
339 case BEGIN_FALSE:
340 return ParseFalse();
341 case BEGIN_NULL:
342 return ParseNull();
343 case UNKNOWN:
344 return ReportUnknown(message: "Expected a value.", parse_code: ParseErrorType::EXPECTED_VALUE);
345 default: {
346 // Special case for having been cut off while parsing, wait for more data.
347 // This handles things like 'fals' being at the end of the string, we
348 // don't know if the next char would be e, completing it, or something
349 // else, making it invalid.
350 if (!finishing_ && p_.length() < kKeywordFalse.length()) {
351 return util::CancelledError(message: "");
352 }
353
354 if (allow_empty_null_ && IsEmptyNullAllowed(type)) {
355 return ParseEmptyNull();
356 }
357 return ReportFailure(message: "Unexpected token.",
358 parse_code: ParseErrorType::UNEXPECTED_TOKEN);
359 }
360 }
361}
362
363util::Status JsonStreamParser::ParseString() {
364 util::Status result = ParseStringHelper();
365 if (result.ok()) {
366 ow_->RenderString(name: key_, value: parsed_);
367 key_ = StringPiece();
368 parsed_ = StringPiece();
369 parsed_storage_.clear();
370 }
371 return result;
372}
373
374util::Status JsonStreamParser::ParseStringHelper() {
375 // If we haven't seen the start quote, grab it and remember it for later.
376 if (string_open_ == 0) {
377 string_open_ = *p_.data();
378 GOOGLE_DCHECK(string_open_ == '\"' || string_open_ == '\'');
379 Advance();
380 }
381 // Track where we last copied data from so we can minimize copying.
382 const char* last = p_.data();
383 while (!p_.empty()) {
384 const char* data = p_.data();
385 if (*data == '\\') {
386 // We're about to handle an escape, copy all bytes from last to data.
387 if (last < data) {
388 parsed_storage_.append(s: last, n: data - last);
389 }
390 // If we ran out of string after the \, cancel or report an error
391 // depending on if we expect more data later.
392 if (p_.length() == 1) {
393 if (!finishing_) {
394 return util::CancelledError(message: "");
395 }
396 return ReportFailure(message: "Closing quote expected in string.",
397 parse_code: ParseErrorType::EXPECTED_CLOSING_QUOTE);
398 }
399 // Parse a unicode escape if we found \u in the string.
400 if (data[1] == 'u') {
401 util::Status result = ParseUnicodeEscape();
402 if (!result.ok()) {
403 return result;
404 }
405 // Move last pointer past the unicode escape and continue.
406 last = p_.data();
407 continue;
408 }
409 // Handle the standard set of backslash-escaped characters.
410 switch (data[1]) {
411 case 'b':
412 parsed_storage_.push_back(c: '\b');
413 break;
414 case 'f':
415 parsed_storage_.push_back(c: '\f');
416 break;
417 case 'n':
418 parsed_storage_.push_back(c: '\n');
419 break;
420 case 'r':
421 parsed_storage_.push_back(c: '\r');
422 break;
423 case 't':
424 parsed_storage_.push_back(c: '\t');
425 break;
426 case 'v':
427 parsed_storage_.push_back(c: '\v');
428 break;
429 default:
430 parsed_storage_.push_back(c: data[1]);
431 }
432 // We handled two characters, so advance past them and continue.
433 p_.remove_prefix(n: 2);
434 last = p_.data();
435 continue;
436 }
437 // If we found the closing quote note it, advance past it, and return.
438 if (*data == string_open_) {
439 // If we didn't copy anything, reuse the input buffer.
440 if (parsed_storage_.empty()) {
441 parsed_ = StringPiece(last, data - last);
442 } else {
443 if (last < data) {
444 parsed_storage_.append(s: last, n: data - last);
445 }
446 parsed_ = StringPiece(parsed_storage_);
447 }
448 // Clear the quote char so next time we try to parse a string we'll
449 // start fresh.
450 string_open_ = 0;
451 Advance();
452 return util::Status();
453 }
454 // Normal character, just advance past it.
455 Advance();
456 }
457 // If we ran out of characters, copy over what we have so far.
458 if (last < p_.data()) {
459 parsed_storage_.append(s: last, n: p_.data() - last);
460 }
461 // If we didn't find the closing quote but we expect more data, cancel for now
462 if (!finishing_) {
463 return util::CancelledError(message: "");
464 }
465 // End of string reached without a closing quote, report an error.
466 string_open_ = 0;
467 return ReportFailure(message: "Closing quote expected in string.",
468 parse_code: ParseErrorType::EXPECTED_CLOSING_QUOTE);
469}
470
471// Converts a unicode escaped character to a decimal value stored in a char32
472// for use in UTF8 encoding utility. We assume that str begins with \uhhhh and
473// convert that from the hex number to a decimal value.
474//
475// There are some security exploits with UTF-8 that we should be careful of:
476// - http://www.unicode.org/reports/tr36/#UTF-8_Exploit
477// - http://sites/intl-eng/design-guide/core-application
478util::Status JsonStreamParser::ParseUnicodeEscape() {
479 if (p_.length() < kUnicodeEscapedLength) {
480 if (!finishing_) {
481 return util::CancelledError(message: "");
482 }
483 return ReportFailure(message: "Illegal hex string.",
484 parse_code: ParseErrorType::ILLEGAL_HEX_STRING);
485 }
486 GOOGLE_DCHECK_EQ('\\', p_.data()[0]);
487 GOOGLE_DCHECK_EQ('u', p_.data()[1]);
488 uint32_t code = 0;
489 for (int i = 2; i < kUnicodeEscapedLength; ++i) {
490 if (!isxdigit(p_.data()[i])) {
491 return ReportFailure(message: "Invalid escape sequence.",
492 parse_code: ParseErrorType::INVALID_ESCAPE_SEQUENCE);
493 }
494 code = (code << 4) + hex_digit_to_int(c: p_.data()[i]);
495 }
496 if (code >= JsonEscaping::kMinHighSurrogate &&
497 code <= JsonEscaping::kMaxHighSurrogate) {
498 if (p_.length() < 2 * kUnicodeEscapedLength) {
499 if (!finishing_) {
500 return util::CancelledError(message: "");
501 }
502 if (!coerce_to_utf8_) {
503 return ReportFailure(message: "Missing low surrogate.",
504 parse_code: ParseErrorType::MISSING_LOW_SURROGATE);
505 }
506 } else if (p_.data()[kUnicodeEscapedLength] == '\\' &&
507 p_.data()[kUnicodeEscapedLength + 1] == 'u') {
508 uint32_t low_code = 0;
509 for (int i = kUnicodeEscapedLength + 2; i < 2 * kUnicodeEscapedLength;
510 ++i) {
511 if (!isxdigit(p_.data()[i])) {
512 return ReportFailure(message: "Invalid escape sequence.",
513 parse_code: ParseErrorType::INVALID_ESCAPE_SEQUENCE);
514 }
515 low_code = (low_code << 4) + hex_digit_to_int(c: p_.data()[i]);
516 }
517 if (low_code >= JsonEscaping::kMinLowSurrogate &&
518 low_code <= JsonEscaping::kMaxLowSurrogate) {
519 // Convert UTF-16 surrogate pair to 21-bit Unicode codepoint.
520 code = (((code & 0x3FF) << 10) | (low_code & 0x3FF)) +
521 JsonEscaping::kMinSupplementaryCodePoint;
522 // Advance past the first code unit escape.
523 p_.remove_prefix(n: kUnicodeEscapedLength);
524 } else if (!coerce_to_utf8_) {
525 return ReportFailure(message: "Invalid low surrogate.",
526 parse_code: ParseErrorType::INVALID_LOW_SURROGATE);
527 }
528 } else if (!coerce_to_utf8_) {
529 return ReportFailure(message: "Missing low surrogate.",
530 parse_code: ParseErrorType::MISSING_LOW_SURROGATE);
531 }
532 }
533 if (!coerce_to_utf8_ && !IsValidCodePoint(code_point: code)) {
534 return ReportFailure(message: "Invalid unicode code point.",
535 parse_code: ParseErrorType::INVALID_UNICODE);
536 }
537 char buf[UTFmax];
538 int len = EncodeAsUTF8Char(code_point: code, output: buf);
539 // Advance past the [final] code unit escape.
540 p_.remove_prefix(n: kUnicodeEscapedLength);
541 parsed_storage_.append(s: buf, n: len);
542 return util::Status();
543}
544
545util::Status JsonStreamParser::ParseNumber() {
546 NumberResult number;
547 util::Status result = ParseNumberHelper(result: &number);
548 if (result.ok()) {
549 switch (number.type) {
550 case NumberResult::DOUBLE:
551 ow_->RenderDouble(name: key_, value: number.double_val);
552 key_ = StringPiece();
553 break;
554
555 case NumberResult::INT:
556 ow_->RenderInt64(name: key_, value: number.int_val);
557 key_ = StringPiece();
558 break;
559
560 case NumberResult::UINT:
561 ow_->RenderUint64(name: key_, value: number.uint_val);
562 key_ = StringPiece();
563 break;
564
565 default:
566 return ReportFailure(message: "Unable to parse number.",
567 parse_code: ParseErrorType::UNABLE_TO_PARSE_NUMBER);
568 }
569 }
570 return result;
571}
572
573util::Status JsonStreamParser::ParseDoubleHelper(const std::string& number,
574 NumberResult* result) {
575 if (!safe_strtod(str: number, value: &result->double_val)) {
576 return ReportFailure(message: "Unable to parse number.",
577 parse_code: ParseErrorType::UNABLE_TO_PARSE_NUMBER);
578 }
579 if (!loose_float_number_conversion_ && !std::isfinite(x: result->double_val)) {
580 return ReportFailure(message: "Number exceeds the range of double.",
581 parse_code: ParseErrorType::NUMBER_EXCEEDS_RANGE_DOUBLE);
582 }
583 result->type = NumberResult::DOUBLE;
584 return util::Status();
585}
586
587util::Status JsonStreamParser::ParseNumberHelper(NumberResult* result) {
588 const char* data = p_.data();
589 int length = p_.length();
590
591 // Look for the first non-numeric character, or the end of the string.
592 int index = 0;
593 bool floating = false;
594 bool negative = data[index] == '-';
595 // Find the first character that cannot be part of the number. Along the way
596 // detect if the number needs to be parsed as a double.
597 // Note that this restricts numbers to the JSON specification, so for example
598 // we do not support hex or octal notations.
599 for (; index < length; ++index) {
600 char c = data[index];
601 if (isdigit(c)) continue;
602 if (c == '.' || c == 'e' || c == 'E') {
603 floating = true;
604 continue;
605 }
606 if (c == '+' || c == '-' || c == 'x') continue;
607 // Not a valid number character, break out.
608 break;
609 }
610
611 // If the entire input is a valid number, and we may have more content in the
612 // future, we abort for now and resume when we know more.
613 if (index == length && !finishing_) {
614 return util::CancelledError(message: "");
615 }
616
617 // Create a string containing just the number, so we can use safe_strtoX
618 std::string number = std::string(p_.substr(pos: 0, n: index));
619
620 // Floating point number, parse as a double.
621 if (floating) {
622 util::Status status = ParseDoubleHelper(number, result);
623 if (status.ok()) {
624 p_.remove_prefix(n: index);
625 }
626 return status;
627 }
628
629 // Positive non-floating point number, parse as a uint64_t.
630 if (!negative) {
631 // Octal/Hex numbers are not valid JSON values.
632 if (number.length() >= 2 && number[0] == '0') {
633 return ReportFailure(
634 message: "Octal/hex numbers are not valid JSON values.",
635 parse_code: ParseErrorType::OCTAL_OR_HEX_ARE_NOT_VALID_JSON_VALUES);
636 }
637 if (safe_strtou64(str: number, value: &result->uint_val)) {
638 result->type = NumberResult::UINT;
639 p_.remove_prefix(n: index);
640 return util::Status();
641 } else {
642 // If the value is too large, parse it as double.
643 util::Status status = ParseDoubleHelper(number, result);
644 if (status.ok()) {
645 p_.remove_prefix(n: index);
646 }
647 return status;
648 }
649 }
650
651 // Octal/Hex numbers are not valid JSON values.
652 if (number.length() >= 3 && number[1] == '0') {
653 return ReportFailure(
654 message: "Octal/hex numbers are not valid JSON values.",
655 parse_code: ParseErrorType::OCTAL_OR_HEX_ARE_NOT_VALID_JSON_VALUES);
656 }
657 // Negative non-floating point number, parse as an int64_t.
658 if (safe_strto64(str: number, value: &result->int_val)) {
659 result->type = NumberResult::INT;
660 p_.remove_prefix(n: index);
661 return util::Status();
662 } else {
663 // If the value is too large, parse it as double.
664 util::Status status = ParseDoubleHelper(number, result);
665 if (status.ok()) {
666 p_.remove_prefix(n: index);
667 }
668 return status;
669 }
670}
671
672util::Status JsonStreamParser::HandleBeginObject() {
673 GOOGLE_DCHECK_EQ('{', *p_.data());
674 Advance();
675 ow_->StartObject(name: key_);
676 auto status = IncrementRecursionDepth(key: key_);
677 if (!status.ok()) {
678 return status;
679 }
680 key_ = StringPiece();
681 stack_.push(x: ENTRY);
682 return util::Status();
683}
684
685util::Status JsonStreamParser::ParseObjectMid(TokenType type) {
686 if (type == UNKNOWN) {
687 return ReportUnknown(message: "Expected , or } after key:value pair.",
688 parse_code: ParseErrorType::EXPECTED_COMMA_OR_BRACES);
689 }
690
691 // Object is complete, advance past the comma and render the EndObject.
692 if (type == END_OBJECT) {
693 Advance();
694 ow_->EndObject();
695 --recursion_depth_;
696 return util::Status();
697 }
698 // Found a comma, advance past it and get ready for an entry.
699 if (type == VALUE_SEPARATOR) {
700 Advance();
701 stack_.push(x: ENTRY);
702 return util::Status();
703 }
704 // Illegal token after key:value pair.
705 return ReportFailure(message: "Expected , or } after key:value pair.",
706 parse_code: ParseErrorType::EXPECTED_COMMA_OR_BRACES);
707}
708
709util::Status JsonStreamParser::ParseEntry(TokenType type) {
710 if (type == UNKNOWN) {
711 return ReportUnknown(message: "Expected an object key or }.",
712 parse_code: ParseErrorType::EXPECTED_OBJECT_KEY_OR_BRACES);
713 }
714
715 // Close the object and return. This allows for trailing commas.
716 if (type == END_OBJECT) {
717 ow_->EndObject();
718 Advance();
719 --recursion_depth_;
720 return util::Status();
721 }
722
723 util::Status result;
724 if (type == BEGIN_STRING) {
725 // Key is a string (standard JSON), parse it and store the string.
726 result = ParseStringHelper();
727 if (result.ok()) {
728 key_storage_.clear();
729 if (!parsed_storage_.empty()) {
730 parsed_storage_.swap(s&: key_storage_);
731 key_ = StringPiece(key_storage_);
732 } else {
733 key_ = parsed_;
734 }
735 parsed_ = StringPiece();
736 }
737 } else if (type == BEGIN_KEY) {
738 // Key is a bare key (back compat), create a StringPiece pointing to it.
739 result = ParseKey();
740 } else if (type == BEGIN_NULL || type == BEGIN_TRUE || type == BEGIN_FALSE) {
741 // Key may be a bare key that begins with a reserved word.
742 result = ParseKey();
743 if (result.ok() && (key_ == kKeywordNull || key_ == kKeywordTrue ||
744 key_ == kKeywordFalse)) {
745 result = ReportFailure(message: "Expected an object key or }.",
746 parse_code: ParseErrorType::EXPECTED_OBJECT_KEY_OR_BRACES);
747 }
748 } else {
749 // Unknown key type, report an error.
750 result = ReportFailure(message: "Expected an object key or }.",
751 parse_code: ParseErrorType::EXPECTED_OBJECT_KEY_OR_BRACES);
752 }
753 // On success we next expect an entry mid ':' then an object mid ',' or '}'
754 if (result.ok()) {
755 stack_.push(x: OBJ_MID);
756 stack_.push(x: ENTRY_MID);
757 }
758 return result;
759}
760
761util::Status JsonStreamParser::ParseEntryMid(TokenType type) {
762 if (type == UNKNOWN) {
763 return ReportUnknown(message: "Expected : between key:value pair.",
764 parse_code: ParseErrorType::EXPECTED_COLON);
765 }
766 if (type == ENTRY_SEPARATOR) {
767 Advance();
768 stack_.push(x: VALUE);
769 return util::Status();
770 }
771 return ReportFailure(message: "Expected : between key:value pair.",
772 parse_code: ParseErrorType::EXPECTED_COLON);
773}
774
775util::Status JsonStreamParser::HandleBeginArray() {
776 GOOGLE_DCHECK_EQ('[', *p_.data());
777 Advance();
778 ow_->StartList(name: key_);
779 key_ = StringPiece();
780 stack_.push(x: ARRAY_VALUE);
781 return util::Status();
782}
783
784util::Status JsonStreamParser::ParseArrayValue(TokenType type) {
785 if (type == UNKNOWN) {
786 return ReportUnknown(message: "Expected a value or ] within an array.",
787 parse_code: ParseErrorType::EXPECTED_VALUE_OR_BRACKET);
788 }
789
790 if (type == END_ARRAY) {
791 ow_->EndList();
792 Advance();
793 return util::Status();
794 }
795
796 // The ParseValue call may push something onto the stack so we need to make
797 // sure an ARRAY_MID is after it, so we push it on now. Also, the parsing of
798 // empty-null array value is relying on this ARRAY_MID token.
799 stack_.push(x: ARRAY_MID);
800 util::Status result = ParseValue(type);
801 if (util::IsCancelled(status: result)) {
802 // If we were cancelled, pop back off the ARRAY_MID so we don't try to
803 // push it on again when we try over.
804 stack_.pop();
805 }
806 return result;
807}
808
809util::Status JsonStreamParser::ParseArrayMid(TokenType type) {
810 if (type == UNKNOWN) {
811 return ReportUnknown(message: "Expected , or ] after array value.",
812 parse_code: ParseErrorType::EXPECTED_COMMA_OR_BRACKET);
813 }
814
815 if (type == END_ARRAY) {
816 ow_->EndList();
817 Advance();
818 return util::Status();
819 }
820
821 // Found a comma, advance past it and expect an array value next.
822 if (type == VALUE_SEPARATOR) {
823 Advance();
824 stack_.push(x: ARRAY_VALUE);
825 return util::Status();
826 }
827 // Illegal token after array value.
828 return ReportFailure(message: "Expected , or ] after array value.",
829 parse_code: ParseErrorType::EXPECTED_COMMA_OR_BRACKET);
830}
831
832util::Status JsonStreamParser::ParseTrue() {
833 ow_->RenderBool(name: key_, value: true);
834 key_ = StringPiece();
835 p_.remove_prefix(n: kKeywordTrue.length());
836 return util::Status();
837}
838
839util::Status JsonStreamParser::ParseFalse() {
840 ow_->RenderBool(name: key_, value: false);
841 key_ = StringPiece();
842 p_.remove_prefix(n: kKeywordFalse.length());
843 return util::Status();
844}
845
846util::Status JsonStreamParser::ParseNull() {
847 ow_->RenderNull(name: key_);
848 key_ = StringPiece();
849 p_.remove_prefix(n: kKeywordNull.length());
850 return util::Status();
851}
852
853util::Status JsonStreamParser::ParseEmptyNull() {
854 ow_->RenderNull(name: key_);
855 key_ = StringPiece();
856 return util::Status();
857}
858
859bool JsonStreamParser::IsEmptyNullAllowed(TokenType type) {
860 if (stack_.empty()) return false;
861 return (stack_.top() == ARRAY_MID && type == VALUE_SEPARATOR) ||
862 stack_.top() == OBJ_MID;
863}
864
865util::Status JsonStreamParser::ReportFailure(StringPiece message,
866 ParseErrorType parse_code) {
867 (void)parse_code; // Parameter is used in Google-internal code.
868 static const int kContextLength = 20;
869 const char* p_start = p_.data();
870 const char* json_start = json_.data();
871 const char* begin = std::max(p_start - kContextLength, json_start);
872 const char* end =
873 std::min(p_start + kContextLength, json_start + json_.size());
874 StringPiece segment(begin, end - begin);
875 std::string location(p_start - begin, ' ');
876 location.push_back(c: '^');
877 auto status = util::InvalidArgumentError(
878 message: StrCat(a: message, b: "\n", c: segment, d: "\n", e: location));
879 return status;
880}
881
882util::Status JsonStreamParser::ReportUnknown(StringPiece message,
883 ParseErrorType parse_code) {
884 // If we aren't finishing the parse, cancel parsing and try later.
885 if (!finishing_) {
886 return util::CancelledError(message: "");
887 }
888 if (p_.empty()) {
889 return ReportFailure(message: StrCat(a: "Unexpected end of string. ", b: message),
890 parse_code);
891 }
892 return ReportFailure(message, parse_code);
893}
894
895util::Status JsonStreamParser::IncrementRecursionDepth(
896 StringPiece key) const {
897 if (++recursion_depth_ > max_recursion_depth_) {
898 return util::InvalidArgumentError(message: StrCat(
899 a: "Message too deep. Max recursion depth reached for key '", b: key, c: "'"));
900 }
901 return util::Status();
902}
903
904void JsonStreamParser::SkipWhitespace() {
905 while (!p_.empty() && ascii_isspace(c: *p_.data())) {
906 Advance();
907 }
908 if (!p_.empty() && !ascii_isspace(c: *p_.data())) {
909 seen_non_whitespace_ = true;
910 }
911}
912
913void JsonStreamParser::Advance() {
914 // Advance by moving one UTF8 character while making sure we don't go beyond
915 // the length of StringPiece.
916 p_.remove_prefix(n: std::min<int>(
917 p_.length(), UTF8FirstLetterNumBytes(src: p_.data(), len: p_.length())));
918}
919
920util::Status JsonStreamParser::ParseKey() {
921 StringPiece original = p_;
922
923 if (allow_permissive_key_naming_) {
924 if (!ConsumeKeyPermissive(input: &p_, key: &key_)) {
925 return ReportFailure(message: "Invalid key or variable name.",
926 parse_code: ParseErrorType::INVALID_KEY_OR_VARIABLE_NAME);
927 }
928 } else {
929 if (!ConsumeKey(input: &p_, key: &key_)) {
930 return ReportFailure(message: "Invalid key or variable name.",
931 parse_code: ParseErrorType::INVALID_KEY_OR_VARIABLE_NAME);
932 }
933 }
934
935 // If we consumed everything but expect more data, reset p_ and cancel since
936 // we can't know if the key was complete or not.
937 if (!finishing_ && p_.empty()) {
938 p_ = original;
939 return util::CancelledError(message: "");
940 }
941 // Since we aren't using the key storage, clear it out.
942 key_storage_.clear();
943 return util::Status();
944}
945
946JsonStreamParser::TokenType JsonStreamParser::GetNextTokenType() {
947 SkipWhitespace();
948
949 int size = p_.size();
950 if (size == 0) {
951 // If we ran out of data, report unknown and we'll place the previous parse
952 // type onto the stack and try again when we have more data.
953 return UNKNOWN;
954 }
955 // TODO(sven): Split this method based on context since different contexts
956 // support different tokens. Would slightly speed up processing?
957 const char* data = p_.data();
958 StringPiece data_view = StringPiece(data, size);
959 if (*data == '\"' || *data == '\'') return BEGIN_STRING;
960 if (*data == '-' || ('0' <= *data && *data <= '9')) {
961 return BEGIN_NUMBER;
962 }
963 if (size >= kKeywordTrue.length() &&
964 HasPrefixString(str: data_view, prefix: kKeywordTrue)) {
965 return BEGIN_TRUE;
966 }
967 if (size >= kKeywordFalse.length() &&
968 HasPrefixString(str: data_view, prefix: kKeywordFalse)) {
969 return BEGIN_FALSE;
970 }
971 if (size >= kKeywordNull.length() &&
972 HasPrefixString(str: data_view, prefix: kKeywordNull)) {
973 return BEGIN_NULL;
974 }
975 if (*data == '{') return BEGIN_OBJECT;
976 if (*data == '}') return END_OBJECT;
977 if (*data == '[') return BEGIN_ARRAY;
978 if (*data == ']') return END_ARRAY;
979 if (*data == ':') return ENTRY_SEPARATOR;
980 if (*data == ',') return VALUE_SEPARATOR;
981 if (MatchKey(input: p_)) {
982 return BEGIN_KEY;
983 }
984
985 // We don't know that we necessarily have an invalid token here, just that we
986 // can't parse what we have so far. So we don't report an error and just
987 // return UNKNOWN so we can try again later when we have more data, or if we
988 // finish and we have leftovers.
989 return UNKNOWN;
990}
991
992} // namespace converter
993} // namespace util
994} // namespace protobuf
995} // namespace google
996