1// Protocol Buffers - Google's data interchange format
2// Copyright 2008 Google Inc. All rights reserved.
3// https://developers.google.com/protocol-buffers/
4//
5// Redistribution and use in source and binary forms, with or without
6// modification, are permitted provided that the following conditions are
7// met:
8//
9// * Redistributions of source code must retain the above copyright
10// notice, this list of conditions and the following disclaimer.
11// * Redistributions in binary form must reproduce the above
12// copyright notice, this list of conditions and the following disclaimer
13// in the documentation and/or other materials provided with the
14// distribution.
15// * Neither the name of Google Inc. nor the names of its
16// contributors may be used to endorse or promote products derived from
17// this software without specific prior written permission.
18//
19// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30
31// Author: kenton@google.com (Kenton Varda)
32// Based on original Protocol Buffers design by
33// Sanjay Ghemawat, Jeff Dean, and others.
34//
35// Here we have a hand-written lexer. At first you might ask yourself,
36// "Hand-written text processing? Is Kenton crazy?!" Well, first of all,
37// yes I am crazy, but that's beside the point. There are actually reasons
38// why I ended up writing this this way.
39//
40// The traditional approach to lexing is to use lex to generate a lexer for
41// you. Unfortunately, lex's output is ridiculously ugly and difficult to
42// integrate cleanly with C++ code, especially abstract code or code meant
43// as a library. Better parser-generators exist but would add dependencies
44// which most users won't already have, which we'd like to avoid. (GNU flex
45// has a C++ output option, but it's still ridiculously ugly, non-abstract,
46// and not library-friendly.)
47//
48// The next approach that any good software engineer should look at is to
49// use regular expressions. And, indeed, I did. I have code which
50// implements this same class using regular expressions. It's about 200
51// lines shorter. However:
52// - Rather than error messages telling you "This string has an invalid
53// escape sequence at line 5, column 45", you get error messages like
54// "Parse error on line 5". Giving more precise errors requires adding
55// a lot of code that ends up basically as complex as the hand-coded
56// version anyway.
57// - The regular expression to match a string literal looks like this:
58// kString = new RE("(\"([^\"\\\\]|" // non-escaped
59// "\\\\[abfnrtv?\"'\\\\0-7]|" // normal escape
60// "\\\\x[0-9a-fA-F])*\"|" // hex escape
61// "\'([^\'\\\\]|" // Also support single-quotes.
62// "\\\\[abfnrtv?\"'\\\\0-7]|"
63// "\\\\x[0-9a-fA-F])*\')");
64// Verifying the correctness of this line noise is actually harder than
65// verifying the correctness of ConsumeString(), defined below. I'm not
66// even confident that the above is correct, after staring at it for some
67// time.
68// - PCRE is fast, but there's still more overhead involved than the code
69// below.
70// - Sadly, regular expressions are not part of the C standard library, so
71// using them would require depending on some other library. For the
72// open source release, this could be really annoying. Nobody likes
73// downloading one piece of software just to find that they need to
74// download something else to make it work, and in all likelihood
75// people downloading Protocol Buffers will already be doing so just
76// to make something else work. We could include a copy of PCRE with
77// our code, but that obligates us to keep it up-to-date and just seems
78// like a big waste just to save 200 lines of code.
79//
80// On a similar but unrelated note, I'm even scared to use ctype.h.
81// Apparently functions like isalpha() are locale-dependent. So, if we used
82// that, then if this code is being called from some program that doesn't
83// have its locale set to "C", it would behave strangely. We can't just set
84// the locale to "C" ourselves since we might break the calling program that
85// way, particularly if it is multi-threaded. WTF? Someone please let me
86// (Kenton) know if I'm missing something here...
87//
88// I'd love to hear about other alternatives, though, as this code isn't
89// exactly pretty.
90
91#include <google/protobuf/io/tokenizer.h>
92
93#include <google/protobuf/stubs/common.h>
94#include <google/protobuf/stubs/logging.h>
95#include <google/protobuf/stubs/strutil.h>
96#include <google/protobuf/stubs/stringprintf.h>
97#include <google/protobuf/io/strtod.h>
98#include <google/protobuf/io/zero_copy_stream.h>
99#include <google/protobuf/stubs/stl_util.h>
100
101// Must be included last.
102#include <google/protobuf/port_def.inc>
103
104namespace google {
105namespace protobuf {
106namespace io {
107namespace {
108
109// As mentioned above, I don't trust ctype.h due to the presence of "locales".
110// So, I have written replacement functions here. Someone please smack me if
111// this is a bad idea or if there is some way around this.
112//
113// These "character classes" are designed to be used in template methods.
114// For instance, Tokenizer::ConsumeZeroOrMore<Whitespace>() will eat
115// whitespace.
116
117// Note: No class is allowed to contain '\0', since this is used to mark end-
118// of-input and is handled specially.
119
120#define CHARACTER_CLASS(NAME, EXPRESSION) \
121 class NAME { \
122 public: \
123 static inline bool InClass(char c) { return EXPRESSION; } \
124 }
125
126CHARACTER_CLASS(Whitespace, c == ' ' || c == '\n' || c == '\t' || c == '\r' ||
127 c == '\v' || c == '\f');
128CHARACTER_CLASS(WhitespaceNoNewline,
129 c == ' ' || c == '\t' || c == '\r' || c == '\v' || c == '\f');
130
131CHARACTER_CLASS(Unprintable, c<' ' && c> '\0');
132
133CHARACTER_CLASS(Digit, '0' <= c && c <= '9');
134CHARACTER_CLASS(OctalDigit, '0' <= c && c <= '7');
135CHARACTER_CLASS(HexDigit, ('0' <= c && c <= '9') || ('a' <= c && c <= 'f') ||
136 ('A' <= c && c <= 'F'));
137
138CHARACTER_CLASS(Letter,
139 ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || (c == '_'));
140
141CHARACTER_CLASS(Alphanumeric, ('a' <= c && c <= 'z') ||
142 ('A' <= c && c <= 'Z') ||
143 ('0' <= c && c <= '9') || (c == '_'));
144
145CHARACTER_CLASS(Escape, c == 'a' || c == 'b' || c == 'f' || c == 'n' ||
146 c == 'r' || c == 't' || c == 'v' || c == '\\' ||
147 c == '?' || c == '\'' || c == '\"');
148
149#undef CHARACTER_CLASS
150
151// Given a char, interpret it as a numeric digit and return its value.
152// This supports any number base up to 36.
153// Represents integer values of digits.
154// Uses 36 to indicate an invalid character since we support
155// bases up to 36.
156static const int8_t kAsciiToInt[256] = {
157 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, // 00-0F
158 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, // 10-1F
159 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, // ' '-'/'
160 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, // '0'-'9'
161 36, 36, 36, 36, 36, 36, 36, // ':'-'@'
162 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, // 'A'-'P'
163 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, // 'Q'-'Z'
164 36, 36, 36, 36, 36, 36, // '['-'`'
165 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, // 'a'-'p'
166 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, // 'q'-'z'
167 36, 36, 36, 36, 36, // '{'-DEL
168 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, // 80-8F
169 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, // 90-9F
170 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, // A0-AF
171 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, // B0-BF
172 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, // C0-CF
173 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, // D0-DF
174 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, // E0-EF
175 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, // F0-FF
176};
177
178inline int DigitValue(char digit) { return kAsciiToInt[digit & 0xFF]; }
179
180// Inline because it's only used in one place.
181inline char TranslateEscape(char c) {
182 switch (c) {
183 case 'a':
184 return '\a';
185 case 'b':
186 return '\b';
187 case 'f':
188 return '\f';
189 case 'n':
190 return '\n';
191 case 'r':
192 return '\r';
193 case 't':
194 return '\t';
195 case 'v':
196 return '\v';
197 case '\\':
198 return '\\';
199 case '?':
200 return '\?'; // Trigraphs = :(
201 case '\'':
202 return '\'';
203 case '"':
204 return '\"';
205
206 // We expect escape sequences to have been validated separately.
207 default:
208 return '?';
209 }
210}
211
212} // anonymous namespace
213
214ErrorCollector::~ErrorCollector() {}
215
216// ===================================================================
217
218Tokenizer::Tokenizer(ZeroCopyInputStream* input,
219 ErrorCollector* error_collector)
220 : input_(input),
221 error_collector_(error_collector),
222 buffer_(NULL),
223 buffer_size_(0),
224 buffer_pos_(0),
225 read_error_(false),
226 line_(0),
227 column_(0),
228 record_target_(NULL),
229 record_start_(-1),
230 allow_f_after_float_(false),
231 comment_style_(CPP_COMMENT_STYLE),
232 require_space_after_number_(true),
233 allow_multiline_strings_(false) {
234 current_.line = 0;
235 current_.column = 0;
236 current_.end_column = 0;
237 current_.type = TYPE_START;
238
239 Refresh();
240}
241
242Tokenizer::~Tokenizer() {
243 // If we had any buffer left unread, return it to the underlying stream
244 // so that someone else can read it.
245 if (buffer_size_ > buffer_pos_) {
246 input_->BackUp(count: buffer_size_ - buffer_pos_);
247 }
248}
249
250bool Tokenizer::report_whitespace() const { return report_whitespace_; }
251// Note: `set_report_whitespace(false)` implies `set_report_newlines(false)`.
252void Tokenizer::set_report_whitespace(bool report) {
253 report_whitespace_ = report;
254 report_newlines_ &= report;
255}
256
257// If true, newline tokens are reported by Next().
258bool Tokenizer::report_newlines() const { return report_newlines_; }
259// Note: `set_report_newlines(true)` implies `set_report_whitespace(true)`.
260void Tokenizer::set_report_newlines(bool report) {
261 report_newlines_ = report;
262 report_whitespace_ |= report; // enable report_whitespace if necessary
263}
264
265// -------------------------------------------------------------------
266// Internal helpers.
267
268void Tokenizer::NextChar() {
269 // Update our line and column counters based on the character being
270 // consumed.
271 if (current_char_ == '\n') {
272 ++line_;
273 column_ = 0;
274 } else if (current_char_ == '\t') {
275 column_ += kTabWidth - column_ % kTabWidth;
276 } else {
277 ++column_;
278 }
279
280 // Advance to the next character.
281 ++buffer_pos_;
282 if (buffer_pos_ < buffer_size_) {
283 current_char_ = buffer_[buffer_pos_];
284 } else {
285 Refresh();
286 }
287}
288
289void Tokenizer::Refresh() {
290 if (read_error_) {
291 current_char_ = '\0';
292 return;
293 }
294
295 // If we're in a token, append the rest of the buffer to it.
296 if (record_target_ != NULL && record_start_ < buffer_size_) {
297 record_target_->append(s: buffer_ + record_start_,
298 n: buffer_size_ - record_start_);
299 record_start_ = 0;
300 }
301
302 const void* data = NULL;
303 buffer_ = NULL;
304 buffer_pos_ = 0;
305 do {
306 if (!input_->Next(data: &data, size: &buffer_size_)) {
307 // end of stream (or read error)
308 buffer_size_ = 0;
309 read_error_ = true;
310 current_char_ = '\0';
311 return;
312 }
313 } while (buffer_size_ == 0);
314
315 buffer_ = static_cast<const char*>(data);
316
317 current_char_ = buffer_[0];
318}
319
320inline void Tokenizer::RecordTo(std::string* target) {
321 record_target_ = target;
322 record_start_ = buffer_pos_;
323}
324
325inline void Tokenizer::StopRecording() {
326 // Note: The if() is necessary because some STL implementations crash when
327 // you call string::append(NULL, 0), presumably because they are trying to
328 // be helpful by detecting the NULL pointer, even though there's nothing
329 // wrong with reading zero bytes from NULL.
330 if (buffer_pos_ != record_start_) {
331 record_target_->append(s: buffer_ + record_start_,
332 n: buffer_pos_ - record_start_);
333 }
334 record_target_ = NULL;
335 record_start_ = -1;
336}
337
338inline void Tokenizer::StartToken() {
339 current_.type = TYPE_START; // Just for the sake of initializing it.
340 current_.text.clear();
341 current_.line = line_;
342 current_.column = column_;
343 RecordTo(target: &current_.text);
344}
345
346inline void Tokenizer::EndToken() {
347 StopRecording();
348 current_.end_column = column_;
349}
350
351// -------------------------------------------------------------------
352// Helper methods that consume characters.
353
354template <typename CharacterClass>
355inline bool Tokenizer::LookingAt() {
356 return CharacterClass::InClass(current_char_);
357}
358
359template <typename CharacterClass>
360inline bool Tokenizer::TryConsumeOne() {
361 if (CharacterClass::InClass(current_char_)) {
362 NextChar();
363 return true;
364 } else {
365 return false;
366 }
367}
368
369inline bool Tokenizer::TryConsume(char c) {
370 if (current_char_ == c) {
371 NextChar();
372 return true;
373 } else {
374 return false;
375 }
376}
377
378template <typename CharacterClass>
379inline void Tokenizer::ConsumeZeroOrMore() {
380 while (CharacterClass::InClass(current_char_)) {
381 NextChar();
382 }
383}
384
385template <typename CharacterClass>
386inline void Tokenizer::ConsumeOneOrMore(const char* error) {
387 if (!CharacterClass::InClass(current_char_)) {
388 AddError(message: error);
389 } else {
390 do {
391 NextChar();
392 } while (CharacterClass::InClass(current_char_));
393 }
394}
395
396// -------------------------------------------------------------------
397// Methods that read whole patterns matching certain kinds of tokens
398// or comments.
399
400void Tokenizer::ConsumeString(char delimiter) {
401 while (true) {
402 switch (current_char_) {
403 case '\0':
404 AddError(message: "Unexpected end of string.");
405 return;
406
407 case '\n': {
408 if (!allow_multiline_strings_) {
409 AddError(message: "String literals cannot cross line boundaries.");
410 return;
411 }
412 NextChar();
413 break;
414 }
415
416 case '\\': {
417 // An escape sequence.
418 NextChar();
419 if (TryConsumeOne<Escape>()) {
420 // Valid escape sequence.
421 } else if (TryConsumeOne<OctalDigit>()) {
422 // Possibly followed by two more octal digits, but these will
423 // just be consumed by the main loop anyway so we don't need
424 // to do so explicitly here.
425 } else if (TryConsume(c: 'x')) {
426 if (!TryConsumeOne<HexDigit>()) {
427 AddError(message: "Expected hex digits for escape sequence.");
428 }
429 // Possibly followed by another hex digit, but again we don't care.
430 } else if (TryConsume(c: 'u')) {
431 if (!TryConsumeOne<HexDigit>() || !TryConsumeOne<HexDigit>() ||
432 !TryConsumeOne<HexDigit>() || !TryConsumeOne<HexDigit>()) {
433 AddError(message: "Expected four hex digits for \\u escape sequence.");
434 }
435 } else if (TryConsume(c: 'U')) {
436 // We expect 8 hex digits; but only the range up to 0x10ffff is
437 // legal.
438 if (!TryConsume(c: '0') || !TryConsume(c: '0') ||
439 !(TryConsume(c: '0') || TryConsume(c: '1')) ||
440 !TryConsumeOne<HexDigit>() || !TryConsumeOne<HexDigit>() ||
441 !TryConsumeOne<HexDigit>() || !TryConsumeOne<HexDigit>() ||
442 !TryConsumeOne<HexDigit>()) {
443 AddError(
444 message: "Expected eight hex digits up to 10ffff for \\U escape "
445 "sequence");
446 }
447 } else {
448 AddError(message: "Invalid escape sequence in string literal.");
449 }
450 break;
451 }
452
453 default: {
454 if (current_char_ == delimiter) {
455 NextChar();
456 return;
457 }
458 NextChar();
459 break;
460 }
461 }
462 }
463}
464
465Tokenizer::TokenType Tokenizer::ConsumeNumber(bool started_with_zero,
466 bool started_with_dot) {
467 bool is_float = false;
468
469 if (started_with_zero && (TryConsume(c: 'x') || TryConsume(c: 'X'))) {
470 // A hex number (started with "0x").
471 ConsumeOneOrMore<HexDigit>(error: "\"0x\" must be followed by hex digits.");
472
473 } else if (started_with_zero && LookingAt<Digit>()) {
474 // An octal number (had a leading zero).
475 ConsumeZeroOrMore<OctalDigit>();
476 if (LookingAt<Digit>()) {
477 AddError(message: "Numbers starting with leading zero must be in octal.");
478 ConsumeZeroOrMore<Digit>();
479 }
480
481 } else {
482 // A decimal number.
483 if (started_with_dot) {
484 is_float = true;
485 ConsumeZeroOrMore<Digit>();
486 } else {
487 ConsumeZeroOrMore<Digit>();
488
489 if (TryConsume(c: '.')) {
490 is_float = true;
491 ConsumeZeroOrMore<Digit>();
492 }
493 }
494
495 if (TryConsume(c: 'e') || TryConsume(c: 'E')) {
496 is_float = true;
497 TryConsume(c: '-') || TryConsume(c: '+');
498 ConsumeOneOrMore<Digit>(error: "\"e\" must be followed by exponent.");
499 }
500
501 if (allow_f_after_float_ && (TryConsume(c: 'f') || TryConsume(c: 'F'))) {
502 is_float = true;
503 }
504 }
505
506 if (LookingAt<Letter>() && require_space_after_number_) {
507 AddError(message: "Need space between number and identifier.");
508 } else if (current_char_ == '.') {
509 if (is_float) {
510 AddError(
511 message: "Already saw decimal point or exponent; can't have another one.");
512 } else {
513 AddError(message: "Hex and octal numbers must be integers.");
514 }
515 }
516
517 return is_float ? TYPE_FLOAT : TYPE_INTEGER;
518}
519
520void Tokenizer::ConsumeLineComment(std::string* content) {
521 if (content != NULL) RecordTo(target: content);
522
523 while (current_char_ != '\0' && current_char_ != '\n') {
524 NextChar();
525 }
526 TryConsume(c: '\n');
527
528 if (content != NULL) StopRecording();
529}
530
531void Tokenizer::ConsumeBlockComment(std::string* content) {
532 int start_line = line_;
533 int start_column = column_ - 2;
534
535 if (content != NULL) RecordTo(target: content);
536
537 while (true) {
538 while (current_char_ != '\0' && current_char_ != '*' &&
539 current_char_ != '/' && current_char_ != '\n') {
540 NextChar();
541 }
542
543 if (TryConsume(c: '\n')) {
544 if (content != NULL) StopRecording();
545
546 // Consume leading whitespace and asterisk;
547 ConsumeZeroOrMore<WhitespaceNoNewline>();
548 if (TryConsume(c: '*')) {
549 if (TryConsume(c: '/')) {
550 // End of comment.
551 break;
552 }
553 }
554
555 if (content != NULL) RecordTo(target: content);
556 } else if (TryConsume(c: '*') && TryConsume(c: '/')) {
557 // End of comment.
558 if (content != NULL) {
559 StopRecording();
560 // Strip trailing "*/".
561 content->erase(pos: content->size() - 2);
562 }
563 break;
564 } else if (TryConsume(c: '/') && current_char_ == '*') {
565 // Note: We didn't consume the '*' because if there is a '/' after it
566 // we want to interpret that as the end of the comment.
567 AddError(
568 message: "\"/*\" inside block comment. Block comments cannot be nested.");
569 } else if (current_char_ == '\0') {
570 AddError(message: "End-of-file inside block comment.");
571 error_collector_->AddError(line: start_line, column: start_column,
572 message: " Comment started here.");
573 if (content != NULL) StopRecording();
574 break;
575 }
576 }
577}
578
579Tokenizer::NextCommentStatus Tokenizer::TryConsumeCommentStart() {
580 if (comment_style_ == CPP_COMMENT_STYLE && TryConsume(c: '/')) {
581 if (TryConsume(c: '/')) {
582 return LINE_COMMENT;
583 } else if (TryConsume(c: '*')) {
584 return BLOCK_COMMENT;
585 } else {
586 // Oops, it was just a slash. Return it.
587 current_.type = TYPE_SYMBOL;
588 current_.text = "/";
589 current_.line = line_;
590 current_.column = column_ - 1;
591 current_.end_column = column_;
592 return SLASH_NOT_COMMENT;
593 }
594 } else if (comment_style_ == SH_COMMENT_STYLE && TryConsume(c: '#')) {
595 return LINE_COMMENT;
596 } else {
597 return NO_COMMENT;
598 }
599}
600
601bool Tokenizer::TryConsumeWhitespace() {
602 if (report_newlines_) {
603 if (TryConsumeOne<WhitespaceNoNewline>()) {
604 ConsumeZeroOrMore<WhitespaceNoNewline>();
605 current_.type = TYPE_WHITESPACE;
606 return true;
607 }
608 return false;
609 }
610 if (TryConsumeOne<Whitespace>()) {
611 ConsumeZeroOrMore<Whitespace>();
612 current_.type = TYPE_WHITESPACE;
613 return report_whitespace_;
614 }
615 return false;
616}
617
618bool Tokenizer::TryConsumeNewline() {
619 if (!report_whitespace_ || !report_newlines_) {
620 return false;
621 }
622 if (TryConsume(c: '\n')) {
623 current_.type = TYPE_NEWLINE;
624 return true;
625 }
626 return false;
627}
628
629// -------------------------------------------------------------------
630
631bool Tokenizer::Next() {
632 previous_ = current_;
633
634 while (!read_error_) {
635 StartToken();
636 bool report_token = TryConsumeWhitespace() || TryConsumeNewline();
637 EndToken();
638 if (report_token) {
639 return true;
640 }
641
642 switch (TryConsumeCommentStart()) {
643 case LINE_COMMENT:
644 ConsumeLineComment(NULL);
645 continue;
646 case BLOCK_COMMENT:
647 ConsumeBlockComment(NULL);
648 continue;
649 case SLASH_NOT_COMMENT:
650 return true;
651 case NO_COMMENT:
652 break;
653 }
654
655 // Check for EOF before continuing.
656 if (read_error_) break;
657
658 if (LookingAt<Unprintable>() || current_char_ == '\0') {
659 AddError(message: "Invalid control characters encountered in text.");
660 NextChar();
661 // Skip more unprintable characters, too. But, remember that '\0' is
662 // also what current_char_ is set to after EOF / read error. We have
663 // to be careful not to go into an infinite loop of trying to consume
664 // it, so make sure to check read_error_ explicitly before consuming
665 // '\0'.
666 while (TryConsumeOne<Unprintable>() ||
667 (!read_error_ && TryConsume(c: '\0'))) {
668 // Ignore.
669 }
670
671 } else {
672 // Reading some sort of token.
673 StartToken();
674
675 if (TryConsumeOne<Letter>()) {
676 ConsumeZeroOrMore<Alphanumeric>();
677 current_.type = TYPE_IDENTIFIER;
678 } else if (TryConsume(c: '0')) {
679 current_.type = ConsumeNumber(started_with_zero: true, started_with_dot: false);
680 } else if (TryConsume(c: '.')) {
681 // This could be the beginning of a floating-point number, or it could
682 // just be a '.' symbol.
683
684 if (TryConsumeOne<Digit>()) {
685 // It's a floating-point number.
686 if (previous_.type == TYPE_IDENTIFIER &&
687 current_.line == previous_.line &&
688 current_.column == previous_.end_column) {
689 // We don't accept syntax like "blah.123".
690 error_collector_->AddError(
691 line: line_, column: column_ - 2,
692 message: "Need space between identifier and decimal point.");
693 }
694 current_.type = ConsumeNumber(started_with_zero: false, started_with_dot: true);
695 } else {
696 current_.type = TYPE_SYMBOL;
697 }
698 } else if (TryConsumeOne<Digit>()) {
699 current_.type = ConsumeNumber(started_with_zero: false, started_with_dot: false);
700 } else if (TryConsume(c: '\"')) {
701 ConsumeString(delimiter: '\"');
702 current_.type = TYPE_STRING;
703 } else if (TryConsume(c: '\'')) {
704 ConsumeString(delimiter: '\'');
705 current_.type = TYPE_STRING;
706 } else {
707 // Check if the high order bit is set.
708 if (current_char_ & 0x80) {
709 error_collector_->AddError(
710 line: line_, column: column_,
711 message: StringPrintf(format: "Interpreting non ascii codepoint %d.",
712 static_cast<unsigned char>(current_char_)));
713 }
714 NextChar();
715 current_.type = TYPE_SYMBOL;
716 }
717
718 EndToken();
719 return true;
720 }
721 }
722
723 // EOF
724 current_.type = TYPE_END;
725 current_.text.clear();
726 current_.line = line_;
727 current_.column = column_;
728 current_.end_column = column_;
729 return false;
730}
731
732namespace {
733
734// Helper class for collecting comments and putting them in the right places.
735//
736// This basically just buffers the most recent comment until it can be decided
737// exactly where that comment should be placed. When Flush() is called, the
738// current comment goes into either prev_trailing_comments or detached_comments.
739// When the CommentCollector is destroyed, the last buffered comment goes into
740// next_leading_comments.
741class CommentCollector {
742 public:
743 CommentCollector(std::string* prev_trailing_comments,
744 std::vector<std::string>* detached_comments,
745 std::string* next_leading_comments)
746 : prev_trailing_comments_(prev_trailing_comments),
747 detached_comments_(detached_comments),
748 next_leading_comments_(next_leading_comments),
749 has_comment_(false),
750 is_line_comment_(false),
751 can_attach_to_prev_(true) {
752 if (prev_trailing_comments != NULL) prev_trailing_comments->clear();
753 if (detached_comments != NULL) detached_comments->clear();
754 if (next_leading_comments != NULL) next_leading_comments->clear();
755 }
756
757 ~CommentCollector() {
758 // Whatever is in the buffer is a leading comment.
759 if (next_leading_comments_ != NULL && has_comment_) {
760 comment_buffer_.swap(s&: *next_leading_comments_);
761 }
762 }
763
764 // About to read a line comment. Get the comment buffer pointer in order to
765 // read into it.
766 std::string* GetBufferForLineComment() {
767 // We want to combine with previous line comments, but not block comments.
768 if (has_comment_ && !is_line_comment_) {
769 Flush();
770 }
771 has_comment_ = true;
772 is_line_comment_ = true;
773 return &comment_buffer_;
774 }
775
776 // About to read a block comment. Get the comment buffer pointer in order to
777 // read into it.
778 std::string* GetBufferForBlockComment() {
779 if (has_comment_) {
780 Flush();
781 }
782 has_comment_ = true;
783 is_line_comment_ = false;
784 return &comment_buffer_;
785 }
786
787 void ClearBuffer() {
788 comment_buffer_.clear();
789 has_comment_ = false;
790 }
791
792 // Called once we know that the comment buffer is complete and is *not*
793 // connected to the next token.
794 void Flush() {
795 if (has_comment_) {
796 if (can_attach_to_prev_) {
797 if (prev_trailing_comments_ != NULL) {
798 prev_trailing_comments_->append(str: comment_buffer_);
799 }
800 can_attach_to_prev_ = false;
801 } else {
802 if (detached_comments_ != NULL) {
803 detached_comments_->push_back(x: comment_buffer_);
804 }
805 }
806 ClearBuffer();
807 }
808 }
809
810 void DetachFromPrev() { can_attach_to_prev_ = false; }
811
812 private:
813 std::string* prev_trailing_comments_;
814 std::vector<std::string>* detached_comments_;
815 std::string* next_leading_comments_;
816
817 std::string comment_buffer_;
818
819 // True if any comments were read into comment_buffer_. This can be true even
820 // if comment_buffer_ is empty, namely if the comment was "/**/".
821 bool has_comment_;
822
823 // Is the comment in the comment buffer a line comment?
824 bool is_line_comment_;
825
826 // Is it still possible that we could be reading a comment attached to the
827 // previous token?
828 bool can_attach_to_prev_;
829};
830
831} // namespace
832
833bool Tokenizer::NextWithComments(std::string* prev_trailing_comments,
834 std::vector<std::string>* detached_comments,
835 std::string* next_leading_comments) {
836 CommentCollector collector(prev_trailing_comments, detached_comments,
837 next_leading_comments);
838
839 if (current_.type == TYPE_START) {
840 // Ignore unicode byte order mark(BOM) if it appears at the file
841 // beginning. Only UTF-8 BOM (0xEF 0xBB 0xBF) is accepted.
842 if (TryConsume(c: static_cast<char>(0xEF))) {
843 if (!TryConsume(c: static_cast<char>(0xBB)) ||
844 !TryConsume(c: static_cast<char>(0xBF))) {
845 AddError(
846 message: "Proto file starts with 0xEF but not UTF-8 BOM. "
847 "Only UTF-8 is accepted for proto file.");
848 return false;
849 }
850 }
851 collector.DetachFromPrev();
852 } else {
853 // A comment appearing on the same line must be attached to the previous
854 // declaration.
855 ConsumeZeroOrMore<WhitespaceNoNewline>();
856 switch (TryConsumeCommentStart()) {
857 case LINE_COMMENT:
858 ConsumeLineComment(content: collector.GetBufferForLineComment());
859
860 // Don't allow comments on subsequent lines to be attached to a trailing
861 // comment.
862 collector.Flush();
863 break;
864 case BLOCK_COMMENT:
865 ConsumeBlockComment(content: collector.GetBufferForBlockComment());
866
867 ConsumeZeroOrMore<WhitespaceNoNewline>();
868 if (!TryConsume(c: '\n')) {
869 // Oops, the next token is on the same line. If we recorded a comment
870 // we really have no idea which token it should be attached to.
871 collector.ClearBuffer();
872 return Next();
873 }
874
875 // Don't allow comments on subsequent lines to be attached to a trailing
876 // comment.
877 collector.Flush();
878 break;
879 case SLASH_NOT_COMMENT:
880 return true;
881 case NO_COMMENT:
882 if (!TryConsume(c: '\n')) {
883 // The next token is on the same line. There are no comments.
884 return Next();
885 }
886 break;
887 }
888 }
889
890 // OK, we are now on the line *after* the previous token.
891 while (true) {
892 ConsumeZeroOrMore<WhitespaceNoNewline>();
893
894 switch (TryConsumeCommentStart()) {
895 case LINE_COMMENT:
896 ConsumeLineComment(content: collector.GetBufferForLineComment());
897 break;
898 case BLOCK_COMMENT:
899 ConsumeBlockComment(content: collector.GetBufferForBlockComment());
900
901 // Consume the rest of the line so that we don't interpret it as a
902 // blank line the next time around the loop.
903 ConsumeZeroOrMore<WhitespaceNoNewline>();
904 TryConsume(c: '\n');
905 break;
906 case SLASH_NOT_COMMENT:
907 return true;
908 case NO_COMMENT:
909 if (TryConsume(c: '\n')) {
910 // Completely blank line.
911 collector.Flush();
912 collector.DetachFromPrev();
913 } else {
914 bool result = Next();
915 if (!result || current_.text == "}" || current_.text == "]" ||
916 current_.text == ")") {
917 // It looks like we're at the end of a scope. In this case it
918 // makes no sense to attach a comment to the following token.
919 collector.Flush();
920 }
921 return result;
922 }
923 break;
924 }
925 }
926}
927
928// -------------------------------------------------------------------
929// Token-parsing helpers. Remember that these don't need to report
930// errors since any errors should already have been reported while
931// tokenizing. Also, these can assume that whatever text they
932// are given is text that the tokenizer actually parsed as a token
933// of the given type.
934
935bool Tokenizer::ParseInteger(const std::string& text, uint64_t max_value,
936 uint64_t* output) {
937 // We can't just use strtoull() because (a) it accepts negative numbers,
938 // (b) We want additional range checks, (c) it reports overflows via errno.
939
940#if 0
941 const char *str_begin = text.c_str();
942 if (*str_begin == '-') return false;
943 char *str_end = nullptr;
944 errno = 0;
945 *output = std::strtoull(str_begin, &str_end, 0);
946 return (errno == 0 && str_end && *str_end == '\0' && *output <= max_value);
947#endif
948
949 const char* ptr = text.c_str();
950 int base = 10;
951 uint64_t overflow_if_mul_base = (kuint64max / 10) + 1;
952 if (ptr[0] == '0') {
953 if (ptr[1] == 'x' || ptr[1] == 'X') {
954 // This is hex.
955 base = 16;
956 overflow_if_mul_base = (kuint64max / 16) + 1;
957 ptr += 2;
958 } else {
959 // This is octal.
960 base = 8;
961 overflow_if_mul_base = (kuint64max / 8) + 1;
962 }
963 }
964
965 uint64_t result = 0;
966 // For all the leading '0's, and also the first non-zero character, we
967 // don't need to multiply.
968 while (*ptr != '\0') {
969 int digit = DigitValue(digit: *ptr++);
970 if (digit >= base) {
971 // The token provided by Tokenizer is invalid. i.e., 099 is an invalid
972 // token, but Tokenizer still think it's integer.
973 return false;
974 }
975 if (digit != 0) {
976 result = digit;
977 break;
978 }
979 }
980 for (; *ptr != '\0'; ptr++) {
981 int digit = DigitValue(digit: *ptr);
982 if (digit < 0 || digit >= base) {
983 // The token provided by Tokenizer is invalid. i.e., 099 is an invalid
984 // token, but Tokenizer still think it's integer.
985 return false;
986 }
987 if (result >= overflow_if_mul_base) {
988 // We know the multiply we're about to do will overflow, so exit now.
989 return false;
990 }
991 // We know that result * base won't overflow, but adding digit might...
992 result = result * base + digit;
993 // C++ guarantees defined "wrap" semantics when unsigned integer
994 // operations overflow, making this a fast way to check if adding
995 // digit made result overflow, and thus, wrap around.
996 if (result < static_cast<uint64_t>(base)) return false;
997 }
998 if (result > max_value) return false;
999
1000 *output = result;
1001 return true;
1002}
1003
1004double Tokenizer::ParseFloat(const std::string& text) {
1005 const char* start = text.c_str();
1006 char* end;
1007 double result = NoLocaleStrtod(str: start, endptr: &end);
1008
1009 // "1e" is not a valid float, but if the tokenizer reads it, it will
1010 // report an error but still return it as a valid token. We need to
1011 // accept anything the tokenizer could possibly return, error or not.
1012 if (*end == 'e' || *end == 'E') {
1013 ++end;
1014 if (*end == '-' || *end == '+') ++end;
1015 }
1016
1017 // If the Tokenizer had allow_f_after_float_ enabled, the float may be
1018 // suffixed with the letter 'f'.
1019 if (*end == 'f' || *end == 'F') {
1020 ++end;
1021 }
1022
1023 GOOGLE_LOG_IF(DFATAL,
1024 static_cast<size_t>(end - start) != text.size() || *start == '-')
1025 << " Tokenizer::ParseFloat() passed text that could not have been"
1026 " tokenized as a float: "
1027 << CEscape(src: text);
1028 return result;
1029}
1030
1031// Helper to append a Unicode code point to a string as UTF8, without bringing
1032// in any external dependencies.
1033static void AppendUTF8(uint32_t code_point, std::string* output) {
1034 uint32_t tmp = 0;
1035 int len = 0;
1036 if (code_point <= 0x7f) {
1037 tmp = code_point;
1038 len = 1;
1039 } else if (code_point <= 0x07ff) {
1040 tmp = 0x0000c080 | ((code_point & 0x07c0) << 2) | (code_point & 0x003f);
1041 len = 2;
1042 } else if (code_point <= 0xffff) {
1043 tmp = 0x00e08080 | ((code_point & 0xf000) << 4) |
1044 ((code_point & 0x0fc0) << 2) | (code_point & 0x003f);
1045 len = 3;
1046 } else if (code_point <= 0x10ffff) {
1047 tmp = 0xf0808080 | ((code_point & 0x1c0000) << 6) |
1048 ((code_point & 0x03f000) << 4) | ((code_point & 0x000fc0) << 2) |
1049 (code_point & 0x003f);
1050 len = 4;
1051 } else {
1052 // Unicode code points end at 0x10FFFF, so this is out-of-range.
1053 // ConsumeString permits hex values up to 0x1FFFFF, and FetchUnicodePoint
1054 // doesn't perform a range check.
1055 StringAppendF(dst: output, format: "\\U%08x", code_point);
1056 return;
1057 }
1058 tmp = ghtonl(x: tmp);
1059 output->append(s: reinterpret_cast<const char*>(&tmp) + sizeof(tmp) - len, n: len);
1060}
1061
1062// Try to read <len> hex digits from ptr, and stuff the numeric result into
1063// *result. Returns true if that many digits were successfully consumed.
1064static bool ReadHexDigits(const char* ptr, int len, uint32_t* result) {
1065 *result = 0;
1066 if (len == 0) return false;
1067 for (const char* end = ptr + len; ptr < end; ++ptr) {
1068 if (*ptr == '\0') return false;
1069 *result = (*result << 4) + DigitValue(digit: *ptr);
1070 }
1071 return true;
1072}
1073
1074// Handling UTF-16 surrogate pairs. UTF-16 encodes code points in the range
1075// 0x10000...0x10ffff as a pair of numbers, a head surrogate followed by a trail
1076// surrogate. These numbers are in a reserved range of Unicode code points, so
1077// if we encounter such a pair we know how to parse it and convert it into a
1078// single code point.
1079static const uint32_t kMinHeadSurrogate = 0xd800;
1080static const uint32_t kMaxHeadSurrogate = 0xdc00;
1081static const uint32_t kMinTrailSurrogate = 0xdc00;
1082static const uint32_t kMaxTrailSurrogate = 0xe000;
1083
1084static inline bool IsHeadSurrogate(uint32_t code_point) {
1085 return (code_point >= kMinHeadSurrogate) && (code_point < kMaxHeadSurrogate);
1086}
1087
1088static inline bool IsTrailSurrogate(uint32_t code_point) {
1089 return (code_point >= kMinTrailSurrogate) &&
1090 (code_point < kMaxTrailSurrogate);
1091}
1092
1093// Combine a head and trail surrogate into a single Unicode code point.
1094static uint32_t AssembleUTF16(uint32_t head_surrogate,
1095 uint32_t trail_surrogate) {
1096 GOOGLE_DCHECK(IsHeadSurrogate(head_surrogate));
1097 GOOGLE_DCHECK(IsTrailSurrogate(trail_surrogate));
1098 return 0x10000 + (((head_surrogate - kMinHeadSurrogate) << 10) |
1099 (trail_surrogate - kMinTrailSurrogate));
1100}
1101
1102// Convert the escape sequence parameter to a number of expected hex digits.
1103static inline int UnicodeLength(char key) {
1104 if (key == 'u') return 4;
1105 if (key == 'U') return 8;
1106 return 0;
1107}
1108
1109// Given a pointer to the 'u' or 'U' starting a Unicode escape sequence, attempt
1110// to parse that sequence. On success, returns a pointer to the first char
1111// beyond that sequence, and fills in *code_point. On failure, returns ptr
1112// itself.
1113static const char* FetchUnicodePoint(const char* ptr, uint32_t* code_point) {
1114 const char* p = ptr;
1115 // Fetch the code point.
1116 const int len = UnicodeLength(key: *p++);
1117 if (!ReadHexDigits(ptr: p, len, result: code_point)) return ptr;
1118 p += len;
1119
1120 // Check if the code point we read is a "head surrogate." If so, then we
1121 // expect it to be immediately followed by another code point which is a valid
1122 // "trail surrogate," and together they form a UTF-16 pair which decodes into
1123 // a single Unicode point. Trail surrogates may only use \u, not \U.
1124 if (IsHeadSurrogate(code_point: *code_point) && *p == '\\' && *(p + 1) == 'u') {
1125 uint32_t trail_surrogate;
1126 if (ReadHexDigits(ptr: p + 2, len: 4, result: &trail_surrogate) &&
1127 IsTrailSurrogate(code_point: trail_surrogate)) {
1128 *code_point = AssembleUTF16(head_surrogate: *code_point, trail_surrogate);
1129 p += 6;
1130 }
1131 // If this failed, then we just emit the head surrogate as a code point.
1132 // It's bogus, but so is the string.
1133 }
1134
1135 return p;
1136}
1137
1138// The text string must begin and end with single or double quote
1139// characters.
1140void Tokenizer::ParseStringAppend(const std::string& text,
1141 std::string* output) {
1142 // Reminder: text[0] is always a quote character. (If text is
1143 // empty, it's invalid, so we'll just return).
1144 const size_t text_size = text.size();
1145 if (text_size == 0) {
1146 GOOGLE_LOG(DFATAL) << " Tokenizer::ParseStringAppend() passed text that could not"
1147 " have been tokenized as a string: "
1148 << CEscape(src: text);
1149 return;
1150 }
1151
1152 // Reserve room for new string. The branch is necessary because if
1153 // there is already space available the reserve() call might
1154 // downsize the output.
1155 const size_t new_len = text_size + output->size();
1156 if (new_len > output->capacity()) {
1157 output->reserve(res_arg: new_len);
1158 }
1159
1160 // Loop through the string copying characters to "output" and
1161 // interpreting escape sequences. Note that any invalid escape
1162 // sequences or other errors were already reported while tokenizing.
1163 // In this case we do not need to produce valid results.
1164 for (const char* ptr = text.c_str() + 1; *ptr != '\0'; ptr++) {
1165 if (*ptr == '\\' && ptr[1] != '\0') {
1166 // An escape sequence.
1167 ++ptr;
1168
1169 if (OctalDigit::InClass(c: *ptr)) {
1170 // An octal escape. May one, two, or three digits.
1171 int code = DigitValue(digit: *ptr);
1172 if (OctalDigit::InClass(c: ptr[1])) {
1173 ++ptr;
1174 code = code * 8 + DigitValue(digit: *ptr);
1175 }
1176 if (OctalDigit::InClass(c: ptr[1])) {
1177 ++ptr;
1178 code = code * 8 + DigitValue(digit: *ptr);
1179 }
1180 output->push_back(c: static_cast<char>(code));
1181
1182 } else if (*ptr == 'x') {
1183 // A hex escape. May zero, one, or two digits. (The zero case
1184 // will have been caught as an error earlier.)
1185 int code = 0;
1186 if (HexDigit::InClass(c: ptr[1])) {
1187 ++ptr;
1188 code = DigitValue(digit: *ptr);
1189 }
1190 if (HexDigit::InClass(c: ptr[1])) {
1191 ++ptr;
1192 code = code * 16 + DigitValue(digit: *ptr);
1193 }
1194 output->push_back(c: static_cast<char>(code));
1195
1196 } else if (*ptr == 'u' || *ptr == 'U') {
1197 uint32_t unicode;
1198 const char* end = FetchUnicodePoint(ptr, code_point: &unicode);
1199 if (end == ptr) {
1200 // Failure: Just dump out what we saw, don't try to parse it.
1201 output->push_back(c: *ptr);
1202 } else {
1203 AppendUTF8(code_point: unicode, output);
1204 ptr = end - 1; // Because we're about to ++ptr.
1205 }
1206 } else {
1207 // Some other escape code.
1208 output->push_back(c: TranslateEscape(c: *ptr));
1209 }
1210
1211 } else if (*ptr == text[0] && ptr[1] == '\0') {
1212 // Ignore final quote matching the starting quote.
1213 } else {
1214 output->push_back(c: *ptr);
1215 }
1216 }
1217}
1218
1219template <typename CharacterClass>
1220static bool AllInClass(const std::string& s) {
1221 for (const char character : s) {
1222 if (!CharacterClass::InClass(character)) return false;
1223 }
1224 return true;
1225}
1226
1227bool Tokenizer::IsIdentifier(const std::string& text) {
1228 // Mirrors IDENTIFIER definition in Tokenizer::Next() above.
1229 if (text.size() == 0) return false;
1230 if (!Letter::InClass(c: text.at(n: 0))) return false;
1231 if (!AllInClass<Alphanumeric>(s: text.substr(pos: 1))) return false;
1232 return true;
1233}
1234
1235} // namespace io
1236} // namespace protobuf
1237} // namespace google
1238
1239#include <google/protobuf/port_undef.inc>
1240