tokenizer.cc source code [Velox/build/_deps/protobuf-src/src/google/protobuf/io/tokenizer.cc]

1	// Protocol Buffers - Google's data interchange format
2	// Copyright 2008 Google Inc. All rights reserved.
3	// https://developers.google.com/protocol-buffers/
4	//
5	// Redistribution and use in source and binary forms, with or without
6	// modification, are permitted provided that the following conditions are
7	// met:
8	//
9	// Redistributions of source code must retain the above copyright*
10	// notice, this list of conditions and the following disclaimer.
11	// Redistributions in binary form must reproduce the above*
12	// copyright notice, this list of conditions and the following disclaimer
13	// in the documentation and/or other materials provided with the
14	// distribution.
15	// Neither the name of Google Inc. nor the names of its*
16	// contributors may be used to endorse or promote products derived from
17	// this software without specific prior written permission.
18	//
19	// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20	// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21	// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22	// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23	// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24	// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25	// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26	// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27	// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28	// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29	// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30
31	// Author: kenton@google.com (Kenton Varda)
32	// Based on original Protocol Buffers design by
33	// Sanjay Ghemawat, Jeff Dean, and others.
34	//
35	// Here we have a hand-written lexer. At first you might ask yourself,
36	// "Hand-written text processing? Is Kenton crazy?!" Well, first of all,
37	// yes I am crazy, but that's beside the point. There are actually reasons
38	// why I ended up writing this this way.
39	//
40	// The traditional approach to lexing is to use lex to generate a lexer for
41	// you. Unfortunately, lex's output is ridiculously ugly and difficult to
42	// integrate cleanly with C++ code, especially abstract code or code meant
43	// as a library. Better parser-generators exist but would add dependencies
44	// which most users won't already have, which we'd like to avoid. (GNU flex
45	// has a C++ output option, but it's still ridiculously ugly, non-abstract,
46	// and not library-friendly.)
47	//
48	// The next approach that any good software engineer should look at is to
49	// use regular expressions. And, indeed, I did. I have code which
50	// implements this same class using regular expressions. It's about 200
51	// lines shorter. However:
52	// - Rather than error messages telling you "This string has an invalid
53	// escape sequence at line 5, column 45", you get error messages like
54	// "Parse error on line 5". Giving more precise errors requires adding
55	// a lot of code that ends up basically as complex as the hand-coded
56	// version anyway.
57	// - The regular expression to match a string literal looks like this:
58	// kString = new RE("(\"([^\"\\\\]\|" // non-escaped
59	// "\\\\[abfnrtv?\"'\\\\0-7]\|" // normal escape
60	// "\\\\x[0-9a-fA-F])\"\|" // hex escape*
61	// "\'([^\'\\\\]\|" // Also support single-quotes.
62	// "\\\\[abfnrtv?\"'\\\\0-7]\|"
63	// "\\\\x[0-9a-fA-F])\')");*
64	// Verifying the correctness of this line noise is actually harder than
65	// verifying the correctness of ConsumeString(), defined below. I'm not
66	// even confident that the above is correct, after staring at it for some
67	// time.
68	// - PCRE is fast, but there's still more overhead involved than the code
69	// below.
70	// - Sadly, regular expressions are not part of the C standard library, so
71	// using them would require depending on some other library. For the
72	// open source release, this could be really annoying. Nobody likes
73	// downloading one piece of software just to find that they need to
74	// download something else to make it work, and in all likelihood
75	// people downloading Protocol Buffers will already be doing so just
76	// to make something else work. We could include a copy of PCRE with
77	// our code, but that obligates us to keep it up-to-date and just seems
78	// like a big waste just to save 200 lines of code.
79	//
80	// On a similar but unrelated note, I'm even scared to use ctype.h.
81	// Apparently functions like isalpha() are locale-dependent. So, if we used
82	// that, then if this code is being called from some program that doesn't
83	// have its locale set to "C", it would behave strangely. We can't just set
84	// the locale to "C" ourselves since we might break the calling program that
85	// way, particularly if it is multi-threaded. WTF? Someone please let me
86	// (Kenton) know if I'm missing something here...
87	//
88	// I'd love to hear about other alternatives, though, as this code isn't
89	// exactly pretty.
90
91	#include <google/protobuf/io/tokenizer.h>
92
93	#include <google/protobuf/stubs/common.h>
94	#include <google/protobuf/stubs/logging.h>
95	#include <google/protobuf/stubs/strutil.h>
96	#include <google/protobuf/stubs/stringprintf.h>
97	#include <google/protobuf/io/strtod.h>
98	#include <google/protobuf/io/zero_copy_stream.h>
99	#include <google/protobuf/stubs/stl_util.h>
100
101	// Must be included last.
102	#include <google/protobuf/port_def.inc>
103
104	namespace google {
105	namespace protobuf {
106	namespace io {
107	namespace {
108
109	// As mentioned above, I don't trust ctype.h due to the presence of "locales".
110	// So, I have written replacement functions here. Someone please smack me if
111	// this is a bad idea or if there is some way around this.
112	//
113	// These "character classes" are designed to be used in template methods.
114	// For instance, Tokenizer::ConsumeZeroOrMore<Whitespace>() will eat
115	// whitespace.
116
117	// Note: No class is allowed to contain '\0', since this is used to mark end-
118	// of-input and is handled specially.
119
120	#define CHARACTER_CLASS(NAME, EXPRESSION) \
121	class NAME { \
122	public: \
123	static inline bool InClass(char c) { return EXPRESSION; } \
124	}
125
126	CHARACTER_CLASS(Whitespace, c == `' '` \|\| c == `'\n'` \|\| c == `'\t'` \|\| c == `'\r'` \|\|
127	c == `'\v'` \|\| c == `'\f'`);
128	CHARACTER_CLASS(WhitespaceNoNewline,
129	c == `' '` \|\| c == `'\t'` \|\| c == `'\r'` \|\| c == `'\v'` \|\| c == `'\f'`);
130
131	CHARACTER_CLASS(Unprintable, c<`' '` && c> `'\0'`);
132
133	CHARACTER_CLASS(Digit, `'0'` <= c && c <= `'9'`);
134	CHARACTER_CLASS(OctalDigit, `'0'` <= c && c <= `'7'`);
135	CHARACTER_CLASS(HexDigit, (`'0'` <= c && c <= `'9'`) \|\| (`'a'` <= c && c <= `'f'`) \|\|
136	(`'A'` <= c && c <= `'F'`));
137
138	CHARACTER_CLASS(Letter,
139	(`'a'` <= c && c <= `'z'`) \|\| (`'A'` <= c && c <= `'Z'`) \|\| (c == `'_'`));
140
141	CHARACTER_CLASS(Alphanumeric, (`'a'` <= c && c <= `'z'`) \|\|
142	(`'A'` <= c && c <= `'Z'`) \|\|
143	(`'0'` <= c && c <= `'9'`) \|\| (c == `'_'`));
144
145	CHARACTER_CLASS(Escape, c == `'a'` \|\| c == `'b'` \|\| c == `'f'` \|\| c == `'n'` \|\|
146	c == `'r'` \|\| c == `'t'` \|\| c == `'v'` \|\| c == `'\\'` \|\|
147	c == `'?'` \|\| c == `'\''` \|\| c == `'\"'`);
148
149	#undef CHARACTER_CLASS
150
151	// Given a char, interpret it as a numeric digit and return its value.
152	// This supports any number base up to 36.
153	// Represents integer values of digits.
154	// Uses 36 to indicate an invalid character since we support
155	// bases up to 36.
156	static const int8_t kAsciiToInt[`256`] = {
157	`36`, `36`, `36`, `36`, `36`, `36`, `36`, `36`, `36`, `36`, `36`, `36`, `36`, `36`, `36`, `36`, // 00-0F
158	`36`, `36`, `36`, `36`, `36`, `36`, `36`, `36`, `36`, `36`, `36`, `36`, `36`, `36`, `36`, `36`, // 10-1F
159	`36`, `36`, `36`, `36`, `36`, `36`, `36`, `36`, `36`, `36`, `36`, `36`, `36`, `36`, `36`, `36`, // ' '-'/'
160	`0`, `1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, // '0'-'9'
161	`36`, `36`, `36`, `36`, `36`, `36`, `36`, // ':'-'@'
162	`10`, `11`, `12`, `13`, `14`, `15`, `16`, `17`, `18`, `19`, `20`, `21`, `22`, `23`, `24`, `25`, // 'A'-'P'
163	`26`, `27`, `28`, `29`, `30`, `31`, `32`, `33`, `34`, `35`, // 'Q'-'Z'
164	`36`, `36`, `36`, `36`, `36`, `36`, // '['-'`'
165	`10`, `11`, `12`, `13`, `14`, `15`, `16`, `17`, `18`, `19`, `20`, `21`, `22`, `23`, `24`, `25`, // 'a'-'p'
166	`26`, `27`, `28`, `29`, `30`, `31`, `32`, `33`, `34`, `35`, // 'q'-'z'
167	`36`, `36`, `36`, `36`, `36`, // '{'-DEL
168	`36`, `36`, `36`, `36`, `36`, `36`, `36`, `36`, `36`, `36`, `36`, `36`, `36`, `36`, `36`, `36`, // 80-8F
169	`36`, `36`, `36`, `36`, `36`, `36`, `36`, `36`, `36`, `36`, `36`, `36`, `36`, `36`, `36`, `36`, // 90-9F
170	`36`, `36`, `36`, `36`, `36`, `36`, `36`, `36`, `36`, `36`, `36`, `36`, `36`, `36`, `36`, `36`, // A0-AF
171	`36`, `36`, `36`, `36`, `36`, `36`, `36`, `36`, `36`, `36`, `36`, `36`, `36`, `36`, `36`, `36`, // B0-BF
172	`36`, `36`, `36`, `36`, `36`, `36`, `36`, `36`, `36`, `36`, `36`, `36`, `36`, `36`, `36`, `36`, // C0-CF
173	`36`, `36`, `36`, `36`, `36`, `36`, `36`, `36`, `36`, `36`, `36`, `36`, `36`, `36`, `36`, `36`, // D0-DF
174	`36`, `36`, `36`, `36`, `36`, `36`, `36`, `36`, `36`, `36`, `36`, `36`, `36`, `36`, `36`, `36`, // E0-EF
175	`36`, `36`, `36`, `36`, `36`, `36`, `36`, `36`, `36`, `36`, `36`, `36`, `36`, `36`, `36`, `36`, // F0-FF
176	};
177
178	inline int DigitValue(char digit) { return kAsciiToInt[digit & `0xFF`]; }
179
180	// Inline because it's only used in one place.
181	inline char TranslateEscape(char c) {
182	switch (c) {
183	case `'a'`:
184	return `'\a'`;
185	case `'b'`:
186	return `'\b'`;
187	case `'f'`:
188	return `'\f'`;
189	case `'n'`:
190	return `'\n'`;
191	case `'r'`:
192	return `'\r'`;
193	case `'t'`:
194	return `'\t'`;
195	case `'v'`:
196	return `'\v'`;
197	case `'\\'`:
198	return `'\\'`;
199	case `'?'`:
200	return `'\?'`; // Trigraphs = :(
201	case `'\''`:
202	return `'\''`;
203	case `'"'`:
204	return `'\"'`;
205
206	// We expect escape sequences to have been validated separately.
207	default:
208	return `'?'`;
209	}
210	}
211
212	} // anonymous namespace
213
214	ErrorCollector::~ErrorCollector() {}
215
216	// ===================================================================
217
218	Tokenizer::Tokenizer(ZeroCopyInputStream* input,
219	ErrorCollector* error_collector)
220	: input_(input),
221	error_collector_(error_collector),
222	buffer_(NULL),
223	buffer_size_(`0`),
224	buffer_pos_(`0`),
225	read_error_(false),
226	line_(`0`),
227	column_(`0`),
228	record_target_(NULL),
229	record_start_(-`1`),
230	allow_f_after_float_(false),
231	comment_style_(CPP_COMMENT_STYLE),
232	require_space_after_number_(true),
233	allow_multiline_strings_(false) {
234	current_.line = `0`;
235	current_.column = `0`;
236	current_.end_column = `0`;
237	current_.type = TYPE_START;
238
239	Refresh();
240	}
241
242	Tokenizer::~Tokenizer() {
243	// If we had any buffer left unread, return it to the underlying stream
244	// so that someone else can read it.
245	if (buffer_size_ > buffer_pos_) {
246	input_->BackUp(count: buffer_size_ - buffer_pos_);
247	}
248	}
249
250	bool Tokenizer::report_whitespace() const { return report_whitespace_; }
251	// Note: `set_report_whitespace(false)` implies `set_report_newlines(false)`.
252	void Tokenizer::set_report_whitespace(bool report) {
253	report_whitespace_ = report;
254	report_newlines_ &= report;
255	}
256
257	// If true, newline tokens are reported by Next().
258	bool Tokenizer::report_newlines() const { return report_newlines_; }
259	// Note: `set_report_newlines(true)` implies `set_report_whitespace(true)`.
260	void Tokenizer::set_report_newlines(bool report) {
261	report_newlines_ = report;
262	report_whitespace_ \|= report; // enable report_whitespace if necessary
263	}
264
265	// -------------------------------------------------------------------
266	// Internal helpers.
267
268	void Tokenizer::NextChar() {
269	// Update our line and column counters based on the character being
270	// consumed.
271	if (current_char_ == `'\n'`) {
272	++line_;
273	column_ = `0`;
274	} else if (current_char_ == `'\t'`) {
275	column_ += kTabWidth - column_ % kTabWidth;
276	} else {
277	++column_;
278	}
279
280	// Advance to the next character.
281	++buffer_pos_;
282	if (buffer_pos_ < buffer_size_) {
283	current_char_ = buffer_[buffer_pos_];
284	} else {
285	Refresh();
286	}
287	}
288
289	void Tokenizer::Refresh() {
290	if (read_error_) {
291	current_char_ = `'\0'`;
292	return;
293	}
294
295	// If we're in a token, append the rest of the buffer to it.
296	if (record_target_ != NULL && record_start_ < buffer_size_) {
297	record_target_->append(s: buffer_ + record_start_,
298	n: buffer_size_ - record_start_);
299	record_start_ = `0`;
300	}
301
302	const void* data = NULL;
303	buffer_ = NULL;
304	buffer_pos_ = `0`;
305	do {
306	if (!input_->Next(data: &data, size: &buffer_size_)) {
307	// end of stream (or read error)
308	buffer_size_ = `0`;
309	read_error_ = true;
310	current_char_ = `'\0'`;
311	return;
312	}
313	} while (buffer_size_ == `0`);
314
315	buffer_ = static_cast<const char*>(data);
316
317	current_char_ = buffer_[`0`];
318	}
319
320	inline void Tokenizer::RecordTo(std::string* target) {
321	record_target_ = target;
322	record_start_ = buffer_pos_;
323	}
324
325	inline void Tokenizer::StopRecording() {
326	// Note: The if() is necessary because some STL implementations crash when
327	// you call string::append(NULL, 0), presumably because they are trying to
328	// be helpful by detecting the NULL pointer, even though there's nothing
329	// wrong with reading zero bytes from NULL.
330	if (buffer_pos_ != record_start_) {
331	record_target_->append(s: buffer_ + record_start_,
332	n: buffer_pos_ - record_start_);
333	}
334	record_target_ = NULL;
335	record_start_ = -`1`;
336	}
337
338	inline void Tokenizer::StartToken() {
339	current_.type = TYPE_START; // Just for the sake of initializing it.
340	current_.text.clear();
341	current_.line = line_;
342	current_.column = column_;
343	RecordTo(target: &current_.text);
344	}
345
346	inline void Tokenizer::EndToken() {
347	StopRecording();
348	current_.end_column = column_;
349	}
350
351	// -------------------------------------------------------------------
352	// Helper methods that consume characters.
353
354	template <typename CharacterClass>
355	inline bool Tokenizer::LookingAt() {
356	return CharacterClass::InClass(current_char_);
357	}
358
359	template <typename CharacterClass>
360	inline bool Tokenizer::TryConsumeOne() {
361	if (CharacterClass::InClass(current_char_)) {
362	NextChar();
363	return true;
364	} else {
365	return false;
366	}
367	}
368
369	inline bool Tokenizer::TryConsume(char c) {
370	if (current_char_ == c) {
371	NextChar();
372	return true;
373	} else {
374	return false;
375	}
376	}
377
378	template <typename CharacterClass>
379	inline void Tokenizer::ConsumeZeroOrMore() {
380	while (CharacterClass::InClass(current_char_)) {
381	NextChar();
382	}
383	}
384
385	template <typename CharacterClass>
386	inline void Tokenizer::ConsumeOneOrMore(const char* error) {
387	if (!CharacterClass::InClass(current_char_)) {
388	AddError(message: error);
389	} else {
390	do {
391	NextChar();
392	} while (CharacterClass::InClass(current_char_));
393	}
394	}
395
396	// -------------------------------------------------------------------
397	// Methods that read whole patterns matching certain kinds of tokens
398	// or comments.
399
400	void Tokenizer::ConsumeString(char delimiter) {
401	while (true) {
402	switch (current_char_) {
403	case `'\0'`:
404	AddError(message: "Unexpected end of string.");
405	return;
406
407	case `'\n'`: {
408	if (!allow_multiline_strings_) {
409	AddError(message: "String literals cannot cross line boundaries.");
410	return;
411	}
412	NextChar();
413	break;
414	}
415
416	case `'\\'`: {
417	// An escape sequence.
418	NextChar();
419	if (TryConsumeOne<Escape>()) {
420	// Valid escape sequence.
421	} else if (TryConsumeOne<OctalDigit>()) {
422	// Possibly followed by two more octal digits, but these will
423	// just be consumed by the main loop anyway so we don't need
424	// to do so explicitly here.
425	} else if (TryConsume(c: `'x'`)) {
426	if (!TryConsumeOne<HexDigit>()) {
427	AddError(message: "Expected hex digits for escape sequence.");
428	}
429	// Possibly followed by another hex digit, but again we don't care.
430	} else if (TryConsume(c: `'u'`)) {
431	if (!TryConsumeOne<HexDigit>() \|\| !TryConsumeOne<HexDigit>() \|\|
432	!TryConsumeOne<HexDigit>() \|\| !TryConsumeOne<HexDigit>()) {
433	AddError(message: "Expected four hex digits for \\u escape sequence.");
434	}
435	} else if (TryConsume(c: `'U'`)) {
436	// We expect 8 hex digits; but only the range up to 0x10ffff is
437	// legal.
438	if (!TryConsume(c: `'0'`) \|\| !TryConsume(c: `'0'`) \|\|
439	!(TryConsume(c: `'0'`) \|\| TryConsume(c: `'1'`)) \|\|
440	!TryConsumeOne<HexDigit>() \|\| !TryConsumeOne<HexDigit>() \|\|
441	!TryConsumeOne<HexDigit>() \|\| !TryConsumeOne<HexDigit>() \|\|
442	!TryConsumeOne<HexDigit>()) {
443	AddError(
444	message: "Expected eight hex digits up to 10ffff for \\U escape "
445	"sequence");
446	}
447	} else {
448	AddError(message: "Invalid escape sequence in string literal.");
449	}
450	break;
451	}
452
453	default: {
454	if (current_char_ == delimiter) {
455	NextChar();
456	return;
457	}
458	NextChar();
459	break;
460	}
461	}
462	}
463	}
464
465	Tokenizer::TokenType Tokenizer::ConsumeNumber(bool started_with_zero,
466	bool started_with_dot) {
467	bool is_float = false;
468
469	if (started_with_zero && (TryConsume(c: `'x'`) \|\| TryConsume(c: `'X'`))) {
470	// A hex number (started with "0x").
471	ConsumeOneOrMore<HexDigit>(error: "\"0x\" must be followed by hex digits.");
472
473	} else if (started_with_zero && LookingAt<Digit>()) {
474	// An octal number (had a leading zero).
475	ConsumeZeroOrMore<OctalDigit>();
476	if (LookingAt<Digit>()) {
477	AddError(message: "Numbers starting with leading zero must be in octal.");
478	ConsumeZeroOrMore<Digit>();
479	}
480
481	} else {
482	// A decimal number.
483	if (started_with_dot) {
484	is_float = true;
485	ConsumeZeroOrMore<Digit>();
486	} else {
487	ConsumeZeroOrMore<Digit>();
488
489	if (TryConsume(c: `'.'`)) {
490	is_float = true;
491	ConsumeZeroOrMore<Digit>();
492	}
493	}
494
495	if (TryConsume(c: `'e'`) \|\| TryConsume(c: `'E'`)) {
496	is_float = true;
497	TryConsume(c: `'-'`) \|\| TryConsume(c: `'+'`);
498	ConsumeOneOrMore<Digit>(error: "\"e\" must be followed by exponent.");
499	}
500
501	if (allow_f_after_float_ && (TryConsume(c: `'f'`) \|\| TryConsume(c: `'F'`))) {
502	is_float = true;
503	}
504	}
505
506	if (LookingAt<Letter>() && require_space_after_number_) {
507	AddError(message: "Need space between number and identifier.");
508	} else if (current_char_ == `'.'`) {
509	if (is_float) {
510	AddError(
511	message: "Already saw decimal point or exponent; can't have another one.");
512	} else {
513	AddError(message: "Hex and octal numbers must be integers.");
514	}
515	}
516
517	return is_float ? TYPE_FLOAT : TYPE_INTEGER;
518	}
519
520	void Tokenizer::ConsumeLineComment(std::string* content) {
521	if (content != NULL) RecordTo(target: content);
522
523	while (current_char_ != `'\0'` && current_char_ != `'\n'`) {
524	NextChar();
525	}
526	TryConsume(c: `'\n'`);
527
528	if (content != NULL) StopRecording();
529	}
530
531	void Tokenizer::ConsumeBlockComment(std::string* content) {
532	int start_line = line_;
533	int start_column = column_ - `2`;
534
535	if (content != NULL) RecordTo(target: content);
536
537	while (true) {
538	while (current_char_ != `'\0'` && current_char_ != `'*'` &&
539	current_char_ != `'/'` && current_char_ != `'\n'`) {
540	NextChar();
541	}
542
543	if (TryConsume(c: `'\n'`)) {
544	if (content != NULL) StopRecording();
545
546	// Consume leading whitespace and asterisk;
547	ConsumeZeroOrMore<WhitespaceNoNewline>();
548	if (TryConsume(c: `'*'`)) {
549	if (TryConsume(c: `'/'`)) {
550	// End of comment.
551	break;
552	}
553	}
554
555	if (content != NULL) RecordTo(target: content);
556	} else if (TryConsume(c: `'*'`) && TryConsume(c: `'/'`)) {
557	// End of comment.
558	if (content != NULL) {
559	StopRecording();
560	// Strip trailing "/".*
561	content->erase(pos: content->size() - `2`);
562	}
563	break;
564	} else if (TryConsume(c: `'/'`) && current_char_ == `'*'`) {
565	// Note: We didn't consume the '' because if there is a '/' after it*
566	// we want to interpret that as the end of the comment.
567	AddError(
568	message: "\"/*\" inside block comment. Block comments cannot be nested.");
569	} else if (current_char_ == `'\0'`) {
570	AddError(message: "End-of-file inside block comment.");
571	error_collector_->AddError(line: start_line, column: start_column,
572	message: " Comment started here.");
573	if (content != NULL) StopRecording();
574	break;
575	}
576	}
577	}
578
579	Tokenizer::NextCommentStatus Tokenizer::TryConsumeCommentStart() {
580	if (comment_style_ == CPP_COMMENT_STYLE && TryConsume(c: `'/'`)) {
581	if (TryConsume(c: `'/'`)) {
582	return LINE_COMMENT;
583	} else if (TryConsume(c: `'*'`)) {
584	return BLOCK_COMMENT;
585	} else {
586	// Oops, it was just a slash. Return it.
587	current_.type = TYPE_SYMBOL;
588	current_.text = "/";
589	current_.line = line_;
590	current_.column = column_ - `1`;
591	current_.end_column = column_;
592	return SLASH_NOT_COMMENT;
593	}
594	} else if (comment_style_ == SH_COMMENT_STYLE && TryConsume(c: `'#'`)) {
595	return LINE_COMMENT;
596	} else {
597	return NO_COMMENT;
598	}
599	}
600
601	bool Tokenizer::TryConsumeWhitespace() {
602	if (report_newlines_) {
603	if (TryConsumeOne<WhitespaceNoNewline>()) {
604	ConsumeZeroOrMore<WhitespaceNoNewline>();
605	current_.type = TYPE_WHITESPACE;
606	return true;
607	}
608	return false;
609	}
610	if (TryConsumeOne<Whitespace>()) {
611	ConsumeZeroOrMore<Whitespace>();
612	current_.type = TYPE_WHITESPACE;
613	return report_whitespace_;
614	}
615	return false;
616	}
617
618	bool Tokenizer::TryConsumeNewline() {
619	if (!report_whitespace_ \|\| !report_newlines_) {
620	return false;
621	}
622	if (TryConsume(c: `'\n'`)) {
623	current_.type = TYPE_NEWLINE;
624	return true;
625	}
626	return false;
627	}
628
629	// -------------------------------------------------------------------
630
631	bool Tokenizer::Next() {
632	previous_ = current_;
633
634	while (!read_error_) {
635	StartToken();
636	bool report_token = TryConsumeWhitespace() \|\| TryConsumeNewline();
637	EndToken();
638	if (report_token) {
639	return true;
640	}
641
642	switch (TryConsumeCommentStart()) {
643	case LINE_COMMENT:
644	ConsumeLineComment(NULL);
645	continue;
646	case BLOCK_COMMENT:
647	ConsumeBlockComment(NULL);
648	continue;
649	case SLASH_NOT_COMMENT:
650	return true;
651	case NO_COMMENT:
652	break;
653	}
654
655	// Check for EOF before continuing.
656	if (read_error_) break;
657
658	if (LookingAt<Unprintable>() \|\| current_char_ == `'\0'`) {
659	AddError(message: "Invalid control characters encountered in text.");
660	NextChar();
661	// Skip more unprintable characters, too. But, remember that '\0' is
662	// also what current_char_ is set to after EOF / read error. We have
663	// to be careful not to go into an infinite loop of trying to consume
664	// it, so make sure to check read_error_ explicitly before consuming
665	// '\0'.
666	while (TryConsumeOne<Unprintable>() \|\|
667	(!read_error_ && TryConsume(c: `'\0'`))) {
668	// Ignore.
669	}
670
671	} else {
672	// Reading some sort of token.
673	StartToken();
674
675	if (TryConsumeOne<Letter>()) {
676	ConsumeZeroOrMore<Alphanumeric>();
677	current_.type = TYPE_IDENTIFIER;
678	} else if (TryConsume(c: `'0'`)) {
679	current_.type = ConsumeNumber(started_with_zero: true, started_with_dot: false);
680	} else if (TryConsume(c: `'.'`)) {
681	// This could be the beginning of a floating-point number, or it could
682	// just be a '.' symbol.
683
684	if (TryConsumeOne<Digit>()) {
685	// It's a floating-point number.
686	if (previous_.type == TYPE_IDENTIFIER &&
687	current_.line == previous_.line &&
688	current_.column == previous_.end_column) {
689	// We don't accept syntax like "blah.123".
690	error_collector_->AddError(
691	line: line_, column: column_ - `2`,
692	message: "Need space between identifier and decimal point.");
693	}
694	current_.type = ConsumeNumber(started_with_zero: false, started_with_dot: true);
695	} else {
696	current_.type = TYPE_SYMBOL;
697	}
698	} else if (TryConsumeOne<Digit>()) {
699	current_.type = ConsumeNumber(started_with_zero: false, started_with_dot: false);
700	} else if (TryConsume(c: `'\"'`)) {
701	ConsumeString(delimiter: `'\"'`);
702	current_.type = TYPE_STRING;
703	} else if (TryConsume(c: `'\''`)) {
704	ConsumeString(delimiter: `'\''`);
705	current_.type = TYPE_STRING;
706	} else {
707	// Check if the high order bit is set.
708	if (current_char_ & `0x80`) {
709	error_collector_->AddError(
710	line: line_, column: column_,
711	message: StringPrintf(format: "Interpreting non ascii codepoint %d.",
712	static_cast<unsigned char>(current_char_)));
713	}
714	NextChar();
715	current_.type = TYPE_SYMBOL;
716	}
717
718	EndToken();
719	return true;
720	}
721	}
722
723	// EOF
724	current_.type = TYPE_END;
725	current_.text.clear();
726	current_.line = line_;
727	current_.column = column_;
728	current_.end_column = column_;
729	return false;
730	}
731
732	namespace {
733
734	// Helper class for collecting comments and putting them in the right places.
735	//
736	// This basically just buffers the most recent comment until it can be decided
737	// exactly where that comment should be placed. When Flush() is called, the
738	// current comment goes into either prev_trailing_comments or detached_comments.
739	// When the CommentCollector is destroyed, the last buffered comment goes into
740	// next_leading_comments.
741	class CommentCollector {
742	public:
743	CommentCollector(std::string* prev_trailing_comments,
744	std::vector<std::string>* detached_comments,
745	std::string* next_leading_comments)
746	: prev_trailing_comments_(prev_trailing_comments),
747	detached_comments_(detached_comments),
748	next_leading_comments_(next_leading_comments),
749	has_comment_(false),
750	is_line_comment_(false),
751	can_attach_to_prev_(true) {
752	if (prev_trailing_comments != NULL) prev_trailing_comments->clear();
753	if (detached_comments != NULL) detached_comments->clear();
754	if (next_leading_comments != NULL) next_leading_comments->clear();
755	}
756
757	~CommentCollector() {
758	// Whatever is in the buffer is a leading comment.
759	if (next_leading_comments_ != NULL && has_comment_) {
760	comment_buffer_.swap(s&: *next_leading_comments_);
761	}
762	}
763
764	// About to read a line comment. Get the comment buffer pointer in order to
765	// read into it.
766	std::string* GetBufferForLineComment() {
767	// We want to combine with previous line comments, but not block comments.
768	if (has_comment_ && !is_line_comment_) {
769	Flush();
770	}
771	has_comment_ = true;
772	is_line_comment_ = true;
773	return &comment_buffer_;
774	}
775
776	// About to read a block comment. Get the comment buffer pointer in order to
777	// read into it.
778	std::string* GetBufferForBlockComment() {
779	if (has_comment_) {
780	Flush();
781	}
782	has_comment_ = true;
783	is_line_comment_ = false;
784	return &comment_buffer_;
785	}
786
787	void ClearBuffer() {
788	comment_buffer_.clear();
789	has_comment_ = false;
790	}
791
792	// Called once we know that the comment buffer is complete and is not
793	// connected to the next token.
794	void Flush() {
795	if (has_comment_) {
796	if (can_attach_to_prev_) {
797	if (prev_trailing_comments_ != NULL) {
798	prev_trailing_comments_->append(str: comment_buffer_);
799	}
800	can_attach_to_prev_ = false;
801	} else {
802	if (detached_comments_ != NULL) {
803	detached_comments_->push_back(x: comment_buffer_);
804	}
805	}
806	ClearBuffer();
807	}
808	}
809
810	void DetachFromPrev() { can_attach_to_prev_ = false; }
811
812	private:
813	std::string* prev_trailing_comments_;
814	std::vector<std::string>* detached_comments_;
815	std::string* next_leading_comments_;
816
817	std::string comment_buffer_;
818
819	// True if any comments were read into comment_buffer_. This can be true even
820	// if comment_buffer_ is empty, namely if the comment was "//".
821	bool has_comment_;
822
823	// Is the comment in the comment buffer a line comment?
824	bool is_line_comment_;
825
826	// Is it still possible that we could be reading a comment attached to the
827	// previous token?
828	bool can_attach_to_prev_;
829	};
830
831	} // namespace
832
833	bool Tokenizer::NextWithComments(std::string* prev_trailing_comments,
834	std::vector<std::string>* detached_comments,
835	std::string* next_leading_comments) {
836	CommentCollector collector(prev_trailing_comments, detached_comments,
837	next_leading_comments);
838
839	if (current_.type == TYPE_START) {
840	// Ignore unicode byte order mark(BOM) if it appears at the file
841	// beginning. Only UTF-8 BOM (0xEF 0xBB 0xBF) is accepted.
842	if (TryConsume(c: static_cast<char>(`0xEF`))) {
843	if (!TryConsume(c: static_cast<char>(`0xBB`)) \|\|
844	!TryConsume(c: static_cast<char>(`0xBF`))) {
845	AddError(
846	message: "Proto file starts with 0xEF but not UTF-8 BOM. "
847	"Only UTF-8 is accepted for proto file.");
848	return false;
849	}
850	}
851	collector.DetachFromPrev();
852	} else {
853	// A comment appearing on the same line must be attached to the previous
854	// declaration.
855	ConsumeZeroOrMore<WhitespaceNoNewline>();
856	switch (TryConsumeCommentStart()) {
857	case LINE_COMMENT:
858	ConsumeLineComment(content: collector.GetBufferForLineComment());
859
860	// Don't allow comments on subsequent lines to be attached to a trailing
861	// comment.
862	collector.Flush();
863	break;
864	case BLOCK_COMMENT:
865	ConsumeBlockComment(content: collector.GetBufferForBlockComment());
866
867	ConsumeZeroOrMore<WhitespaceNoNewline>();
868	if (!TryConsume(c: `'\n'`)) {
869	// Oops, the next token is on the same line. If we recorded a comment
870	// we really have no idea which token it should be attached to.
871	collector.ClearBuffer();
872	return Next();
873	}
874
875	// Don't allow comments on subsequent lines to be attached to a trailing
876	// comment.
877	collector.Flush();
878	break;
879	case SLASH_NOT_COMMENT:
880	return true;
881	case NO_COMMENT:
882	if (!TryConsume(c: `'\n'`)) {
883	// The next token is on the same line. There are no comments.
884	return Next();
885	}
886	break;
887	}
888	}
889
890	// OK, we are now on the line after* the previous token.*
891	while (true) {
892	ConsumeZeroOrMore<WhitespaceNoNewline>();
893
894	switch (TryConsumeCommentStart()) {
895	case LINE_COMMENT:
896	ConsumeLineComment(content: collector.GetBufferForLineComment());
897	break;
898	case BLOCK_COMMENT:
899	ConsumeBlockComment(content: collector.GetBufferForBlockComment());
900
901	// Consume the rest of the line so that we don't interpret it as a
902	// blank line the next time around the loop.
903	ConsumeZeroOrMore<WhitespaceNoNewline>();
904	TryConsume(c: `'\n'`);
905	break;
906	case SLASH_NOT_COMMENT:
907	return true;
908	case NO_COMMENT:
909	if (TryConsume(c: `'\n'`)) {
910	// Completely blank line.
911	collector.Flush();
912	collector.DetachFromPrev();
913	} else {
914	bool result = Next();
915	if (!result \|\| current_.text == "}" \|\| current_.text == "]" \|\|
916	current_.text == ")") {
917	// It looks like we're at the end of a scope. In this case it
918	// makes no sense to attach a comment to the following token.
919	collector.Flush();
920	}
921	return result;
922	}
923	break;
924	}
925	}
926	}
927
928	// -------------------------------------------------------------------
929	// Token-parsing helpers. Remember that these don't need to report
930	// errors since any errors should already have been reported while
931	// tokenizing. Also, these can assume that whatever text they
932	// are given is text that the tokenizer actually parsed as a token
933	// of the given type.
934
935	bool Tokenizer::ParseInteger(const std::string& text, uint64_t max_value,
936	uint64_t* output) {
937	// We can't just use strtoull() because (a) it accepts negative numbers,
938	// (b) We want additional range checks, (c) it reports overflows via errno.
939
940	#if 0
941	const char *str_begin = text.c_str();
942	if (str_begin == `'-'`) return* false;
943	char str_end = nullptr*;
944	errno = `0`;
945	*output = std::strtoull(str_begin, &str_end, `0`);
946	return (errno == `0` && str_end && str_end == `'\0'` && output <= max_value);
947	#endif
948
949	const char* ptr = text.c_str();
950	int base = `10`;
951	uint64_t overflow_if_mul_base = (kuint64max / `10`) + `1`;
952	if (ptr[`0`] == `'0'`) {
953	if (ptr[`1`] == `'x'` \|\| ptr[`1`] == `'X'`) {
954	// This is hex.
955	base = `16`;
956	overflow_if_mul_base = (kuint64max / `16`) + `1`;
957	ptr += `2`;
958	} else {
959	// This is octal.
960	base = `8`;
961	overflow_if_mul_base = (kuint64max / `8`) + `1`;
962	}
963	}
964
965	uint64_t result = `0`;
966	// For all the leading '0's, and also the first non-zero character, we
967	// don't need to multiply.
968	while (*ptr != `'\0'`) {
969	int digit = DigitValue(digit: *ptr++);
970	if (digit >= base) {
971	// The token provided by Tokenizer is invalid. i.e., 099 is an invalid
972	// token, but Tokenizer still think it's integer.
973	return false;
974	}
975	if (digit != `0`) {
976	result = digit;
977	break;
978	}
979	}
980	for (; *ptr != `'\0'`; ptr++) {
981	int digit = DigitValue(digit: *ptr);
982	if (digit < `0` \|\| digit >= base) {
983	// The token provided by Tokenizer is invalid. i.e., 099 is an invalid
984	// token, but Tokenizer still think it's integer.
985	return false;
986	}
987	if (result >= overflow_if_mul_base) {
988	// We know the multiply we're about to do will overflow, so exit now.
989	return false;
990	}
991	// We know that result base won't overflow, but adding digit might...*
992	result = result * base + digit;
993	// C++ guarantees defined "wrap" semantics when unsigned integer
994	// operations overflow, making this a fast way to check if adding
995	// digit made result overflow, and thus, wrap around.
996	if (result < static_cast<uint64_t>(base)) return false;
997	}
998	if (result > max_value) return false;
999
1000	*output = result;
1001	return true;
1002	}
1003
1004	double Tokenizer::ParseFloat(const std::string& text) {
1005	const char* start = text.c_str();
1006	char* end;
1007	double result = NoLocaleStrtod(str: start, endptr: &end);
1008
1009	// "1e" is not a valid float, but if the tokenizer reads it, it will
1010	// report an error but still return it as a valid token. We need to
1011	// accept anything the tokenizer could possibly return, error or not.
1012	if (end == `'e'` \|\| end == `'E'`) {
1013	++end;
1014	if (end == `'-'` \|\| end == `'+'`) ++end;
1015	}
1016
1017	// If the Tokenizer had allow_f_after_float_ enabled, the float may be
1018	// suffixed with the letter 'f'.
1019	if (end == `'f'` \|\| end == `'F'`) {
1020	++end;
1021	}
1022
1023	GOOGLE_LOG_IF(DFATAL,
1024	static_cast<size_t>(end - start) != text.size() \|\| *start == `'-'`)
1025	<< " Tokenizer::ParseFloat() passed text that could not have been"
1026	" tokenized as a float: "
1027	<< CEscape(src: text);
1028	return result;
1029	}
1030
1031	// Helper to append a Unicode code point to a string as UTF8, without bringing
1032	// in any external dependencies.
1033	static void AppendUTF8(uint32_t code_point, std::string* output) {
1034	uint32_t tmp = `0`;
1035	int len = `0`;
1036	if (code_point <= `0x7f`) {
1037	tmp = code_point;
1038	len = `1`;
1039	} else if (code_point <= `0x07ff`) {
1040	tmp = `0x0000c080` \| ((code_point & `0x07c0`) << `2`) \| (code_point & `0x003f`);
1041	len = `2`;
1042	} else if (code_point <= `0xffff`) {
1043	tmp = `0x00e08080` \| ((code_point & `0xf000`) << `4`) \|
1044	((code_point & `0x0fc0`) << `2`) \| (code_point & `0x003f`);
1045	len = `3`;
1046	} else if (code_point <= `0x10ffff`) {
1047	tmp = `0xf0808080` \| ((code_point & `0x1c0000`) << `6`) \|
1048	((code_point & `0x03f000`) << `4`) \| ((code_point & `0x000fc0`) << `2`) \|
1049	(code_point & `0x003f`);
1050	len = `4`;
1051	} else {
1052	// Unicode code points end at 0x10FFFF, so this is out-of-range.
1053	// ConsumeString permits hex values up to 0x1FFFFF, and FetchUnicodePoint
1054	// doesn't perform a range check.
1055	StringAppendF(dst: output, format: "\\U%08x", code_point);
1056	return;
1057	}
1058	tmp = ghtonl(x: tmp);
1059	output->append(s: reinterpret_cast<const char>(&tmp) + sizeof*(tmp) - len, n: len);
1060	}
1061
1062	// Try to read <len> hex digits from ptr, and stuff the numeric result into
1063	// result. Returns true if that many digits were successfully consumed.*
1064	static bool ReadHexDigits(const char* ptr, int len, uint32_t* result) {
1065	*result = `0`;
1066	if (len == `0`) return false;
1067	for (const char* end = ptr + len; ptr < end; ++ptr) {
1068	if (ptr == `'\0'`) return* false;
1069	result = (result << `4`) + DigitValue(digit: *ptr);
1070	}
1071	return true;
1072	}
1073
1074	// Handling UTF-16 surrogate pairs. UTF-16 encodes code points in the range
1075	// 0x10000...0x10ffff as a pair of numbers, a head surrogate followed by a trail
1076	// surrogate. These numbers are in a reserved range of Unicode code points, so
1077	// if we encounter such a pair we know how to parse it and convert it into a
1078	// single code point.
1079	static const uint32_t kMinHeadSurrogate = `0xd800`;
1080	static const uint32_t kMaxHeadSurrogate = `0xdc00`;
1081	static const uint32_t kMinTrailSurrogate = `0xdc00`;
1082	static const uint32_t kMaxTrailSurrogate = `0xe000`;
1083
1084	static inline bool IsHeadSurrogate(uint32_t code_point) {
1085	return (code_point >= kMinHeadSurrogate) && (code_point < kMaxHeadSurrogate);
1086	}
1087
1088	static inline bool IsTrailSurrogate(uint32_t code_point) {
1089	return (code_point >= kMinTrailSurrogate) &&
1090	(code_point < kMaxTrailSurrogate);
1091	}
1092
1093	// Combine a head and trail surrogate into a single Unicode code point.
1094	static uint32_t AssembleUTF16(uint32_t head_surrogate,
1095	uint32_t trail_surrogate) {
1096	GOOGLE_DCHECK(IsHeadSurrogate(head_surrogate));
1097	GOOGLE_DCHECK(IsTrailSurrogate(trail_surrogate));
1098	return `0x10000` + (((head_surrogate - kMinHeadSurrogate) << `10`) \|
1099	(trail_surrogate - kMinTrailSurrogate));
1100	}
1101
1102	// Convert the escape sequence parameter to a number of expected hex digits.
1103	static inline int UnicodeLength(char key) {
1104	if (key == `'u'`) return `4`;
1105	if (key == `'U'`) return `8`;
1106	return `0`;
1107	}
1108
1109	// Given a pointer to the 'u' or 'U' starting a Unicode escape sequence, attempt
1110	// to parse that sequence. On success, returns a pointer to the first char
1111	// beyond that sequence, and fills in code_point. On failure, returns ptr*
1112	// itself.
1113	static const char* FetchUnicodePoint(const char* ptr, uint32_t* code_point) {
1114	const char* p = ptr;
1115	// Fetch the code point.
1116	const int len = UnicodeLength(key: *p++);
1117	if (!ReadHexDigits(ptr: p, len, result: code_point)) return ptr;
1118	p += len;
1119
1120	// Check if the code point we read is a "head surrogate." If so, then we
1121	// expect it to be immediately followed by another code point which is a valid
1122	// "trail surrogate," and together they form a UTF-16 pair which decodes into
1123	// a single Unicode point. Trail surrogates may only use \u, not \U.
1124	if (IsHeadSurrogate(code_point: code_point) && p == `'\\'` && *(p + `1`) == `'u'`) {
1125	uint32_t trail_surrogate;
1126	if (ReadHexDigits(ptr: p + `2`, len: `4`, result: &trail_surrogate) &&
1127	IsTrailSurrogate(code_point: trail_surrogate)) {
1128	code_point = AssembleUTF16(head_surrogate: code_point, trail_surrogate);
1129	p += `6`;
1130	}
1131	// If this failed, then we just emit the head surrogate as a code point.
1132	// It's bogus, but so is the string.
1133	}
1134
1135	return p;
1136	}
1137
1138	// The text string must begin and end with single or double quote
1139	// characters.
1140	void Tokenizer::ParseStringAppend(const std::string& text,
1141	std::string* output) {
1142	// Reminder: text[0] is always a quote character. (If text is
1143	// empty, it's invalid, so we'll just return).
1144	const size_t text_size = text.size();
1145	if (text_size == `0`) {
1146	GOOGLE_LOG(DFATAL) << " Tokenizer::ParseStringAppend() passed text that could not"
1147	" have been tokenized as a string: "
1148	<< CEscape(src: text);
1149	return;
1150	}
1151
1152	// Reserve room for new string. The branch is necessary because if
1153	// there is already space available the reserve() call might
1154	// downsize the output.
1155	const size_t new_len = text_size + output->size();
1156	if (new_len > output->capacity()) {
1157	output->reserve(res_arg: new_len);
1158	}
1159
1160	// Loop through the string copying characters to "output" and
1161	// interpreting escape sequences. Note that any invalid escape
1162	// sequences or other errors were already reported while tokenizing.
1163	// In this case we do not need to produce valid results.
1164	for (const char* ptr = text.c_str() + `1`; *ptr != `'\0'`; ptr++) {
1165	if (*ptr == `'\\'` && ptr[`1`] != `'\0'`) {
1166	// An escape sequence.
1167	++ptr;
1168
1169	if (OctalDigit::InClass(c: *ptr)) {
1170	// An octal escape. May one, two, or three digits.
1171	int code = DigitValue(digit: *ptr);
1172	if (OctalDigit::InClass(c: ptr[`1`])) {
1173	++ptr;
1174	code = code * `8` + DigitValue(digit: *ptr);
1175	}
1176	if (OctalDigit::InClass(c: ptr[`1`])) {
1177	++ptr;
1178	code = code * `8` + DigitValue(digit: *ptr);
1179	}
1180	output->push_back(c: static_cast<char>(code));
1181
1182	} else if (*ptr == `'x'`) {
1183	// A hex escape. May zero, one, or two digits. (The zero case
1184	// will have been caught as an error earlier.)
1185	int code = `0`;
1186	if (HexDigit::InClass(c: ptr[`1`])) {
1187	++ptr;
1188	code = DigitValue(digit: *ptr);
1189	}
1190	if (HexDigit::InClass(c: ptr[`1`])) {
1191	++ptr;
1192	code = code * `16` + DigitValue(digit: *ptr);
1193	}
1194	output->push_back(c: static_cast<char>(code));
1195
1196	} else if (ptr == `'u'` \|\| ptr == `'U'`) {
1197	uint32_t unicode;
1198	const char* end = FetchUnicodePoint(ptr, code_point: &unicode);
1199	if (end == ptr) {
1200	// Failure: Just dump out what we saw, don't try to parse it.
1201	output->push_back(c: *ptr);
1202	} else {
1203	AppendUTF8(code_point: unicode, output);
1204	ptr = end - `1`; // Because we're about to ++ptr.
1205	}
1206	} else {
1207	// Some other escape code.
1208	output->push_back(c: TranslateEscape(c: *ptr));
1209	}
1210
1211	} else if (*ptr == text [`0`] && ptr[`1`] == `'\0'`) {
1212	// Ignore final quote matching the starting quote.
1213	} else {
1214	output->push_back(c: *ptr);
1215	}
1216	}
1217	}
1218
1219	template <typename CharacterClass>
1220	static bool AllInClass(const std::string& s) {
1221	for (const char character : s) {
1222	if (!CharacterClass::InClass(character)) return false;
1223	}
1224	return true;
1225	}
1226
1227	bool Tokenizer::IsIdentifier(const std::string& text) {
1228	// Mirrors IDENTIFIER definition in Tokenizer::Next() above.
1229	if (text.size() == `0`) return false;
1230	if (!Letter::InClass(c: text.at(n: `0`))) return false;
1231	if (!AllInClass<Alphanumeric>(s: text.substr(pos: `1`))) return false;
1232	return true;
1233	}
1234
1235	} // namespace io
1236	} // namespace protobuf
1237	} // namespace google
1238
1239	#include <google/protobuf/port_undef.inc>
1240

Browse the source code of Velox/build/_deps/protobuf-src/src/google/protobuf/io/tokenizer.cc