tokenizer.h source code [Velox/build/_deps/protobuf-src/src/google/protobuf/io/tokenizer.h]

1	// Protocol Buffers - Google's data interchange format
2	// Copyright 2008 Google Inc. All rights reserved.
3	// https://developers.google.com/protocol-buffers/
4	//
5	// Redistribution and use in source and binary forms, with or without
6	// modification, are permitted provided that the following conditions are
7	// met:
8	//
9	// Redistributions of source code must retain the above copyright*
10	// notice, this list of conditions and the following disclaimer.
11	// Redistributions in binary form must reproduce the above*
12	// copyright notice, this list of conditions and the following disclaimer
13	// in the documentation and/or other materials provided with the
14	// distribution.
15	// Neither the name of Google Inc. nor the names of its*
16	// contributors may be used to endorse or promote products derived from
17	// this software without specific prior written permission.
18	//
19	// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20	// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21	// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22	// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23	// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24	// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25	// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26	// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27	// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28	// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29	// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30
31	// Author: kenton@google.com (Kenton Varda)
32	// Based on original Protocol Buffers design by
33	// Sanjay Ghemawat, Jeff Dean, and others.
34	//
35	// Class for parsing tokenized text from a ZeroCopyInputStream.
36
37	#ifndef GOOGLE_PROTOBUF_IO_TOKENIZER_H__
38	#define GOOGLE_PROTOBUF_IO_TOKENIZER_H__
39
40
41	#include <string>
42	#include <vector>
43
44	#include <google/protobuf/stubs/common.h>
45	#include <google/protobuf/stubs/logging.h>
46
47	// Must be included last.
48	#include <google/protobuf/port_def.inc>
49
50	namespace google {
51	namespace protobuf {
52	namespace io {
53
54	class ZeroCopyInputStream; // zero_copy_stream.h
55
56	// Defined in this file.
57	class ErrorCollector;
58	class Tokenizer;
59
60	// By "column number", the proto compiler refers to a count of the number
61	// of bytes before a given byte, except that a tab character advances to
62	// the next multiple of 8 bytes. Note in particular that column numbers
63	// are zero-based, while many user interfaces use one-based column numbers.
64	typedef int ColumnNumber;
65
66	// Abstract interface for an object which collects the errors that occur
67	// during parsing. A typical implementation might simply print the errors
68	// to stdout.
69	class PROTOBUF_EXPORT ErrorCollector {
70	public:
71	inline ErrorCollector() {}
72	virtual ~ErrorCollector();
73
74	// Indicates that there was an error in the input at the given line and
75	// column numbers. The numbers are zero-based, so you may want to add
76	// 1 to each before printing them.
77	virtual void AddError(int line, ColumnNumber column,
78	const std::string& message) = `0`;
79
80	// Indicates that there was a warning in the input at the given line and
81	// column numbers. The numbers are zero-based, so you may want to add
82	// 1 to each before printing them.
83	virtual void AddWarning(int / line /, ColumnNumber / column /,
84	const std::string& / message /) {}
85
86	private:
87	GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(ErrorCollector);
88	};
89
90	// This class converts a stream of raw text into a stream of tokens for
91	// the protocol definition parser to parse. The tokens recognized are
92	// similar to those that make up the C language; see the TokenType enum for
93	// precise descriptions. Whitespace and comments are skipped. By default,
94	// C- and C++-style comments are recognized, but other styles can be used by
95	// calling set_comment_style().
96	class PROTOBUF_EXPORT Tokenizer {
97	public:
98	// Construct a Tokenizer that reads and tokenizes text from the given
99	// input stream and writes errors to the given error_collector.
100	// The caller keeps ownership of input and error_collector.
101	Tokenizer(ZeroCopyInputStream* input, ErrorCollector* error_collector);
102	~Tokenizer();
103
104	enum TokenType {
105	TYPE_START, // Next() has not yet been called.
106	TYPE_END, // End of input reached. "text" is empty.
107
108	TYPE_IDENTIFIER, // A sequence of letters, digits, and underscores, not
109	// starting with a digit. It is an error for a number
110	// to be followed by an identifier with no space in
111	// between.
112	TYPE_INTEGER, // A sequence of digits representing an integer. Normally
113	// the digits are decimal, but a prefix of "0x" indicates
114	// a hex number and a leading zero indicates octal, just
115	// like with C numeric literals. A leading negative sign
116	// is NOT included in the token; it's up to the parser to
117	// interpret the unary minus operator on its own.
118	TYPE_FLOAT, // A floating point literal, with a fractional part and/or
119	// an exponent. Always in decimal. Again, never
120	// negative.
121	TYPE_STRING, // A quoted sequence of escaped characters. Either single
122	// or double quotes can be used, but they must match.
123	// A string literal cannot cross a line break.
124	TYPE_SYMBOL, // Any other printable character, like '!' or '+'.
125	// Symbols are always a single character, so "!+$%" is
126	// four tokens.
127	TYPE_WHITESPACE, // A sequence of whitespace. This token type is only
128	// produced if report_whitespace() is true. It is not
129	// reported for whitespace within comments or strings.
130	TYPE_NEWLINE, // A newline (\n). This token type is only
131	// produced if report_whitespace() is true and
132	// report_newlines() is true. It is not reported for
133	// newlines in comments or strings.
134	};
135
136	// Structure representing a token read from the token stream.
137	struct Token {
138	TokenType type;
139	std::string text; // The exact text of the token as it appeared in
140	// the input. e.g. tokens of TYPE_STRING will still
141	// be escaped and in quotes.
142
143	// "line" and "column" specify the position of the first character of
144	// the token within the input stream. They are zero-based.
145	int line;
146	ColumnNumber column;
147	ColumnNumber end_column;
148	};
149
150	// Get the current token. This is updated when Next() is called. Before
151	// the first call to Next(), current() has type TYPE_START and no contents.
152	const Token& current();
153
154	// Return the previous token -- i.e. what current() returned before the
155	// previous call to Next().
156	const Token& previous();
157
158	// Advance to the next token. Returns false if the end of the input is
159	// reached.
160	bool Next();
161
162	// Like Next(), but also collects comments which appear between the previous
163	// and next tokens.
164	//
165	// Comments which appear to be attached to the previous token are stored
166	// in prev_tailing_comments. Comments which appear to be attached to the*
167	// next token are stored in next_leading_comments. Comments appearing in*
168	// between which do not appear to be attached to either will be added to
169	// detached_comments. Any of these parameters can be NULL to simply discard
170	// the comments.
171	//
172	// A series of line comments appearing on consecutive lines, with no other
173	// tokens appearing on those lines, will be treated as a single comment.
174	//
175	// Only the comment content is returned; comment markers (e.g. //) are
176	// stripped out. For block comments, leading whitespace and an asterisk will
177	// be stripped from the beginning of each line other than the first. Newlines
178	// are included in the output.
179	//
180	// Examples:
181	//
182	// optional int32 foo = 1; // Comment attached to foo.
183	// // Comment attached to bar.
184	// optional int32 bar = 2;
185	//
186	// optional string baz = 3;
187	// // Comment attached to baz.
188	// // Another line attached to baz.
189	//
190	// // Comment attached to qux.
191	// //
192	// // Another line attached to qux.
193	// optional double qux = 4;
194	//
195	// // Detached comment. This is not attached to qux or corge
196	// // because there are blank lines separating it from both.
197	//
198	// optional string corge = 5;
199	// / Block comment attached*
200	// to corge. Leading asterisks*
201	// will be removed. /
202	// / Block comment attached to*
203	// grault. /
204	// optional int32 grault = 6;
205	bool NextWithComments(std::string* prev_trailing_comments,
206	std::vector<std::string>* detached_comments,
207	std::string* next_leading_comments);
208
209	// Parse helpers ---------------------------------------------------
210
211	// Parses a TYPE_FLOAT token. This never fails, so long as the text actually
212	// comes from a TYPE_FLOAT token parsed by Tokenizer. If it doesn't, the
213	// result is undefined (possibly an assert failure).
214	static double ParseFloat(const std::string& text);
215
216	// Parses a TYPE_STRING token. This never fails, so long as the text actually
217	// comes from a TYPE_STRING token parsed by Tokenizer. If it doesn't, the
218	// result is undefined (possibly an assert failure).
219	static void ParseString(const std::string& text, std::string* output);
220
221	// Identical to ParseString, but appends to output.
222	static void ParseStringAppend(const std::string& text, std::string* output);
223
224	// Parses a TYPE_INTEGER token. Returns false if the result would be
225	// greater than max_value. Otherwise, returns true and sets output to the*
226	// result. If the text is not from a Token of type TYPE_INTEGER originally
227	// parsed by a Tokenizer, the result is undefined (possibly an assert
228	// failure).
229	static bool ParseInteger(const std::string& text, uint64_t max_value,
230	uint64_t* output);
231
232	// Options ---------------------------------------------------------
233
234	// Set true to allow floats to be suffixed with the letter 'f'. Tokens
235	// which would otherwise be integers but which have the 'f' suffix will be
236	// forced to be interpreted as floats. For all other purposes, the 'f' is
237	// ignored.
238	void set_allow_f_after_float(bool value) { allow_f_after_float_ = value; }
239
240	// Valid values for set_comment_style().
241	enum CommentStyle {
242	// Line comments begin with "//", block comments are delimited by "/" and*
243	// "/".*
244	CPP_COMMENT_STYLE,
245	// Line comments begin with "#". No way to write block comments.
246	SH_COMMENT_STYLE
247	};
248
249	// Sets the comment style.
250	void set_comment_style(CommentStyle style) { comment_style_ = style; }
251
252	// Whether to require whitespace between a number and a field name.
253	// Default is true. Do not use this; for Google-internal cleanup only.
254	void set_require_space_after_number(bool require) {
255	require_space_after_number_ = require;
256	}
257
258	// Whether to allow string literals to span multiple lines. Default is false.
259	// Do not use this; for Google-internal cleanup only.
260	void set_allow_multiline_strings(bool allow) {
261	allow_multiline_strings_ = allow;
262	}
263
264	// If true, whitespace tokens are reported by Next().
265	// Note: `set_report_whitespace(false)` implies `set_report_newlines(false)`.
266	bool report_whitespace() const;
267	void set_report_whitespace(bool report);
268
269	// If true, newline tokens are reported by Next().
270	// Note: `set_report_newlines(true)` implies `set_report_whitespace(true)`.
271	bool report_newlines() const;
272	void set_report_newlines(bool report);
273
274	// External helper: validate an identifier.
275	static bool IsIdentifier(const std::string& text);
276
277	// -----------------------------------------------------------------
278	private:
279	GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(Tokenizer);
280
281	Token current_; // Returned by current().
282	Token previous_; // Returned by previous().
283
284	ZeroCopyInputStream* input_;
285	ErrorCollector* error_collector_;
286
287	char current_char_; // == buffer_[buffer_pos_], updated by NextChar().
288	const char* buffer_; // Current buffer returned from input_.
289	int buffer_size_; // Size of buffer_.
290	int buffer_pos_; // Current position within the buffer.
291	bool read_error_; // Did we previously encounter a read error?
292
293	// Line and column number of current_char_ within the whole input stream.
294	int line_;
295	ColumnNumber column_;
296
297	// String to which text should be appended as we advance through it.
298	// Call RecordTo(&str) to start recording and StopRecording() to stop.
299	// E.g. StartToken() calls RecordTo(&current_.text). record_start_ is the
300	// position within the current buffer where recording started.
301	std::string* record_target_;
302	int record_start_;
303
304	// Options.
305	bool allow_f_after_float_;
306	CommentStyle comment_style_;
307	bool require_space_after_number_;
308	bool allow_multiline_strings_;
309	bool report_whitespace_ = false;
310	bool report_newlines_ = false;
311
312	// Since we count columns we need to interpret tabs somehow. We'll take
313	// the standard 8-character definition for lack of any way to do better.
314	// This must match the documentation of ColumnNumber.
315	static const int kTabWidth = `8`;
316
317	// -----------------------------------------------------------------
318	// Helper methods.
319
320	// Consume this character and advance to the next one.
321	void NextChar();
322
323	// Read a new buffer from the input.
324	void Refresh();
325
326	inline void RecordTo(std::string* target);
327	inline void StopRecording();
328
329	// Called when the current character is the first character of a new
330	// token (not including whitespace or comments).
331	inline void StartToken();
332	// Called when the current character is the first character after the
333	// end of the last token. After this returns, current_.text will
334	// contain all text consumed since StartToken() was called.
335	inline void EndToken();
336
337	// Convenience method to add an error at the current line and column.
338	void AddError(const std::string& message) {
339	error_collector_->AddError(line: line_, column: column_, message);
340	}
341
342	// -----------------------------------------------------------------
343	// The following four methods are used to consume tokens of specific
344	// types. They are actually used to consume all characters after
345	// the first, since the calling function consumes the first character
346	// in order to decide what kind of token is being read.
347
348	// Read and consume a string, ending when the given delimiter is
349	// consumed.
350	void ConsumeString(char delimiter);
351
352	// Read and consume a number, returning TYPE_FLOAT or TYPE_INTEGER
353	// depending on what was read. This needs to know if the first
354	// character was a zero in order to correctly recognize hex and octal
355	// numbers.
356	// It also needs to know if the first character was a . to parse floating
357	// point correctly.
358	TokenType ConsumeNumber(bool started_with_zero, bool started_with_dot);
359
360	// Consume the rest of a line.
361	void ConsumeLineComment(std::string* content);
362	// Consume until "/".*
363	void ConsumeBlockComment(std::string* content);
364
365	enum NextCommentStatus {
366	// Started a line comment.
367	LINE_COMMENT,
368
369	// Started a block comment.
370	BLOCK_COMMENT,
371
372	// Consumed a slash, then realized it wasn't a comment. current_ has
373	// been filled in with a slash token. The caller should return it.
374	SLASH_NOT_COMMENT,
375
376	// We do not appear to be starting a comment here.
377	NO_COMMENT
378	};
379
380	// If we're at the start of a new comment, consume it and return what kind
381	// of comment it is.
382	NextCommentStatus TryConsumeCommentStart();
383
384	// If we're looking at a TYPE_WHITESPACE token and `report_whitespace_` is
385	// true, consume it and return true.
386	bool TryConsumeWhitespace();
387
388	// If we're looking at a TYPE_NEWLINE token and `report_newlines_` is true,
389	// consume it and return true.
390	bool TryConsumeNewline();
391
392	// -----------------------------------------------------------------
393	// These helper methods make the parsing code more readable. The
394	// "character classes" referred to are defined at the top of the .cc file.
395	// Basically it is a C++ class with one method:
396	// static bool InClass(char c);
397	// The method returns true if c is a member of this "class", like "Letter"
398	// or "Digit".
399
400	// Returns true if the current character is of the given character
401	// class, but does not consume anything.
402	template <typename CharacterClass>
403	inline bool LookingAt();
404
405	// If the current character is in the given class, consume it and return
406	// true. Otherwise return false.
407	// e.g. TryConsumeOne<Letter>()
408	template <typename CharacterClass>
409	inline bool TryConsumeOne();
410
411	// Like above, but try to consume the specific character indicated.
412	inline bool TryConsume(char c);
413
414	// Consume zero or more of the given character class.
415	template <typename CharacterClass>
416	inline void ConsumeZeroOrMore();
417
418	// Consume one or more of the given character class or log the given
419	// error message.
420	// e.g. ConsumeOneOrMore<Digit>("Expected digits.");
421	template <typename CharacterClass>
422	inline void ConsumeOneOrMore(const char* error);
423	};
424
425	// inline methods ====================================================
426	inline const Tokenizer::Token& Tokenizer::current() { return current_; }
427
428	inline const Tokenizer::Token& Tokenizer::previous() { return previous_; }
429
430	inline void Tokenizer::ParseString(const std::string& text,
431	std::string* output) {
432	output->clear();
433	ParseStringAppend(text, output);
434	}
435
436	} // namespace io
437	} // namespace protobuf
438	} // namespace google
439
440	#include <google/protobuf/port_undef.inc>
441
442	#endif // GOOGLE_PROTOBUF_IO_TOKENIZER_H__
443

Browse the source code of Velox/build/_deps/protobuf-src/src/google/protobuf/io/tokenizer.h