lexer.c++ source code [ClickHouse/contrib/capnproto/c++/src/capnp/compiler/lexer.c++]

1	// Copyright (c) 2013-2014 Sandstorm Development Group, Inc. and contributors
2	// Licensed under the MIT License:
3	//
4	// Permission is hereby granted, free of charge, to any person obtaining a copy
5	// of this software and associated documentation files (the "Software"), to deal
6	// in the Software without restriction, including without limitation the rights
7	// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8	// copies of the Software, and to permit persons to whom the Software is
9	// furnished to do so, subject to the following conditions:
10	//
11	// The above copyright notice and this permission notice shall be included in
12	// all copies or substantial portions of the Software.
13	//
14	// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15	// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16	// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17	// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18	// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19	// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20	// THE SOFTWARE.
21
22	#include "lexer.h"
23	#include <kj/parse/char.h>
24	#include <kj/debug.h>
25
26	namespace capnp {
27	namespace compiler {
28
29	namespace p = kj::parse;
30
31	bool lex(kj::ArrayPtr<const char> input, LexedStatements::Builder result,
32	ErrorReporter& errorReporter) {
33	Lexer lexer(Orphanage::getForMessageContaining(result), errorReporter);
34
35	auto parser = p::sequence(lexer.getParsers().statementSequence, p::endOfInput);
36
37	Lexer::ParserInput parserInput(input.begin(), input.end());
38	kj::Maybe<kj::Array<Orphan<Statement>>> parseOutput = parser (parserInput);
39
40	KJ_IF_MAYBE(output, parseOutput) {
41	auto l = result.initStatements(output->size());
42	for (uint i = `0`; i < output->size(); i++) {
43	l.adoptWithCaveats(i, kj::mv((*output)[i]));
44	}
45	return true;
46	} else {
47	uint32_t best = parserInput.getBest();
48	errorReporter.addError(best, best, kj::str("Parse error."));
49	return false;
50	}
51	}
52
53	bool lex(kj::ArrayPtr<const char> input, LexedTokens::Builder result,
54	ErrorReporter& errorReporter) {
55	Lexer lexer(Orphanage::getForMessageContaining(result), errorReporter);
56
57	auto parser = p::sequence(lexer.getParsers().tokenSequence, p::endOfInput);
58
59	Lexer::ParserInput parserInput(input.begin(), input.end());
60	kj::Maybe<kj::Array<Orphan<Token>>> parseOutput = parser (parserInput);
61
62	KJ_IF_MAYBE(output, parseOutput) {
63	auto l = result.initTokens(output->size());
64	for (uint i = `0`; i < output->size(); i++) {
65	l.adoptWithCaveats(i, kj::mv((*output)[i]));
66	}
67	return true;
68	} else {
69	uint32_t best = parserInput.getBest();
70	errorReporter.addError(best, best, kj::str("Parse error."));
71	return false;
72	}
73	}
74
75	namespace {
76
77	typedef p::Span<uint32_t> Location;
78
79	Token::Builder initTok(Orphan<Token>& t, const Location& loc) {
80	auto builder = t.get();
81	builder.setStartByte(loc.begin());
82	builder.setEndByte(loc.end());
83	return builder;
84	}
85
86	void buildTokenSequenceList(List<List<Token>>::Builder builder,
87	kj::Array<kj::Array<Orphan<Token>>>&& items) {
88	for (uint i = `0`; i < items.size(); i++) {
89	auto& item = items [i];
90	auto itemBuilder = builder.init(i, item.size());
91	for (uint j = `0`; j < item.size(); j++) {
92	itemBuilder.adoptWithCaveats(j, kj::mv(item [j]));
93	}
94	}
95	}
96
97	void attachDocComment(Statement::Builder statement, kj::Array<kj::String>&& comment) {
98	size_t size = `0`;
99	for (auto& line: comment) {
100	size += line.size() + `1`; // include newline
101	}
102	Text::Builder builder = statement.initDocComment(size);
103	char* pos = builder.begin();
104	for (auto& line: comment) {
105	memcpy(pos, line.begin(), line.size());
106	pos += line.size();
107	*pos++ = `'\n'`;
108	}
109	KJ_ASSERT(pos == builder.end());
110	}
111
112	constexpr auto discardComment =
113	sequence(p::exactChar<`'#'`>(), p::discard(p::many(p::discard(p::anyOfChars("\n").invert()))),
114	p::oneOf(p::exactChar<`'\n'`>(), p::endOfInput));
115	constexpr auto saveComment =
116	sequence(p::exactChar<`'#'`>(), p::discard(p::optional(p::exactChar<`' '`>())),
117	p::charsToString(p::many(p::anyOfChars("\n").invert())),
118	p::oneOf(p::exactChar<`'\n'`>(), p::endOfInput));
119
120	constexpr auto utf8Bom =
121	sequence(p::exactChar<`'\xef'`>(), p::exactChar<`'\xbb'`>(), p::exactChar<`'\xbf'`>());
122
123	constexpr auto bomsAndWhitespace =
124	sequence(p::discardWhitespace,
125	p::discard(p::many(sequence(utf8Bom, p::discardWhitespace))));
126
127	constexpr auto commentsAndWhitespace =
128	sequence(bomsAndWhitespace,
129	p::discard(p::many(sequence(discardComment, bomsAndWhitespace))));
130
131	constexpr auto discardLineWhitespace =
132	p::discard(p::many(p::discard(p::whitespaceChar.invert().orAny("\r\n").invert())));
133	constexpr auto newline = p::oneOf(
134	p::exactChar<`'\n'`>(),
135	sequence(p::exactChar<`'\r'`>(), p::discard(p::optional(p::exactChar<`'\n'`>()))));
136
137	constexpr auto docComment = p::optional(p::sequence(
138	discardLineWhitespace,
139	p::discard(p::optional(newline)),
140	p::oneOrMore(p::sequence(discardLineWhitespace, saveComment))));
141	// Parses a set of comment lines preceded by at most one newline and with no intervening blank
142	// lines.
143
144	} // namespace
145
146	Lexer::Lexer(Orphanage orphanageParam, ErrorReporter& errorReporter)
147	: orphanage (orphanageParam) {
148
149	// Note that because passing an lvalue to a parser constructor uses it by-referencee, it's safe
150	// for us to use parsers.tokenSequence even though we haven't yet constructed it.
151	auto& tokenSequence = parsers.tokenSequence;
152
153	auto& commaDelimitedList = arena.copy(p::transform(
154	p::sequence(tokenSequence, p::many(p::sequence(p::exactChar<`','`>(), tokenSequence))),
155	[](kj::Array<Orphan<Token>>&& first, kj::Array<kj::Array<Orphan<Token>>>&& rest)
156	-> kj::Array<kj::Array<Orphan<Token>>> {
157	if (first == nullptr && rest == nullptr) {
158	// Completely empty list.
159	return nullptr;
160	} else {
161	uint restSize = rest.size();
162	if (restSize > `0` && rest [restSize - `1`] == nullptr) {
163	// Allow for trailing commas by shortening the list by one item if the final token is
164	// nullptr
165	restSize--;
166	}
167	auto result = kj::heapArrayBuilder<kj::Array<Orphan<Token>>>(`1` + restSize); // first+rest
168	result.add(kj::mv(first));
169	for (uint i = `0`; i < restSize ; i++) {
170	result.add(kj::mv(rest [i]));
171	}
172	return result.finish();
173	}
174	}));
175
176	auto& token = arena.copy(p::oneOf(
177	p::transformWithLocation(p::identifier,
178	[this](Location loc, kj::String name) -> Orphan<Token> {
179	auto t = orphanage.newOrphan<Token>();
180	initTok(t, loc).setIdentifier(name);
181	return t;
182	}),
183	p::transformWithLocation(p::doubleQuotedString,
184	[this](Location loc, kj::String text) -> Orphan<Token> {
185	auto t = orphanage.newOrphan<Token>();
186	initTok(t, loc).setStringLiteral(text);
187	return t;
188	}),
189	p::transformWithLocation(p::doubleQuotedHexBinary,
190	[this](Location loc, kj::Array<byte> data) -> Orphan<Token> {
191	auto t = orphanage.newOrphan<Token>();
192	initTok(t, loc).setBinaryLiteral(data);
193	return t;
194	}),
195	p::transformWithLocation(p::integer,
196	[this](Location loc, uint64_t i) -> Orphan<Token> {
197	auto t = orphanage.newOrphan<Token>();
198	initTok(t, loc).setIntegerLiteral(i);
199	return t;
200	}),
201	p::transformWithLocation(p::number,
202	[this](Location loc, double x) -> Orphan<Token> {
203	auto t = orphanage.newOrphan<Token>();
204	initTok(t, loc).setFloatLiteral(x);
205	return t;
206	}),
207	p::transformWithLocation(
208	p::charsToString(p::oneOrMore(p::anyOfChars("!$%&*+-./:<=>?@^\|~"))),
209	[this](Location loc, kj::String text) -> Orphan<Token> {
210	auto t = orphanage.newOrphan<Token>();
211	initTok(t, loc).setOperator(text);
212	return t;
213	}),
214	p::transformWithLocation(
215	sequence(p::exactChar<`'('`>(), commaDelimitedList, p::exactChar<`')'`>()),
216	[this](Location loc, kj::Array<kj::Array<Orphan<Token>>>&& items) -> Orphan<Token> {
217	auto t = orphanage.newOrphan<Token>();
218	buildTokenSequenceList(
219	initTok(t, loc).initParenthesizedList(items.size()), kj::mv(items));
220	return t;
221	}),
222	p::transformWithLocation(
223	sequence(p::exactChar<`'['`>(), commaDelimitedList, p::exactChar<`']'`>()),
224	[this](Location loc, kj::Array<kj::Array<Orphan<Token>>>&& items) -> Orphan<Token> {
225	auto t = orphanage.newOrphan<Token>();
226	buildTokenSequenceList(
227	initTok(t, loc).initBracketedList(items.size()), kj::mv(items));
228	return t;
229	}),
230	p::transformOrReject(p::transformWithLocation(
231	p::oneOf(sequence(p::exactChar<`'\xff'`>(), p::exactChar<`'\xfe'`>()),
232	sequence(p::exactChar<`'\xfe'`>(), p::exactChar<`'\xff'`>()),
233	sequence(p::exactChar<`'\x00'`>())),
234	[&errorReporter](Location loc) -> kj::Maybe<Orphan<Token>> {
235	errorReporter.addError(loc.begin(), loc.end(),
236	"Non-UTF-8 input detected. Cap'n Proto schema files must be UTF-8 text.");
237	return nullptr;
238	}), [](kj::Maybe<Orphan<Token>> param) { return param; })));
239	parsers.tokenSequence = arena.copy(p::sequence(
240	commentsAndWhitespace, p::many(p::sequence(token, commentsAndWhitespace))));
241
242	auto& statementSequence = parsers.statementSequence;
243
244	auto& statementEnd = arena.copy(p::oneOf(
245	transform(p::sequence(p::exactChar<`';'`>(), docComment),
246	[this](kj::Maybe<kj::Array<kj::String>>&& comment) -> Orphan<Statement> {
247	auto result = orphanage.newOrphan<Statement>();
248	auto builder = result.get();
249	KJ_IF_MAYBE(c, comment) {
250	attachDocComment(builder, kj::mv(*c));
251	}
252	builder.setLine();
253	return result;
254	}),
255	transform(
256	p::sequence(p::exactChar<`'{'`>(), docComment, statementSequence, p::exactChar<`'}'`>(),
257	docComment),
258	[this](kj::Maybe<kj::Array<kj::String>>&& comment,
259	kj::Array<Orphan<Statement>>&& statements,
260	kj::Maybe<kj::Array<kj::String>>&& lateComment)
261	-> Orphan<Statement> {
262	auto result = orphanage.newOrphan<Statement>();
263	auto builder = result.get();
264	KJ_IF_MAYBE(c, comment) {
265	attachDocComment(builder, kj::mv(*c));
266	} else KJ_IF_MAYBE(c, lateComment) {
267	attachDocComment(builder, kj::mv(*c));
268	}
269	auto list = builder.initBlock(statements.size());
270	for (uint i = `0`; i < statements.size(); i++) {
271	list.adoptWithCaveats(i, kj::mv(statements [i]));
272	}
273	return result;
274	})
275	));
276
277	auto& statement = arena.copy(p::transformWithLocation(p::sequence(tokenSequence, statementEnd),
278	[](Location loc, kj::Array<Orphan<Token>>&& tokens, Orphan<Statement>&& statement) {
279	auto builder = statement.get();
280	auto tokensBuilder = builder.initTokens(tokens.size());
281	for (uint i = `0`; i < tokens.size(); i++) {
282	tokensBuilder.adoptWithCaveats(i, kj::mv(tokens [i]));
283	}
284	builder.setStartByte(loc.begin());
285	builder.setEndByte(loc.end());
286	return kj::mv(statement);
287	}));
288
289	parsers.statementSequence = arena.copy(sequence(
290	commentsAndWhitespace, many(sequence(statement, commentsAndWhitespace))));
291
292	parsers.token = token;
293	parsers.statement = statement;
294	parsers.emptySpace = commentsAndWhitespace;
295	}
296
297	Lexer::~Lexer() noexcept(false) {}
298
299	} // namespace compiler
300	} // namespace capnp
301

Browse the source code of ClickHouse/contrib/capnproto/c++/src/capnp/compiler/lexer.c++