1// Copyright (c) 2013-2014 Sandstorm Development Group, Inc. and contributors
2// Licensed under the MIT License:
3//
4// Permission is hereby granted, free of charge, to any person obtaining a copy
5// of this software and associated documentation files (the "Software"), to deal
6// in the Software without restriction, including without limitation the rights
7// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8// copies of the Software, and to permit persons to whom the Software is
9// furnished to do so, subject to the following conditions:
10//
11// The above copyright notice and this permission notice shall be included in
12// all copies or substantial portions of the Software.
13//
14// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20// THE SOFTWARE.
21
22#include "lexer.h"
23#include <kj/parse/char.h>
24#include <kj/debug.h>
25
26namespace capnp {
27namespace compiler {
28
29namespace p = kj::parse;
30
31bool lex(kj::ArrayPtr<const char> input, LexedStatements::Builder result,
32 ErrorReporter& errorReporter) {
33 Lexer lexer(Orphanage::getForMessageContaining(result), errorReporter);
34
35 auto parser = p::sequence(lexer.getParsers().statementSequence, p::endOfInput);
36
37 Lexer::ParserInput parserInput(input.begin(), input.end());
38 kj::Maybe<kj::Array<Orphan<Statement>>> parseOutput = parser(parserInput);
39
40 KJ_IF_MAYBE(output, parseOutput) {
41 auto l = result.initStatements(output->size());
42 for (uint i = 0; i < output->size(); i++) {
43 l.adoptWithCaveats(i, kj::mv((*output)[i]));
44 }
45 return true;
46 } else {
47 uint32_t best = parserInput.getBest();
48 errorReporter.addError(best, best, kj::str("Parse error."));
49 return false;
50 }
51}
52
53bool lex(kj::ArrayPtr<const char> input, LexedTokens::Builder result,
54 ErrorReporter& errorReporter) {
55 Lexer lexer(Orphanage::getForMessageContaining(result), errorReporter);
56
57 auto parser = p::sequence(lexer.getParsers().tokenSequence, p::endOfInput);
58
59 Lexer::ParserInput parserInput(input.begin(), input.end());
60 kj::Maybe<kj::Array<Orphan<Token>>> parseOutput = parser(parserInput);
61
62 KJ_IF_MAYBE(output, parseOutput) {
63 auto l = result.initTokens(output->size());
64 for (uint i = 0; i < output->size(); i++) {
65 l.adoptWithCaveats(i, kj::mv((*output)[i]));
66 }
67 return true;
68 } else {
69 uint32_t best = parserInput.getBest();
70 errorReporter.addError(best, best, kj::str("Parse error."));
71 return false;
72 }
73}
74
75namespace {
76
77typedef p::Span<uint32_t> Location;
78
79Token::Builder initTok(Orphan<Token>& t, const Location& loc) {
80 auto builder = t.get();
81 builder.setStartByte(loc.begin());
82 builder.setEndByte(loc.end());
83 return builder;
84}
85
86void buildTokenSequenceList(List<List<Token>>::Builder builder,
87 kj::Array<kj::Array<Orphan<Token>>>&& items) {
88 for (uint i = 0; i < items.size(); i++) {
89 auto& item = items[i];
90 auto itemBuilder = builder.init(i, item.size());
91 for (uint j = 0; j < item.size(); j++) {
92 itemBuilder.adoptWithCaveats(j, kj::mv(item[j]));
93 }
94 }
95}
96
97void attachDocComment(Statement::Builder statement, kj::Array<kj::String>&& comment) {
98 size_t size = 0;
99 for (auto& line: comment) {
100 size += line.size() + 1; // include newline
101 }
102 Text::Builder builder = statement.initDocComment(size);
103 char* pos = builder.begin();
104 for (auto& line: comment) {
105 memcpy(pos, line.begin(), line.size());
106 pos += line.size();
107 *pos++ = '\n';
108 }
109 KJ_ASSERT(pos == builder.end());
110}
111
112constexpr auto discardComment =
113 sequence(p::exactChar<'#'>(), p::discard(p::many(p::discard(p::anyOfChars("\n").invert()))),
114 p::oneOf(p::exactChar<'\n'>(), p::endOfInput));
115constexpr auto saveComment =
116 sequence(p::exactChar<'#'>(), p::discard(p::optional(p::exactChar<' '>())),
117 p::charsToString(p::many(p::anyOfChars("\n").invert())),
118 p::oneOf(p::exactChar<'\n'>(), p::endOfInput));
119
120constexpr auto utf8Bom =
121 sequence(p::exactChar<'\xef'>(), p::exactChar<'\xbb'>(), p::exactChar<'\xbf'>());
122
123constexpr auto bomsAndWhitespace =
124 sequence(p::discardWhitespace,
125 p::discard(p::many(sequence(utf8Bom, p::discardWhitespace))));
126
127constexpr auto commentsAndWhitespace =
128 sequence(bomsAndWhitespace,
129 p::discard(p::many(sequence(discardComment, bomsAndWhitespace))));
130
131constexpr auto discardLineWhitespace =
132 p::discard(p::many(p::discard(p::whitespaceChar.invert().orAny("\r\n").invert())));
133constexpr auto newline = p::oneOf(
134 p::exactChar<'\n'>(),
135 sequence(p::exactChar<'\r'>(), p::discard(p::optional(p::exactChar<'\n'>()))));
136
137constexpr auto docComment = p::optional(p::sequence(
138 discardLineWhitespace,
139 p::discard(p::optional(newline)),
140 p::oneOrMore(p::sequence(discardLineWhitespace, saveComment))));
141// Parses a set of comment lines preceded by at most one newline and with no intervening blank
142// lines.
143
144} // namespace
145
146Lexer::Lexer(Orphanage orphanageParam, ErrorReporter& errorReporter)
147 : orphanage(orphanageParam) {
148
149 // Note that because passing an lvalue to a parser constructor uses it by-referencee, it's safe
150 // for us to use parsers.tokenSequence even though we haven't yet constructed it.
151 auto& tokenSequence = parsers.tokenSequence;
152
153 auto& commaDelimitedList = arena.copy(p::transform(
154 p::sequence(tokenSequence, p::many(p::sequence(p::exactChar<','>(), tokenSequence))),
155 [](kj::Array<Orphan<Token>>&& first, kj::Array<kj::Array<Orphan<Token>>>&& rest)
156 -> kj::Array<kj::Array<Orphan<Token>>> {
157 if (first == nullptr && rest == nullptr) {
158 // Completely empty list.
159 return nullptr;
160 } else {
161 uint restSize = rest.size();
162 if (restSize > 0 && rest[restSize - 1] == nullptr) {
163 // Allow for trailing commas by shortening the list by one item if the final token is
164 // nullptr
165 restSize--;
166 }
167 auto result = kj::heapArrayBuilder<kj::Array<Orphan<Token>>>(1 + restSize); // first+rest
168 result.add(kj::mv(first));
169 for (uint i = 0; i < restSize ; i++) {
170 result.add(kj::mv(rest[i]));
171 }
172 return result.finish();
173 }
174 }));
175
176 auto& token = arena.copy(p::oneOf(
177 p::transformWithLocation(p::identifier,
178 [this](Location loc, kj::String name) -> Orphan<Token> {
179 auto t = orphanage.newOrphan<Token>();
180 initTok(t, loc).setIdentifier(name);
181 return t;
182 }),
183 p::transformWithLocation(p::doubleQuotedString,
184 [this](Location loc, kj::String text) -> Orphan<Token> {
185 auto t = orphanage.newOrphan<Token>();
186 initTok(t, loc).setStringLiteral(text);
187 return t;
188 }),
189 p::transformWithLocation(p::doubleQuotedHexBinary,
190 [this](Location loc, kj::Array<byte> data) -> Orphan<Token> {
191 auto t = orphanage.newOrphan<Token>();
192 initTok(t, loc).setBinaryLiteral(data);
193 return t;
194 }),
195 p::transformWithLocation(p::integer,
196 [this](Location loc, uint64_t i) -> Orphan<Token> {
197 auto t = orphanage.newOrphan<Token>();
198 initTok(t, loc).setIntegerLiteral(i);
199 return t;
200 }),
201 p::transformWithLocation(p::number,
202 [this](Location loc, double x) -> Orphan<Token> {
203 auto t = orphanage.newOrphan<Token>();
204 initTok(t, loc).setFloatLiteral(x);
205 return t;
206 }),
207 p::transformWithLocation(
208 p::charsToString(p::oneOrMore(p::anyOfChars("!$%&*+-./:<=>?@^|~"))),
209 [this](Location loc, kj::String text) -> Orphan<Token> {
210 auto t = orphanage.newOrphan<Token>();
211 initTok(t, loc).setOperator(text);
212 return t;
213 }),
214 p::transformWithLocation(
215 sequence(p::exactChar<'('>(), commaDelimitedList, p::exactChar<')'>()),
216 [this](Location loc, kj::Array<kj::Array<Orphan<Token>>>&& items) -> Orphan<Token> {
217 auto t = orphanage.newOrphan<Token>();
218 buildTokenSequenceList(
219 initTok(t, loc).initParenthesizedList(items.size()), kj::mv(items));
220 return t;
221 }),
222 p::transformWithLocation(
223 sequence(p::exactChar<'['>(), commaDelimitedList, p::exactChar<']'>()),
224 [this](Location loc, kj::Array<kj::Array<Orphan<Token>>>&& items) -> Orphan<Token> {
225 auto t = orphanage.newOrphan<Token>();
226 buildTokenSequenceList(
227 initTok(t, loc).initBracketedList(items.size()), kj::mv(items));
228 return t;
229 }),
230 p::transformOrReject(p::transformWithLocation(
231 p::oneOf(sequence(p::exactChar<'\xff'>(), p::exactChar<'\xfe'>()),
232 sequence(p::exactChar<'\xfe'>(), p::exactChar<'\xff'>()),
233 sequence(p::exactChar<'\x00'>())),
234 [&errorReporter](Location loc) -> kj::Maybe<Orphan<Token>> {
235 errorReporter.addError(loc.begin(), loc.end(),
236 "Non-UTF-8 input detected. Cap'n Proto schema files must be UTF-8 text.");
237 return nullptr;
238 }), [](kj::Maybe<Orphan<Token>> param) { return param; })));
239 parsers.tokenSequence = arena.copy(p::sequence(
240 commentsAndWhitespace, p::many(p::sequence(token, commentsAndWhitespace))));
241
242 auto& statementSequence = parsers.statementSequence;
243
244 auto& statementEnd = arena.copy(p::oneOf(
245 transform(p::sequence(p::exactChar<';'>(), docComment),
246 [this](kj::Maybe<kj::Array<kj::String>>&& comment) -> Orphan<Statement> {
247 auto result = orphanage.newOrphan<Statement>();
248 auto builder = result.get();
249 KJ_IF_MAYBE(c, comment) {
250 attachDocComment(builder, kj::mv(*c));
251 }
252 builder.setLine();
253 return result;
254 }),
255 transform(
256 p::sequence(p::exactChar<'{'>(), docComment, statementSequence, p::exactChar<'}'>(),
257 docComment),
258 [this](kj::Maybe<kj::Array<kj::String>>&& comment,
259 kj::Array<Orphan<Statement>>&& statements,
260 kj::Maybe<kj::Array<kj::String>>&& lateComment)
261 -> Orphan<Statement> {
262 auto result = orphanage.newOrphan<Statement>();
263 auto builder = result.get();
264 KJ_IF_MAYBE(c, comment) {
265 attachDocComment(builder, kj::mv(*c));
266 } else KJ_IF_MAYBE(c, lateComment) {
267 attachDocComment(builder, kj::mv(*c));
268 }
269 auto list = builder.initBlock(statements.size());
270 for (uint i = 0; i < statements.size(); i++) {
271 list.adoptWithCaveats(i, kj::mv(statements[i]));
272 }
273 return result;
274 })
275 ));
276
277 auto& statement = arena.copy(p::transformWithLocation(p::sequence(tokenSequence, statementEnd),
278 [](Location loc, kj::Array<Orphan<Token>>&& tokens, Orphan<Statement>&& statement) {
279 auto builder = statement.get();
280 auto tokensBuilder = builder.initTokens(tokens.size());
281 for (uint i = 0; i < tokens.size(); i++) {
282 tokensBuilder.adoptWithCaveats(i, kj::mv(tokens[i]));
283 }
284 builder.setStartByte(loc.begin());
285 builder.setEndByte(loc.end());
286 return kj::mv(statement);
287 }));
288
289 parsers.statementSequence = arena.copy(sequence(
290 commentsAndWhitespace, many(sequence(statement, commentsAndWhitespace))));
291
292 parsers.token = token;
293 parsers.statement = statement;
294 parsers.emptySpace = commentsAndWhitespace;
295}
296
297Lexer::~Lexer() noexcept(false) {}
298
299} // namespace compiler
300} // namespace capnp
301