1 | // Copyright (c) 2013-2014 Sandstorm Development Group, Inc. and contributors |
2 | // Licensed under the MIT License: |
3 | // |
4 | // Permission is hereby granted, free of charge, to any person obtaining a copy |
5 | // of this software and associated documentation files (the "Software"), to deal |
6 | // in the Software without restriction, including without limitation the rights |
7 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
8 | // copies of the Software, and to permit persons to whom the Software is |
9 | // furnished to do so, subject to the following conditions: |
10 | // |
11 | // The above copyright notice and this permission notice shall be included in |
12 | // all copies or substantial portions of the Software. |
13 | // |
14 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
15 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
16 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
17 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
18 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
19 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN |
20 | // THE SOFTWARE. |
21 | |
22 | #include "lexer.h" |
23 | #include <kj/parse/char.h> |
24 | #include <kj/debug.h> |
25 | |
26 | namespace capnp { |
27 | namespace compiler { |
28 | |
29 | namespace p = kj::parse; |
30 | |
31 | bool lex(kj::ArrayPtr<const char> input, LexedStatements::Builder result, |
32 | ErrorReporter& errorReporter) { |
33 | Lexer lexer(Orphanage::getForMessageContaining(result), errorReporter); |
34 | |
35 | auto parser = p::sequence(lexer.getParsers().statementSequence, p::endOfInput); |
36 | |
37 | Lexer::ParserInput parserInput(input.begin(), input.end()); |
38 | kj::Maybe<kj::Array<Orphan<Statement>>> parseOutput = parser(parserInput); |
39 | |
40 | KJ_IF_MAYBE(output, parseOutput) { |
41 | auto l = result.initStatements(output->size()); |
42 | for (uint i = 0; i < output->size(); i++) { |
43 | l.adoptWithCaveats(i, kj::mv((*output)[i])); |
44 | } |
45 | return true; |
46 | } else { |
47 | uint32_t best = parserInput.getBest(); |
48 | errorReporter.addError(best, best, kj::str("Parse error." )); |
49 | return false; |
50 | } |
51 | } |
52 | |
53 | bool lex(kj::ArrayPtr<const char> input, LexedTokens::Builder result, |
54 | ErrorReporter& errorReporter) { |
55 | Lexer lexer(Orphanage::getForMessageContaining(result), errorReporter); |
56 | |
57 | auto parser = p::sequence(lexer.getParsers().tokenSequence, p::endOfInput); |
58 | |
59 | Lexer::ParserInput parserInput(input.begin(), input.end()); |
60 | kj::Maybe<kj::Array<Orphan<Token>>> parseOutput = parser(parserInput); |
61 | |
62 | KJ_IF_MAYBE(output, parseOutput) { |
63 | auto l = result.initTokens(output->size()); |
64 | for (uint i = 0; i < output->size(); i++) { |
65 | l.adoptWithCaveats(i, kj::mv((*output)[i])); |
66 | } |
67 | return true; |
68 | } else { |
69 | uint32_t best = parserInput.getBest(); |
70 | errorReporter.addError(best, best, kj::str("Parse error." )); |
71 | return false; |
72 | } |
73 | } |
74 | |
75 | namespace { |
76 | |
77 | typedef p::Span<uint32_t> Location; |
78 | |
79 | Token::Builder initTok(Orphan<Token>& t, const Location& loc) { |
80 | auto builder = t.get(); |
81 | builder.setStartByte(loc.begin()); |
82 | builder.setEndByte(loc.end()); |
83 | return builder; |
84 | } |
85 | |
86 | void buildTokenSequenceList(List<List<Token>>::Builder builder, |
87 | kj::Array<kj::Array<Orphan<Token>>>&& items) { |
88 | for (uint i = 0; i < items.size(); i++) { |
89 | auto& item = items[i]; |
90 | auto itemBuilder = builder.init(i, item.size()); |
91 | for (uint j = 0; j < item.size(); j++) { |
92 | itemBuilder.adoptWithCaveats(j, kj::mv(item[j])); |
93 | } |
94 | } |
95 | } |
96 | |
97 | void (Statement::Builder statement, kj::Array<kj::String>&& ) { |
98 | size_t size = 0; |
99 | for (auto& line: comment) { |
100 | size += line.size() + 1; // include newline |
101 | } |
102 | Text::Builder builder = statement.initDocComment(size); |
103 | char* pos = builder.begin(); |
104 | for (auto& line: comment) { |
105 | memcpy(pos, line.begin(), line.size()); |
106 | pos += line.size(); |
107 | *pos++ = '\n'; |
108 | } |
109 | KJ_ASSERT(pos == builder.end()); |
110 | } |
111 | |
112 | constexpr auto = |
113 | sequence(p::exactChar<'#'>(), p::discard(p::many(p::discard(p::anyOfChars("\n" ).invert()))), |
114 | p::oneOf(p::exactChar<'\n'>(), p::endOfInput)); |
115 | constexpr auto = |
116 | sequence(p::exactChar<'#'>(), p::discard(p::optional(p::exactChar<' '>())), |
117 | p::charsToString(p::many(p::anyOfChars("\n" ).invert())), |
118 | p::oneOf(p::exactChar<'\n'>(), p::endOfInput)); |
119 | |
120 | constexpr auto utf8Bom = |
121 | sequence(p::exactChar<'\xef'>(), p::exactChar<'\xbb'>(), p::exactChar<'\xbf'>()); |
122 | |
123 | constexpr auto bomsAndWhitespace = |
124 | sequence(p::discardWhitespace, |
125 | p::discard(p::many(sequence(utf8Bom, p::discardWhitespace)))); |
126 | |
127 | constexpr auto commentsAndWhitespace = |
128 | sequence(bomsAndWhitespace, |
129 | p::discard(p::many(sequence(discardComment, bomsAndWhitespace)))); |
130 | |
131 | constexpr auto discardLineWhitespace = |
132 | p::discard(p::many(p::discard(p::whitespaceChar.invert().orAny("\r\n" ).invert()))); |
133 | constexpr auto newline = p::oneOf( |
134 | p::exactChar<'\n'>(), |
135 | sequence(p::exactChar<'\r'>(), p::discard(p::optional(p::exactChar<'\n'>())))); |
136 | |
137 | constexpr auto = p::optional(p::sequence( |
138 | discardLineWhitespace, |
139 | p::discard(p::optional(newline)), |
140 | p::oneOrMore(p::sequence(discardLineWhitespace, saveComment)))); |
141 | // Parses a set of comment lines preceded by at most one newline and with no intervening blank |
142 | // lines. |
143 | |
144 | } // namespace |
145 | |
146 | Lexer::Lexer(Orphanage orphanageParam, ErrorReporter& errorReporter) |
147 | : orphanage(orphanageParam) { |
148 | |
149 | // Note that because passing an lvalue to a parser constructor uses it by-referencee, it's safe |
150 | // for us to use parsers.tokenSequence even though we haven't yet constructed it. |
151 | auto& tokenSequence = parsers.tokenSequence; |
152 | |
153 | auto& commaDelimitedList = arena.copy(p::transform( |
154 | p::sequence(tokenSequence, p::many(p::sequence(p::exactChar<','>(), tokenSequence))), |
155 | [](kj::Array<Orphan<Token>>&& first, kj::Array<kj::Array<Orphan<Token>>>&& rest) |
156 | -> kj::Array<kj::Array<Orphan<Token>>> { |
157 | if (first == nullptr && rest == nullptr) { |
158 | // Completely empty list. |
159 | return nullptr; |
160 | } else { |
161 | uint restSize = rest.size(); |
162 | if (restSize > 0 && rest[restSize - 1] == nullptr) { |
163 | // Allow for trailing commas by shortening the list by one item if the final token is |
164 | // nullptr |
165 | restSize--; |
166 | } |
167 | auto result = kj::heapArrayBuilder<kj::Array<Orphan<Token>>>(1 + restSize); // first+rest |
168 | result.add(kj::mv(first)); |
169 | for (uint i = 0; i < restSize ; i++) { |
170 | result.add(kj::mv(rest[i])); |
171 | } |
172 | return result.finish(); |
173 | } |
174 | })); |
175 | |
176 | auto& token = arena.copy(p::oneOf( |
177 | p::transformWithLocation(p::identifier, |
178 | [this](Location loc, kj::String name) -> Orphan<Token> { |
179 | auto t = orphanage.newOrphan<Token>(); |
180 | initTok(t, loc).setIdentifier(name); |
181 | return t; |
182 | }), |
183 | p::transformWithLocation(p::doubleQuotedString, |
184 | [this](Location loc, kj::String text) -> Orphan<Token> { |
185 | auto t = orphanage.newOrphan<Token>(); |
186 | initTok(t, loc).setStringLiteral(text); |
187 | return t; |
188 | }), |
189 | p::transformWithLocation(p::doubleQuotedHexBinary, |
190 | [this](Location loc, kj::Array<byte> data) -> Orphan<Token> { |
191 | auto t = orphanage.newOrphan<Token>(); |
192 | initTok(t, loc).setBinaryLiteral(data); |
193 | return t; |
194 | }), |
195 | p::transformWithLocation(p::integer, |
196 | [this](Location loc, uint64_t i) -> Orphan<Token> { |
197 | auto t = orphanage.newOrphan<Token>(); |
198 | initTok(t, loc).setIntegerLiteral(i); |
199 | return t; |
200 | }), |
201 | p::transformWithLocation(p::number, |
202 | [this](Location loc, double x) -> Orphan<Token> { |
203 | auto t = orphanage.newOrphan<Token>(); |
204 | initTok(t, loc).setFloatLiteral(x); |
205 | return t; |
206 | }), |
207 | p::transformWithLocation( |
208 | p::charsToString(p::oneOrMore(p::anyOfChars("!$%&*+-./:<=>?@^|~" ))), |
209 | [this](Location loc, kj::String text) -> Orphan<Token> { |
210 | auto t = orphanage.newOrphan<Token>(); |
211 | initTok(t, loc).setOperator(text); |
212 | return t; |
213 | }), |
214 | p::transformWithLocation( |
215 | sequence(p::exactChar<'('>(), commaDelimitedList, p::exactChar<')'>()), |
216 | [this](Location loc, kj::Array<kj::Array<Orphan<Token>>>&& items) -> Orphan<Token> { |
217 | auto t = orphanage.newOrphan<Token>(); |
218 | buildTokenSequenceList( |
219 | initTok(t, loc).initParenthesizedList(items.size()), kj::mv(items)); |
220 | return t; |
221 | }), |
222 | p::transformWithLocation( |
223 | sequence(p::exactChar<'['>(), commaDelimitedList, p::exactChar<']'>()), |
224 | [this](Location loc, kj::Array<kj::Array<Orphan<Token>>>&& items) -> Orphan<Token> { |
225 | auto t = orphanage.newOrphan<Token>(); |
226 | buildTokenSequenceList( |
227 | initTok(t, loc).initBracketedList(items.size()), kj::mv(items)); |
228 | return t; |
229 | }), |
230 | p::transformOrReject(p::transformWithLocation( |
231 | p::oneOf(sequence(p::exactChar<'\xff'>(), p::exactChar<'\xfe'>()), |
232 | sequence(p::exactChar<'\xfe'>(), p::exactChar<'\xff'>()), |
233 | sequence(p::exactChar<'\x00'>())), |
234 | [&errorReporter](Location loc) -> kj::Maybe<Orphan<Token>> { |
235 | errorReporter.addError(loc.begin(), loc.end(), |
236 | "Non-UTF-8 input detected. Cap'n Proto schema files must be UTF-8 text." ); |
237 | return nullptr; |
238 | }), [](kj::Maybe<Orphan<Token>> param) { return param; }))); |
239 | parsers.tokenSequence = arena.copy(p::sequence( |
240 | commentsAndWhitespace, p::many(p::sequence(token, commentsAndWhitespace)))); |
241 | |
242 | auto& statementSequence = parsers.statementSequence; |
243 | |
244 | auto& statementEnd = arena.copy(p::oneOf( |
245 | transform(p::sequence(p::exactChar<';'>(), docComment), |
246 | [this](kj::Maybe<kj::Array<kj::String>>&& ) -> Orphan<Statement> { |
247 | auto result = orphanage.newOrphan<Statement>(); |
248 | auto builder = result.get(); |
249 | KJ_IF_MAYBE(c, comment) { |
250 | attachDocComment(builder, kj::mv(*c)); |
251 | } |
252 | builder.setLine(); |
253 | return result; |
254 | }), |
255 | transform( |
256 | p::sequence(p::exactChar<'{'>(), docComment, statementSequence, p::exactChar<'}'>(), |
257 | docComment), |
258 | [this](kj::Maybe<kj::Array<kj::String>>&& , |
259 | kj::Array<Orphan<Statement>>&& statements, |
260 | kj::Maybe<kj::Array<kj::String>>&& ) |
261 | -> Orphan<Statement> { |
262 | auto result = orphanage.newOrphan<Statement>(); |
263 | auto builder = result.get(); |
264 | KJ_IF_MAYBE(c, comment) { |
265 | attachDocComment(builder, kj::mv(*c)); |
266 | } else KJ_IF_MAYBE(c, lateComment) { |
267 | attachDocComment(builder, kj::mv(*c)); |
268 | } |
269 | auto list = builder.initBlock(statements.size()); |
270 | for (uint i = 0; i < statements.size(); i++) { |
271 | list.adoptWithCaveats(i, kj::mv(statements[i])); |
272 | } |
273 | return result; |
274 | }) |
275 | )); |
276 | |
277 | auto& statement = arena.copy(p::transformWithLocation(p::sequence(tokenSequence, statementEnd), |
278 | [](Location loc, kj::Array<Orphan<Token>>&& tokens, Orphan<Statement>&& statement) { |
279 | auto builder = statement.get(); |
280 | auto tokensBuilder = builder.initTokens(tokens.size()); |
281 | for (uint i = 0; i < tokens.size(); i++) { |
282 | tokensBuilder.adoptWithCaveats(i, kj::mv(tokens[i])); |
283 | } |
284 | builder.setStartByte(loc.begin()); |
285 | builder.setEndByte(loc.end()); |
286 | return kj::mv(statement); |
287 | })); |
288 | |
289 | parsers.statementSequence = arena.copy(sequence( |
290 | commentsAndWhitespace, many(sequence(statement, commentsAndWhitespace)))); |
291 | |
292 | parsers.token = token; |
293 | parsers.statement = statement; |
294 | parsers.emptySpace = commentsAndWhitespace; |
295 | } |
296 | |
297 | Lexer::~Lexer() noexcept(false) {} |
298 | |
299 | } // namespace compiler |
300 | } // namespace capnp |
301 | |