| 1 | // Copyright (c) 2013-2014 Sandstorm Development Group, Inc. and contributors | 
|---|
| 2 | // Licensed under the MIT License: | 
|---|
| 3 | // | 
|---|
| 4 | // Permission is hereby granted, free of charge, to any person obtaining a copy | 
|---|
| 5 | // of this software and associated documentation files (the "Software"), to deal | 
|---|
| 6 | // in the Software without restriction, including without limitation the rights | 
|---|
| 7 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | 
|---|
| 8 | // copies of the Software, and to permit persons to whom the Software is | 
|---|
| 9 | // furnished to do so, subject to the following conditions: | 
|---|
| 10 | // | 
|---|
| 11 | // The above copyright notice and this permission notice shall be included in | 
|---|
| 12 | // all copies or substantial portions of the Software. | 
|---|
| 13 | // | 
|---|
| 14 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | 
|---|
| 15 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | 
|---|
| 16 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | 
|---|
| 17 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | 
|---|
| 18 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | 
|---|
| 19 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | 
|---|
| 20 | // THE SOFTWARE. | 
|---|
| 21 |  | 
|---|
| 22 | #include "lexer.h" | 
|---|
| 23 | #include <kj/parse/char.h> | 
|---|
| 24 | #include <kj/debug.h> | 
|---|
| 25 |  | 
|---|
| 26 | namespace capnp { | 
|---|
| 27 | namespace compiler { | 
|---|
| 28 |  | 
|---|
| 29 | namespace p = kj::parse; | 
|---|
| 30 |  | 
|---|
| 31 | bool lex(kj::ArrayPtr<const char> input, LexedStatements::Builder result, | 
|---|
| 32 | ErrorReporter& errorReporter) { | 
|---|
| 33 | Lexer lexer(Orphanage::getForMessageContaining(result), errorReporter); | 
|---|
| 34 |  | 
|---|
| 35 | auto parser = p::sequence(lexer.getParsers().statementSequence, p::endOfInput); | 
|---|
| 36 |  | 
|---|
| 37 | Lexer::ParserInput parserInput(input.begin(), input.end()); | 
|---|
| 38 | kj::Maybe<kj::Array<Orphan<Statement>>> parseOutput = parser(parserInput); | 
|---|
| 39 |  | 
|---|
| 40 | KJ_IF_MAYBE(output, parseOutput) { | 
|---|
| 41 | auto l = result.initStatements(output->size()); | 
|---|
| 42 | for (uint i = 0; i < output->size(); i++) { | 
|---|
| 43 | l.adoptWithCaveats(i, kj::mv((*output)[i])); | 
|---|
| 44 | } | 
|---|
| 45 | return true; | 
|---|
| 46 | } else { | 
|---|
| 47 | uint32_t best = parserInput.getBest(); | 
|---|
| 48 | errorReporter.addError(best, best, kj::str( "Parse error.")); | 
|---|
| 49 | return false; | 
|---|
| 50 | } | 
|---|
| 51 | } | 
|---|
| 52 |  | 
|---|
| 53 | bool lex(kj::ArrayPtr<const char> input, LexedTokens::Builder result, | 
|---|
| 54 | ErrorReporter& errorReporter) { | 
|---|
| 55 | Lexer lexer(Orphanage::getForMessageContaining(result), errorReporter); | 
|---|
| 56 |  | 
|---|
| 57 | auto parser = p::sequence(lexer.getParsers().tokenSequence, p::endOfInput); | 
|---|
| 58 |  | 
|---|
| 59 | Lexer::ParserInput parserInput(input.begin(), input.end()); | 
|---|
| 60 | kj::Maybe<kj::Array<Orphan<Token>>> parseOutput = parser(parserInput); | 
|---|
| 61 |  | 
|---|
| 62 | KJ_IF_MAYBE(output, parseOutput) { | 
|---|
| 63 | auto l = result.initTokens(output->size()); | 
|---|
| 64 | for (uint i = 0; i < output->size(); i++) { | 
|---|
| 65 | l.adoptWithCaveats(i, kj::mv((*output)[i])); | 
|---|
| 66 | } | 
|---|
| 67 | return true; | 
|---|
| 68 | } else { | 
|---|
| 69 | uint32_t best = parserInput.getBest(); | 
|---|
| 70 | errorReporter.addError(best, best, kj::str( "Parse error.")); | 
|---|
| 71 | return false; | 
|---|
| 72 | } | 
|---|
| 73 | } | 
|---|
| 74 |  | 
|---|
| 75 | namespace { | 
|---|
| 76 |  | 
|---|
| 77 | typedef p::Span<uint32_t> Location; | 
|---|
| 78 |  | 
|---|
| 79 | Token::Builder initTok(Orphan<Token>& t, const Location& loc) { | 
|---|
| 80 | auto builder = t.get(); | 
|---|
| 81 | builder.setStartByte(loc.begin()); | 
|---|
| 82 | builder.setEndByte(loc.end()); | 
|---|
| 83 | return builder; | 
|---|
| 84 | } | 
|---|
| 85 |  | 
|---|
| 86 | void buildTokenSequenceList(List<List<Token>>::Builder builder, | 
|---|
| 87 | kj::Array<kj::Array<Orphan<Token>>>&& items) { | 
|---|
| 88 | for (uint i = 0; i < items.size(); i++) { | 
|---|
| 89 | auto& item = items[i]; | 
|---|
| 90 | auto itemBuilder = builder.init(i, item.size()); | 
|---|
| 91 | for (uint j = 0; j < item.size(); j++) { | 
|---|
| 92 | itemBuilder.adoptWithCaveats(j, kj::mv(item[j])); | 
|---|
| 93 | } | 
|---|
| 94 | } | 
|---|
| 95 | } | 
|---|
| 96 |  | 
|---|
| 97 | void (Statement::Builder statement, kj::Array<kj::String>&& ) { | 
|---|
| 98 | size_t size = 0; | 
|---|
| 99 | for (auto& line: comment) { | 
|---|
| 100 | size += line.size() + 1;  // include newline | 
|---|
| 101 | } | 
|---|
| 102 | Text::Builder builder = statement.initDocComment(size); | 
|---|
| 103 | char* pos = builder.begin(); | 
|---|
| 104 | for (auto& line: comment) { | 
|---|
| 105 | memcpy(pos, line.begin(), line.size()); | 
|---|
| 106 | pos += line.size(); | 
|---|
| 107 | *pos++ = '\n'; | 
|---|
| 108 | } | 
|---|
| 109 | KJ_ASSERT(pos == builder.end()); | 
|---|
| 110 | } | 
|---|
| 111 |  | 
|---|
| 112 | constexpr auto  = | 
|---|
| 113 | sequence(p::exactChar<'#'>(), p::discard(p::many(p::discard(p::anyOfChars( "\n").invert()))), | 
|---|
| 114 | p::oneOf(p::exactChar<'\n'>(), p::endOfInput)); | 
|---|
| 115 | constexpr auto  = | 
|---|
| 116 | sequence(p::exactChar<'#'>(), p::discard(p::optional(p::exactChar<' '>())), | 
|---|
| 117 | p::charsToString(p::many(p::anyOfChars( "\n").invert())), | 
|---|
| 118 | p::oneOf(p::exactChar<'\n'>(), p::endOfInput)); | 
|---|
| 119 |  | 
|---|
| 120 | constexpr auto utf8Bom = | 
|---|
| 121 | sequence(p::exactChar<'\xef'>(), p::exactChar<'\xbb'>(), p::exactChar<'\xbf'>()); | 
|---|
| 122 |  | 
|---|
| 123 | constexpr auto bomsAndWhitespace = | 
|---|
| 124 | sequence(p::discardWhitespace, | 
|---|
| 125 | p::discard(p::many(sequence(utf8Bom, p::discardWhitespace)))); | 
|---|
| 126 |  | 
|---|
| 127 | constexpr auto commentsAndWhitespace = | 
|---|
| 128 | sequence(bomsAndWhitespace, | 
|---|
| 129 | p::discard(p::many(sequence(discardComment, bomsAndWhitespace)))); | 
|---|
| 130 |  | 
|---|
| 131 | constexpr auto discardLineWhitespace = | 
|---|
| 132 | p::discard(p::many(p::discard(p::whitespaceChar.invert().orAny( "\r\n").invert()))); | 
|---|
| 133 | constexpr auto newline = p::oneOf( | 
|---|
| 134 | p::exactChar<'\n'>(), | 
|---|
| 135 | sequence(p::exactChar<'\r'>(), p::discard(p::optional(p::exactChar<'\n'>())))); | 
|---|
| 136 |  | 
|---|
| 137 | constexpr auto  = p::optional(p::sequence( | 
|---|
| 138 | discardLineWhitespace, | 
|---|
| 139 | p::discard(p::optional(newline)), | 
|---|
| 140 | p::oneOrMore(p::sequence(discardLineWhitespace, saveComment)))); | 
|---|
| 141 | // Parses a set of comment lines preceded by at most one newline and with no intervening blank | 
|---|
| 142 | // lines. | 
|---|
| 143 |  | 
|---|
| 144 | }  // namespace | 
|---|
| 145 |  | 
|---|
| 146 | Lexer::Lexer(Orphanage orphanageParam, ErrorReporter& errorReporter) | 
|---|
| 147 | : orphanage(orphanageParam) { | 
|---|
| 148 |  | 
|---|
| 149 | // Note that because passing an lvalue to a parser constructor uses it by-referencee, it's safe | 
|---|
| 150 | // for us to use parsers.tokenSequence even though we haven't yet constructed it. | 
|---|
| 151 | auto& tokenSequence = parsers.tokenSequence; | 
|---|
| 152 |  | 
|---|
| 153 | auto& commaDelimitedList = arena.copy(p::transform( | 
|---|
| 154 | p::sequence(tokenSequence, p::many(p::sequence(p::exactChar<','>(), tokenSequence))), | 
|---|
| 155 | [](kj::Array<Orphan<Token>>&& first, kj::Array<kj::Array<Orphan<Token>>>&& rest) | 
|---|
| 156 | -> kj::Array<kj::Array<Orphan<Token>>> { | 
|---|
| 157 | if (first == nullptr && rest == nullptr) { | 
|---|
| 158 | // Completely empty list. | 
|---|
| 159 | return nullptr; | 
|---|
| 160 | } else { | 
|---|
| 161 | uint restSize = rest.size(); | 
|---|
| 162 | if (restSize > 0 && rest[restSize - 1] == nullptr) { | 
|---|
| 163 | // Allow for trailing commas by shortening the list by one item if the final token is | 
|---|
| 164 | // nullptr | 
|---|
| 165 | restSize--; | 
|---|
| 166 | } | 
|---|
| 167 | auto result = kj::heapArrayBuilder<kj::Array<Orphan<Token>>>(1 + restSize); // first+rest | 
|---|
| 168 | result.add(kj::mv(first)); | 
|---|
| 169 | for (uint i = 0; i < restSize ; i++) { | 
|---|
| 170 | result.add(kj::mv(rest[i])); | 
|---|
| 171 | } | 
|---|
| 172 | return result.finish(); | 
|---|
| 173 | } | 
|---|
| 174 | })); | 
|---|
| 175 |  | 
|---|
| 176 | auto& token = arena.copy(p::oneOf( | 
|---|
| 177 | p::transformWithLocation(p::identifier, | 
|---|
| 178 | [this](Location loc, kj::String name) -> Orphan<Token> { | 
|---|
| 179 | auto t = orphanage.newOrphan<Token>(); | 
|---|
| 180 | initTok(t, loc).setIdentifier(name); | 
|---|
| 181 | return t; | 
|---|
| 182 | }), | 
|---|
| 183 | p::transformWithLocation(p::doubleQuotedString, | 
|---|
| 184 | [this](Location loc, kj::String text) -> Orphan<Token> { | 
|---|
| 185 | auto t = orphanage.newOrphan<Token>(); | 
|---|
| 186 | initTok(t, loc).setStringLiteral(text); | 
|---|
| 187 | return t; | 
|---|
| 188 | }), | 
|---|
| 189 | p::transformWithLocation(p::doubleQuotedHexBinary, | 
|---|
| 190 | [this](Location loc, kj::Array<byte> data) -> Orphan<Token> { | 
|---|
| 191 | auto t = orphanage.newOrphan<Token>(); | 
|---|
| 192 | initTok(t, loc).setBinaryLiteral(data); | 
|---|
| 193 | return t; | 
|---|
| 194 | }), | 
|---|
| 195 | p::transformWithLocation(p::integer, | 
|---|
| 196 | [this](Location loc, uint64_t i) -> Orphan<Token> { | 
|---|
| 197 | auto t = orphanage.newOrphan<Token>(); | 
|---|
| 198 | initTok(t, loc).setIntegerLiteral(i); | 
|---|
| 199 | return t; | 
|---|
| 200 | }), | 
|---|
| 201 | p::transformWithLocation(p::number, | 
|---|
| 202 | [this](Location loc, double x) -> Orphan<Token> { | 
|---|
| 203 | auto t = orphanage.newOrphan<Token>(); | 
|---|
| 204 | initTok(t, loc).setFloatLiteral(x); | 
|---|
| 205 | return t; | 
|---|
| 206 | }), | 
|---|
| 207 | p::transformWithLocation( | 
|---|
| 208 | p::charsToString(p::oneOrMore(p::anyOfChars( "!$%&*+-./:<=>?@^|~"))), | 
|---|
| 209 | [this](Location loc, kj::String text) -> Orphan<Token> { | 
|---|
| 210 | auto t = orphanage.newOrphan<Token>(); | 
|---|
| 211 | initTok(t, loc).setOperator(text); | 
|---|
| 212 | return t; | 
|---|
| 213 | }), | 
|---|
| 214 | p::transformWithLocation( | 
|---|
| 215 | sequence(p::exactChar<'('>(), commaDelimitedList, p::exactChar<')'>()), | 
|---|
| 216 | [this](Location loc, kj::Array<kj::Array<Orphan<Token>>>&& items) -> Orphan<Token> { | 
|---|
| 217 | auto t = orphanage.newOrphan<Token>(); | 
|---|
| 218 | buildTokenSequenceList( | 
|---|
| 219 | initTok(t, loc).initParenthesizedList(items.size()), kj::mv(items)); | 
|---|
| 220 | return t; | 
|---|
| 221 | }), | 
|---|
| 222 | p::transformWithLocation( | 
|---|
| 223 | sequence(p::exactChar<'['>(), commaDelimitedList, p::exactChar<']'>()), | 
|---|
| 224 | [this](Location loc, kj::Array<kj::Array<Orphan<Token>>>&& items) -> Orphan<Token> { | 
|---|
| 225 | auto t = orphanage.newOrphan<Token>(); | 
|---|
| 226 | buildTokenSequenceList( | 
|---|
| 227 | initTok(t, loc).initBracketedList(items.size()), kj::mv(items)); | 
|---|
| 228 | return t; | 
|---|
| 229 | }), | 
|---|
| 230 | p::transformOrReject(p::transformWithLocation( | 
|---|
| 231 | p::oneOf(sequence(p::exactChar<'\xff'>(), p::exactChar<'\xfe'>()), | 
|---|
| 232 | sequence(p::exactChar<'\xfe'>(), p::exactChar<'\xff'>()), | 
|---|
| 233 | sequence(p::exactChar<'\x00'>())), | 
|---|
| 234 | [&errorReporter](Location loc) -> kj::Maybe<Orphan<Token>> { | 
|---|
| 235 | errorReporter.addError(loc.begin(), loc.end(), | 
|---|
| 236 | "Non-UTF-8 input detected. Cap'n Proto schema files must be UTF-8 text."); | 
|---|
| 237 | return nullptr; | 
|---|
| 238 | }), [](kj::Maybe<Orphan<Token>> param) { return param; }))); | 
|---|
| 239 | parsers.tokenSequence = arena.copy(p::sequence( | 
|---|
| 240 | commentsAndWhitespace, p::many(p::sequence(token, commentsAndWhitespace)))); | 
|---|
| 241 |  | 
|---|
| 242 | auto& statementSequence = parsers.statementSequence; | 
|---|
| 243 |  | 
|---|
| 244 | auto& statementEnd = arena.copy(p::oneOf( | 
|---|
| 245 | transform(p::sequence(p::exactChar<';'>(), docComment), | 
|---|
| 246 | [this](kj::Maybe<kj::Array<kj::String>>&& ) -> Orphan<Statement> { | 
|---|
| 247 | auto result = orphanage.newOrphan<Statement>(); | 
|---|
| 248 | auto builder = result.get(); | 
|---|
| 249 | KJ_IF_MAYBE(c, comment) { | 
|---|
| 250 | attachDocComment(builder, kj::mv(*c)); | 
|---|
| 251 | } | 
|---|
| 252 | builder.setLine(); | 
|---|
| 253 | return result; | 
|---|
| 254 | }), | 
|---|
| 255 | transform( | 
|---|
| 256 | p::sequence(p::exactChar<'{'>(), docComment, statementSequence, p::exactChar<'}'>(), | 
|---|
| 257 | docComment), | 
|---|
| 258 | [this](kj::Maybe<kj::Array<kj::String>>&& , | 
|---|
| 259 | kj::Array<Orphan<Statement>>&& statements, | 
|---|
| 260 | kj::Maybe<kj::Array<kj::String>>&& ) | 
|---|
| 261 | -> Orphan<Statement> { | 
|---|
| 262 | auto result = orphanage.newOrphan<Statement>(); | 
|---|
| 263 | auto builder = result.get(); | 
|---|
| 264 | KJ_IF_MAYBE(c, comment) { | 
|---|
| 265 | attachDocComment(builder, kj::mv(*c)); | 
|---|
| 266 | } else KJ_IF_MAYBE(c, lateComment) { | 
|---|
| 267 | attachDocComment(builder, kj::mv(*c)); | 
|---|
| 268 | } | 
|---|
| 269 | auto list = builder.initBlock(statements.size()); | 
|---|
| 270 | for (uint i = 0; i < statements.size(); i++) { | 
|---|
| 271 | list.adoptWithCaveats(i, kj::mv(statements[i])); | 
|---|
| 272 | } | 
|---|
| 273 | return result; | 
|---|
| 274 | }) | 
|---|
| 275 | )); | 
|---|
| 276 |  | 
|---|
| 277 | auto& statement = arena.copy(p::transformWithLocation(p::sequence(tokenSequence, statementEnd), | 
|---|
| 278 | [](Location loc, kj::Array<Orphan<Token>>&& tokens, Orphan<Statement>&& statement) { | 
|---|
| 279 | auto builder = statement.get(); | 
|---|
| 280 | auto tokensBuilder = builder.initTokens(tokens.size()); | 
|---|
| 281 | for (uint i = 0; i < tokens.size(); i++) { | 
|---|
| 282 | tokensBuilder.adoptWithCaveats(i, kj::mv(tokens[i])); | 
|---|
| 283 | } | 
|---|
| 284 | builder.setStartByte(loc.begin()); | 
|---|
| 285 | builder.setEndByte(loc.end()); | 
|---|
| 286 | return kj::mv(statement); | 
|---|
| 287 | })); | 
|---|
| 288 |  | 
|---|
| 289 | parsers.statementSequence = arena.copy(sequence( | 
|---|
| 290 | commentsAndWhitespace, many(sequence(statement, commentsAndWhitespace)))); | 
|---|
| 291 |  | 
|---|
| 292 | parsers.token = token; | 
|---|
| 293 | parsers.statement = statement; | 
|---|
| 294 | parsers.emptySpace = commentsAndWhitespace; | 
|---|
| 295 | } | 
|---|
| 296 |  | 
|---|
| 297 | Lexer::~Lexer() noexcept(false) {} | 
|---|
| 298 |  | 
|---|
| 299 | }  // namespace compiler | 
|---|
| 300 | }  // namespace capnp | 
|---|
| 301 |  | 
|---|