| 1 | // Copyright (c) 2013-2014 Sandstorm Development Group, Inc. and contributors |
| 2 | // Licensed under the MIT License: |
| 3 | // |
| 4 | // Permission is hereby granted, free of charge, to any person obtaining a copy |
| 5 | // of this software and associated documentation files (the "Software"), to deal |
| 6 | // in the Software without restriction, including without limitation the rights |
| 7 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
| 8 | // copies of the Software, and to permit persons to whom the Software is |
| 9 | // furnished to do so, subject to the following conditions: |
| 10 | // |
| 11 | // The above copyright notice and this permission notice shall be included in |
| 12 | // all copies or substantial portions of the Software. |
| 13 | // |
| 14 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| 15 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| 16 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
| 17 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| 18 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
| 19 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN |
| 20 | // THE SOFTWARE. |
| 21 | |
| 22 | // This file contains parsers useful for character stream inputs, including parsers to parse |
| 23 | // common kinds of tokens like identifiers, numbers, and quoted strings. |
| 24 | |
| 25 | #pragma once |
| 26 | |
| 27 | #if defined(__GNUC__) && !KJ_HEADER_WARNINGS |
| 28 | #pragma GCC system_header |
| 29 | #endif |
| 30 | |
| 31 | #include "common.h" |
| 32 | #include "../string.h" |
| 33 | #include <inttypes.h> |
| 34 | |
| 35 | namespace kj { |
| 36 | namespace parse { |
| 37 | |
| 38 | // ======================================================================================= |
| 39 | // Exact char/string. |
| 40 | |
| 41 | class ExactString_ { |
| 42 | public: |
| 43 | constexpr inline ExactString_(const char* str): str(str) {} |
| 44 | |
| 45 | template <typename Input> |
| 46 | Maybe<Tuple<>> operator()(Input& input) const { |
| 47 | const char* ptr = str; |
| 48 | |
| 49 | while (*ptr != '\0') { |
| 50 | if (input.atEnd() || input.current() != *ptr) return nullptr; |
| 51 | input.next(); |
| 52 | ++ptr; |
| 53 | } |
| 54 | |
| 55 | return Tuple<>(); |
| 56 | } |
| 57 | |
| 58 | private: |
| 59 | const char* str; |
| 60 | }; |
| 61 | |
| 62 | constexpr inline ExactString_ exactString(const char* str) { |
| 63 | return ExactString_(str); |
| 64 | } |
| 65 | |
| 66 | template <char c> |
| 67 | constexpr ExactlyConst_<char, c> exactChar() { |
| 68 | // Returns a parser that matches exactly the character given by the template argument (returning |
| 69 | // no result). |
| 70 | return ExactlyConst_<char, c>(); |
| 71 | } |
| 72 | |
| 73 | // ======================================================================================= |
| 74 | // Char ranges / sets |
| 75 | |
| 76 | class CharGroup_ { |
| 77 | public: |
| 78 | constexpr inline CharGroup_(): bits{0, 0, 0, 0} {} |
| 79 | |
| 80 | constexpr inline CharGroup_ orRange(unsigned char first, unsigned char last) const { |
| 81 | return CharGroup_(bits[0] | (oneBits(last + 1) & ~oneBits(first )), |
| 82 | bits[1] | (oneBits(last - 63) & ~oneBits(first - 64)), |
| 83 | bits[2] | (oneBits(last - 127) & ~oneBits(first - 128)), |
| 84 | bits[3] | (oneBits(last - 191) & ~oneBits(first - 192))); |
| 85 | } |
| 86 | |
| 87 | constexpr inline CharGroup_ orAny(const char* chars) const { |
| 88 | return *chars == 0 ? *this : orChar(*chars).orAny(chars + 1); |
| 89 | } |
| 90 | |
| 91 | constexpr inline CharGroup_ orChar(unsigned char c) const { |
| 92 | return CharGroup_(bits[0] | bit(c), |
| 93 | bits[1] | bit(c - 64), |
| 94 | bits[2] | bit(c - 128), |
| 95 | bits[3] | bit(c - 256)); |
| 96 | } |
| 97 | |
| 98 | constexpr inline CharGroup_ orGroup(CharGroup_ other) const { |
| 99 | return CharGroup_(bits[0] | other.bits[0], |
| 100 | bits[1] | other.bits[1], |
| 101 | bits[2] | other.bits[2], |
| 102 | bits[3] | other.bits[3]); |
| 103 | } |
| 104 | |
| 105 | constexpr inline CharGroup_ invert() const { |
| 106 | return CharGroup_(~bits[0], ~bits[1], ~bits[2], ~bits[3]); |
| 107 | } |
| 108 | |
| 109 | constexpr inline bool contains(unsigned char c) const { |
| 110 | return (bits[c / 64] & (1ll << (c % 64))) != 0; |
| 111 | } |
| 112 | |
| 113 | inline bool containsAll(ArrayPtr<const char> text) const { |
| 114 | for (char c: text) { |
| 115 | if (!contains(c)) return false; |
| 116 | } |
| 117 | return true; |
| 118 | } |
| 119 | |
| 120 | template <typename Input> |
| 121 | Maybe<char> operator()(Input& input) const { |
| 122 | if (input.atEnd()) return nullptr; |
| 123 | unsigned char c = input.current(); |
| 124 | if (contains(c)) { |
| 125 | input.next(); |
| 126 | return c; |
| 127 | } else { |
| 128 | return nullptr; |
| 129 | } |
| 130 | } |
| 131 | |
| 132 | private: |
| 133 | typedef unsigned long long Bits64; |
| 134 | |
| 135 | constexpr inline CharGroup_(Bits64 a, Bits64 b, Bits64 c, Bits64 d): bits{a, b, c, d} {} |
| 136 | Bits64 bits[4]; |
| 137 | |
| 138 | static constexpr inline Bits64 oneBits(int count) { |
| 139 | return count <= 0 ? 0ll : count >= 64 ? -1ll : ((1ll << count) - 1); |
| 140 | } |
| 141 | static constexpr inline Bits64 bit(int index) { |
| 142 | return index < 0 ? 0 : index >= 64 ? 0 : (1ll << index); |
| 143 | } |
| 144 | }; |
| 145 | |
| 146 | constexpr inline CharGroup_ charRange(char first, char last) { |
| 147 | // Create a parser which accepts any character in the range from `first` to `last`, inclusive. |
| 148 | // For example: `charRange('a', 'z')` matches all lower-case letters. The parser's result is the |
| 149 | // character matched. |
| 150 | // |
| 151 | // The returned object has methods which can be used to match more characters. The following |
| 152 | // produces a parser which accepts any letter as well as '_', '+', '-', and '.'. |
| 153 | // |
| 154 | // charRange('a', 'z').orRange('A', 'Z').orChar('_').orAny("+-.") |
| 155 | // |
| 156 | // You can also use `.invert()` to match the opposite set of characters. |
| 157 | |
| 158 | return CharGroup_().orRange(first, last); |
| 159 | } |
| 160 | |
| 161 | #if _MSC_VER |
| 162 | #define anyOfChars(chars) CharGroup_().orAny(chars) |
| 163 | // TODO(msvc): MSVC ICEs on the proper definition of `anyOfChars()`, which in turn prevents us from |
| 164 | // building the compiler or schema parser. We don't know why this happens, but Harris found that |
| 165 | // this horrible, horrible hack makes things work. This is awful, but it's better than nothing. |
| 166 | // Hopefully, MSVC will get fixed soon and we'll be able to remove this. |
| 167 | #else |
| 168 | constexpr inline CharGroup_ anyOfChars(const char* chars) { |
| 169 | // Returns a parser that accepts any of the characters in the given string (which should usually |
| 170 | // be a literal). The returned parser is of the same type as returned by `charRange()` -- see |
| 171 | // that function for more info. |
| 172 | |
| 173 | return CharGroup_().orAny(chars); |
| 174 | } |
| 175 | #endif |
| 176 | |
| 177 | // ======================================================================================= |
| 178 | |
| 179 | namespace _ { // private |
| 180 | |
| 181 | struct ArrayToString { |
| 182 | inline String operator()(const Array<char>& arr) const { |
| 183 | return heapString(arr); |
| 184 | } |
| 185 | }; |
| 186 | |
| 187 | } // namespace _ (private) |
| 188 | |
| 189 | template <typename SubParser> |
| 190 | constexpr inline auto charsToString(SubParser&& subParser) |
| 191 | -> decltype(transform(kj::fwd<SubParser>(subParser), _::ArrayToString())) { |
| 192 | // Wraps a parser that returns Array<char> such that it returns String instead. |
| 193 | return parse::transform(kj::fwd<SubParser>(subParser), _::ArrayToString()); |
| 194 | } |
| 195 | |
| 196 | // ======================================================================================= |
| 197 | // Basic character classes. |
| 198 | |
| 199 | constexpr auto alpha = charRange('a', 'z').orRange('A', 'Z'); |
| 200 | constexpr auto digit = charRange('0', '9'); |
| 201 | constexpr auto alphaNumeric = alpha.orGroup(digit); |
| 202 | constexpr auto nameStart = alpha.orChar('_'); |
| 203 | constexpr auto nameChar = alphaNumeric.orChar('_'); |
| 204 | constexpr auto hexDigit = charRange('0', '9').orRange('a', 'f').orRange('A', 'F'); |
| 205 | constexpr auto octDigit = charRange('0', '7'); |
| 206 | constexpr auto whitespaceChar = anyOfChars(" \f\n\r\t\v" ); |
| 207 | constexpr auto controlChar = charRange(0, 0x1f).invert().orGroup(whitespaceChar).invert(); |
| 208 | |
| 209 | constexpr auto whitespace = many(anyOfChars(" \f\n\r\t\v" )); |
| 210 | |
| 211 | constexpr auto discardWhitespace = discard(many(discard(anyOfChars(" \f\n\r\t\v" )))); |
| 212 | // Like discard(whitespace) but avoids some memory allocation. |
| 213 | |
| 214 | // ======================================================================================= |
| 215 | // Identifiers |
| 216 | |
| 217 | namespace _ { // private |
| 218 | |
| 219 | struct IdentifierToString { |
| 220 | inline String operator()(char first, const Array<char>& rest) const { |
| 221 | if (rest.size() == 0) return heapString(&first, 1); |
| 222 | String result = heapString(rest.size() + 1); |
| 223 | result[0] = first; |
| 224 | memcpy(result.begin() + 1, rest.begin(), rest.size()); |
| 225 | return result; |
| 226 | } |
| 227 | }; |
| 228 | |
| 229 | } // namespace _ (private) |
| 230 | |
| 231 | constexpr auto identifier = transform(sequence(nameStart, many(nameChar)), _::IdentifierToString()); |
| 232 | // Parses an identifier (e.g. a C variable name). |
| 233 | |
| 234 | // ======================================================================================= |
| 235 | // Integers |
| 236 | |
| 237 | namespace _ { // private |
| 238 | |
| 239 | inline char parseDigit(char c) { |
| 240 | if (c < 'A') return c - '0'; |
| 241 | if (c < 'a') return c - 'A' + 10; |
| 242 | return c - 'a' + 10; |
| 243 | } |
| 244 | |
| 245 | template <uint base> |
| 246 | struct ParseInteger { |
| 247 | inline uint64_t operator()(const Array<char>& digits) const { |
| 248 | return operator()('0', digits); |
| 249 | } |
| 250 | uint64_t operator()(char first, const Array<char>& digits) const { |
| 251 | uint64_t result = parseDigit(first); |
| 252 | for (char digit: digits) { |
| 253 | result = result * base + parseDigit(digit); |
| 254 | } |
| 255 | return result; |
| 256 | } |
| 257 | }; |
| 258 | |
| 259 | |
| 260 | } // namespace _ (private) |
| 261 | |
| 262 | constexpr auto integer = sequence( |
| 263 | oneOf( |
| 264 | transform(sequence(exactChar<'0'>(), exactChar<'x'>(), oneOrMore(hexDigit)), _::ParseInteger<16>()), |
| 265 | transform(sequence(exactChar<'0'>(), many(octDigit)), _::ParseInteger<8>()), |
| 266 | transform(sequence(charRange('1', '9'), many(digit)), _::ParseInteger<10>())), |
| 267 | notLookingAt(alpha.orAny("_." ))); |
| 268 | |
| 269 | // ======================================================================================= |
| 270 | // Numbers (i.e. floats) |
| 271 | |
| 272 | namespace _ { // private |
| 273 | |
| 274 | struct ParseFloat { |
| 275 | double operator()(const Array<char>& digits, |
| 276 | const Maybe<Array<char>>& fraction, |
| 277 | const Maybe<Tuple<Maybe<char>, Array<char>>>& exponent) const; |
| 278 | }; |
| 279 | |
| 280 | } // namespace _ (private) |
| 281 | |
| 282 | constexpr auto number = transform( |
| 283 | sequence( |
| 284 | oneOrMore(digit), |
| 285 | optional(sequence(exactChar<'.'>(), many(digit))), |
| 286 | optional(sequence(discard(anyOfChars("eE" )), optional(anyOfChars("+-" )), many(digit))), |
| 287 | notLookingAt(alpha.orAny("_." ))), |
| 288 | _::ParseFloat()); |
| 289 | |
| 290 | // ======================================================================================= |
| 291 | // Quoted strings |
| 292 | |
| 293 | namespace _ { // private |
| 294 | |
| 295 | struct InterpretEscape { |
| 296 | char operator()(char c) const { |
| 297 | switch (c) { |
| 298 | case 'a': return '\a'; |
| 299 | case 'b': return '\b'; |
| 300 | case 'f': return '\f'; |
| 301 | case 'n': return '\n'; |
| 302 | case 'r': return '\r'; |
| 303 | case 't': return '\t'; |
| 304 | case 'v': return '\v'; |
| 305 | default: return c; |
| 306 | } |
| 307 | } |
| 308 | }; |
| 309 | |
| 310 | struct ParseHexEscape { |
| 311 | inline char operator()(char first, char second) const { |
| 312 | return (parseDigit(first) << 4) | parseDigit(second); |
| 313 | } |
| 314 | }; |
| 315 | |
| 316 | struct ParseHexByte { |
| 317 | inline byte operator()(char first, char second) const { |
| 318 | return (parseDigit(first) << 4) | parseDigit(second); |
| 319 | } |
| 320 | }; |
| 321 | |
| 322 | struct ParseOctEscape { |
| 323 | inline char operator()(char first, Maybe<char> second, Maybe<char> third) const { |
| 324 | char result = first - '0'; |
| 325 | KJ_IF_MAYBE(digit1, second) { |
| 326 | result = (result << 3) | (*digit1 - '0'); |
| 327 | KJ_IF_MAYBE(digit2, third) { |
| 328 | result = (result << 3) | (*digit2 - '0'); |
| 329 | } |
| 330 | } |
| 331 | return result; |
| 332 | } |
| 333 | }; |
| 334 | |
| 335 | } // namespace _ (private) |
| 336 | |
| 337 | constexpr auto escapeSequence = |
| 338 | sequence(exactChar<'\\'>(), oneOf( |
| 339 | transform(anyOfChars("abfnrtv'\"\\\?" ), _::InterpretEscape()), |
| 340 | transform(sequence(exactChar<'x'>(), hexDigit, hexDigit), _::ParseHexEscape()), |
| 341 | transform(sequence(octDigit, optional(octDigit), optional(octDigit)), |
| 342 | _::ParseOctEscape()))); |
| 343 | // A parser that parses a C-string-style escape sequence (starting with a backslash). Returns |
| 344 | // a char. |
| 345 | |
| 346 | constexpr auto doubleQuotedString = charsToString(sequence( |
| 347 | exactChar<'\"'>(), |
| 348 | many(oneOf(anyOfChars("\\\n\"" ).invert(), escapeSequence)), |
| 349 | exactChar<'\"'>())); |
| 350 | // Parses a C-style double-quoted string. |
| 351 | |
| 352 | constexpr auto singleQuotedString = charsToString(sequence( |
| 353 | exactChar<'\''>(), |
| 354 | many(oneOf(anyOfChars("\\\n\'" ).invert(), escapeSequence)), |
| 355 | exactChar<'\''>())); |
| 356 | // Parses a C-style single-quoted string. |
| 357 | |
| 358 | constexpr auto doubleQuotedHexBinary = sequence( |
| 359 | exactChar<'0'>(), exactChar<'x'>(), exactChar<'\"'>(), |
| 360 | oneOrMore(transform(sequence(discardWhitespace, hexDigit, hexDigit), _::ParseHexByte())), |
| 361 | discardWhitespace, |
| 362 | exactChar<'\"'>()); |
| 363 | // Parses a double-quoted hex binary literal. Returns Array<byte>. |
| 364 | |
| 365 | } // namespace parse |
| 366 | } // namespace kj |
| 367 | |