1 | // Copyright (c) 2013-2014 Sandstorm Development Group, Inc. and contributors |
2 | // Licensed under the MIT License: |
3 | // |
4 | // Permission is hereby granted, free of charge, to any person obtaining a copy |
5 | // of this software and associated documentation files (the "Software"), to deal |
6 | // in the Software without restriction, including without limitation the rights |
7 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
8 | // copies of the Software, and to permit persons to whom the Software is |
9 | // furnished to do so, subject to the following conditions: |
10 | // |
11 | // The above copyright notice and this permission notice shall be included in |
12 | // all copies or substantial portions of the Software. |
13 | // |
14 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
15 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
16 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
17 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
18 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
19 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN |
20 | // THE SOFTWARE. |
21 | |
22 | // This file contains parsers useful for character stream inputs, including parsers to parse |
23 | // common kinds of tokens like identifiers, numbers, and quoted strings. |
24 | |
25 | #pragma once |
26 | |
27 | #if defined(__GNUC__) && !KJ_HEADER_WARNINGS |
28 | #pragma GCC system_header |
29 | #endif |
30 | |
31 | #include "common.h" |
32 | #include "../string.h" |
33 | #include <inttypes.h> |
34 | |
35 | namespace kj { |
36 | namespace parse { |
37 | |
38 | // ======================================================================================= |
39 | // Exact char/string. |
40 | |
41 | class ExactString_ { |
42 | public: |
43 | constexpr inline ExactString_(const char* str): str(str) {} |
44 | |
45 | template <typename Input> |
46 | Maybe<Tuple<>> operator()(Input& input) const { |
47 | const char* ptr = str; |
48 | |
49 | while (*ptr != '\0') { |
50 | if (input.atEnd() || input.current() != *ptr) return nullptr; |
51 | input.next(); |
52 | ++ptr; |
53 | } |
54 | |
55 | return Tuple<>(); |
56 | } |
57 | |
58 | private: |
59 | const char* str; |
60 | }; |
61 | |
62 | constexpr inline ExactString_ exactString(const char* str) { |
63 | return ExactString_(str); |
64 | } |
65 | |
66 | template <char c> |
67 | constexpr ExactlyConst_<char, c> exactChar() { |
68 | // Returns a parser that matches exactly the character given by the template argument (returning |
69 | // no result). |
70 | return ExactlyConst_<char, c>(); |
71 | } |
72 | |
73 | // ======================================================================================= |
74 | // Char ranges / sets |
75 | |
76 | class CharGroup_ { |
77 | public: |
78 | constexpr inline CharGroup_(): bits{0, 0, 0, 0} {} |
79 | |
80 | constexpr inline CharGroup_ orRange(unsigned char first, unsigned char last) const { |
81 | return CharGroup_(bits[0] | (oneBits(last + 1) & ~oneBits(first )), |
82 | bits[1] | (oneBits(last - 63) & ~oneBits(first - 64)), |
83 | bits[2] | (oneBits(last - 127) & ~oneBits(first - 128)), |
84 | bits[3] | (oneBits(last - 191) & ~oneBits(first - 192))); |
85 | } |
86 | |
87 | constexpr inline CharGroup_ orAny(const char* chars) const { |
88 | return *chars == 0 ? *this : orChar(*chars).orAny(chars + 1); |
89 | } |
90 | |
91 | constexpr inline CharGroup_ orChar(unsigned char c) const { |
92 | return CharGroup_(bits[0] | bit(c), |
93 | bits[1] | bit(c - 64), |
94 | bits[2] | bit(c - 128), |
95 | bits[3] | bit(c - 256)); |
96 | } |
97 | |
98 | constexpr inline CharGroup_ orGroup(CharGroup_ other) const { |
99 | return CharGroup_(bits[0] | other.bits[0], |
100 | bits[1] | other.bits[1], |
101 | bits[2] | other.bits[2], |
102 | bits[3] | other.bits[3]); |
103 | } |
104 | |
105 | constexpr inline CharGroup_ invert() const { |
106 | return CharGroup_(~bits[0], ~bits[1], ~bits[2], ~bits[3]); |
107 | } |
108 | |
109 | constexpr inline bool contains(unsigned char c) const { |
110 | return (bits[c / 64] & (1ll << (c % 64))) != 0; |
111 | } |
112 | |
113 | inline bool containsAll(ArrayPtr<const char> text) const { |
114 | for (char c: text) { |
115 | if (!contains(c)) return false; |
116 | } |
117 | return true; |
118 | } |
119 | |
120 | template <typename Input> |
121 | Maybe<char> operator()(Input& input) const { |
122 | if (input.atEnd()) return nullptr; |
123 | unsigned char c = input.current(); |
124 | if (contains(c)) { |
125 | input.next(); |
126 | return c; |
127 | } else { |
128 | return nullptr; |
129 | } |
130 | } |
131 | |
132 | private: |
133 | typedef unsigned long long Bits64; |
134 | |
135 | constexpr inline CharGroup_(Bits64 a, Bits64 b, Bits64 c, Bits64 d): bits{a, b, c, d} {} |
136 | Bits64 bits[4]; |
137 | |
138 | static constexpr inline Bits64 oneBits(int count) { |
139 | return count <= 0 ? 0ll : count >= 64 ? -1ll : ((1ll << count) - 1); |
140 | } |
141 | static constexpr inline Bits64 bit(int index) { |
142 | return index < 0 ? 0 : index >= 64 ? 0 : (1ll << index); |
143 | } |
144 | }; |
145 | |
146 | constexpr inline CharGroup_ charRange(char first, char last) { |
147 | // Create a parser which accepts any character in the range from `first` to `last`, inclusive. |
148 | // For example: `charRange('a', 'z')` matches all lower-case letters. The parser's result is the |
149 | // character matched. |
150 | // |
151 | // The returned object has methods which can be used to match more characters. The following |
152 | // produces a parser which accepts any letter as well as '_', '+', '-', and '.'. |
153 | // |
154 | // charRange('a', 'z').orRange('A', 'Z').orChar('_').orAny("+-.") |
155 | // |
156 | // You can also use `.invert()` to match the opposite set of characters. |
157 | |
158 | return CharGroup_().orRange(first, last); |
159 | } |
160 | |
161 | #if _MSC_VER |
162 | #define anyOfChars(chars) CharGroup_().orAny(chars) |
163 | // TODO(msvc): MSVC ICEs on the proper definition of `anyOfChars()`, which in turn prevents us from |
164 | // building the compiler or schema parser. We don't know why this happens, but Harris found that |
165 | // this horrible, horrible hack makes things work. This is awful, but it's better than nothing. |
166 | // Hopefully, MSVC will get fixed soon and we'll be able to remove this. |
167 | #else |
168 | constexpr inline CharGroup_ anyOfChars(const char* chars) { |
169 | // Returns a parser that accepts any of the characters in the given string (which should usually |
170 | // be a literal). The returned parser is of the same type as returned by `charRange()` -- see |
171 | // that function for more info. |
172 | |
173 | return CharGroup_().orAny(chars); |
174 | } |
175 | #endif |
176 | |
177 | // ======================================================================================= |
178 | |
179 | namespace _ { // private |
180 | |
181 | struct ArrayToString { |
182 | inline String operator()(const Array<char>& arr) const { |
183 | return heapString(arr); |
184 | } |
185 | }; |
186 | |
187 | } // namespace _ (private) |
188 | |
189 | template <typename SubParser> |
190 | constexpr inline auto charsToString(SubParser&& subParser) |
191 | -> decltype(transform(kj::fwd<SubParser>(subParser), _::ArrayToString())) { |
192 | // Wraps a parser that returns Array<char> such that it returns String instead. |
193 | return parse::transform(kj::fwd<SubParser>(subParser), _::ArrayToString()); |
194 | } |
195 | |
196 | // ======================================================================================= |
197 | // Basic character classes. |
198 | |
199 | constexpr auto alpha = charRange('a', 'z').orRange('A', 'Z'); |
200 | constexpr auto digit = charRange('0', '9'); |
201 | constexpr auto alphaNumeric = alpha.orGroup(digit); |
202 | constexpr auto nameStart = alpha.orChar('_'); |
203 | constexpr auto nameChar = alphaNumeric.orChar('_'); |
204 | constexpr auto hexDigit = charRange('0', '9').orRange('a', 'f').orRange('A', 'F'); |
205 | constexpr auto octDigit = charRange('0', '7'); |
206 | constexpr auto whitespaceChar = anyOfChars(" \f\n\r\t\v" ); |
207 | constexpr auto controlChar = charRange(0, 0x1f).invert().orGroup(whitespaceChar).invert(); |
208 | |
209 | constexpr auto whitespace = many(anyOfChars(" \f\n\r\t\v" )); |
210 | |
211 | constexpr auto discardWhitespace = discard(many(discard(anyOfChars(" \f\n\r\t\v" )))); |
212 | // Like discard(whitespace) but avoids some memory allocation. |
213 | |
214 | // ======================================================================================= |
215 | // Identifiers |
216 | |
217 | namespace _ { // private |
218 | |
219 | struct IdentifierToString { |
220 | inline String operator()(char first, const Array<char>& rest) const { |
221 | if (rest.size() == 0) return heapString(&first, 1); |
222 | String result = heapString(rest.size() + 1); |
223 | result[0] = first; |
224 | memcpy(result.begin() + 1, rest.begin(), rest.size()); |
225 | return result; |
226 | } |
227 | }; |
228 | |
229 | } // namespace _ (private) |
230 | |
231 | constexpr auto identifier = transform(sequence(nameStart, many(nameChar)), _::IdentifierToString()); |
232 | // Parses an identifier (e.g. a C variable name). |
233 | |
234 | // ======================================================================================= |
235 | // Integers |
236 | |
237 | namespace _ { // private |
238 | |
239 | inline char parseDigit(char c) { |
240 | if (c < 'A') return c - '0'; |
241 | if (c < 'a') return c - 'A' + 10; |
242 | return c - 'a' + 10; |
243 | } |
244 | |
245 | template <uint base> |
246 | struct ParseInteger { |
247 | inline uint64_t operator()(const Array<char>& digits) const { |
248 | return operator()('0', digits); |
249 | } |
250 | uint64_t operator()(char first, const Array<char>& digits) const { |
251 | uint64_t result = parseDigit(first); |
252 | for (char digit: digits) { |
253 | result = result * base + parseDigit(digit); |
254 | } |
255 | return result; |
256 | } |
257 | }; |
258 | |
259 | |
260 | } // namespace _ (private) |
261 | |
262 | constexpr auto integer = sequence( |
263 | oneOf( |
264 | transform(sequence(exactChar<'0'>(), exactChar<'x'>(), oneOrMore(hexDigit)), _::ParseInteger<16>()), |
265 | transform(sequence(exactChar<'0'>(), many(octDigit)), _::ParseInteger<8>()), |
266 | transform(sequence(charRange('1', '9'), many(digit)), _::ParseInteger<10>())), |
267 | notLookingAt(alpha.orAny("_." ))); |
268 | |
269 | // ======================================================================================= |
270 | // Numbers (i.e. floats) |
271 | |
272 | namespace _ { // private |
273 | |
274 | struct ParseFloat { |
275 | double operator()(const Array<char>& digits, |
276 | const Maybe<Array<char>>& fraction, |
277 | const Maybe<Tuple<Maybe<char>, Array<char>>>& exponent) const; |
278 | }; |
279 | |
280 | } // namespace _ (private) |
281 | |
282 | constexpr auto number = transform( |
283 | sequence( |
284 | oneOrMore(digit), |
285 | optional(sequence(exactChar<'.'>(), many(digit))), |
286 | optional(sequence(discard(anyOfChars("eE" )), optional(anyOfChars("+-" )), many(digit))), |
287 | notLookingAt(alpha.orAny("_." ))), |
288 | _::ParseFloat()); |
289 | |
290 | // ======================================================================================= |
291 | // Quoted strings |
292 | |
293 | namespace _ { // private |
294 | |
295 | struct InterpretEscape { |
296 | char operator()(char c) const { |
297 | switch (c) { |
298 | case 'a': return '\a'; |
299 | case 'b': return '\b'; |
300 | case 'f': return '\f'; |
301 | case 'n': return '\n'; |
302 | case 'r': return '\r'; |
303 | case 't': return '\t'; |
304 | case 'v': return '\v'; |
305 | default: return c; |
306 | } |
307 | } |
308 | }; |
309 | |
310 | struct ParseHexEscape { |
311 | inline char operator()(char first, char second) const { |
312 | return (parseDigit(first) << 4) | parseDigit(second); |
313 | } |
314 | }; |
315 | |
316 | struct ParseHexByte { |
317 | inline byte operator()(char first, char second) const { |
318 | return (parseDigit(first) << 4) | parseDigit(second); |
319 | } |
320 | }; |
321 | |
322 | struct ParseOctEscape { |
323 | inline char operator()(char first, Maybe<char> second, Maybe<char> third) const { |
324 | char result = first - '0'; |
325 | KJ_IF_MAYBE(digit1, second) { |
326 | result = (result << 3) | (*digit1 - '0'); |
327 | KJ_IF_MAYBE(digit2, third) { |
328 | result = (result << 3) | (*digit2 - '0'); |
329 | } |
330 | } |
331 | return result; |
332 | } |
333 | }; |
334 | |
335 | } // namespace _ (private) |
336 | |
337 | constexpr auto escapeSequence = |
338 | sequence(exactChar<'\\'>(), oneOf( |
339 | transform(anyOfChars("abfnrtv'\"\\\?" ), _::InterpretEscape()), |
340 | transform(sequence(exactChar<'x'>(), hexDigit, hexDigit), _::ParseHexEscape()), |
341 | transform(sequence(octDigit, optional(octDigit), optional(octDigit)), |
342 | _::ParseOctEscape()))); |
343 | // A parser that parses a C-string-style escape sequence (starting with a backslash). Returns |
344 | // a char. |
345 | |
346 | constexpr auto doubleQuotedString = charsToString(sequence( |
347 | exactChar<'\"'>(), |
348 | many(oneOf(anyOfChars("\\\n\"" ).invert(), escapeSequence)), |
349 | exactChar<'\"'>())); |
350 | // Parses a C-style double-quoted string. |
351 | |
352 | constexpr auto singleQuotedString = charsToString(sequence( |
353 | exactChar<'\''>(), |
354 | many(oneOf(anyOfChars("\\\n\'" ).invert(), escapeSequence)), |
355 | exactChar<'\''>())); |
356 | // Parses a C-style single-quoted string. |
357 | |
358 | constexpr auto doubleQuotedHexBinary = sequence( |
359 | exactChar<'0'>(), exactChar<'x'>(), exactChar<'\"'>(), |
360 | oneOrMore(transform(sequence(discardWhitespace, hexDigit, hexDigit), _::ParseHexByte())), |
361 | discardWhitespace, |
362 | exactChar<'\"'>()); |
363 | // Parses a double-quoted hex binary literal. Returns Array<byte>. |
364 | |
365 | } // namespace parse |
366 | } // namespace kj |
367 | |