1// Copyright (c) 2013-2014 Sandstorm Development Group, Inc. and contributors
2// Licensed under the MIT License:
3//
4// Permission is hereby granted, free of charge, to any person obtaining a copy
5// of this software and associated documentation files (the "Software"), to deal
6// in the Software without restriction, including without limitation the rights
7// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8// copies of the Software, and to permit persons to whom the Software is
9// furnished to do so, subject to the following conditions:
10//
11// The above copyright notice and this permission notice shall be included in
12// all copies or substantial portions of the Software.
13//
14// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20// THE SOFTWARE.
21
22// This file contains parsers useful for character stream inputs, including parsers to parse
23// common kinds of tokens like identifiers, numbers, and quoted strings.
24
25#pragma once
26
27#if defined(__GNUC__) && !KJ_HEADER_WARNINGS
28#pragma GCC system_header
29#endif
30
31#include "common.h"
32#include "../string.h"
33#include <inttypes.h>
34
35namespace kj {
36namespace parse {
37
38// =======================================================================================
39// Exact char/string.
40
41class ExactString_ {
42public:
43 constexpr inline ExactString_(const char* str): str(str) {}
44
45 template <typename Input>
46 Maybe<Tuple<>> operator()(Input& input) const {
47 const char* ptr = str;
48
49 while (*ptr != '\0') {
50 if (input.atEnd() || input.current() != *ptr) return nullptr;
51 input.next();
52 ++ptr;
53 }
54
55 return Tuple<>();
56 }
57
58private:
59 const char* str;
60};
61
62constexpr inline ExactString_ exactString(const char* str) {
63 return ExactString_(str);
64}
65
66template <char c>
67constexpr ExactlyConst_<char, c> exactChar() {
68 // Returns a parser that matches exactly the character given by the template argument (returning
69 // no result).
70 return ExactlyConst_<char, c>();
71}
72
73// =======================================================================================
74// Char ranges / sets
75
76class CharGroup_ {
77public:
78 constexpr inline CharGroup_(): bits{0, 0, 0, 0} {}
79
80 constexpr inline CharGroup_ orRange(unsigned char first, unsigned char last) const {
81 return CharGroup_(bits[0] | (oneBits(last + 1) & ~oneBits(first )),
82 bits[1] | (oneBits(last - 63) & ~oneBits(first - 64)),
83 bits[2] | (oneBits(last - 127) & ~oneBits(first - 128)),
84 bits[3] | (oneBits(last - 191) & ~oneBits(first - 192)));
85 }
86
87 constexpr inline CharGroup_ orAny(const char* chars) const {
88 return *chars == 0 ? *this : orChar(*chars).orAny(chars + 1);
89 }
90
91 constexpr inline CharGroup_ orChar(unsigned char c) const {
92 return CharGroup_(bits[0] | bit(c),
93 bits[1] | bit(c - 64),
94 bits[2] | bit(c - 128),
95 bits[3] | bit(c - 256));
96 }
97
98 constexpr inline CharGroup_ orGroup(CharGroup_ other) const {
99 return CharGroup_(bits[0] | other.bits[0],
100 bits[1] | other.bits[1],
101 bits[2] | other.bits[2],
102 bits[3] | other.bits[3]);
103 }
104
105 constexpr inline CharGroup_ invert() const {
106 return CharGroup_(~bits[0], ~bits[1], ~bits[2], ~bits[3]);
107 }
108
109 constexpr inline bool contains(unsigned char c) const {
110 return (bits[c / 64] & (1ll << (c % 64))) != 0;
111 }
112
113 inline bool containsAll(ArrayPtr<const char> text) const {
114 for (char c: text) {
115 if (!contains(c)) return false;
116 }
117 return true;
118 }
119
120 template <typename Input>
121 Maybe<char> operator()(Input& input) const {
122 if (input.atEnd()) return nullptr;
123 unsigned char c = input.current();
124 if (contains(c)) {
125 input.next();
126 return c;
127 } else {
128 return nullptr;
129 }
130 }
131
132private:
133 typedef unsigned long long Bits64;
134
135 constexpr inline CharGroup_(Bits64 a, Bits64 b, Bits64 c, Bits64 d): bits{a, b, c, d} {}
136 Bits64 bits[4];
137
138 static constexpr inline Bits64 oneBits(int count) {
139 return count <= 0 ? 0ll : count >= 64 ? -1ll : ((1ll << count) - 1);
140 }
141 static constexpr inline Bits64 bit(int index) {
142 return index < 0 ? 0 : index >= 64 ? 0 : (1ll << index);
143 }
144};
145
146constexpr inline CharGroup_ charRange(char first, char last) {
147 // Create a parser which accepts any character in the range from `first` to `last`, inclusive.
148 // For example: `charRange('a', 'z')` matches all lower-case letters. The parser's result is the
149 // character matched.
150 //
151 // The returned object has methods which can be used to match more characters. The following
152 // produces a parser which accepts any letter as well as '_', '+', '-', and '.'.
153 //
154 // charRange('a', 'z').orRange('A', 'Z').orChar('_').orAny("+-.")
155 //
156 // You can also use `.invert()` to match the opposite set of characters.
157
158 return CharGroup_().orRange(first, last);
159}
160
161#if _MSC_VER
162#define anyOfChars(chars) CharGroup_().orAny(chars)
163// TODO(msvc): MSVC ICEs on the proper definition of `anyOfChars()`, which in turn prevents us from
164// building the compiler or schema parser. We don't know why this happens, but Harris found that
165// this horrible, horrible hack makes things work. This is awful, but it's better than nothing.
166// Hopefully, MSVC will get fixed soon and we'll be able to remove this.
167#else
168constexpr inline CharGroup_ anyOfChars(const char* chars) {
169 // Returns a parser that accepts any of the characters in the given string (which should usually
170 // be a literal). The returned parser is of the same type as returned by `charRange()` -- see
171 // that function for more info.
172
173 return CharGroup_().orAny(chars);
174}
175#endif
176
177// =======================================================================================
178
179namespace _ { // private
180
181struct ArrayToString {
182 inline String operator()(const Array<char>& arr) const {
183 return heapString(arr);
184 }
185};
186
187} // namespace _ (private)
188
189template <typename SubParser>
190constexpr inline auto charsToString(SubParser&& subParser)
191 -> decltype(transform(kj::fwd<SubParser>(subParser), _::ArrayToString())) {
192 // Wraps a parser that returns Array<char> such that it returns String instead.
193 return parse::transform(kj::fwd<SubParser>(subParser), _::ArrayToString());
194}
195
196// =======================================================================================
197// Basic character classes.
198
199constexpr auto alpha = charRange('a', 'z').orRange('A', 'Z');
200constexpr auto digit = charRange('0', '9');
201constexpr auto alphaNumeric = alpha.orGroup(digit);
202constexpr auto nameStart = alpha.orChar('_');
203constexpr auto nameChar = alphaNumeric.orChar('_');
204constexpr auto hexDigit = charRange('0', '9').orRange('a', 'f').orRange('A', 'F');
205constexpr auto octDigit = charRange('0', '7');
206constexpr auto whitespaceChar = anyOfChars(" \f\n\r\t\v");
207constexpr auto controlChar = charRange(0, 0x1f).invert().orGroup(whitespaceChar).invert();
208
209constexpr auto whitespace = many(anyOfChars(" \f\n\r\t\v"));
210
211constexpr auto discardWhitespace = discard(many(discard(anyOfChars(" \f\n\r\t\v"))));
212// Like discard(whitespace) but avoids some memory allocation.
213
214// =======================================================================================
215// Identifiers
216
217namespace _ { // private
218
219struct IdentifierToString {
220 inline String operator()(char first, const Array<char>& rest) const {
221 if (rest.size() == 0) return heapString(&first, 1);
222 String result = heapString(rest.size() + 1);
223 result[0] = first;
224 memcpy(result.begin() + 1, rest.begin(), rest.size());
225 return result;
226 }
227};
228
229} // namespace _ (private)
230
231constexpr auto identifier = transform(sequence(nameStart, many(nameChar)), _::IdentifierToString());
232// Parses an identifier (e.g. a C variable name).
233
234// =======================================================================================
235// Integers
236
237namespace _ { // private
238
239inline char parseDigit(char c) {
240 if (c < 'A') return c - '0';
241 if (c < 'a') return c - 'A' + 10;
242 return c - 'a' + 10;
243}
244
245template <uint base>
246struct ParseInteger {
247 inline uint64_t operator()(const Array<char>& digits) const {
248 return operator()('0', digits);
249 }
250 uint64_t operator()(char first, const Array<char>& digits) const {
251 uint64_t result = parseDigit(first);
252 for (char digit: digits) {
253 result = result * base + parseDigit(digit);
254 }
255 return result;
256 }
257};
258
259
260} // namespace _ (private)
261
262constexpr auto integer = sequence(
263 oneOf(
264 transform(sequence(exactChar<'0'>(), exactChar<'x'>(), oneOrMore(hexDigit)), _::ParseInteger<16>()),
265 transform(sequence(exactChar<'0'>(), many(octDigit)), _::ParseInteger<8>()),
266 transform(sequence(charRange('1', '9'), many(digit)), _::ParseInteger<10>())),
267 notLookingAt(alpha.orAny("_.")));
268
269// =======================================================================================
270// Numbers (i.e. floats)
271
272namespace _ { // private
273
274struct ParseFloat {
275 double operator()(const Array<char>& digits,
276 const Maybe<Array<char>>& fraction,
277 const Maybe<Tuple<Maybe<char>, Array<char>>>& exponent) const;
278};
279
280} // namespace _ (private)
281
282constexpr auto number = transform(
283 sequence(
284 oneOrMore(digit),
285 optional(sequence(exactChar<'.'>(), many(digit))),
286 optional(sequence(discard(anyOfChars("eE")), optional(anyOfChars("+-")), many(digit))),
287 notLookingAt(alpha.orAny("_."))),
288 _::ParseFloat());
289
290// =======================================================================================
291// Quoted strings
292
293namespace _ { // private
294
295struct InterpretEscape {
296 char operator()(char c) const {
297 switch (c) {
298 case 'a': return '\a';
299 case 'b': return '\b';
300 case 'f': return '\f';
301 case 'n': return '\n';
302 case 'r': return '\r';
303 case 't': return '\t';
304 case 'v': return '\v';
305 default: return c;
306 }
307 }
308};
309
310struct ParseHexEscape {
311 inline char operator()(char first, char second) const {
312 return (parseDigit(first) << 4) | parseDigit(second);
313 }
314};
315
316struct ParseHexByte {
317 inline byte operator()(char first, char second) const {
318 return (parseDigit(first) << 4) | parseDigit(second);
319 }
320};
321
322struct ParseOctEscape {
323 inline char operator()(char first, Maybe<char> second, Maybe<char> third) const {
324 char result = first - '0';
325 KJ_IF_MAYBE(digit1, second) {
326 result = (result << 3) | (*digit1 - '0');
327 KJ_IF_MAYBE(digit2, third) {
328 result = (result << 3) | (*digit2 - '0');
329 }
330 }
331 return result;
332 }
333};
334
335} // namespace _ (private)
336
337constexpr auto escapeSequence =
338 sequence(exactChar<'\\'>(), oneOf(
339 transform(anyOfChars("abfnrtv'\"\\\?"), _::InterpretEscape()),
340 transform(sequence(exactChar<'x'>(), hexDigit, hexDigit), _::ParseHexEscape()),
341 transform(sequence(octDigit, optional(octDigit), optional(octDigit)),
342 _::ParseOctEscape())));
343// A parser that parses a C-string-style escape sequence (starting with a backslash). Returns
344// a char.
345
346constexpr auto doubleQuotedString = charsToString(sequence(
347 exactChar<'\"'>(),
348 many(oneOf(anyOfChars("\\\n\"").invert(), escapeSequence)),
349 exactChar<'\"'>()));
350// Parses a C-style double-quoted string.
351
352constexpr auto singleQuotedString = charsToString(sequence(
353 exactChar<'\''>(),
354 many(oneOf(anyOfChars("\\\n\'").invert(), escapeSequence)),
355 exactChar<'\''>()));
356// Parses a C-style single-quoted string.
357
358constexpr auto doubleQuotedHexBinary = sequence(
359 exactChar<'0'>(), exactChar<'x'>(), exactChar<'\"'>(),
360 oneOrMore(transform(sequence(discardWhitespace, hexDigit, hexDigit), _::ParseHexByte())),
361 discardWhitespace,
362 exactChar<'\"'>());
363// Parses a double-quoted hex binary literal. Returns Array<byte>.
364
365} // namespace parse
366} // namespace kj
367