char.h source code [ClickHouse/contrib/capnproto/c++/src/kj/parse/char.h]

1	// Copyright (c) 2013-2014 Sandstorm Development Group, Inc. and contributors
2	// Licensed under the MIT License:
3	//
4	// Permission is hereby granted, free of charge, to any person obtaining a copy
5	// of this software and associated documentation files (the "Software"), to deal
6	// in the Software without restriction, including without limitation the rights
7	// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8	// copies of the Software, and to permit persons to whom the Software is
9	// furnished to do so, subject to the following conditions:
10	//
11	// The above copyright notice and this permission notice shall be included in
12	// all copies or substantial portions of the Software.
13	//
14	// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15	// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16	// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17	// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18	// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19	// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20	// THE SOFTWARE.
21
22	// This file contains parsers useful for character stream inputs, including parsers to parse
23	// common kinds of tokens like identifiers, numbers, and quoted strings.
24
25	#pragma once
26
27	#if defined(__GNUC__) && !KJ_HEADER_WARNINGS
28	#pragma GCC system_header
29	#endif
30
31	#include "common.h"
32	#include "../string.h"
33	#include <inttypes.h>
34
35	namespace kj {
36	namespace parse {
37
38	// =======================================================================================
39	// Exact char/string.
40
41	class ExactString_ {
42	public:
43	constexpr inline ExactString_(const char* str): str(str) {}
44
45	template <typename Input>
46	Maybe<Tuple<>> operator()(Input& input) const {
47	const char* ptr = str;
48
49	while (*ptr != `'\0'`) {
50	if (input.atEnd() \|\| input.current() != ptr) return* nullptr;
51	input.next();
52	++ptr;
53	}
54
55	return Tuple<>();
56	}
57
58	private:
59	const char* str;
60	};
61
62	constexpr inline ExactString_ exactString(const char* str) {
63	return ExactString_(str);
64	}
65
66	template <char c>
67	constexpr ExactlyConst_<char, c> exactChar() {
68	// Returns a parser that matches exactly the character given by the template argument (returning
69	// no result).
70	return ExactlyConst_<char, c>();
71	}
72
73	// =======================================================================================
74	// Char ranges / sets
75
76	class CharGroup_ {
77	public:
78	constexpr inline CharGroup_(): bits{`0`, `0`, `0`, `0`} {}
79
80	constexpr inline CharGroup_ orRange(unsigned char first, unsigned char last) const {
81	return CharGroup_(bits[`0`] \| (oneBits(last + `1`) & ~oneBits(first )),
82	bits[`1`] \| (oneBits(last - `63`) & ~oneBits(first - `64`)),
83	bits[`2`] \| (oneBits(last - `127`) & ~oneBits(first - `128`)),
84	bits[`3`] \| (oneBits(last - `191`) & ~oneBits(first - `192`)));
85	}
86
87	constexpr inline CharGroup_ orAny(const char* chars) const {
88	return chars == `0` ? this : orChar(*chars).orAny(chars + `1`);
89	}
90
91	constexpr inline CharGroup_ orChar(unsigned char c) const {
92	return CharGroup_(bits[`0`] \| bit(c),
93	bits[`1`] \| bit(c - `64`),
94	bits[`2`] \| bit(c - `128`),
95	bits[`3`] \| bit(c - `256`));
96	}
97
98	constexpr inline CharGroup_ orGroup(CharGroup_ other) const {
99	return CharGroup_(bits[`0`] \| other.bits[`0`],
100	bits[`1`] \| other.bits[`1`],
101	bits[`2`] \| other.bits[`2`],
102	bits[`3`] \| other.bits[`3`]);
103	}
104
105	constexpr inline CharGroup_ invert() const {
106	return CharGroup_(~bits[`0`], ~bits[`1`], ~bits[`2`], ~bits[`3`]);
107	}
108
109	constexpr inline bool contains(unsigned char c) const {
110	return (bits[c / `64`] & (`1ll` << (c % `64`))) != `0`;
111	}
112
113	inline bool containsAll(ArrayPtr<const char> text) const {
114	for (char c: text) {
115	if (!contains(c)) return false;
116	}
117	return true;
118	}
119
120	template <typename Input>
121	Maybe<char> operator()(Input& input) const {
122	if (input.atEnd()) return nullptr;
123	unsigned char c = input.current();
124	if (contains(c)) {
125	input.next();
126	return c;
127	} else {
128	return nullptr;
129	}
130	}
131
132	private:
133	typedef unsigned long long Bits64;
134
135	constexpr inline CharGroup_(Bits64 a, Bits64 b, Bits64 c, Bits64 d): bits{a, b, c, d} {}
136	Bits64 bits[`4`];
137
138	static constexpr inline Bits64 oneBits(int count) {
139	return count <= `0` ? `0ll` : count >= `64` ? -`1ll` : ((`1ll` << count) - `1`);
140	}
141	static constexpr inline Bits64 bit(int index) {
142	return index < `0` ? `0` : index >= `64` ? `0` : (`1ll` << index);
143	}
144	};
145
146	constexpr inline CharGroup_ charRange(char first, char last) {
147	// Create a parser which accepts any character in the range from `first` to `last`, inclusive.
148	// For example: `charRange('a', 'z')` matches all lower-case letters. The parser's result is the
149	// character matched.
150	//
151	// The returned object has methods which can be used to match more characters. The following
152	// produces a parser which accepts any letter as well as '_', '+', '-', and '.'.
153	//
154	// charRange('a', 'z').orRange('A', 'Z').orChar('_').orAny("+-.")
155	//
156	// You can also use `.invert()` to match the opposite set of characters.
157
158	return CharGroup_().orRange(first, last);
159	}
160
161	#if _MSC_VER
162	#define anyOfChars(chars) CharGroup_().orAny(chars)
163	// TODO(msvc): MSVC ICEs on the proper definition of `anyOfChars()`, which in turn prevents us from
164	// building the compiler or schema parser. We don't know why this happens, but Harris found that
165	// this horrible, horrible hack makes things work. This is awful, but it's better than nothing.
166	// Hopefully, MSVC will get fixed soon and we'll be able to remove this.
167	#else
168	constexpr inline CharGroup_ anyOfChars(const char* chars) {
169	// Returns a parser that accepts any of the characters in the given string (which should usually
170	// be a literal). The returned parser is of the same type as returned by `charRange()` -- see
171	// that function for more info.
172
173	return CharGroup_().orAny(chars);
174	}
175	#endif
176
177	// =======================================================================================
178
179	namespace _ { // private
180
181	struct ArrayToString {
182	inline String operator()(const Array<char>& arr) const {
183	return heapString(arr);
184	}
185	};
186
187	} // namespace _ (private)
188
189	template <typename SubParser>
190	constexpr inline auto charsToString(SubParser&& subParser)
191	-> decltype(transform(kj::fwd<SubParser>(subParser), _::ArrayToString ())) {
192	// Wraps a parser that returns Array<char> such that it returns String instead.
193	return parse::transform(kj::fwd<SubParser>(subParser), _::ArrayToString ());
194	}
195
196	// =======================================================================================
197	// Basic character classes.
198
199	constexpr auto alpha = charRange(`'a'`, `'z'`).orRange(`'A'`, `'Z'`);
200	constexpr auto digit = charRange(`'0'`, `'9'`);
201	constexpr auto alphaNumeric = alpha.orGroup(digit);
202	constexpr auto nameStart = alpha.orChar(`'_'`);
203	constexpr auto nameChar = alphaNumeric.orChar(`'_'`);
204	constexpr auto hexDigit = charRange(`'0'`, `'9'`).orRange(`'a'`, `'f'`).orRange(`'A'`, `'F'`);
205	constexpr auto octDigit = charRange(`'0'`, `'7'`);
206	constexpr auto whitespaceChar = anyOfChars(" \f\n\r\t\v");
207	constexpr auto controlChar = charRange(`0`, `0x1f`).invert().orGroup(whitespaceChar).invert();
208
209	constexpr auto whitespace = many(anyOfChars(" \f\n\r\t\v"));
210
211	constexpr auto discardWhitespace = discard(many(discard(anyOfChars(" \f\n\r\t\v"))));
212	// Like discard(whitespace) but avoids some memory allocation.
213
214	// =======================================================================================
215	// Identifiers
216
217	namespace _ { // private
218
219	struct IdentifierToString {
220	inline String operator()(char first, const Array<char>& rest) const {
221	if (rest.size() == `0`) return heapString(&first, `1`);
222	String result = heapString(rest.size() + `1`);
223	result [`0`] = first;
224	memcpy(result.begin() + `1`, rest.begin(), rest.size());
225	return result;
226	}
227	};
228
229	} // namespace _ (private)
230
231	constexpr auto identifier = transform(sequence(nameStart, many(nameChar)), _::IdentifierToString ());
232	// Parses an identifier (e.g. a C variable name).
233
234	// =======================================================================================
235	// Integers
236
237	namespace _ { // private
238
239	inline char parseDigit(char c) {
240	if (c < `'A'`) return c - `'0'`;
241	if (c < `'a'`) return c - `'A'` + `10`;
242	return c - `'a'` + `10`;
243	}
244
245	template <uint base>
246	struct ParseInteger {
247	inline uint64_t operator()(const Array<char>& digits) const {
248	return operator()(`'0'`, digits);
249	}
250	uint64_t operator()(char first, const Array<char>& digits) const {
251	uint64_t result = parseDigit(first);
252	for (char digit: digits) {
253	result = result * base + parseDigit(digit);
254	}
255	return result;
256	}
257	};
258
259
260	} // namespace _ (private)
261
262	constexpr auto integer = sequence(
263	oneOf(
264	transform(sequence(exactChar<`'0'`>(), exactChar<`'x'`>(), oneOrMore(hexDigit)), _::ParseInteger<`16`>()),
265	transform(sequence(exactChar<`'0'`>(), many(octDigit)), _::ParseInteger<`8`>()),
266	transform(sequence(charRange(`'1'`, `'9'`), many(digit)), _::ParseInteger<`10`>())),
267	notLookingAt(alpha.orAny("_.")));
268
269	// =======================================================================================
270	// Numbers (i.e. floats)
271
272	namespace _ { // private
273
274	struct ParseFloat {
275	double operator()(const Array<char>& digits,
276	const Maybe<Array<char>>& fraction,
277	const Maybe<Tuple<Maybe<char>, Array<char>>>& exponent) const;
278	};
279
280	} // namespace _ (private)
281
282	constexpr auto number = transform(
283	sequence(
284	oneOrMore(digit),
285	optional(sequence(exactChar<`'.'`>(), many(digit))),
286	optional(sequence(discard(anyOfChars("eE")), optional(anyOfChars("+-")), many(digit))),
287	notLookingAt(alpha.orAny("_."))),
288	_::ParseFloat ());
289
290	// =======================================================================================
291	// Quoted strings
292
293	namespace _ { // private
294
295	struct InterpretEscape {
296	char operator()(char c) const {
297	switch (c) {
298	case `'a'`: return `'\a'`;
299	case `'b'`: return `'\b'`;
300	case `'f'`: return `'\f'`;
301	case `'n'`: return `'\n'`;
302	case `'r'`: return `'\r'`;
303	case `'t'`: return `'\t'`;
304	case `'v'`: return `'\v'`;
305	default: return c;
306	}
307	}
308	};
309
310	struct ParseHexEscape {
311	inline char operator()(char first, char second) const {
312	return (parseDigit(first) << `4`) \| parseDigit(second);
313	}
314	};
315
316	struct ParseHexByte {
317	inline byte operator()(char first, char second) const {
318	return (parseDigit(first) << `4`) \| parseDigit(second);
319	}
320	};
321
322	struct ParseOctEscape {
323	inline char operator()(char first, Maybe<char> second, Maybe<char> third) const {
324	char result = first - `'0'`;
325	KJ_IF_MAYBE(digit1, second) {
326	result = (result << `3`) \| (*digit1 - `'0'`);
327	KJ_IF_MAYBE(digit2, third) {
328	result = (result << `3`) \| (*digit2 - `'0'`);
329	}
330	}
331	return result;
332	}
333	};
334
335	} // namespace _ (private)
336
337	constexpr auto escapeSequence =
338	sequence(exactChar<`'\\'`>(), oneOf(
339	transform(anyOfChars("abfnrtv'\"\\\?"), _::InterpretEscape ()),
340	transform(sequence(exactChar<`'x'`>(), hexDigit, hexDigit), _::ParseHexEscape ()),
341	transform(sequence(octDigit, optional(octDigit), optional(octDigit)),
342	_::ParseOctEscape ())));
343	// A parser that parses a C-string-style escape sequence (starting with a backslash). Returns
344	// a char.
345
346	constexpr auto doubleQuotedString = charsToString(sequence(
347	exactChar<`'\"'`>(),
348	many(oneOf(anyOfChars("\\\n\"").invert(), escapeSequence)),
349	exactChar<`'\"'`>()));
350	// Parses a C-style double-quoted string.
351
352	constexpr auto singleQuotedString = charsToString(sequence(
353	exactChar<`'\''`>(),
354	many(oneOf(anyOfChars("\\\n\'").invert(), escapeSequence)),
355	exactChar<`'\''`>()));
356	// Parses a C-style single-quoted string.
357
358	constexpr auto doubleQuotedHexBinary = sequence(
359	exactChar<`'0'`>(), exactChar<`'x'`>(), exactChar<`'\"'`>(),
360	oneOrMore(transform(sequence(discardWhitespace, hexDigit, hexDigit), _::ParseHexByte ())),
361	discardWhitespace,
362	exactChar<`'\"'`>());
363	// Parses a double-quoted hex binary literal. Returns Array<byte>.
364
365	} // namespace parse
366	} // namespace kj
367

Browse the source code of ClickHouse/contrib/capnproto/c++/src/kj/parse/char.h