1 | #pragma once |
2 | |
3 | #include <stddef.h> |
4 | |
5 | |
6 | namespace DB |
7 | { |
8 | |
9 | #define APPLY_FOR_TOKENS(M) \ |
10 | M(Whitespace) \ |
11 | M() \ |
12 | \ |
13 | M(BareWord) /** Either keyword (SELECT) or identifier (column) */ \ |
14 | \ |
15 | M(Number) /** Always non-negative. No leading plus. 123 or something like 123.456e12, 0x123p12 */ \ |
16 | M(StringLiteral) /** 'hello word', 'hello''word', 'hello\'word\\' */ \ |
17 | \ |
18 | M(QuotedIdentifier) /** "x", `x` */ \ |
19 | \ |
20 | M(OpeningRoundBracket) \ |
21 | M(ClosingRoundBracket) \ |
22 | \ |
23 | M(OpeningSquareBracket) \ |
24 | M(ClosingSquareBracket) \ |
25 | \ |
26 | M(OpeningCurlyBrace) \ |
27 | M(ClosingCurlyBrace) \ |
28 | \ |
29 | M(Comma) \ |
30 | M(Semicolon) \ |
31 | M(Dot) /** Compound identifiers, like a.b or tuple access operator a.1, (x, y).2. */ \ |
32 | /** Need to be distinguished from floating point number with omitted integer part: .1 */ \ |
33 | \ |
34 | M(Asterisk) /** Could be used as multiplication operator or on it's own: "SELECT *" */ \ |
35 | \ |
36 | M(Plus) \ |
37 | M(Minus) \ |
38 | M(Slash) \ |
39 | M(Percent) \ |
40 | M(Arrow) /** ->. Should be distinguished from minus operator. */ \ |
41 | M(QuestionMark) \ |
42 | M(Colon) \ |
43 | M(Equals) \ |
44 | M(NotEquals) \ |
45 | M(Less) \ |
46 | M(Greater) \ |
47 | M(LessOrEquals) \ |
48 | M(GreaterOrEquals) \ |
49 | M(Concatenation) /** String concatenation operator: || */ \ |
50 | \ |
51 | /** Order is important. EndOfStream goes after all usual tokens, and special error tokens goes after EndOfStream. */ \ |
52 | \ |
53 | M(EndOfStream) \ |
54 | \ |
55 | /** Something unrecognized. */ \ |
56 | M(Error) \ |
57 | /** Something is wrong and we have more information. */ \ |
58 | M() \ |
59 | M(ErrorSingleQuoteIsNotClosed) \ |
60 | M(ErrorDoubleQuoteIsNotClosed) \ |
61 | M(ErrorBackQuoteIsNotClosed) \ |
62 | M(ErrorSingleExclamationMark) \ |
63 | M(ErrorSinglePipeMark) \ |
64 | M(ErrorWrongNumber) \ |
65 | M(ErrorMaxQuerySizeExceeded) \ |
66 | |
67 | |
68 | enum class TokenType |
69 | { |
70 | #define M(TOKEN) TOKEN, |
71 | APPLY_FOR_TOKENS(M) |
72 | #undef M |
73 | }; |
74 | |
75 | const char * getTokenName(TokenType type); |
76 | const char * getErrorTokenDescription(TokenType type); |
77 | |
78 | |
79 | struct Token |
80 | { |
81 | TokenType type; |
82 | const char * begin; |
83 | const char * end; |
84 | |
85 | size_t size() const { return end - begin; } |
86 | |
87 | Token() = default; |
88 | Token(TokenType type_, const char * begin_, const char * end_) : type(type_), begin(begin_), end(end_) {} |
89 | |
90 | bool isSignificant() const { return type != TokenType::Whitespace && type != TokenType::Comment; } |
91 | bool isError() const { return type > TokenType::EndOfStream; } |
92 | bool isEnd() const { return type == TokenType::EndOfStream; } |
93 | }; |
94 | |
95 | |
96 | class Lexer |
97 | { |
98 | public: |
99 | Lexer(const char * begin_, const char * end_, size_t max_query_size_ = 0) |
100 | : begin(begin_), pos(begin_), end(end_), max_query_size(max_query_size_) {} |
101 | Token nextToken(); |
102 | |
103 | private: |
104 | const char * const begin; |
105 | const char * pos; |
106 | const char * const end; |
107 | |
108 | const size_t max_query_size; |
109 | |
110 | Token nextTokenImpl(); |
111 | |
112 | /// This is needed to disambiguate tuple access operator from floating point number (.1). |
113 | TokenType prev_significant_token_type = TokenType::Whitespace; /// No previous token. |
114 | }; |
115 | |
116 | } |
117 | |