| 1 | #pragma once |
| 2 | |
| 3 | #include <stddef.h> |
| 4 | |
| 5 | |
| 6 | namespace DB |
| 7 | { |
| 8 | |
| 9 | #define APPLY_FOR_TOKENS(M) \ |
| 10 | M(Whitespace) \ |
| 11 | M() \ |
| 12 | \ |
| 13 | M(BareWord) /** Either keyword (SELECT) or identifier (column) */ \ |
| 14 | \ |
| 15 | M(Number) /** Always non-negative. No leading plus. 123 or something like 123.456e12, 0x123p12 */ \ |
| 16 | M(StringLiteral) /** 'hello word', 'hello''word', 'hello\'word\\' */ \ |
| 17 | \ |
| 18 | M(QuotedIdentifier) /** "x", `x` */ \ |
| 19 | \ |
| 20 | M(OpeningRoundBracket) \ |
| 21 | M(ClosingRoundBracket) \ |
| 22 | \ |
| 23 | M(OpeningSquareBracket) \ |
| 24 | M(ClosingSquareBracket) \ |
| 25 | \ |
| 26 | M(OpeningCurlyBrace) \ |
| 27 | M(ClosingCurlyBrace) \ |
| 28 | \ |
| 29 | M(Comma) \ |
| 30 | M(Semicolon) \ |
| 31 | M(Dot) /** Compound identifiers, like a.b or tuple access operator a.1, (x, y).2. */ \ |
| 32 | /** Need to be distinguished from floating point number with omitted integer part: .1 */ \ |
| 33 | \ |
| 34 | M(Asterisk) /** Could be used as multiplication operator or on it's own: "SELECT *" */ \ |
| 35 | \ |
| 36 | M(Plus) \ |
| 37 | M(Minus) \ |
| 38 | M(Slash) \ |
| 39 | M(Percent) \ |
| 40 | M(Arrow) /** ->. Should be distinguished from minus operator. */ \ |
| 41 | M(QuestionMark) \ |
| 42 | M(Colon) \ |
| 43 | M(Equals) \ |
| 44 | M(NotEquals) \ |
| 45 | M(Less) \ |
| 46 | M(Greater) \ |
| 47 | M(LessOrEquals) \ |
| 48 | M(GreaterOrEquals) \ |
| 49 | M(Concatenation) /** String concatenation operator: || */ \ |
| 50 | \ |
| 51 | /** Order is important. EndOfStream goes after all usual tokens, and special error tokens goes after EndOfStream. */ \ |
| 52 | \ |
| 53 | M(EndOfStream) \ |
| 54 | \ |
| 55 | /** Something unrecognized. */ \ |
| 56 | M(Error) \ |
| 57 | /** Something is wrong and we have more information. */ \ |
| 58 | M() \ |
| 59 | M(ErrorSingleQuoteIsNotClosed) \ |
| 60 | M(ErrorDoubleQuoteIsNotClosed) \ |
| 61 | M(ErrorBackQuoteIsNotClosed) \ |
| 62 | M(ErrorSingleExclamationMark) \ |
| 63 | M(ErrorSinglePipeMark) \ |
| 64 | M(ErrorWrongNumber) \ |
| 65 | M(ErrorMaxQuerySizeExceeded) \ |
| 66 | |
| 67 | |
| 68 | enum class TokenType |
| 69 | { |
| 70 | #define M(TOKEN) TOKEN, |
| 71 | APPLY_FOR_TOKENS(M) |
| 72 | #undef M |
| 73 | }; |
| 74 | |
| 75 | const char * getTokenName(TokenType type); |
| 76 | const char * getErrorTokenDescription(TokenType type); |
| 77 | |
| 78 | |
| 79 | struct Token |
| 80 | { |
| 81 | TokenType type; |
| 82 | const char * begin; |
| 83 | const char * end; |
| 84 | |
| 85 | size_t size() const { return end - begin; } |
| 86 | |
| 87 | Token() = default; |
| 88 | Token(TokenType type_, const char * begin_, const char * end_) : type(type_), begin(begin_), end(end_) {} |
| 89 | |
| 90 | bool isSignificant() const { return type != TokenType::Whitespace && type != TokenType::Comment; } |
| 91 | bool isError() const { return type > TokenType::EndOfStream; } |
| 92 | bool isEnd() const { return type == TokenType::EndOfStream; } |
| 93 | }; |
| 94 | |
| 95 | |
| 96 | class Lexer |
| 97 | { |
| 98 | public: |
| 99 | Lexer(const char * begin_, const char * end_, size_t max_query_size_ = 0) |
| 100 | : begin(begin_), pos(begin_), end(end_), max_query_size(max_query_size_) {} |
| 101 | Token nextToken(); |
| 102 | |
| 103 | private: |
| 104 | const char * const begin; |
| 105 | const char * pos; |
| 106 | const char * const end; |
| 107 | |
| 108 | const size_t max_query_size; |
| 109 | |
| 110 | Token nextTokenImpl(); |
| 111 | |
| 112 | /// This is needed to disambiguate tuple access operator from floating point number (.1). |
| 113 | TokenType prev_significant_token_type = TokenType::Whitespace; /// No previous token. |
| 114 | }; |
| 115 | |
| 116 | } |
| 117 | |