1 | #include <Parsers/Lexer.h> |
2 | #include <Common/StringUtils/StringUtils.h> |
3 | #include <common/find_symbols.h> |
4 | |
5 | |
6 | namespace DB |
7 | { |
8 | |
9 | namespace |
10 | { |
11 | |
12 | /// This must be consistent with functions in ReadHelpers.h |
13 | template <char quote, TokenType success_token, TokenType error_token> |
14 | Token quotedString(const char *& pos, const char * const token_begin, const char * const end) |
15 | { |
16 | ++pos; |
17 | while (true) |
18 | { |
19 | pos = find_first_symbols<quote, '\\'>(pos, end); |
20 | if (pos >= end) |
21 | return Token(error_token, token_begin, end); |
22 | |
23 | if (*pos == quote) |
24 | { |
25 | ++pos; |
26 | if (pos < end && *pos == quote) |
27 | { |
28 | ++pos; |
29 | continue; |
30 | } |
31 | return Token(success_token, token_begin, pos); |
32 | } |
33 | |
34 | if (*pos == '\\') |
35 | { |
36 | ++pos; |
37 | if (pos >= end) |
38 | return Token(error_token, token_begin, end); |
39 | ++pos; |
40 | continue; |
41 | } |
42 | |
43 | __builtin_unreachable(); |
44 | } |
45 | } |
46 | |
47 | } |
48 | |
49 | |
50 | Token Lexer::nextToken() |
51 | { |
52 | Token res = nextTokenImpl(); |
53 | if (res.type != TokenType::EndOfStream && max_query_size && res.end > begin + max_query_size) |
54 | res.type = TokenType::ErrorMaxQuerySizeExceeded; |
55 | if (res.isSignificant()) |
56 | prev_significant_token_type = res.type; |
57 | return res; |
58 | } |
59 | |
60 | |
61 | Token Lexer::nextTokenImpl() |
62 | { |
63 | if (pos >= end) |
64 | return Token(TokenType::EndOfStream, end, end); |
65 | |
66 | const char * const token_begin = pos; |
67 | |
68 | auto = [&]() mutable |
69 | { |
70 | pos = find_first_symbols<'\n'>(pos, end); /// This means that newline in single-line comment cannot be escaped. |
71 | return Token(TokenType::Comment, token_begin, pos); |
72 | }; |
73 | |
74 | switch (*pos) |
75 | { |
76 | case ' ': [[fallthrough]]; |
77 | case '\t': [[fallthrough]]; |
78 | case '\n': [[fallthrough]]; |
79 | case '\r': [[fallthrough]]; |
80 | case '\f': [[fallthrough]]; |
81 | case '\v': |
82 | { |
83 | ++pos; |
84 | while (pos < end && isWhitespaceASCII(*pos)) |
85 | ++pos; |
86 | return Token(TokenType::Whitespace, token_begin, pos); |
87 | } |
88 | |
89 | case '0': [[fallthrough]]; |
90 | case '1': [[fallthrough]]; |
91 | case '2': [[fallthrough]]; |
92 | case '3': [[fallthrough]]; |
93 | case '4': [[fallthrough]]; |
94 | case '5': [[fallthrough]]; |
95 | case '6': [[fallthrough]]; |
96 | case '7': [[fallthrough]]; |
97 | case '8': [[fallthrough]]; |
98 | case '9': |
99 | { |
100 | /// The task is not to parse a number or check correctness, but only to skip it. |
101 | |
102 | /// Disambiguation: if previous token was dot, then we could parse only simple integer, |
103 | /// for chained tuple access operators (x.1.1) to work. |
104 | // Otherwise it will be tokenized as x . 1.1, not as x . 1 . 1 |
105 | if (prev_significant_token_type == TokenType::Dot) |
106 | { |
107 | ++pos; |
108 | while (pos < end && isNumericASCII(*pos)) |
109 | ++pos; |
110 | } |
111 | else |
112 | { |
113 | /// 0x, 0b |
114 | bool hex = false; |
115 | if (pos + 2 < end && *pos == '0' && (pos[1] == 'x' || pos[1] == 'b' || pos[1] == 'X' || pos[1] == 'B')) |
116 | { |
117 | if (pos[1] == 'x' || pos[1] == 'X') |
118 | hex = true; |
119 | pos += 2; |
120 | } |
121 | else |
122 | ++pos; |
123 | |
124 | while (pos < end && (hex ? isHexDigit(*pos) : isNumericASCII(*pos))) |
125 | ++pos; |
126 | |
127 | /// decimal point |
128 | if (pos < end && *pos == '.') |
129 | { |
130 | ++pos; |
131 | while (pos < end && (hex ? isHexDigit(*pos) : isNumericASCII(*pos))) |
132 | ++pos; |
133 | } |
134 | |
135 | /// exponentiation (base 10 or base 2) |
136 | if (pos + 1 < end && (hex ? (*pos == 'p' || *pos == 'P') : (*pos == 'e' || *pos == 'E'))) |
137 | { |
138 | ++pos; |
139 | |
140 | /// sign of exponent. It is always decimal. |
141 | if (pos + 1 < end && (*pos == '-' || *pos == '+')) |
142 | ++pos; |
143 | |
144 | while (pos < end && isNumericASCII(*pos)) |
145 | ++pos; |
146 | } |
147 | } |
148 | |
149 | /// word character cannot go just after number (SELECT 123FROM) |
150 | if (pos < end && isWordCharASCII(*pos)) |
151 | { |
152 | ++pos; |
153 | while (pos < end && isWordCharASCII(*pos)) |
154 | ++pos; |
155 | return Token(TokenType::ErrorWrongNumber, token_begin, pos); |
156 | } |
157 | |
158 | return Token(TokenType::Number, token_begin, pos); |
159 | } |
160 | |
161 | case '\'': |
162 | return quotedString<'\'', TokenType::StringLiteral, TokenType::ErrorSingleQuoteIsNotClosed>(pos, token_begin, end); |
163 | case '"': |
164 | return quotedString<'"', TokenType::QuotedIdentifier, TokenType::ErrorDoubleQuoteIsNotClosed>(pos, token_begin, end); |
165 | case '`': |
166 | return quotedString<'`', TokenType::QuotedIdentifier, TokenType::ErrorBackQuoteIsNotClosed>(pos, token_begin, end); |
167 | |
168 | case '(': |
169 | return Token(TokenType::OpeningRoundBracket, token_begin, ++pos); |
170 | case ')': |
171 | return Token(TokenType::ClosingRoundBracket, token_begin, ++pos); |
172 | case '[': |
173 | return Token(TokenType::OpeningSquareBracket, token_begin, ++pos); |
174 | case ']': |
175 | return Token(TokenType::ClosingSquareBracket, token_begin, ++pos); |
176 | case '{': |
177 | return Token(TokenType::OpeningCurlyBrace, token_begin, ++pos); |
178 | case '}': |
179 | return Token(TokenType::ClosingCurlyBrace, token_begin, ++pos); |
180 | case ',': |
181 | return Token(TokenType::Comma, token_begin, ++pos); |
182 | case ';': |
183 | return Token(TokenType::Semicolon, token_begin, ++pos); |
184 | |
185 | case '.': /// qualifier, tuple access operator or start of floating point number |
186 | { |
187 | /// Just after identifier or complex expression or number (for chained tuple access like x.1.1 to work properly). |
188 | if (pos > begin |
189 | && (!(pos + 1 < end && isNumericASCII(pos[1])) |
190 | || prev_significant_token_type == TokenType::ClosingRoundBracket |
191 | || prev_significant_token_type == TokenType::ClosingSquareBracket |
192 | || prev_significant_token_type == TokenType::BareWord |
193 | || prev_significant_token_type == TokenType::QuotedIdentifier |
194 | || prev_significant_token_type == TokenType::Number)) |
195 | return Token(TokenType::Dot, token_begin, ++pos); |
196 | |
197 | ++pos; |
198 | while (pos < end && isNumericASCII(*pos)) |
199 | ++pos; |
200 | |
201 | /// exponentiation |
202 | if (pos + 1 < end && (*pos == 'e' || *pos == 'E')) |
203 | { |
204 | ++pos; |
205 | |
206 | /// sign of exponent |
207 | if (pos + 1 < end && (*pos == '-' || *pos == '+')) |
208 | ++pos; |
209 | |
210 | while (pos < end && isNumericASCII(*pos)) |
211 | ++pos; |
212 | } |
213 | |
214 | return Token(TokenType::Number, token_begin, pos); |
215 | } |
216 | |
217 | case '+': |
218 | return Token(TokenType::Plus, token_begin, ++pos); |
219 | case '-': /// minus (-), arrow (->) or start of comment (--) |
220 | { |
221 | ++pos; |
222 | if (pos < end && *pos == '>') |
223 | return Token(TokenType::Arrow, token_begin, ++pos); |
224 | |
225 | if (pos < end && *pos == '-') |
226 | { |
227 | ++pos; |
228 | return commentUntilEndOfLine(); |
229 | } |
230 | |
231 | return Token(TokenType::Minus, token_begin, pos); |
232 | } |
233 | case '*': |
234 | ++pos; |
235 | return Token(TokenType::Asterisk, token_begin, pos); |
236 | case '/': /// division (/) or start of comment (//, /*) |
237 | { |
238 | ++pos; |
239 | if (pos < end && (*pos == '/' || *pos == '*')) |
240 | { |
241 | if (*pos == '/') |
242 | { |
243 | ++pos; |
244 | return commentUntilEndOfLine(); |
245 | } |
246 | else |
247 | { |
248 | ++pos; |
249 | while (pos + 2 <= end) |
250 | { |
251 | /// This means that nested multiline comments are not supported. |
252 | if (pos[0] == '*' && pos[1] == '/') |
253 | { |
254 | pos += 2; |
255 | return Token(TokenType::Comment, token_begin, pos); |
256 | } |
257 | ++pos; |
258 | } |
259 | return Token(TokenType::ErrorMultilineCommentIsNotClosed, token_begin, end); |
260 | } |
261 | } |
262 | return Token(TokenType::Slash, token_begin, pos); |
263 | } |
264 | case '%': |
265 | return Token(TokenType::Percent, token_begin, ++pos); |
266 | case '=': /// =, == |
267 | { |
268 | ++pos; |
269 | if (pos < end && *pos == '=') |
270 | ++pos; |
271 | return Token(TokenType::Equals, token_begin, pos); |
272 | } |
273 | case '!': /// != |
274 | { |
275 | ++pos; |
276 | if (pos < end && *pos == '=') |
277 | return Token(TokenType::NotEquals, token_begin, ++pos); |
278 | return Token(TokenType::ErrorSingleExclamationMark, token_begin, pos); |
279 | } |
280 | case '<': /// <, <=, <> |
281 | { |
282 | ++pos; |
283 | if (pos < end && *pos == '=') |
284 | return Token(TokenType::LessOrEquals, token_begin, ++pos); |
285 | if (pos < end && *pos == '>') |
286 | return Token(TokenType::NotEquals, token_begin, ++pos); |
287 | return Token(TokenType::Less, token_begin, pos); |
288 | } |
289 | case '>': /// >, >= |
290 | { |
291 | ++pos; |
292 | if (pos < end && *pos == '=') |
293 | return Token(TokenType::GreaterOrEquals, token_begin, ++pos); |
294 | return Token(TokenType::Greater, token_begin, pos); |
295 | } |
296 | case '?': |
297 | return Token(TokenType::QuestionMark, token_begin, ++pos); |
298 | case ':': |
299 | return Token(TokenType::Colon, token_begin, ++pos); |
300 | case '|': |
301 | { |
302 | ++pos; |
303 | if (pos < end && *pos == '|') |
304 | return Token(TokenType::Concatenation, token_begin, ++pos); |
305 | return Token(TokenType::ErrorSinglePipeMark, token_begin, pos); |
306 | } |
307 | |
308 | default: |
309 | if (isWordCharASCII(*pos)) |
310 | { |
311 | ++pos; |
312 | while (pos < end && isWordCharASCII(*pos)) |
313 | ++pos; |
314 | return Token(TokenType::BareWord, token_begin, pos); |
315 | } |
316 | else |
317 | return Token(TokenType::Error, token_begin, ++pos); |
318 | } |
319 | } |
320 | |
321 | |
322 | const char * getTokenName(TokenType type) |
323 | { |
324 | switch (type) |
325 | { |
326 | #define M(TOKEN) \ |
327 | case TokenType::TOKEN: return #TOKEN; |
328 | APPLY_FOR_TOKENS(M) |
329 | #undef M |
330 | } |
331 | |
332 | __builtin_unreachable(); |
333 | } |
334 | |
335 | |
336 | const char * getErrorTokenDescription(TokenType type) |
337 | { |
338 | switch (type) |
339 | { |
340 | case TokenType::Error: |
341 | return "Unrecognized token" ; |
342 | case TokenType::ErrorMultilineCommentIsNotClosed: |
343 | return "Multiline comment is not closed" ; |
344 | case TokenType::ErrorSingleQuoteIsNotClosed: |
345 | return "Single quoted string is not closed" ; |
346 | case TokenType::ErrorDoubleQuoteIsNotClosed: |
347 | return "Double quoted string is not closed" ; |
348 | case TokenType::ErrorBackQuoteIsNotClosed: |
349 | return "Back quoted string is not closed" ; |
350 | case TokenType::ErrorSingleExclamationMark: |
351 | return "Exclamation mark can only occur in != operator" ; |
352 | case TokenType::ErrorSinglePipeMark: |
353 | return "Pipe symbol could only occur in || operator" ; |
354 | case TokenType::ErrorWrongNumber: |
355 | return "Wrong number" ; |
356 | case TokenType::ErrorMaxQuerySizeExceeded: |
357 | return "Max query size exceeded" ; |
358 | default: |
359 | return "Not an error" ; |
360 | } |
361 | } |
362 | |
363 | } |
364 | |