1#include <Parsers/Lexer.h>
2#include <Common/StringUtils/StringUtils.h>
3#include <common/find_symbols.h>
4
5
6namespace DB
7{
8
9namespace
10{
11
12/// This must be consistent with functions in ReadHelpers.h
13template <char quote, TokenType success_token, TokenType error_token>
14Token quotedString(const char *& pos, const char * const token_begin, const char * const end)
15{
16 ++pos;
17 while (true)
18 {
19 pos = find_first_symbols<quote, '\\'>(pos, end);
20 if (pos >= end)
21 return Token(error_token, token_begin, end);
22
23 if (*pos == quote)
24 {
25 ++pos;
26 if (pos < end && *pos == quote)
27 {
28 ++pos;
29 continue;
30 }
31 return Token(success_token, token_begin, pos);
32 }
33
34 if (*pos == '\\')
35 {
36 ++pos;
37 if (pos >= end)
38 return Token(error_token, token_begin, end);
39 ++pos;
40 continue;
41 }
42
43 __builtin_unreachable();
44 }
45}
46
47}
48
49
50Token Lexer::nextToken()
51{
52 Token res = nextTokenImpl();
53 if (res.type != TokenType::EndOfStream && max_query_size && res.end > begin + max_query_size)
54 res.type = TokenType::ErrorMaxQuerySizeExceeded;
55 if (res.isSignificant())
56 prev_significant_token_type = res.type;
57 return res;
58}
59
60
61Token Lexer::nextTokenImpl()
62{
63 if (pos >= end)
64 return Token(TokenType::EndOfStream, end, end);
65
66 const char * const token_begin = pos;
67
68 auto commentUntilEndOfLine = [&]() mutable
69 {
70 pos = find_first_symbols<'\n'>(pos, end); /// This means that newline in single-line comment cannot be escaped.
71 return Token(TokenType::Comment, token_begin, pos);
72 };
73
74 switch (*pos)
75 {
76 case ' ': [[fallthrough]];
77 case '\t': [[fallthrough]];
78 case '\n': [[fallthrough]];
79 case '\r': [[fallthrough]];
80 case '\f': [[fallthrough]];
81 case '\v':
82 {
83 ++pos;
84 while (pos < end && isWhitespaceASCII(*pos))
85 ++pos;
86 return Token(TokenType::Whitespace, token_begin, pos);
87 }
88
89 case '0': [[fallthrough]];
90 case '1': [[fallthrough]];
91 case '2': [[fallthrough]];
92 case '3': [[fallthrough]];
93 case '4': [[fallthrough]];
94 case '5': [[fallthrough]];
95 case '6': [[fallthrough]];
96 case '7': [[fallthrough]];
97 case '8': [[fallthrough]];
98 case '9':
99 {
100 /// The task is not to parse a number or check correctness, but only to skip it.
101
102 /// Disambiguation: if previous token was dot, then we could parse only simple integer,
103 /// for chained tuple access operators (x.1.1) to work.
104 // Otherwise it will be tokenized as x . 1.1, not as x . 1 . 1
105 if (prev_significant_token_type == TokenType::Dot)
106 {
107 ++pos;
108 while (pos < end && isNumericASCII(*pos))
109 ++pos;
110 }
111 else
112 {
113 /// 0x, 0b
114 bool hex = false;
115 if (pos + 2 < end && *pos == '0' && (pos[1] == 'x' || pos[1] == 'b' || pos[1] == 'X' || pos[1] == 'B'))
116 {
117 if (pos[1] == 'x' || pos[1] == 'X')
118 hex = true;
119 pos += 2;
120 }
121 else
122 ++pos;
123
124 while (pos < end && (hex ? isHexDigit(*pos) : isNumericASCII(*pos)))
125 ++pos;
126
127 /// decimal point
128 if (pos < end && *pos == '.')
129 {
130 ++pos;
131 while (pos < end && (hex ? isHexDigit(*pos) : isNumericASCII(*pos)))
132 ++pos;
133 }
134
135 /// exponentiation (base 10 or base 2)
136 if (pos + 1 < end && (hex ? (*pos == 'p' || *pos == 'P') : (*pos == 'e' || *pos == 'E')))
137 {
138 ++pos;
139
140 /// sign of exponent. It is always decimal.
141 if (pos + 1 < end && (*pos == '-' || *pos == '+'))
142 ++pos;
143
144 while (pos < end && isNumericASCII(*pos))
145 ++pos;
146 }
147 }
148
149 /// word character cannot go just after number (SELECT 123FROM)
150 if (pos < end && isWordCharASCII(*pos))
151 {
152 ++pos;
153 while (pos < end && isWordCharASCII(*pos))
154 ++pos;
155 return Token(TokenType::ErrorWrongNumber, token_begin, pos);
156 }
157
158 return Token(TokenType::Number, token_begin, pos);
159 }
160
161 case '\'':
162 return quotedString<'\'', TokenType::StringLiteral, TokenType::ErrorSingleQuoteIsNotClosed>(pos, token_begin, end);
163 case '"':
164 return quotedString<'"', TokenType::QuotedIdentifier, TokenType::ErrorDoubleQuoteIsNotClosed>(pos, token_begin, end);
165 case '`':
166 return quotedString<'`', TokenType::QuotedIdentifier, TokenType::ErrorBackQuoteIsNotClosed>(pos, token_begin, end);
167
168 case '(':
169 return Token(TokenType::OpeningRoundBracket, token_begin, ++pos);
170 case ')':
171 return Token(TokenType::ClosingRoundBracket, token_begin, ++pos);
172 case '[':
173 return Token(TokenType::OpeningSquareBracket, token_begin, ++pos);
174 case ']':
175 return Token(TokenType::ClosingSquareBracket, token_begin, ++pos);
176 case '{':
177 return Token(TokenType::OpeningCurlyBrace, token_begin, ++pos);
178 case '}':
179 return Token(TokenType::ClosingCurlyBrace, token_begin, ++pos);
180 case ',':
181 return Token(TokenType::Comma, token_begin, ++pos);
182 case ';':
183 return Token(TokenType::Semicolon, token_begin, ++pos);
184
185 case '.': /// qualifier, tuple access operator or start of floating point number
186 {
187 /// Just after identifier or complex expression or number (for chained tuple access like x.1.1 to work properly).
188 if (pos > begin
189 && (!(pos + 1 < end && isNumericASCII(pos[1]))
190 || prev_significant_token_type == TokenType::ClosingRoundBracket
191 || prev_significant_token_type == TokenType::ClosingSquareBracket
192 || prev_significant_token_type == TokenType::BareWord
193 || prev_significant_token_type == TokenType::QuotedIdentifier
194 || prev_significant_token_type == TokenType::Number))
195 return Token(TokenType::Dot, token_begin, ++pos);
196
197 ++pos;
198 while (pos < end && isNumericASCII(*pos))
199 ++pos;
200
201 /// exponentiation
202 if (pos + 1 < end && (*pos == 'e' || *pos == 'E'))
203 {
204 ++pos;
205
206 /// sign of exponent
207 if (pos + 1 < end && (*pos == '-' || *pos == '+'))
208 ++pos;
209
210 while (pos < end && isNumericASCII(*pos))
211 ++pos;
212 }
213
214 return Token(TokenType::Number, token_begin, pos);
215 }
216
217 case '+':
218 return Token(TokenType::Plus, token_begin, ++pos);
219 case '-': /// minus (-), arrow (->) or start of comment (--)
220 {
221 ++pos;
222 if (pos < end && *pos == '>')
223 return Token(TokenType::Arrow, token_begin, ++pos);
224
225 if (pos < end && *pos == '-')
226 {
227 ++pos;
228 return commentUntilEndOfLine();
229 }
230
231 return Token(TokenType::Minus, token_begin, pos);
232 }
233 case '*':
234 ++pos;
235 return Token(TokenType::Asterisk, token_begin, pos);
236 case '/': /// division (/) or start of comment (//, /*)
237 {
238 ++pos;
239 if (pos < end && (*pos == '/' || *pos == '*'))
240 {
241 if (*pos == '/')
242 {
243 ++pos;
244 return commentUntilEndOfLine();
245 }
246 else
247 {
248 ++pos;
249 while (pos + 2 <= end)
250 {
251 /// This means that nested multiline comments are not supported.
252 if (pos[0] == '*' && pos[1] == '/')
253 {
254 pos += 2;
255 return Token(TokenType::Comment, token_begin, pos);
256 }
257 ++pos;
258 }
259 return Token(TokenType::ErrorMultilineCommentIsNotClosed, token_begin, end);
260 }
261 }
262 return Token(TokenType::Slash, token_begin, pos);
263 }
264 case '%':
265 return Token(TokenType::Percent, token_begin, ++pos);
266 case '=': /// =, ==
267 {
268 ++pos;
269 if (pos < end && *pos == '=')
270 ++pos;
271 return Token(TokenType::Equals, token_begin, pos);
272 }
273 case '!': /// !=
274 {
275 ++pos;
276 if (pos < end && *pos == '=')
277 return Token(TokenType::NotEquals, token_begin, ++pos);
278 return Token(TokenType::ErrorSingleExclamationMark, token_begin, pos);
279 }
280 case '<': /// <, <=, <>
281 {
282 ++pos;
283 if (pos < end && *pos == '=')
284 return Token(TokenType::LessOrEquals, token_begin, ++pos);
285 if (pos < end && *pos == '>')
286 return Token(TokenType::NotEquals, token_begin, ++pos);
287 return Token(TokenType::Less, token_begin, pos);
288 }
289 case '>': /// >, >=
290 {
291 ++pos;
292 if (pos < end && *pos == '=')
293 return Token(TokenType::GreaterOrEquals, token_begin, ++pos);
294 return Token(TokenType::Greater, token_begin, pos);
295 }
296 case '?':
297 return Token(TokenType::QuestionMark, token_begin, ++pos);
298 case ':':
299 return Token(TokenType::Colon, token_begin, ++pos);
300 case '|':
301 {
302 ++pos;
303 if (pos < end && *pos == '|')
304 return Token(TokenType::Concatenation, token_begin, ++pos);
305 return Token(TokenType::ErrorSinglePipeMark, token_begin, pos);
306 }
307
308 default:
309 if (isWordCharASCII(*pos))
310 {
311 ++pos;
312 while (pos < end && isWordCharASCII(*pos))
313 ++pos;
314 return Token(TokenType::BareWord, token_begin, pos);
315 }
316 else
317 return Token(TokenType::Error, token_begin, ++pos);
318 }
319}
320
321
322const char * getTokenName(TokenType type)
323{
324 switch (type)
325 {
326#define M(TOKEN) \
327 case TokenType::TOKEN: return #TOKEN;
328APPLY_FOR_TOKENS(M)
329#undef M
330 }
331
332 __builtin_unreachable();
333}
334
335
336const char * getErrorTokenDescription(TokenType type)
337{
338 switch (type)
339 {
340 case TokenType::Error:
341 return "Unrecognized token";
342 case TokenType::ErrorMultilineCommentIsNotClosed:
343 return "Multiline comment is not closed";
344 case TokenType::ErrorSingleQuoteIsNotClosed:
345 return "Single quoted string is not closed";
346 case TokenType::ErrorDoubleQuoteIsNotClosed:
347 return "Double quoted string is not closed";
348 case TokenType::ErrorBackQuoteIsNotClosed:
349 return "Back quoted string is not closed";
350 case TokenType::ErrorSingleExclamationMark:
351 return "Exclamation mark can only occur in != operator";
352 case TokenType::ErrorSinglePipeMark:
353 return "Pipe symbol could only occur in || operator";
354 case TokenType::ErrorWrongNumber:
355 return "Wrong number";
356 case TokenType::ErrorMaxQuerySizeExceeded:
357 return "Max query size exceeded";
358 default:
359 return "Not an error";
360 }
361}
362
363}
364