1 | #include "duckdb/parser/parser.hpp" |
2 | |
3 | #include "duckdb/parser/parsed_data/create_table_info.hpp" |
4 | #include "duckdb/parser/parser_extension.hpp" |
5 | #include "duckdb/parser/query_error_context.hpp" |
6 | #include "duckdb/parser/query_node/select_node.hpp" |
7 | #include "duckdb/parser/statement/create_statement.hpp" |
8 | #include "duckdb/parser/statement/extension_statement.hpp" |
9 | #include "duckdb/parser/statement/select_statement.hpp" |
10 | #include "duckdb/parser/statement/update_statement.hpp" |
11 | #include "duckdb/parser/tableref/expressionlistref.hpp" |
12 | #include "duckdb/parser/transformer.hpp" |
13 | #include "parser/parser.hpp" |
14 | #include "postgres_parser.hpp" |
15 | |
16 | namespace duckdb { |
17 | |
18 | Parser::Parser(ParserOptions options_p) : options(options_p) { |
19 | } |
20 | |
21 | struct UnicodeSpace { |
22 | UnicodeSpace(idx_t pos, idx_t bytes) : pos(pos), bytes(bytes) { |
23 | } |
24 | |
25 | idx_t pos; |
26 | idx_t bytes; |
27 | }; |
28 | |
29 | static bool ReplaceUnicodeSpaces(const string &query, string &new_query, vector<UnicodeSpace> &unicode_spaces) { |
30 | if (unicode_spaces.empty()) { |
31 | // no unicode spaces found |
32 | return false; |
33 | } |
34 | idx_t prev = 0; |
35 | for (auto &usp : unicode_spaces) { |
36 | new_query += query.substr(pos: prev, n: usp.pos - prev); |
37 | new_query += " " ; |
38 | prev = usp.pos + usp.bytes; |
39 | } |
40 | new_query += query.substr(pos: prev, n: query.size() - prev); |
41 | return true; |
42 | } |
43 | |
44 | // This function strips unicode space characters from the query and replaces them with regular spaces |
45 | // It returns true if any unicode space characters were found and stripped |
46 | // See here for a list of unicode space characters - https://jkorpela.fi/chars/spaces.html |
47 | bool Parser::StripUnicodeSpaces(const string &query_str, string &new_query) { |
48 | const idx_t NBSP_LEN = 2; |
49 | const idx_t USP_LEN = 3; |
50 | idx_t pos = 0; |
51 | unsigned char quote; |
52 | vector<UnicodeSpace> unicode_spaces; |
53 | auto query = const_uchar_ptr_cast(src: query_str.c_str()); |
54 | auto qsize = query_str.size(); |
55 | |
56 | regular: |
57 | for (; pos + 2 < qsize; pos++) { |
58 | if (query[pos] == 0xC2) { |
59 | if (query[pos + 1] == 0xA0) { |
60 | // U+00A0 - C2A0 |
61 | unicode_spaces.emplace_back(args&: pos, args: NBSP_LEN); |
62 | } |
63 | } |
64 | if (query[pos] == 0xE2) { |
65 | if (query[pos + 1] == 0x80) { |
66 | if (query[pos + 2] >= 0x80 && query[pos + 2] <= 0x8B) { |
67 | // U+2000 to U+200B |
68 | // E28080 - E2808B |
69 | unicode_spaces.emplace_back(args&: pos, args: USP_LEN); |
70 | } else if (query[pos + 2] == 0xAF) { |
71 | // U+202F - E280AF |
72 | unicode_spaces.emplace_back(args&: pos, args: USP_LEN); |
73 | } |
74 | } else if (query[pos + 1] == 0x81) { |
75 | if (query[pos + 2] == 0x9F) { |
76 | // U+205F - E2819f |
77 | unicode_spaces.emplace_back(args&: pos, args: USP_LEN); |
78 | } else if (query[pos + 2] == 0xA0) { |
79 | // U+2060 - E281A0 |
80 | unicode_spaces.emplace_back(args&: pos, args: USP_LEN); |
81 | } |
82 | } |
83 | } else if (query[pos] == 0xE3) { |
84 | if (query[pos + 1] == 0x80 && query[pos + 2] == 0x80) { |
85 | // U+3000 - E38080 |
86 | unicode_spaces.emplace_back(args&: pos, args: USP_LEN); |
87 | } |
88 | } else if (query[pos] == 0xEF) { |
89 | if (query[pos + 1] == 0xBB && query[pos + 2] == 0xBF) { |
90 | // U+FEFF - EFBBBF |
91 | unicode_spaces.emplace_back(args&: pos, args: USP_LEN); |
92 | } |
93 | } else if (query[pos] == '"' || query[pos] == '\'') { |
94 | quote = query[pos]; |
95 | pos++; |
96 | goto in_quotes; |
97 | } else if (query[pos] == '-' && query[pos + 1] == '-') { |
98 | goto in_comment; |
99 | } |
100 | } |
101 | goto end; |
102 | in_quotes: |
103 | for (; pos + 1 < qsize; pos++) { |
104 | if (query[pos] == quote) { |
105 | if (query[pos + 1] == quote) { |
106 | // escaped quote |
107 | pos++; |
108 | continue; |
109 | } |
110 | pos++; |
111 | goto regular; |
112 | } |
113 | } |
114 | goto end; |
115 | : |
116 | for (; pos < qsize; pos++) { |
117 | if (query[pos] == '\n' || query[pos] == '\r') { |
118 | goto regular; |
119 | } |
120 | } |
121 | goto end; |
122 | end: |
123 | return ReplaceUnicodeSpaces(query: query_str, new_query, unicode_spaces); |
124 | } |
125 | |
126 | vector<string> SplitQueryStringIntoStatements(const string &query) { |
127 | // Break sql string down into sql statements using the tokenizer |
128 | vector<string> query_statements; |
129 | auto tokens = Parser::Tokenize(query); |
130 | auto next_statement_start = 0; |
131 | for (idx_t i = 1; i < tokens.size(); ++i) { |
132 | auto &t_prev = tokens[i - 1]; |
133 | auto &t = tokens[i]; |
134 | if (t_prev.type == SimplifiedTokenType::SIMPLIFIED_TOKEN_OPERATOR) { |
135 | // LCOV_EXCL_START |
136 | for (idx_t c = t_prev.start; c <= t.start; ++c) { |
137 | if (query.c_str()[c] == ';') { |
138 | query_statements.emplace_back(args: query.substr(pos: next_statement_start, n: t.start - next_statement_start)); |
139 | next_statement_start = tokens[i].start; |
140 | } |
141 | } |
142 | // LCOV_EXCL_STOP |
143 | } |
144 | } |
145 | query_statements.emplace_back(args: query.substr(pos: next_statement_start, n: query.size() - next_statement_start)); |
146 | return query_statements; |
147 | } |
148 | |
149 | void Parser::ParseQuery(const string &query) { |
150 | Transformer transformer(options); |
151 | string parser_error; |
152 | { |
153 | // check if there are any unicode spaces in the string |
154 | string new_query; |
155 | if (StripUnicodeSpaces(query_str: query, new_query)) { |
156 | // there are - strip the unicode spaces and re-run the query |
157 | ParseQuery(query: new_query); |
158 | return; |
159 | } |
160 | } |
161 | { |
162 | PostgresParser::SetPreserveIdentifierCase(options.preserve_identifier_case); |
163 | bool parsing_succeed = false; |
164 | // Creating a new scope to prevent multiple PostgresParser destructors being called |
165 | // which led to some memory issues |
166 | { |
167 | PostgresParser parser; |
168 | parser.Parse(query); |
169 | if (parser.success) { |
170 | if (!parser.parse_tree) { |
171 | // empty statement |
172 | return; |
173 | } |
174 | |
175 | // if it succeeded, we transform the Postgres parse tree into a list of |
176 | // SQLStatements |
177 | transformer.TransformParseTree(tree: parser.parse_tree, statements); |
178 | parsing_succeed = true; |
179 | } else { |
180 | parser_error = QueryErrorContext::Format(query, error_message: parser.error_message, error_location: parser.error_location - 1); |
181 | } |
182 | } |
183 | // If DuckDB fails to parse the entire sql string, break the string down into individual statements |
184 | // using ';' as the delimiter so that parser extensions can parse the statement |
185 | if (parsing_succeed) { |
186 | // no-op |
187 | // return here would require refactoring into another function. o.w. will just no-op in order to run wrap up |
188 | // code at the end of this function |
189 | } else if (!options.extensions || options.extensions->empty()) { |
190 | throw ParserException(parser_error); |
191 | } else { |
192 | // split sql string into statements and re-parse using extension |
193 | auto query_statements = SplitQueryStringIntoStatements(query); |
194 | for (auto const &query_statement : query_statements) { |
195 | PostgresParser another_parser; |
196 | another_parser.Parse(query: query_statement); |
197 | // LCOV_EXCL_START |
198 | // first see if DuckDB can parse this individual query statement |
199 | if (another_parser.success) { |
200 | if (!another_parser.parse_tree) { |
201 | // empty statement |
202 | continue; |
203 | } |
204 | transformer.TransformParseTree(tree: another_parser.parse_tree, statements); |
205 | } else { |
206 | // let extensions parse the statement which DuckDB failed to parse |
207 | bool parsed_single_statement = false; |
208 | for (auto &ext : *options.extensions) { |
209 | D_ASSERT(!parsed_single_statement); |
210 | D_ASSERT(ext.parse_function); |
211 | auto result = ext.parse_function(ext.parser_info.get(), query_statement); |
212 | if (result.type == ParserExtensionResultType::PARSE_SUCCESSFUL) { |
213 | auto statement = make_uniq<ExtensionStatement>(args: ext, args: std::move(result.parse_data)); |
214 | statement->stmt_length = query_statement.size(); |
215 | statement->stmt_location = 0; |
216 | statements.push_back(x: std::move(statement)); |
217 | parsed_single_statement = true; |
218 | break; |
219 | } else if (result.type == ParserExtensionResultType::DISPLAY_EXTENSION_ERROR) { |
220 | throw ParserException(result.error); |
221 | } else { |
222 | // We move to the next one! |
223 | } |
224 | } |
225 | if (!parsed_single_statement) { |
226 | parser_error = QueryErrorContext::Format(query, error_message: another_parser.error_message, |
227 | error_location: another_parser.error_location - 1); |
228 | throw ParserException(parser_error); |
229 | } |
230 | } |
231 | // LCOV_EXCL_STOP |
232 | } |
233 | } |
234 | } |
235 | if (!statements.empty()) { |
236 | auto &last_statement = statements.back(); |
237 | last_statement->stmt_length = query.size() - last_statement->stmt_location; |
238 | for (auto &statement : statements) { |
239 | statement->query = query; |
240 | if (statement->type == StatementType::CREATE_STATEMENT) { |
241 | auto &create = statement->Cast<CreateStatement>(); |
242 | create.info->sql = query.substr(pos: statement->stmt_location, n: statement->stmt_length); |
243 | } |
244 | } |
245 | } |
246 | } |
247 | |
248 | vector<SimplifiedToken> Parser::Tokenize(const string &query) { |
249 | auto pg_tokens = PostgresParser::Tokenize(query); |
250 | vector<SimplifiedToken> result; |
251 | result.reserve(n: pg_tokens.size()); |
252 | for (auto &pg_token : pg_tokens) { |
253 | SimplifiedToken token; |
254 | switch (pg_token.type) { |
255 | case duckdb_libpgquery::PGSimplifiedTokenType::PG_SIMPLIFIED_TOKEN_IDENTIFIER: |
256 | token.type = SimplifiedTokenType::SIMPLIFIED_TOKEN_IDENTIFIER; |
257 | break; |
258 | case duckdb_libpgquery::PGSimplifiedTokenType::PG_SIMPLIFIED_TOKEN_NUMERIC_CONSTANT: |
259 | token.type = SimplifiedTokenType::SIMPLIFIED_TOKEN_NUMERIC_CONSTANT; |
260 | break; |
261 | case duckdb_libpgquery::PGSimplifiedTokenType::PG_SIMPLIFIED_TOKEN_STRING_CONSTANT: |
262 | token.type = SimplifiedTokenType::SIMPLIFIED_TOKEN_STRING_CONSTANT; |
263 | break; |
264 | case duckdb_libpgquery::PGSimplifiedTokenType::PG_SIMPLIFIED_TOKEN_OPERATOR: |
265 | token.type = SimplifiedTokenType::SIMPLIFIED_TOKEN_OPERATOR; |
266 | break; |
267 | case duckdb_libpgquery::PGSimplifiedTokenType::PG_SIMPLIFIED_TOKEN_KEYWORD: |
268 | token.type = SimplifiedTokenType::SIMPLIFIED_TOKEN_KEYWORD; |
269 | break; |
270 | // comments are not supported by our tokenizer right now |
271 | case duckdb_libpgquery::PGSimplifiedTokenType::PG_SIMPLIFIED_TOKEN_COMMENT: // LCOV_EXCL_START |
272 | token.type = SimplifiedTokenType::SIMPLIFIED_TOKEN_COMMENT; |
273 | break; |
274 | default: |
275 | throw InternalException("Unrecognized token category" ); |
276 | } // LCOV_EXCL_STOP |
277 | token.start = pg_token.start; |
278 | result.push_back(x: token); |
279 | } |
280 | return result; |
281 | } |
282 | |
283 | bool Parser::IsKeyword(const string &text) { |
284 | return PostgresParser::IsKeyword(text); |
285 | } |
286 | |
287 | vector<ParserKeyword> Parser::KeywordList() { |
288 | auto keywords = PostgresParser::KeywordList(); |
289 | vector<ParserKeyword> result; |
290 | for (auto &kw : keywords) { |
291 | ParserKeyword res; |
292 | res.name = kw.text; |
293 | switch (kw.category) { |
294 | case duckdb_libpgquery::PGKeywordCategory::PG_KEYWORD_RESERVED: |
295 | res.category = KeywordCategory::KEYWORD_RESERVED; |
296 | break; |
297 | case duckdb_libpgquery::PGKeywordCategory::PG_KEYWORD_UNRESERVED: |
298 | res.category = KeywordCategory::KEYWORD_UNRESERVED; |
299 | break; |
300 | case duckdb_libpgquery::PGKeywordCategory::PG_KEYWORD_TYPE_FUNC: |
301 | res.category = KeywordCategory::KEYWORD_TYPE_FUNC; |
302 | break; |
303 | case duckdb_libpgquery::PGKeywordCategory::PG_KEYWORD_COL_NAME: |
304 | res.category = KeywordCategory::KEYWORD_COL_NAME; |
305 | break; |
306 | default: |
307 | throw InternalException("Unrecognized keyword category" ); |
308 | } |
309 | result.push_back(x: res); |
310 | } |
311 | return result; |
312 | } |
313 | |
314 | vector<unique_ptr<ParsedExpression>> Parser::ParseExpressionList(const string &select_list, ParserOptions options) { |
315 | // construct a mock query prefixed with SELECT |
316 | string mock_query = "SELECT " + select_list; |
317 | // parse the query |
318 | Parser parser(options); |
319 | parser.ParseQuery(query: mock_query); |
320 | // check the statements |
321 | if (parser.statements.size() != 1 || parser.statements[0]->type != StatementType::SELECT_STATEMENT) { |
322 | throw ParserException("Expected a single SELECT statement" ); |
323 | } |
324 | auto &select = parser.statements[0]->Cast<SelectStatement>(); |
325 | if (select.node->type != QueryNodeType::SELECT_NODE) { |
326 | throw ParserException("Expected a single SELECT node" ); |
327 | } |
328 | auto &select_node = select.node->Cast<SelectNode>(); |
329 | return std::move(select_node.select_list); |
330 | } |
331 | |
332 | vector<OrderByNode> Parser::ParseOrderList(const string &select_list, ParserOptions options) { |
333 | // construct a mock query |
334 | string mock_query = "SELECT * FROM tbl ORDER BY " + select_list; |
335 | // parse the query |
336 | Parser parser(options); |
337 | parser.ParseQuery(query: mock_query); |
338 | // check the statements |
339 | if (parser.statements.size() != 1 || parser.statements[0]->type != StatementType::SELECT_STATEMENT) { |
340 | throw ParserException("Expected a single SELECT statement" ); |
341 | } |
342 | auto &select = parser.statements[0]->Cast<SelectStatement>(); |
343 | if (select.node->type != QueryNodeType::SELECT_NODE) { |
344 | throw ParserException("Expected a single SELECT node" ); |
345 | } |
346 | auto &select_node = select.node->Cast<SelectNode>(); |
347 | if (select_node.modifiers.empty() || select_node.modifiers[0]->type != ResultModifierType::ORDER_MODIFIER || |
348 | select_node.modifiers.size() != 1) { |
349 | throw ParserException("Expected a single ORDER clause" ); |
350 | } |
351 | auto &order = select_node.modifiers[0]->Cast<OrderModifier>(); |
352 | return std::move(order.orders); |
353 | } |
354 | |
355 | void Parser::ParseUpdateList(const string &update_list, vector<string> &update_columns, |
356 | vector<unique_ptr<ParsedExpression>> &expressions, ParserOptions options) { |
357 | // construct a mock query |
358 | string mock_query = "UPDATE tbl SET " + update_list; |
359 | // parse the query |
360 | Parser parser(options); |
361 | parser.ParseQuery(query: mock_query); |
362 | // check the statements |
363 | if (parser.statements.size() != 1 || parser.statements[0]->type != StatementType::UPDATE_STATEMENT) { |
364 | throw ParserException("Expected a single UPDATE statement" ); |
365 | } |
366 | auto &update = parser.statements[0]->Cast<UpdateStatement>(); |
367 | update_columns = std::move(update.set_info->columns); |
368 | expressions = std::move(update.set_info->expressions); |
369 | } |
370 | |
371 | vector<vector<unique_ptr<ParsedExpression>>> Parser::ParseValuesList(const string &value_list, ParserOptions options) { |
372 | // construct a mock query |
373 | string mock_query = "VALUES " + value_list; |
374 | // parse the query |
375 | Parser parser(options); |
376 | parser.ParseQuery(query: mock_query); |
377 | // check the statements |
378 | if (parser.statements.size() != 1 || parser.statements[0]->type != StatementType::SELECT_STATEMENT) { |
379 | throw ParserException("Expected a single SELECT statement" ); |
380 | } |
381 | auto &select = parser.statements[0]->Cast<SelectStatement>(); |
382 | if (select.node->type != QueryNodeType::SELECT_NODE) { |
383 | throw ParserException("Expected a single SELECT node" ); |
384 | } |
385 | auto &select_node = select.node->Cast<SelectNode>(); |
386 | if (!select_node.from_table || select_node.from_table->type != TableReferenceType::EXPRESSION_LIST) { |
387 | throw ParserException("Expected a single VALUES statement" ); |
388 | } |
389 | auto &values_list = select_node.from_table->Cast<ExpressionListRef>(); |
390 | return std::move(values_list.values); |
391 | } |
392 | |
393 | ColumnList Parser::ParseColumnList(const string &column_list, ParserOptions options) { |
394 | string mock_query = "CREATE TABLE blabla (" + column_list + ")" ; |
395 | Parser parser(options); |
396 | parser.ParseQuery(query: mock_query); |
397 | if (parser.statements.size() != 1 || parser.statements[0]->type != StatementType::CREATE_STATEMENT) { |
398 | throw ParserException("Expected a single CREATE statement" ); |
399 | } |
400 | auto &create = parser.statements[0]->Cast<CreateStatement>(); |
401 | if (create.info->type != CatalogType::TABLE_ENTRY) { |
402 | throw InternalException("Expected a single CREATE TABLE statement" ); |
403 | } |
404 | auto &info = create.info->Cast<CreateTableInfo>(); |
405 | return std::move(info.columns); |
406 | } |
407 | |
408 | } // namespace duckdb |
409 | |