1#include "duckdb/parser/parser.hpp"
2
3#include "duckdb/parser/parsed_data/create_table_info.hpp"
4#include "duckdb/parser/parser_extension.hpp"
5#include "duckdb/parser/query_error_context.hpp"
6#include "duckdb/parser/query_node/select_node.hpp"
7#include "duckdb/parser/statement/create_statement.hpp"
8#include "duckdb/parser/statement/extension_statement.hpp"
9#include "duckdb/parser/statement/select_statement.hpp"
10#include "duckdb/parser/statement/update_statement.hpp"
11#include "duckdb/parser/tableref/expressionlistref.hpp"
12#include "duckdb/parser/transformer.hpp"
13#include "parser/parser.hpp"
14#include "postgres_parser.hpp"
15
16namespace duckdb {
17
18Parser::Parser(ParserOptions options_p) : options(options_p) {
19}
20
21struct UnicodeSpace {
22 UnicodeSpace(idx_t pos, idx_t bytes) : pos(pos), bytes(bytes) {
23 }
24
25 idx_t pos;
26 idx_t bytes;
27};
28
29static bool ReplaceUnicodeSpaces(const string &query, string &new_query, vector<UnicodeSpace> &unicode_spaces) {
30 if (unicode_spaces.empty()) {
31 // no unicode spaces found
32 return false;
33 }
34 idx_t prev = 0;
35 for (auto &usp : unicode_spaces) {
36 new_query += query.substr(pos: prev, n: usp.pos - prev);
37 new_query += " ";
38 prev = usp.pos + usp.bytes;
39 }
40 new_query += query.substr(pos: prev, n: query.size() - prev);
41 return true;
42}
43
44// This function strips unicode space characters from the query and replaces them with regular spaces
45// It returns true if any unicode space characters were found and stripped
46// See here for a list of unicode space characters - https://jkorpela.fi/chars/spaces.html
47bool Parser::StripUnicodeSpaces(const string &query_str, string &new_query) {
48 const idx_t NBSP_LEN = 2;
49 const idx_t USP_LEN = 3;
50 idx_t pos = 0;
51 unsigned char quote;
52 vector<UnicodeSpace> unicode_spaces;
53 auto query = const_uchar_ptr_cast(src: query_str.c_str());
54 auto qsize = query_str.size();
55
56regular:
57 for (; pos + 2 < qsize; pos++) {
58 if (query[pos] == 0xC2) {
59 if (query[pos + 1] == 0xA0) {
60 // U+00A0 - C2A0
61 unicode_spaces.emplace_back(args&: pos, args: NBSP_LEN);
62 }
63 }
64 if (query[pos] == 0xE2) {
65 if (query[pos + 1] == 0x80) {
66 if (query[pos + 2] >= 0x80 && query[pos + 2] <= 0x8B) {
67 // U+2000 to U+200B
68 // E28080 - E2808B
69 unicode_spaces.emplace_back(args&: pos, args: USP_LEN);
70 } else if (query[pos + 2] == 0xAF) {
71 // U+202F - E280AF
72 unicode_spaces.emplace_back(args&: pos, args: USP_LEN);
73 }
74 } else if (query[pos + 1] == 0x81) {
75 if (query[pos + 2] == 0x9F) {
76 // U+205F - E2819f
77 unicode_spaces.emplace_back(args&: pos, args: USP_LEN);
78 } else if (query[pos + 2] == 0xA0) {
79 // U+2060 - E281A0
80 unicode_spaces.emplace_back(args&: pos, args: USP_LEN);
81 }
82 }
83 } else if (query[pos] == 0xE3) {
84 if (query[pos + 1] == 0x80 && query[pos + 2] == 0x80) {
85 // U+3000 - E38080
86 unicode_spaces.emplace_back(args&: pos, args: USP_LEN);
87 }
88 } else if (query[pos] == 0xEF) {
89 if (query[pos + 1] == 0xBB && query[pos + 2] == 0xBF) {
90 // U+FEFF - EFBBBF
91 unicode_spaces.emplace_back(args&: pos, args: USP_LEN);
92 }
93 } else if (query[pos] == '"' || query[pos] == '\'') {
94 quote = query[pos];
95 pos++;
96 goto in_quotes;
97 } else if (query[pos] == '-' && query[pos + 1] == '-') {
98 goto in_comment;
99 }
100 }
101 goto end;
102in_quotes:
103 for (; pos + 1 < qsize; pos++) {
104 if (query[pos] == quote) {
105 if (query[pos + 1] == quote) {
106 // escaped quote
107 pos++;
108 continue;
109 }
110 pos++;
111 goto regular;
112 }
113 }
114 goto end;
115in_comment:
116 for (; pos < qsize; pos++) {
117 if (query[pos] == '\n' || query[pos] == '\r') {
118 goto regular;
119 }
120 }
121 goto end;
122end:
123 return ReplaceUnicodeSpaces(query: query_str, new_query, unicode_spaces);
124}
125
126vector<string> SplitQueryStringIntoStatements(const string &query) {
127 // Break sql string down into sql statements using the tokenizer
128 vector<string> query_statements;
129 auto tokens = Parser::Tokenize(query);
130 auto next_statement_start = 0;
131 for (idx_t i = 1; i < tokens.size(); ++i) {
132 auto &t_prev = tokens[i - 1];
133 auto &t = tokens[i];
134 if (t_prev.type == SimplifiedTokenType::SIMPLIFIED_TOKEN_OPERATOR) {
135 // LCOV_EXCL_START
136 for (idx_t c = t_prev.start; c <= t.start; ++c) {
137 if (query.c_str()[c] == ';') {
138 query_statements.emplace_back(args: query.substr(pos: next_statement_start, n: t.start - next_statement_start));
139 next_statement_start = tokens[i].start;
140 }
141 }
142 // LCOV_EXCL_STOP
143 }
144 }
145 query_statements.emplace_back(args: query.substr(pos: next_statement_start, n: query.size() - next_statement_start));
146 return query_statements;
147}
148
149void Parser::ParseQuery(const string &query) {
150 Transformer transformer(options);
151 string parser_error;
152 {
153 // check if there are any unicode spaces in the string
154 string new_query;
155 if (StripUnicodeSpaces(query_str: query, new_query)) {
156 // there are - strip the unicode spaces and re-run the query
157 ParseQuery(query: new_query);
158 return;
159 }
160 }
161 {
162 PostgresParser::SetPreserveIdentifierCase(options.preserve_identifier_case);
163 bool parsing_succeed = false;
164 // Creating a new scope to prevent multiple PostgresParser destructors being called
165 // which led to some memory issues
166 {
167 PostgresParser parser;
168 parser.Parse(query);
169 if (parser.success) {
170 if (!parser.parse_tree) {
171 // empty statement
172 return;
173 }
174
175 // if it succeeded, we transform the Postgres parse tree into a list of
176 // SQLStatements
177 transformer.TransformParseTree(tree: parser.parse_tree, statements);
178 parsing_succeed = true;
179 } else {
180 parser_error = QueryErrorContext::Format(query, error_message: parser.error_message, error_location: parser.error_location - 1);
181 }
182 }
183 // If DuckDB fails to parse the entire sql string, break the string down into individual statements
184 // using ';' as the delimiter so that parser extensions can parse the statement
185 if (parsing_succeed) {
186 // no-op
187 // return here would require refactoring into another function. o.w. will just no-op in order to run wrap up
188 // code at the end of this function
189 } else if (!options.extensions || options.extensions->empty()) {
190 throw ParserException(parser_error);
191 } else {
192 // split sql string into statements and re-parse using extension
193 auto query_statements = SplitQueryStringIntoStatements(query);
194 for (auto const &query_statement : query_statements) {
195 PostgresParser another_parser;
196 another_parser.Parse(query: query_statement);
197 // LCOV_EXCL_START
198 // first see if DuckDB can parse this individual query statement
199 if (another_parser.success) {
200 if (!another_parser.parse_tree) {
201 // empty statement
202 continue;
203 }
204 transformer.TransformParseTree(tree: another_parser.parse_tree, statements);
205 } else {
206 // let extensions parse the statement which DuckDB failed to parse
207 bool parsed_single_statement = false;
208 for (auto &ext : *options.extensions) {
209 D_ASSERT(!parsed_single_statement);
210 D_ASSERT(ext.parse_function);
211 auto result = ext.parse_function(ext.parser_info.get(), query_statement);
212 if (result.type == ParserExtensionResultType::PARSE_SUCCESSFUL) {
213 auto statement = make_uniq<ExtensionStatement>(args: ext, args: std::move(result.parse_data));
214 statement->stmt_length = query_statement.size();
215 statement->stmt_location = 0;
216 statements.push_back(x: std::move(statement));
217 parsed_single_statement = true;
218 break;
219 } else if (result.type == ParserExtensionResultType::DISPLAY_EXTENSION_ERROR) {
220 throw ParserException(result.error);
221 } else {
222 // We move to the next one!
223 }
224 }
225 if (!parsed_single_statement) {
226 parser_error = QueryErrorContext::Format(query, error_message: another_parser.error_message,
227 error_location: another_parser.error_location - 1);
228 throw ParserException(parser_error);
229 }
230 }
231 // LCOV_EXCL_STOP
232 }
233 }
234 }
235 if (!statements.empty()) {
236 auto &last_statement = statements.back();
237 last_statement->stmt_length = query.size() - last_statement->stmt_location;
238 for (auto &statement : statements) {
239 statement->query = query;
240 if (statement->type == StatementType::CREATE_STATEMENT) {
241 auto &create = statement->Cast<CreateStatement>();
242 create.info->sql = query.substr(pos: statement->stmt_location, n: statement->stmt_length);
243 }
244 }
245 }
246}
247
248vector<SimplifiedToken> Parser::Tokenize(const string &query) {
249 auto pg_tokens = PostgresParser::Tokenize(query);
250 vector<SimplifiedToken> result;
251 result.reserve(n: pg_tokens.size());
252 for (auto &pg_token : pg_tokens) {
253 SimplifiedToken token;
254 switch (pg_token.type) {
255 case duckdb_libpgquery::PGSimplifiedTokenType::PG_SIMPLIFIED_TOKEN_IDENTIFIER:
256 token.type = SimplifiedTokenType::SIMPLIFIED_TOKEN_IDENTIFIER;
257 break;
258 case duckdb_libpgquery::PGSimplifiedTokenType::PG_SIMPLIFIED_TOKEN_NUMERIC_CONSTANT:
259 token.type = SimplifiedTokenType::SIMPLIFIED_TOKEN_NUMERIC_CONSTANT;
260 break;
261 case duckdb_libpgquery::PGSimplifiedTokenType::PG_SIMPLIFIED_TOKEN_STRING_CONSTANT:
262 token.type = SimplifiedTokenType::SIMPLIFIED_TOKEN_STRING_CONSTANT;
263 break;
264 case duckdb_libpgquery::PGSimplifiedTokenType::PG_SIMPLIFIED_TOKEN_OPERATOR:
265 token.type = SimplifiedTokenType::SIMPLIFIED_TOKEN_OPERATOR;
266 break;
267 case duckdb_libpgquery::PGSimplifiedTokenType::PG_SIMPLIFIED_TOKEN_KEYWORD:
268 token.type = SimplifiedTokenType::SIMPLIFIED_TOKEN_KEYWORD;
269 break;
270 // comments are not supported by our tokenizer right now
271 case duckdb_libpgquery::PGSimplifiedTokenType::PG_SIMPLIFIED_TOKEN_COMMENT: // LCOV_EXCL_START
272 token.type = SimplifiedTokenType::SIMPLIFIED_TOKEN_COMMENT;
273 break;
274 default:
275 throw InternalException("Unrecognized token category");
276 } // LCOV_EXCL_STOP
277 token.start = pg_token.start;
278 result.push_back(x: token);
279 }
280 return result;
281}
282
283bool Parser::IsKeyword(const string &text) {
284 return PostgresParser::IsKeyword(text);
285}
286
287vector<ParserKeyword> Parser::KeywordList() {
288 auto keywords = PostgresParser::KeywordList();
289 vector<ParserKeyword> result;
290 for (auto &kw : keywords) {
291 ParserKeyword res;
292 res.name = kw.text;
293 switch (kw.category) {
294 case duckdb_libpgquery::PGKeywordCategory::PG_KEYWORD_RESERVED:
295 res.category = KeywordCategory::KEYWORD_RESERVED;
296 break;
297 case duckdb_libpgquery::PGKeywordCategory::PG_KEYWORD_UNRESERVED:
298 res.category = KeywordCategory::KEYWORD_UNRESERVED;
299 break;
300 case duckdb_libpgquery::PGKeywordCategory::PG_KEYWORD_TYPE_FUNC:
301 res.category = KeywordCategory::KEYWORD_TYPE_FUNC;
302 break;
303 case duckdb_libpgquery::PGKeywordCategory::PG_KEYWORD_COL_NAME:
304 res.category = KeywordCategory::KEYWORD_COL_NAME;
305 break;
306 default:
307 throw InternalException("Unrecognized keyword category");
308 }
309 result.push_back(x: res);
310 }
311 return result;
312}
313
314vector<unique_ptr<ParsedExpression>> Parser::ParseExpressionList(const string &select_list, ParserOptions options) {
315 // construct a mock query prefixed with SELECT
316 string mock_query = "SELECT " + select_list;
317 // parse the query
318 Parser parser(options);
319 parser.ParseQuery(query: mock_query);
320 // check the statements
321 if (parser.statements.size() != 1 || parser.statements[0]->type != StatementType::SELECT_STATEMENT) {
322 throw ParserException("Expected a single SELECT statement");
323 }
324 auto &select = parser.statements[0]->Cast<SelectStatement>();
325 if (select.node->type != QueryNodeType::SELECT_NODE) {
326 throw ParserException("Expected a single SELECT node");
327 }
328 auto &select_node = select.node->Cast<SelectNode>();
329 return std::move(select_node.select_list);
330}
331
332vector<OrderByNode> Parser::ParseOrderList(const string &select_list, ParserOptions options) {
333 // construct a mock query
334 string mock_query = "SELECT * FROM tbl ORDER BY " + select_list;
335 // parse the query
336 Parser parser(options);
337 parser.ParseQuery(query: mock_query);
338 // check the statements
339 if (parser.statements.size() != 1 || parser.statements[0]->type != StatementType::SELECT_STATEMENT) {
340 throw ParserException("Expected a single SELECT statement");
341 }
342 auto &select = parser.statements[0]->Cast<SelectStatement>();
343 if (select.node->type != QueryNodeType::SELECT_NODE) {
344 throw ParserException("Expected a single SELECT node");
345 }
346 auto &select_node = select.node->Cast<SelectNode>();
347 if (select_node.modifiers.empty() || select_node.modifiers[0]->type != ResultModifierType::ORDER_MODIFIER ||
348 select_node.modifiers.size() != 1) {
349 throw ParserException("Expected a single ORDER clause");
350 }
351 auto &order = select_node.modifiers[0]->Cast<OrderModifier>();
352 return std::move(order.orders);
353}
354
355void Parser::ParseUpdateList(const string &update_list, vector<string> &update_columns,
356 vector<unique_ptr<ParsedExpression>> &expressions, ParserOptions options) {
357 // construct a mock query
358 string mock_query = "UPDATE tbl SET " + update_list;
359 // parse the query
360 Parser parser(options);
361 parser.ParseQuery(query: mock_query);
362 // check the statements
363 if (parser.statements.size() != 1 || parser.statements[0]->type != StatementType::UPDATE_STATEMENT) {
364 throw ParserException("Expected a single UPDATE statement");
365 }
366 auto &update = parser.statements[0]->Cast<UpdateStatement>();
367 update_columns = std::move(update.set_info->columns);
368 expressions = std::move(update.set_info->expressions);
369}
370
371vector<vector<unique_ptr<ParsedExpression>>> Parser::ParseValuesList(const string &value_list, ParserOptions options) {
372 // construct a mock query
373 string mock_query = "VALUES " + value_list;
374 // parse the query
375 Parser parser(options);
376 parser.ParseQuery(query: mock_query);
377 // check the statements
378 if (parser.statements.size() != 1 || parser.statements[0]->type != StatementType::SELECT_STATEMENT) {
379 throw ParserException("Expected a single SELECT statement");
380 }
381 auto &select = parser.statements[0]->Cast<SelectStatement>();
382 if (select.node->type != QueryNodeType::SELECT_NODE) {
383 throw ParserException("Expected a single SELECT node");
384 }
385 auto &select_node = select.node->Cast<SelectNode>();
386 if (!select_node.from_table || select_node.from_table->type != TableReferenceType::EXPRESSION_LIST) {
387 throw ParserException("Expected a single VALUES statement");
388 }
389 auto &values_list = select_node.from_table->Cast<ExpressionListRef>();
390 return std::move(values_list.values);
391}
392
393ColumnList Parser::ParseColumnList(const string &column_list, ParserOptions options) {
394 string mock_query = "CREATE TABLE blabla (" + column_list + ")";
395 Parser parser(options);
396 parser.ParseQuery(query: mock_query);
397 if (parser.statements.size() != 1 || parser.statements[0]->type != StatementType::CREATE_STATEMENT) {
398 throw ParserException("Expected a single CREATE statement");
399 }
400 auto &create = parser.statements[0]->Cast<CreateStatement>();
401 if (create.info->type != CatalogType::TABLE_ENTRY) {
402 throw InternalException("Expected a single CREATE TABLE statement");
403 }
404 auto &info = create.info->Cast<CreateTableInfo>();
405 return std::move(info.columns);
406}
407
408} // namespace duckdb
409