| 1 | #include "duckdb/parser/parser.hpp" |
| 2 | |
| 3 | #include "duckdb/parser/parsed_data/create_table_info.hpp" |
| 4 | #include "duckdb/parser/parser_extension.hpp" |
| 5 | #include "duckdb/parser/query_error_context.hpp" |
| 6 | #include "duckdb/parser/query_node/select_node.hpp" |
| 7 | #include "duckdb/parser/statement/create_statement.hpp" |
| 8 | #include "duckdb/parser/statement/extension_statement.hpp" |
| 9 | #include "duckdb/parser/statement/select_statement.hpp" |
| 10 | #include "duckdb/parser/statement/update_statement.hpp" |
| 11 | #include "duckdb/parser/tableref/expressionlistref.hpp" |
| 12 | #include "duckdb/parser/transformer.hpp" |
| 13 | #include "parser/parser.hpp" |
| 14 | #include "postgres_parser.hpp" |
| 15 | |
| 16 | namespace duckdb { |
| 17 | |
| 18 | Parser::Parser(ParserOptions options_p) : options(options_p) { |
| 19 | } |
| 20 | |
| 21 | struct UnicodeSpace { |
| 22 | UnicodeSpace(idx_t pos, idx_t bytes) : pos(pos), bytes(bytes) { |
| 23 | } |
| 24 | |
| 25 | idx_t pos; |
| 26 | idx_t bytes; |
| 27 | }; |
| 28 | |
| 29 | static bool ReplaceUnicodeSpaces(const string &query, string &new_query, vector<UnicodeSpace> &unicode_spaces) { |
| 30 | if (unicode_spaces.empty()) { |
| 31 | // no unicode spaces found |
| 32 | return false; |
| 33 | } |
| 34 | idx_t prev = 0; |
| 35 | for (auto &usp : unicode_spaces) { |
| 36 | new_query += query.substr(pos: prev, n: usp.pos - prev); |
| 37 | new_query += " " ; |
| 38 | prev = usp.pos + usp.bytes; |
| 39 | } |
| 40 | new_query += query.substr(pos: prev, n: query.size() - prev); |
| 41 | return true; |
| 42 | } |
| 43 | |
| 44 | // This function strips unicode space characters from the query and replaces them with regular spaces |
| 45 | // It returns true if any unicode space characters were found and stripped |
| 46 | // See here for a list of unicode space characters - https://jkorpela.fi/chars/spaces.html |
| 47 | bool Parser::StripUnicodeSpaces(const string &query_str, string &new_query) { |
| 48 | const idx_t NBSP_LEN = 2; |
| 49 | const idx_t USP_LEN = 3; |
| 50 | idx_t pos = 0; |
| 51 | unsigned char quote; |
| 52 | vector<UnicodeSpace> unicode_spaces; |
| 53 | auto query = const_uchar_ptr_cast(src: query_str.c_str()); |
| 54 | auto qsize = query_str.size(); |
| 55 | |
| 56 | regular: |
| 57 | for (; pos + 2 < qsize; pos++) { |
| 58 | if (query[pos] == 0xC2) { |
| 59 | if (query[pos + 1] == 0xA0) { |
| 60 | // U+00A0 - C2A0 |
| 61 | unicode_spaces.emplace_back(args&: pos, args: NBSP_LEN); |
| 62 | } |
| 63 | } |
| 64 | if (query[pos] == 0xE2) { |
| 65 | if (query[pos + 1] == 0x80) { |
| 66 | if (query[pos + 2] >= 0x80 && query[pos + 2] <= 0x8B) { |
| 67 | // U+2000 to U+200B |
| 68 | // E28080 - E2808B |
| 69 | unicode_spaces.emplace_back(args&: pos, args: USP_LEN); |
| 70 | } else if (query[pos + 2] == 0xAF) { |
| 71 | // U+202F - E280AF |
| 72 | unicode_spaces.emplace_back(args&: pos, args: USP_LEN); |
| 73 | } |
| 74 | } else if (query[pos + 1] == 0x81) { |
| 75 | if (query[pos + 2] == 0x9F) { |
| 76 | // U+205F - E2819f |
| 77 | unicode_spaces.emplace_back(args&: pos, args: USP_LEN); |
| 78 | } else if (query[pos + 2] == 0xA0) { |
| 79 | // U+2060 - E281A0 |
| 80 | unicode_spaces.emplace_back(args&: pos, args: USP_LEN); |
| 81 | } |
| 82 | } |
| 83 | } else if (query[pos] == 0xE3) { |
| 84 | if (query[pos + 1] == 0x80 && query[pos + 2] == 0x80) { |
| 85 | // U+3000 - E38080 |
| 86 | unicode_spaces.emplace_back(args&: pos, args: USP_LEN); |
| 87 | } |
| 88 | } else if (query[pos] == 0xEF) { |
| 89 | if (query[pos + 1] == 0xBB && query[pos + 2] == 0xBF) { |
| 90 | // U+FEFF - EFBBBF |
| 91 | unicode_spaces.emplace_back(args&: pos, args: USP_LEN); |
| 92 | } |
| 93 | } else if (query[pos] == '"' || query[pos] == '\'') { |
| 94 | quote = query[pos]; |
| 95 | pos++; |
| 96 | goto in_quotes; |
| 97 | } else if (query[pos] == '-' && query[pos + 1] == '-') { |
| 98 | goto in_comment; |
| 99 | } |
| 100 | } |
| 101 | goto end; |
| 102 | in_quotes: |
| 103 | for (; pos + 1 < qsize; pos++) { |
| 104 | if (query[pos] == quote) { |
| 105 | if (query[pos + 1] == quote) { |
| 106 | // escaped quote |
| 107 | pos++; |
| 108 | continue; |
| 109 | } |
| 110 | pos++; |
| 111 | goto regular; |
| 112 | } |
| 113 | } |
| 114 | goto end; |
| 115 | : |
| 116 | for (; pos < qsize; pos++) { |
| 117 | if (query[pos] == '\n' || query[pos] == '\r') { |
| 118 | goto regular; |
| 119 | } |
| 120 | } |
| 121 | goto end; |
| 122 | end: |
| 123 | return ReplaceUnicodeSpaces(query: query_str, new_query, unicode_spaces); |
| 124 | } |
| 125 | |
| 126 | vector<string> SplitQueryStringIntoStatements(const string &query) { |
| 127 | // Break sql string down into sql statements using the tokenizer |
| 128 | vector<string> query_statements; |
| 129 | auto tokens = Parser::Tokenize(query); |
| 130 | auto next_statement_start = 0; |
| 131 | for (idx_t i = 1; i < tokens.size(); ++i) { |
| 132 | auto &t_prev = tokens[i - 1]; |
| 133 | auto &t = tokens[i]; |
| 134 | if (t_prev.type == SimplifiedTokenType::SIMPLIFIED_TOKEN_OPERATOR) { |
| 135 | // LCOV_EXCL_START |
| 136 | for (idx_t c = t_prev.start; c <= t.start; ++c) { |
| 137 | if (query.c_str()[c] == ';') { |
| 138 | query_statements.emplace_back(args: query.substr(pos: next_statement_start, n: t.start - next_statement_start)); |
| 139 | next_statement_start = tokens[i].start; |
| 140 | } |
| 141 | } |
| 142 | // LCOV_EXCL_STOP |
| 143 | } |
| 144 | } |
| 145 | query_statements.emplace_back(args: query.substr(pos: next_statement_start, n: query.size() - next_statement_start)); |
| 146 | return query_statements; |
| 147 | } |
| 148 | |
| 149 | void Parser::ParseQuery(const string &query) { |
| 150 | Transformer transformer(options); |
| 151 | string parser_error; |
| 152 | { |
| 153 | // check if there are any unicode spaces in the string |
| 154 | string new_query; |
| 155 | if (StripUnicodeSpaces(query_str: query, new_query)) { |
| 156 | // there are - strip the unicode spaces and re-run the query |
| 157 | ParseQuery(query: new_query); |
| 158 | return; |
| 159 | } |
| 160 | } |
| 161 | { |
| 162 | PostgresParser::SetPreserveIdentifierCase(options.preserve_identifier_case); |
| 163 | bool parsing_succeed = false; |
| 164 | // Creating a new scope to prevent multiple PostgresParser destructors being called |
| 165 | // which led to some memory issues |
| 166 | { |
| 167 | PostgresParser parser; |
| 168 | parser.Parse(query); |
| 169 | if (parser.success) { |
| 170 | if (!parser.parse_tree) { |
| 171 | // empty statement |
| 172 | return; |
| 173 | } |
| 174 | |
| 175 | // if it succeeded, we transform the Postgres parse tree into a list of |
| 176 | // SQLStatements |
| 177 | transformer.TransformParseTree(tree: parser.parse_tree, statements); |
| 178 | parsing_succeed = true; |
| 179 | } else { |
| 180 | parser_error = QueryErrorContext::Format(query, error_message: parser.error_message, error_location: parser.error_location - 1); |
| 181 | } |
| 182 | } |
| 183 | // If DuckDB fails to parse the entire sql string, break the string down into individual statements |
| 184 | // using ';' as the delimiter so that parser extensions can parse the statement |
| 185 | if (parsing_succeed) { |
| 186 | // no-op |
| 187 | // return here would require refactoring into another function. o.w. will just no-op in order to run wrap up |
| 188 | // code at the end of this function |
| 189 | } else if (!options.extensions || options.extensions->empty()) { |
| 190 | throw ParserException(parser_error); |
| 191 | } else { |
| 192 | // split sql string into statements and re-parse using extension |
| 193 | auto query_statements = SplitQueryStringIntoStatements(query); |
| 194 | for (auto const &query_statement : query_statements) { |
| 195 | PostgresParser another_parser; |
| 196 | another_parser.Parse(query: query_statement); |
| 197 | // LCOV_EXCL_START |
| 198 | // first see if DuckDB can parse this individual query statement |
| 199 | if (another_parser.success) { |
| 200 | if (!another_parser.parse_tree) { |
| 201 | // empty statement |
| 202 | continue; |
| 203 | } |
| 204 | transformer.TransformParseTree(tree: another_parser.parse_tree, statements); |
| 205 | } else { |
| 206 | // let extensions parse the statement which DuckDB failed to parse |
| 207 | bool parsed_single_statement = false; |
| 208 | for (auto &ext : *options.extensions) { |
| 209 | D_ASSERT(!parsed_single_statement); |
| 210 | D_ASSERT(ext.parse_function); |
| 211 | auto result = ext.parse_function(ext.parser_info.get(), query_statement); |
| 212 | if (result.type == ParserExtensionResultType::PARSE_SUCCESSFUL) { |
| 213 | auto statement = make_uniq<ExtensionStatement>(args: ext, args: std::move(result.parse_data)); |
| 214 | statement->stmt_length = query_statement.size(); |
| 215 | statement->stmt_location = 0; |
| 216 | statements.push_back(x: std::move(statement)); |
| 217 | parsed_single_statement = true; |
| 218 | break; |
| 219 | } else if (result.type == ParserExtensionResultType::DISPLAY_EXTENSION_ERROR) { |
| 220 | throw ParserException(result.error); |
| 221 | } else { |
| 222 | // We move to the next one! |
| 223 | } |
| 224 | } |
| 225 | if (!parsed_single_statement) { |
| 226 | parser_error = QueryErrorContext::Format(query, error_message: another_parser.error_message, |
| 227 | error_location: another_parser.error_location - 1); |
| 228 | throw ParserException(parser_error); |
| 229 | } |
| 230 | } |
| 231 | // LCOV_EXCL_STOP |
| 232 | } |
| 233 | } |
| 234 | } |
| 235 | if (!statements.empty()) { |
| 236 | auto &last_statement = statements.back(); |
| 237 | last_statement->stmt_length = query.size() - last_statement->stmt_location; |
| 238 | for (auto &statement : statements) { |
| 239 | statement->query = query; |
| 240 | if (statement->type == StatementType::CREATE_STATEMENT) { |
| 241 | auto &create = statement->Cast<CreateStatement>(); |
| 242 | create.info->sql = query.substr(pos: statement->stmt_location, n: statement->stmt_length); |
| 243 | } |
| 244 | } |
| 245 | } |
| 246 | } |
| 247 | |
| 248 | vector<SimplifiedToken> Parser::Tokenize(const string &query) { |
| 249 | auto pg_tokens = PostgresParser::Tokenize(query); |
| 250 | vector<SimplifiedToken> result; |
| 251 | result.reserve(n: pg_tokens.size()); |
| 252 | for (auto &pg_token : pg_tokens) { |
| 253 | SimplifiedToken token; |
| 254 | switch (pg_token.type) { |
| 255 | case duckdb_libpgquery::PGSimplifiedTokenType::PG_SIMPLIFIED_TOKEN_IDENTIFIER: |
| 256 | token.type = SimplifiedTokenType::SIMPLIFIED_TOKEN_IDENTIFIER; |
| 257 | break; |
| 258 | case duckdb_libpgquery::PGSimplifiedTokenType::PG_SIMPLIFIED_TOKEN_NUMERIC_CONSTANT: |
| 259 | token.type = SimplifiedTokenType::SIMPLIFIED_TOKEN_NUMERIC_CONSTANT; |
| 260 | break; |
| 261 | case duckdb_libpgquery::PGSimplifiedTokenType::PG_SIMPLIFIED_TOKEN_STRING_CONSTANT: |
| 262 | token.type = SimplifiedTokenType::SIMPLIFIED_TOKEN_STRING_CONSTANT; |
| 263 | break; |
| 264 | case duckdb_libpgquery::PGSimplifiedTokenType::PG_SIMPLIFIED_TOKEN_OPERATOR: |
| 265 | token.type = SimplifiedTokenType::SIMPLIFIED_TOKEN_OPERATOR; |
| 266 | break; |
| 267 | case duckdb_libpgquery::PGSimplifiedTokenType::PG_SIMPLIFIED_TOKEN_KEYWORD: |
| 268 | token.type = SimplifiedTokenType::SIMPLIFIED_TOKEN_KEYWORD; |
| 269 | break; |
| 270 | // comments are not supported by our tokenizer right now |
| 271 | case duckdb_libpgquery::PGSimplifiedTokenType::PG_SIMPLIFIED_TOKEN_COMMENT: // LCOV_EXCL_START |
| 272 | token.type = SimplifiedTokenType::SIMPLIFIED_TOKEN_COMMENT; |
| 273 | break; |
| 274 | default: |
| 275 | throw InternalException("Unrecognized token category" ); |
| 276 | } // LCOV_EXCL_STOP |
| 277 | token.start = pg_token.start; |
| 278 | result.push_back(x: token); |
| 279 | } |
| 280 | return result; |
| 281 | } |
| 282 | |
| 283 | bool Parser::IsKeyword(const string &text) { |
| 284 | return PostgresParser::IsKeyword(text); |
| 285 | } |
| 286 | |
| 287 | vector<ParserKeyword> Parser::KeywordList() { |
| 288 | auto keywords = PostgresParser::KeywordList(); |
| 289 | vector<ParserKeyword> result; |
| 290 | for (auto &kw : keywords) { |
| 291 | ParserKeyword res; |
| 292 | res.name = kw.text; |
| 293 | switch (kw.category) { |
| 294 | case duckdb_libpgquery::PGKeywordCategory::PG_KEYWORD_RESERVED: |
| 295 | res.category = KeywordCategory::KEYWORD_RESERVED; |
| 296 | break; |
| 297 | case duckdb_libpgquery::PGKeywordCategory::PG_KEYWORD_UNRESERVED: |
| 298 | res.category = KeywordCategory::KEYWORD_UNRESERVED; |
| 299 | break; |
| 300 | case duckdb_libpgquery::PGKeywordCategory::PG_KEYWORD_TYPE_FUNC: |
| 301 | res.category = KeywordCategory::KEYWORD_TYPE_FUNC; |
| 302 | break; |
| 303 | case duckdb_libpgquery::PGKeywordCategory::PG_KEYWORD_COL_NAME: |
| 304 | res.category = KeywordCategory::KEYWORD_COL_NAME; |
| 305 | break; |
| 306 | default: |
| 307 | throw InternalException("Unrecognized keyword category" ); |
| 308 | } |
| 309 | result.push_back(x: res); |
| 310 | } |
| 311 | return result; |
| 312 | } |
| 313 | |
| 314 | vector<unique_ptr<ParsedExpression>> Parser::ParseExpressionList(const string &select_list, ParserOptions options) { |
| 315 | // construct a mock query prefixed with SELECT |
| 316 | string mock_query = "SELECT " + select_list; |
| 317 | // parse the query |
| 318 | Parser parser(options); |
| 319 | parser.ParseQuery(query: mock_query); |
| 320 | // check the statements |
| 321 | if (parser.statements.size() != 1 || parser.statements[0]->type != StatementType::SELECT_STATEMENT) { |
| 322 | throw ParserException("Expected a single SELECT statement" ); |
| 323 | } |
| 324 | auto &select = parser.statements[0]->Cast<SelectStatement>(); |
| 325 | if (select.node->type != QueryNodeType::SELECT_NODE) { |
| 326 | throw ParserException("Expected a single SELECT node" ); |
| 327 | } |
| 328 | auto &select_node = select.node->Cast<SelectNode>(); |
| 329 | return std::move(select_node.select_list); |
| 330 | } |
| 331 | |
| 332 | vector<OrderByNode> Parser::ParseOrderList(const string &select_list, ParserOptions options) { |
| 333 | // construct a mock query |
| 334 | string mock_query = "SELECT * FROM tbl ORDER BY " + select_list; |
| 335 | // parse the query |
| 336 | Parser parser(options); |
| 337 | parser.ParseQuery(query: mock_query); |
| 338 | // check the statements |
| 339 | if (parser.statements.size() != 1 || parser.statements[0]->type != StatementType::SELECT_STATEMENT) { |
| 340 | throw ParserException("Expected a single SELECT statement" ); |
| 341 | } |
| 342 | auto &select = parser.statements[0]->Cast<SelectStatement>(); |
| 343 | if (select.node->type != QueryNodeType::SELECT_NODE) { |
| 344 | throw ParserException("Expected a single SELECT node" ); |
| 345 | } |
| 346 | auto &select_node = select.node->Cast<SelectNode>(); |
| 347 | if (select_node.modifiers.empty() || select_node.modifiers[0]->type != ResultModifierType::ORDER_MODIFIER || |
| 348 | select_node.modifiers.size() != 1) { |
| 349 | throw ParserException("Expected a single ORDER clause" ); |
| 350 | } |
| 351 | auto &order = select_node.modifiers[0]->Cast<OrderModifier>(); |
| 352 | return std::move(order.orders); |
| 353 | } |
| 354 | |
| 355 | void Parser::ParseUpdateList(const string &update_list, vector<string> &update_columns, |
| 356 | vector<unique_ptr<ParsedExpression>> &expressions, ParserOptions options) { |
| 357 | // construct a mock query |
| 358 | string mock_query = "UPDATE tbl SET " + update_list; |
| 359 | // parse the query |
| 360 | Parser parser(options); |
| 361 | parser.ParseQuery(query: mock_query); |
| 362 | // check the statements |
| 363 | if (parser.statements.size() != 1 || parser.statements[0]->type != StatementType::UPDATE_STATEMENT) { |
| 364 | throw ParserException("Expected a single UPDATE statement" ); |
| 365 | } |
| 366 | auto &update = parser.statements[0]->Cast<UpdateStatement>(); |
| 367 | update_columns = std::move(update.set_info->columns); |
| 368 | expressions = std::move(update.set_info->expressions); |
| 369 | } |
| 370 | |
| 371 | vector<vector<unique_ptr<ParsedExpression>>> Parser::ParseValuesList(const string &value_list, ParserOptions options) { |
| 372 | // construct a mock query |
| 373 | string mock_query = "VALUES " + value_list; |
| 374 | // parse the query |
| 375 | Parser parser(options); |
| 376 | parser.ParseQuery(query: mock_query); |
| 377 | // check the statements |
| 378 | if (parser.statements.size() != 1 || parser.statements[0]->type != StatementType::SELECT_STATEMENT) { |
| 379 | throw ParserException("Expected a single SELECT statement" ); |
| 380 | } |
| 381 | auto &select = parser.statements[0]->Cast<SelectStatement>(); |
| 382 | if (select.node->type != QueryNodeType::SELECT_NODE) { |
| 383 | throw ParserException("Expected a single SELECT node" ); |
| 384 | } |
| 385 | auto &select_node = select.node->Cast<SelectNode>(); |
| 386 | if (!select_node.from_table || select_node.from_table->type != TableReferenceType::EXPRESSION_LIST) { |
| 387 | throw ParserException("Expected a single VALUES statement" ); |
| 388 | } |
| 389 | auto &values_list = select_node.from_table->Cast<ExpressionListRef>(); |
| 390 | return std::move(values_list.values); |
| 391 | } |
| 392 | |
| 393 | ColumnList Parser::ParseColumnList(const string &column_list, ParserOptions options) { |
| 394 | string mock_query = "CREATE TABLE blabla (" + column_list + ")" ; |
| 395 | Parser parser(options); |
| 396 | parser.ParseQuery(query: mock_query); |
| 397 | if (parser.statements.size() != 1 || parser.statements[0]->type != StatementType::CREATE_STATEMENT) { |
| 398 | throw ParserException("Expected a single CREATE statement" ); |
| 399 | } |
| 400 | auto &create = parser.statements[0]->Cast<CreateStatement>(); |
| 401 | if (create.info->type != CatalogType::TABLE_ENTRY) { |
| 402 | throw InternalException("Expected a single CREATE TABLE statement" ); |
| 403 | } |
| 404 | auto &info = create.info->Cast<CreateTableInfo>(); |
| 405 | return std::move(info.columns); |
| 406 | } |
| 407 | |
| 408 | } // namespace duckdb |
| 409 | |