parser.cpp source code [Velox/build/_deps/duckdb-src/src/parser/parser.cpp]

1	#include "duckdb/parser/parser.hpp"
2
3	#include "duckdb/parser/parsed_data/create_table_info.hpp"
4	#include "duckdb/parser/parser_extension.hpp"
5	#include "duckdb/parser/query_error_context.hpp"
6	#include "duckdb/parser/query_node/select_node.hpp"
7	#include "duckdb/parser/statement/create_statement.hpp"
8	#include "duckdb/parser/statement/extension_statement.hpp"
9	#include "duckdb/parser/statement/select_statement.hpp"
10	#include "duckdb/parser/statement/update_statement.hpp"
11	#include "duckdb/parser/tableref/expressionlistref.hpp"
12	#include "duckdb/parser/transformer.hpp"
13	#include "parser/parser.hpp"
14	#include "postgres_parser.hpp"
15
16	namespace duckdb {
17
18	Parser::Parser(ParserOptions options_p) : options (options_p) {
19	}
20
21	struct UnicodeSpace {
22	UnicodeSpace(idx_t pos, idx_t bytes) : pos(pos), bytes(bytes) {
23	}
24
25	idx_t pos;
26	idx_t bytes;
27	};
28
29	static bool ReplaceUnicodeSpaces(const string &query, string &new_query, vector<UnicodeSpace> &unicode_spaces) {
30	if (unicode_spaces.empty()) {
31	// no unicode spaces found
32	return false;
33	}
34	idx_t prev = `0`;
35	for (auto &usp : unicode_spaces) {
36	new_query += query.substr(pos: prev, n: usp.pos - prev);
37	new_query += " ";
38	prev = usp.pos + usp.bytes;
39	}
40	new_query += query.substr(pos: prev, n: query.size() - prev);
41	return true;
42	}
43
44	// This function strips unicode space characters from the query and replaces them with regular spaces
45	// It returns true if any unicode space characters were found and stripped
46	// See here for a list of unicode space characters - https://jkorpela.fi/chars/spaces.html
47	bool Parser::StripUnicodeSpaces(const string &query_str, string &new_query) {
48	const idx_t NBSP_LEN = `2`;
49	const idx_t USP_LEN = `3`;
50	idx_t pos = `0`;
51	unsigned char quote;
52	vector<UnicodeSpace> unicode_spaces;
53	auto query = const_uchar_ptr_cast(src: query_str.c_str());
54	auto qsize = query_str.size();
55
56	regular:
57	for (; pos + `2` < qsize; pos++) {
58	if (query[pos] == `0xC2`) {
59	if (query[pos + `1`] == `0xA0`) {
60	// U+00A0 - C2A0
61	unicode_spaces.emplace_back(args&: pos, args: NBSP_LEN);
62	}
63	}
64	if (query[pos] == `0xE2`) {
65	if (query[pos + `1`] == `0x80`) {
66	if (query[pos + `2`] >= `0x80` && query[pos + `2`] <= `0x8B`) {
67	// U+2000 to U+200B
68	// E28080 - E2808B
69	unicode_spaces.emplace_back(args&: pos, args: USP_LEN);
70	} else if (query[pos + `2`] == `0xAF`) {
71	// U+202F - E280AF
72	unicode_spaces.emplace_back(args&: pos, args: USP_LEN);
73	}
74	} else if (query[pos + `1`] == `0x81`) {
75	if (query[pos + `2`] == `0x9F`) {
76	// U+205F - E2819f
77	unicode_spaces.emplace_back(args&: pos, args: USP_LEN);
78	} else if (query[pos + `2`] == `0xA0`) {
79	// U+2060 - E281A0
80	unicode_spaces.emplace_back(args&: pos, args: USP_LEN);
81	}
82	}
83	} else if (query[pos] == `0xE3`) {
84	if (query[pos + `1`] == `0x80` && query[pos + `2`] == `0x80`) {
85	// U+3000 - E38080
86	unicode_spaces.emplace_back(args&: pos, args: USP_LEN);
87	}
88	} else if (query[pos] == `0xEF`) {
89	if (query[pos + `1`] == `0xBB` && query[pos + `2`] == `0xBF`) {
90	// U+FEFF - EFBBBF
91	unicode_spaces.emplace_back(args&: pos, args: USP_LEN);
92	}
93	} else if (query[pos] == `'"'` \|\| query[pos] == `'\''`) {
94	quote = query[pos];
95	pos++;
96	goto in_quotes;
97	} else if (query[pos] == `'-'` && query[pos + `1`] == `'-'`) {
98	goto in_comment;
99	}
100	}
101	goto end;
102	in_quotes:
103	for (; pos + `1` < qsize; pos++) {
104	if (query[pos] == quote) {
105	if (query[pos + `1`] == quote) {
106	// escaped quote
107	pos++;
108	continue;
109	}
110	pos++;
111	goto regular;
112	}
113	}
114	goto end;
115	in_comment:
116	for (; pos < qsize; pos++) {
117	if (query[pos] == `'\n'` \|\| query[pos] == `'\r'`) {
118	goto regular;
119	}
120	}
121	goto end;
122	end:
123	return ReplaceUnicodeSpaces(query: query_str, new_query, unicode_spaces);
124	}
125
126	vector<string> SplitQueryStringIntoStatements(const string &query) {
127	// Break sql string down into sql statements using the tokenizer
128	vector<string> query_statements;
129	auto tokens = Parser::Tokenize(query);
130	auto next_statement_start = `0`;
131	for (idx_t i = `1`; i < tokens.size(); ++i) {
132	auto &t_prev = tokens [i - `1`];
133	auto &t = tokens [i];
134	if (t_prev.type == SimplifiedTokenType::SIMPLIFIED_TOKEN_OPERATOR) {
135	// LCOV_EXCL_START
136	for (idx_t c = t_prev.start; c <= t.start; ++c) {
137	if (query.c_str()[c] == `';'`) {
138	query_statements.emplace_back(args: query.substr(pos: next_statement_start, n: t.start - next_statement_start));
139	next_statement_start = tokens [i].start;
140	}
141	}
142	// LCOV_EXCL_STOP
143	}
144	}
145	query_statements.emplace_back(args: query.substr(pos: next_statement_start, n: query.size() - next_statement_start));
146	return query_statements;
147	}
148
149	void Parser::ParseQuery(const string &query) {
150	Transformer transformer(options);
151	string parser_error;
152	{
153	// check if there are any unicode spaces in the string
154	string new_query;
155	if (StripUnicodeSpaces(query_str: query, new_query)) {
156	// there are - strip the unicode spaces and re-run the query
157	ParseQuery(query: new_query);
158	return;
159	}
160	}
161	{
162	PostgresParser::SetPreserveIdentifierCase(options.preserve_identifier_case);
163	bool parsing_succeed = false;
164	// Creating a new scope to prevent multiple PostgresParser destructors being called
165	// which led to some memory issues
166	{
167	PostgresParser parser;
168	parser.Parse(query);
169	if (parser.success) {
170	if (!parser.parse_tree) {
171	// empty statement
172	return;
173	}
174
175	// if it succeeded, we transform the Postgres parse tree into a list of
176	// SQLStatements
177	transformer.TransformParseTree(tree: parser.parse_tree, statements);
178	parsing_succeed = true;
179	} else {
180	parser_error = QueryErrorContext::Format(query, error_message: parser.error_message, error_location: parser.error_location - `1`);
181	}
182	}
183	// If DuckDB fails to parse the entire sql string, break the string down into individual statements
184	// using ';' as the delimiter so that parser extensions can parse the statement
185	if (parsing_succeed) {
186	// no-op
187	// return here would require refactoring into another function. o.w. will just no-op in order to run wrap up
188	// code at the end of this function
189	} else if (!options.extensions \|\| options.extensions->empty()) {
190	throw ParserException (parser_error);
191	} else {
192	// split sql string into statements and re-parse using extension
193	auto query_statements = SplitQueryStringIntoStatements(query);
194	for (auto const &query_statement : query_statements) {
195	PostgresParser another_parser;
196	another_parser.Parse(query: query_statement);
197	// LCOV_EXCL_START
198	// first see if DuckDB can parse this individual query statement
199	if (another_parser.success) {
200	if (!another_parser.parse_tree) {
201	// empty statement
202	continue;
203	}
204	transformer.TransformParseTree(tree: another_parser.parse_tree, statements);
205	} else {
206	// let extensions parse the statement which DuckDB failed to parse
207	bool parsed_single_statement = false;
208	for (auto &ext : *options.extensions) {
209	D_ASSERT(!parsed_single_statement);
210	D_ASSERT(ext.parse_function);
211	auto result = ext.parse_function(ext.parser_info.get(), query_statement);
212	if (result.type == ParserExtensionResultType::PARSE_SUCCESSFUL) {
213	auto statement = make_uniq<ExtensionStatement>(args: ext, args: std::move(result.parse_data));
214	statement ->stmt_length = query_statement.size();
215	statement ->stmt_location = `0`;
216	statements.push_back(x: std::move(statement));
217	parsed_single_statement = true;
218	break;
219	} else if (result.type == ParserExtensionResultType::DISPLAY_EXTENSION_ERROR) {
220	throw ParserException (result.error);
221	} else {
222	// We move to the next one!
223	}
224	}
225	if (!parsed_single_statement) {
226	parser_error = QueryErrorContext::Format(query, error_message: another_parser.error_message,
227	error_location: another_parser.error_location - `1`);
228	throw ParserException (parser_error);
229	}
230	}
231	// LCOV_EXCL_STOP
232	}
233	}
234	}
235	if (!statements.empty()) {
236	auto &last_statement = statements.back();
237	last_statement ->stmt_length = query.size() - last_statement ->stmt_location;
238	for (auto &statement : statements) {
239	statement ->query = query;
240	if (statement ->type == StatementType::CREATE_STATEMENT) {
241	auto &create = statement ->Cast<CreateStatement>();
242	create.info ->sql = query.substr(pos: statement ->stmt_location, n: statement ->stmt_length);
243	}
244	}
245	}
246	}
247
248	vector<SimplifiedToken> Parser::Tokenize(const string &query) {
249	auto pg_tokens = PostgresParser::Tokenize(query);
250	vector<SimplifiedToken> result;
251	result.reserve(n: pg_tokens.size());
252	for (auto &pg_token : pg_tokens) {
253	SimplifiedToken token;
254	switch (pg_token.type) {
255	case duckdb_libpgquery::PGSimplifiedTokenType::PG_SIMPLIFIED_TOKEN_IDENTIFIER:
256	token.type = SimplifiedTokenType::SIMPLIFIED_TOKEN_IDENTIFIER;
257	break;
258	case duckdb_libpgquery::PGSimplifiedTokenType::PG_SIMPLIFIED_TOKEN_NUMERIC_CONSTANT:
259	token.type = SimplifiedTokenType::SIMPLIFIED_TOKEN_NUMERIC_CONSTANT;
260	break;
261	case duckdb_libpgquery::PGSimplifiedTokenType::PG_SIMPLIFIED_TOKEN_STRING_CONSTANT:
262	token.type = SimplifiedTokenType::SIMPLIFIED_TOKEN_STRING_CONSTANT;
263	break;
264	case duckdb_libpgquery::PGSimplifiedTokenType::PG_SIMPLIFIED_TOKEN_OPERATOR:
265	token.type = SimplifiedTokenType::SIMPLIFIED_TOKEN_OPERATOR;
266	break;
267	case duckdb_libpgquery::PGSimplifiedTokenType::PG_SIMPLIFIED_TOKEN_KEYWORD:
268	token.type = SimplifiedTokenType::SIMPLIFIED_TOKEN_KEYWORD;
269	break;
270	// comments are not supported by our tokenizer right now
271	case duckdb_libpgquery::PGSimplifiedTokenType::PG_SIMPLIFIED_TOKEN_COMMENT: // LCOV_EXCL_START
272	token.type = SimplifiedTokenType::SIMPLIFIED_TOKEN_COMMENT;
273	break;
274	default:
275	throw InternalException ("Unrecognized token category");
276	} // LCOV_EXCL_STOP
277	token.start = pg_token.start;
278	result.push_back(x: token);
279	}
280	return result;
281	}
282
283	bool Parser::IsKeyword(const string &text) {
284	return PostgresParser::IsKeyword(text);
285	}
286
287	vector<ParserKeyword> Parser::KeywordList() {
288	auto keywords = PostgresParser::KeywordList();
289	vector<ParserKeyword> result;
290	for (auto &kw : keywords) {
291	ParserKeyword res;
292	res.name = kw.text;
293	switch (kw.category) {
294	case duckdb_libpgquery::PGKeywordCategory::PG_KEYWORD_RESERVED:
295	res.category = KeywordCategory::KEYWORD_RESERVED;
296	break;
297	case duckdb_libpgquery::PGKeywordCategory::PG_KEYWORD_UNRESERVED:
298	res.category = KeywordCategory::KEYWORD_UNRESERVED;
299	break;
300	case duckdb_libpgquery::PGKeywordCategory::PG_KEYWORD_TYPE_FUNC:
301	res.category = KeywordCategory::KEYWORD_TYPE_FUNC;
302	break;
303	case duckdb_libpgquery::PGKeywordCategory::PG_KEYWORD_COL_NAME:
304	res.category = KeywordCategory::KEYWORD_COL_NAME;
305	break;
306	default:
307	throw InternalException ("Unrecognized keyword category");
308	}
309	result.push_back(x: res);
310	}
311	return result;
312	}
313
314	vector<unique_ptr<ParsedExpression>> Parser::ParseExpressionList(const string &select_list, ParserOptions options) {
315	// construct a mock query prefixed with SELECT
316	string mock_query = "SELECT " + select_list;
317	// parse the query
318	Parser parser(options);
319	parser.ParseQuery(query: mock_query);
320	// check the statements
321	if (parser.statements.size() != `1` \|\| parser.statements [`0`]->type != StatementType::SELECT_STATEMENT) {
322	throw ParserException ("Expected a single SELECT statement");
323	}
324	auto &select = parser.statements [`0`]->Cast<SelectStatement>();
325	if (select.node ->type != QueryNodeType::SELECT_NODE) {
326	throw ParserException ("Expected a single SELECT node");
327	}
328	auto &select_node = select.node ->Cast<SelectNode>();
329	return std::move(select_node.select_list);
330	}
331
332	vector<OrderByNode> Parser::ParseOrderList(const string &select_list, ParserOptions options) {
333	// construct a mock query
334	string mock_query = "SELECT * FROM tbl ORDER BY " + select_list;
335	// parse the query
336	Parser parser(options);
337	parser.ParseQuery(query: mock_query);
338	// check the statements
339	if (parser.statements.size() != `1` \|\| parser.statements [`0`]->type != StatementType::SELECT_STATEMENT) {
340	throw ParserException ("Expected a single SELECT statement");
341	}
342	auto &select = parser.statements [`0`]->Cast<SelectStatement>();
343	if (select.node ->type != QueryNodeType::SELECT_NODE) {
344	throw ParserException ("Expected a single SELECT node");
345	}
346	auto &select_node = select.node ->Cast<SelectNode>();
347	if (select_node.modifiers.empty() \|\| select_node.modifiers [`0`]->type != ResultModifierType::ORDER_MODIFIER \|\|
348	select_node.modifiers.size() != `1`) {
349	throw ParserException ("Expected a single ORDER clause");
350	}
351	auto &order = select_node.modifiers [`0`]->Cast<OrderModifier>();
352	return std::move(order.orders);
353	}
354
355	void Parser::ParseUpdateList(const string &update_list, vector<string> &update_columns,
356	vector<unique_ptr<ParsedExpression>> &expressions, ParserOptions options) {
357	// construct a mock query
358	string mock_query = "UPDATE tbl SET " + update_list;
359	// parse the query
360	Parser parser(options);
361	parser.ParseQuery(query: mock_query);
362	// check the statements
363	if (parser.statements.size() != `1` \|\| parser.statements [`0`]->type != StatementType::UPDATE_STATEMENT) {
364	throw ParserException ("Expected a single UPDATE statement");
365	}
366	auto &update = parser.statements [`0`]->Cast<UpdateStatement>();
367	update_columns = std::move(update.set_info ->columns);
368	expressions = std::move(update.set_info ->expressions);
369	}
370
371	vector<vector<unique_ptr<ParsedExpression>>> Parser::ParseValuesList(const string &value_list, ParserOptions options) {
372	// construct a mock query
373	string mock_query = "VALUES " + value_list;
374	// parse the query
375	Parser parser(options);
376	parser.ParseQuery(query: mock_query);
377	// check the statements
378	if (parser.statements.size() != `1` \|\| parser.statements [`0`]->type != StatementType::SELECT_STATEMENT) {
379	throw ParserException ("Expected a single SELECT statement");
380	}
381	auto &select = parser.statements [`0`]->Cast<SelectStatement>();
382	if (select.node ->type != QueryNodeType::SELECT_NODE) {
383	throw ParserException ("Expected a single SELECT node");
384	}
385	auto &select_node = select.node ->Cast<SelectNode>();
386	if (!select_node.from_table \|\| select_node.from_table ->type != TableReferenceType::EXPRESSION_LIST) {
387	throw ParserException ("Expected a single VALUES statement");
388	}
389	auto &values_list = select_node.from_table ->Cast<ExpressionListRef>();
390	return std::move(values_list.values);
391	}
392
393	ColumnList Parser::ParseColumnList(const string &column_list, ParserOptions options) {
394	string mock_query = "CREATE TABLE blabla (" + column_list + ")";
395	Parser parser(options);
396	parser.ParseQuery(query: mock_query);
397	if (parser.statements.size() != `1` \|\| parser.statements [`0`]->type != StatementType::CREATE_STATEMENT) {
398	throw ParserException ("Expected a single CREATE statement");
399	}
400	auto &create = parser.statements [`0`]->Cast<CreateStatement>();
401	if (create.info ->type != CatalogType::TABLE_ENTRY) {
402	throw InternalException ("Expected a single CREATE TABLE statement");
403	}
404	auto &info = create.info ->Cast<CreateTableInfo>();
405	return std::move(info.columns);
406	}
407
408	} // namespace duckdb
409

Browse the source code of Velox/build/_deps/duckdb-src/src/parser/parser.cpp