src_backend_parser_parser.cpp source code [Velox/build/_deps/duckdb-src/third_party/libpg_query/src_backend_parser_parser.cpp]

1	/--------------------------------------------------------------------*
2	* Symbols referenced in this file:
3	* - raw_parser
4	* - base_yylex
5	* - raw_parser
6	*--------------------------------------------------------------------
7	*/
8
9	/-------------------------------------------------------------------------*
10	*
11	* parser.c
12	* Main entry point/driver for PostgreSQL grammar
13	*
14	* Note that the grammar is not allowed to perform any table access
15	* (since we need to be able to do basic parsing even while inside an
16	* aborted transaction). Therefore, the data structures returned by
17	* the grammar are "raw" parsetrees that still need to be analyzed by
18	* analyze.c and related files.
19	*
20	*
21	* Portions Copyright (c) 1996-2017, PostgreSQL Global Development PGGroup
22	* Portions Copyright (c) 1994, Regents of the University of California
23	*
24	* IDENTIFICATION
25	* src/backend/parser/parser.c
26	*
27	*-------------------------------------------------------------------------
28	*/
29
30	#include "pg_functions.hpp"
31
32	#include "parser/gramparse.hpp"
33	#include "parser/parser.hpp"
34	#include "parser/kwlist.hpp"
35
36	namespace duckdb_libpgquery {
37
38	/*
39	* raw_parser
40	* Given a query in string form, do lexical and grammatical analysis.
41	*
42	* Returns a list of raw (un-analyzed) parse trees. The immediate elements
43	* of the list are always PGRawStmt nodes.
44	*/
45	PGList raw_parser(const* char *str) {
46	core_yyscan_t yyscanner;
47	base_yy_extra_type yyextra;
48	int yyresult;
49
50	/ initialize the flex scanner /
51	yyscanner = scanner_init(str, yyext: &yyextra.core_yy_extra, keywords: ScanKeywords, num_keywords: NumScanKeywords);
52
53	/ base_yylex() only needs this much initialization /
54	yyextra.have_lookahead = false;
55
56	/ initialize the bison parser /
57	parser_init(yyext: &yyextra);
58
59	/ Parse! /
60	yyresult = base_yyparse(yyscanner);
61
62	/ Clean up (release memory) /
63	scanner_finish(yyscanner);
64
65	if (yyresult) / error /
66	return NIL;
67
68	return yyextra.parsetree;
69	}
70
71	bool is_keyword(const char *text) {
72	return ScanKeywordLookup(text, keywords: ScanKeywords, num_keywords: NumScanKeywords) != NULL;
73	}
74
75	std::vector<PGKeyword> keyword_list() {
76	std::vector<PGKeyword> result;
77	for(size_t i = `0`; i < NumScanKeywords; i++) {
78	PGKeyword keyword;
79	keyword.text = ScanKeywords[i].name;
80	switch(ScanKeywords[i].category) {
81	case UNRESERVED_KEYWORD:
82	keyword.category = PGKeywordCategory::PG_KEYWORD_UNRESERVED;
83	break;
84	case RESERVED_KEYWORD:
85	keyword.category = PGKeywordCategory::PG_KEYWORD_RESERVED;
86	break;
87	case TYPE_FUNC_NAME_KEYWORD:
88	keyword.category = PGKeywordCategory::PG_KEYWORD_TYPE_FUNC;
89	break;
90	case COL_NAME_KEYWORD:
91	keyword.category = PGKeywordCategory::PG_KEYWORD_COL_NAME;
92	break;
93	}
94	result.push_back(x: keyword);
95	}
96	return result;
97	}
98
99	std::vector<PGSimplifiedToken> tokenize(const char *str) {
100	core_yyscan_t yyscanner;
101	base_yy_extra_type yyextra;
102
103	std::vector<PGSimplifiedToken> result;
104	yyscanner = scanner_init(str, yyext: &yyextra.core_yy_extra, keywords: ScanKeywords, num_keywords: NumScanKeywords);
105	yyextra.have_lookahead = false;
106
107	while(true) {
108	YYSTYPE type;
109	YYLTYPE loc;
110	int token;
111	try {
112	token = base_yylex(lvalp: &type, llocp: &loc, yyscanner);
113	} catch(...) {
114	token = `0`;
115	}
116	if (token == `0`) {
117	break;
118	}
119	PGSimplifiedToken current_token;
120	switch(token) {
121	case IDENT:
122	current_token.type = PGSimplifiedTokenType::PG_SIMPLIFIED_TOKEN_IDENTIFIER;
123	break;
124	case ICONST:
125	case FCONST:
126	current_token.type = PGSimplifiedTokenType::PG_SIMPLIFIED_TOKEN_NUMERIC_CONSTANT;
127	break;
128	case SCONST:
129	case BCONST:
130	case XCONST:
131	current_token.type = PGSimplifiedTokenType::PG_SIMPLIFIED_TOKEN_STRING_CONSTANT;
132	break;
133	case Op:
134	case PARAM:
135	case COLON_EQUALS:
136	case EQUALS_GREATER:
137	case LESS_EQUALS:
138	case GREATER_EQUALS:
139	case NOT_EQUALS:
140	current_token.type = PGSimplifiedTokenType::PG_SIMPLIFIED_TOKEN_OPERATOR;
141	break;
142	default:
143	if (token >= `255`) {
144	// non-ascii value, probably a keyword
145	current_token.type = PGSimplifiedTokenType::PG_SIMPLIFIED_TOKEN_KEYWORD;
146	} else {
147	// ascii value, probably an operator
148	current_token.type = PGSimplifiedTokenType::PG_SIMPLIFIED_TOKEN_OPERATOR;
149	}
150	break;
151	}
152	current_token.start = loc;
153	result.push_back(x: current_token);
154	}
155
156	scanner_finish(yyscanner);
157	return result;
158	}
159
160
161
162	/*
163	* Intermediate filter between parser and core lexer (core_yylex in scan.l).
164	*
165	* This filter is needed because in some cases the standard SQL grammar
166	* requires more than one token lookahead. We reduce these cases to one-token
167	* lookahead by replacing tokens here, in order to keep the grammar LALR(1).
168	*
169	* Using a filter is simpler than trying to recognize multiword tokens
170	* directly in scan.l, because we'd have to allow for comments between the
171	* words. Furthermore it's not clear how to do that without re-introducing
172	* scanner backtrack, which would cost more performance than this filter
173	* layer does.
174	*
175	* The filter also provides a convenient place to translate between
176	* the core_YYSTYPE and YYSTYPE representations (which are really the
177	* same thing anyway, but notationally they're different).
178	*/
179	int base_yylex(YYSTYPE lvalp, YYLTYPE llocp, core_yyscan_t yyscanner) {
180	base_yy_extra_type *yyextra = pg_yyget_extra(yyscanner);
181	int cur_token;
182	int next_token;
183	int cur_token_length;
184	YYLTYPE cur_yylloc;
185
186	/ Get next token --- we might already have it /
187	if (yyextra->have_lookahead) {
188	cur_token = yyextra->lookahead_token;
189	lvalp->core_yystype = yyextra->lookahead_yylval;
190	*llocp = yyextra->lookahead_yylloc;
191	*(yyextra->lookahead_end) = yyextra->lookahead_hold_char;
192	yyextra->have_lookahead = false;
193	} else
194	cur_token = core_yylex(lvalp: &(lvalp->core_yystype), llocp, yyscanner);
195
196	/*
197	* If this token isn't one that requires lookahead, just return it. If it
198	* does, determine the token length. (We could get that via strlen(), but
199	* since we have such a small set of possibilities, hardwiring seems
200	* feasible and more efficient.)
201	*/
202	switch (cur_token) {
203	case NOT:
204	cur_token_length = `3`;
205	break;
206	case NULLS_P:
207	cur_token_length = `5`;
208	break;
209	case WITH:
210	cur_token_length = `4`;
211	break;
212	default:
213	return cur_token;
214	}
215
216	/*
217	* Identify end+1 of current token. core_yylex() has temporarily stored a
218	* '\0' here, and will undo that when we call it again. We need to redo
219	* it to fully revert the lookahead call for error reporting purposes.
220	*/
221	yyextra->lookahead_end = yyextra->core_yy_extra.scanbuf + *llocp + cur_token_length;
222	Assert(*(yyextra->lookahead_end) == `'\0'`);
223
224	/*
225	* Save and restore *llocp around the call. It might look like we could
226	* avoid this by just passing &lookahead_yylloc to core_yylex(), but that
227	* does not work because flex actually holds onto the last-passed pointer
228	* internally, and will use that for error reporting. We need any error
229	* reports to point to the current token, not the next one.
230	*/
231	cur_yylloc = *llocp;
232
233	/ Get next token, saving outputs into lookahead variables /
234	next_token = core_yylex(lvalp: &(yyextra->lookahead_yylval), llocp, yyscanner);
235	yyextra->lookahead_token = next_token;
236	yyextra->lookahead_yylloc = *llocp;
237
238	*llocp = cur_yylloc;
239
240	/ Now revert the un-truncation of the current token /
241	yyextra->lookahead_hold_char = *(yyextra->lookahead_end);
242	*(yyextra->lookahead_end) = `'\0'`;
243
244	yyextra->have_lookahead = true;
245
246	/ Replace cur_token if needed, based on lookahead /
247	switch (cur_token) {
248	case NOT:
249	/ Replace NOT by NOT_LA if it's followed by BETWEEN, IN, etc /
250	switch (next_token) {
251	case BETWEEN:
252	case IN_P:
253	case LIKE:
254	case ILIKE:
255	case SIMILAR:
256	cur_token = NOT_LA;
257	break;
258	}
259	break;
260
261	case NULLS_P:
262	/ Replace NULLS_P by NULLS_LA if it's followed by FIRST or LAST /
263	switch (next_token) {
264	case FIRST_P:
265	case LAST_P:
266	cur_token = NULLS_LA;
267	break;
268	}
269	break;
270
271	case WITH:
272	/ Replace WITH by WITH_LA if it's followed by TIME or ORDINALITY /
273	switch (next_token) {
274	case TIME:
275	case ORDINALITY:
276	cur_token = WITH_LA;
277	break;
278	}
279	break;
280	}
281
282	return cur_token;
283	}
284
285	}

Browse the source code of Velox/build/_deps/duckdb-src/third_party/libpg_query/src_backend_parser_parser.cpp