1 | /*-------------------------------------------------------------------- |
2 | * Symbols referenced in this file: |
3 | * - raw_parser |
4 | * - base_yylex |
5 | * - raw_parser |
6 | *-------------------------------------------------------------------- |
7 | */ |
8 | |
9 | /*------------------------------------------------------------------------- |
10 | * |
11 | * parser.c |
12 | * Main entry point/driver for PostgreSQL grammar |
13 | * |
14 | * Note that the grammar is not allowed to perform any table access |
15 | * (since we need to be able to do basic parsing even while inside an |
16 | * aborted transaction). Therefore, the data structures returned by |
17 | * the grammar are "raw" parsetrees that still need to be analyzed by |
18 | * analyze.c and related files. |
19 | * |
20 | * |
21 | * Portions Copyright (c) 1996-2017, PostgreSQL Global Development PGGroup |
22 | * Portions Copyright (c) 1994, Regents of the University of California |
23 | * |
24 | * IDENTIFICATION |
25 | * src/backend/parser/parser.c |
26 | * |
27 | *------------------------------------------------------------------------- |
28 | */ |
29 | |
30 | #include "pg_functions.hpp" |
31 | |
32 | #include "parser/gramparse.hpp" |
33 | #include "parser/parser.hpp" |
34 | #include "parser/kwlist.hpp" |
35 | |
36 | namespace duckdb_libpgquery { |
37 | |
38 | /* |
39 | * raw_parser |
40 | * Given a query in string form, do lexical and grammatical analysis. |
41 | * |
42 | * Returns a list of raw (un-analyzed) parse trees. The immediate elements |
43 | * of the list are always PGRawStmt nodes. |
44 | */ |
45 | PGList *raw_parser(const char *str) { |
46 | core_yyscan_t yyscanner; |
47 | base_yy_extra_type ; |
48 | int yyresult; |
49 | |
50 | /* initialize the flex scanner */ |
51 | yyscanner = scanner_init(str, yyext: &yyextra.core_yy_extra, keywords: ScanKeywords, num_keywords: NumScanKeywords); |
52 | |
53 | /* base_yylex() only needs this much initialization */ |
54 | yyextra.have_lookahead = false; |
55 | |
56 | /* initialize the bison parser */ |
57 | parser_init(yyext: &yyextra); |
58 | |
59 | /* Parse! */ |
60 | yyresult = base_yyparse(yyscanner); |
61 | |
62 | /* Clean up (release memory) */ |
63 | scanner_finish(yyscanner); |
64 | |
65 | if (yyresult) /* error */ |
66 | return NIL; |
67 | |
68 | return yyextra.parsetree; |
69 | } |
70 | |
71 | bool is_keyword(const char *text) { |
72 | return ScanKeywordLookup(text, keywords: ScanKeywords, num_keywords: NumScanKeywords) != NULL; |
73 | } |
74 | |
75 | std::vector<PGKeyword> keyword_list() { |
76 | std::vector<PGKeyword> result; |
77 | for(size_t i = 0; i < NumScanKeywords; i++) { |
78 | PGKeyword keyword; |
79 | keyword.text = ScanKeywords[i].name; |
80 | switch(ScanKeywords[i].category) { |
81 | case UNRESERVED_KEYWORD: |
82 | keyword.category = PGKeywordCategory::PG_KEYWORD_UNRESERVED; |
83 | break; |
84 | case RESERVED_KEYWORD: |
85 | keyword.category = PGKeywordCategory::PG_KEYWORD_RESERVED; |
86 | break; |
87 | case TYPE_FUNC_NAME_KEYWORD: |
88 | keyword.category = PGKeywordCategory::PG_KEYWORD_TYPE_FUNC; |
89 | break; |
90 | case COL_NAME_KEYWORD: |
91 | keyword.category = PGKeywordCategory::PG_KEYWORD_COL_NAME; |
92 | break; |
93 | } |
94 | result.push_back(x: keyword); |
95 | } |
96 | return result; |
97 | } |
98 | |
99 | std::vector<PGSimplifiedToken> tokenize(const char *str) { |
100 | core_yyscan_t yyscanner; |
101 | base_yy_extra_type ; |
102 | |
103 | std::vector<PGSimplifiedToken> result; |
104 | yyscanner = scanner_init(str, yyext: &yyextra.core_yy_extra, keywords: ScanKeywords, num_keywords: NumScanKeywords); |
105 | yyextra.have_lookahead = false; |
106 | |
107 | while(true) { |
108 | YYSTYPE type; |
109 | YYLTYPE loc; |
110 | int token; |
111 | try { |
112 | token = base_yylex(lvalp: &type, llocp: &loc, yyscanner); |
113 | } catch(...) { |
114 | token = 0; |
115 | } |
116 | if (token == 0) { |
117 | break; |
118 | } |
119 | PGSimplifiedToken current_token; |
120 | switch(token) { |
121 | case IDENT: |
122 | current_token.type = PGSimplifiedTokenType::PG_SIMPLIFIED_TOKEN_IDENTIFIER; |
123 | break; |
124 | case ICONST: |
125 | case FCONST: |
126 | current_token.type = PGSimplifiedTokenType::PG_SIMPLIFIED_TOKEN_NUMERIC_CONSTANT; |
127 | break; |
128 | case SCONST: |
129 | case BCONST: |
130 | case XCONST: |
131 | current_token.type = PGSimplifiedTokenType::PG_SIMPLIFIED_TOKEN_STRING_CONSTANT; |
132 | break; |
133 | case Op: |
134 | case PARAM: |
135 | case COLON_EQUALS: |
136 | case EQUALS_GREATER: |
137 | case LESS_EQUALS: |
138 | case GREATER_EQUALS: |
139 | case NOT_EQUALS: |
140 | current_token.type = PGSimplifiedTokenType::PG_SIMPLIFIED_TOKEN_OPERATOR; |
141 | break; |
142 | default: |
143 | if (token >= 255) { |
144 | // non-ascii value, probably a keyword |
145 | current_token.type = PGSimplifiedTokenType::PG_SIMPLIFIED_TOKEN_KEYWORD; |
146 | } else { |
147 | // ascii value, probably an operator |
148 | current_token.type = PGSimplifiedTokenType::PG_SIMPLIFIED_TOKEN_OPERATOR; |
149 | } |
150 | break; |
151 | } |
152 | current_token.start = loc; |
153 | result.push_back(x: current_token); |
154 | } |
155 | |
156 | scanner_finish(yyscanner); |
157 | return result; |
158 | } |
159 | |
160 | |
161 | |
162 | /* |
163 | * Intermediate filter between parser and core lexer (core_yylex in scan.l). |
164 | * |
165 | * This filter is needed because in some cases the standard SQL grammar |
166 | * requires more than one token lookahead. We reduce these cases to one-token |
167 | * lookahead by replacing tokens here, in order to keep the grammar LALR(1). |
168 | * |
169 | * Using a filter is simpler than trying to recognize multiword tokens |
170 | * directly in scan.l, because we'd have to allow for comments between the |
171 | * words. Furthermore it's not clear how to do that without re-introducing |
172 | * scanner backtrack, which would cost more performance than this filter |
173 | * layer does. |
174 | * |
175 | * The filter also provides a convenient place to translate between |
176 | * the core_YYSTYPE and YYSTYPE representations (which are really the |
177 | * same thing anyway, but notationally they're different). |
178 | */ |
179 | int base_yylex(YYSTYPE *lvalp, YYLTYPE *llocp, core_yyscan_t yyscanner) { |
180 | base_yy_extra_type * = pg_yyget_extra(yyscanner); |
181 | int cur_token; |
182 | int next_token; |
183 | int cur_token_length; |
184 | YYLTYPE cur_yylloc; |
185 | |
186 | /* Get next token --- we might already have it */ |
187 | if (yyextra->have_lookahead) { |
188 | cur_token = yyextra->lookahead_token; |
189 | lvalp->core_yystype = yyextra->lookahead_yylval; |
190 | *llocp = yyextra->lookahead_yylloc; |
191 | *(yyextra->lookahead_end) = yyextra->lookahead_hold_char; |
192 | yyextra->have_lookahead = false; |
193 | } else |
194 | cur_token = core_yylex(lvalp: &(lvalp->core_yystype), llocp, yyscanner); |
195 | |
196 | /* |
197 | * If this token isn't one that requires lookahead, just return it. If it |
198 | * does, determine the token length. (We could get that via strlen(), but |
199 | * since we have such a small set of possibilities, hardwiring seems |
200 | * feasible and more efficient.) |
201 | */ |
202 | switch (cur_token) { |
203 | case NOT: |
204 | cur_token_length = 3; |
205 | break; |
206 | case NULLS_P: |
207 | cur_token_length = 5; |
208 | break; |
209 | case WITH: |
210 | cur_token_length = 4; |
211 | break; |
212 | default: |
213 | return cur_token; |
214 | } |
215 | |
216 | /* |
217 | * Identify end+1 of current token. core_yylex() has temporarily stored a |
218 | * '\0' here, and will undo that when we call it again. We need to redo |
219 | * it to fully revert the lookahead call for error reporting purposes. |
220 | */ |
221 | yyextra->lookahead_end = yyextra->core_yy_extra.scanbuf + *llocp + cur_token_length; |
222 | Assert(*(yyextra->lookahead_end) == '\0'); |
223 | |
224 | /* |
225 | * Save and restore *llocp around the call. It might look like we could |
226 | * avoid this by just passing &lookahead_yylloc to core_yylex(), but that |
227 | * does not work because flex actually holds onto the last-passed pointer |
228 | * internally, and will use that for error reporting. We need any error |
229 | * reports to point to the current token, not the next one. |
230 | */ |
231 | cur_yylloc = *llocp; |
232 | |
233 | /* Get next token, saving outputs into lookahead variables */ |
234 | next_token = core_yylex(lvalp: &(yyextra->lookahead_yylval), llocp, yyscanner); |
235 | yyextra->lookahead_token = next_token; |
236 | yyextra->lookahead_yylloc = *llocp; |
237 | |
238 | *llocp = cur_yylloc; |
239 | |
240 | /* Now revert the un-truncation of the current token */ |
241 | yyextra->lookahead_hold_char = *(yyextra->lookahead_end); |
242 | *(yyextra->lookahead_end) = '\0'; |
243 | |
244 | yyextra->have_lookahead = true; |
245 | |
246 | /* Replace cur_token if needed, based on lookahead */ |
247 | switch (cur_token) { |
248 | case NOT: |
249 | /* Replace NOT by NOT_LA if it's followed by BETWEEN, IN, etc */ |
250 | switch (next_token) { |
251 | case BETWEEN: |
252 | case IN_P: |
253 | case LIKE: |
254 | case ILIKE: |
255 | case SIMILAR: |
256 | cur_token = NOT_LA; |
257 | break; |
258 | } |
259 | break; |
260 | |
261 | case NULLS_P: |
262 | /* Replace NULLS_P by NULLS_LA if it's followed by FIRST or LAST */ |
263 | switch (next_token) { |
264 | case FIRST_P: |
265 | case LAST_P: |
266 | cur_token = NULLS_LA; |
267 | break; |
268 | } |
269 | break; |
270 | |
271 | case WITH: |
272 | /* Replace WITH by WITH_LA if it's followed by TIME or ORDINALITY */ |
273 | switch (next_token) { |
274 | case TIME: |
275 | case ORDINALITY: |
276 | cur_token = WITH_LA; |
277 | break; |
278 | } |
279 | break; |
280 | } |
281 | |
282 | return cur_token; |
283 | } |
284 | |
285 | } |