1/*--------------------------------------------------------------------
2 * Symbols referenced in this file:
3 * - raw_parser
4 * - base_yylex
5 * - raw_parser
6 *--------------------------------------------------------------------
7 */
8
9/*-------------------------------------------------------------------------
10 *
11 * parser.c
12 * Main entry point/driver for PostgreSQL grammar
13 *
14 * Note that the grammar is not allowed to perform any table access
15 * (since we need to be able to do basic parsing even while inside an
16 * aborted transaction). Therefore, the data structures returned by
17 * the grammar are "raw" parsetrees that still need to be analyzed by
18 * analyze.c and related files.
19 *
20 *
21 * Portions Copyright (c) 1996-2017, PostgreSQL Global Development PGGroup
22 * Portions Copyright (c) 1994, Regents of the University of California
23 *
24 * IDENTIFICATION
25 * src/backend/parser/parser.c
26 *
27 *-------------------------------------------------------------------------
28 */
29
30#include "pg_functions.hpp"
31
32#include "parser/gramparse.hpp"
33#include "parser/parser.hpp"
34#include "parser/kwlist.hpp"
35
36namespace duckdb_libpgquery {
37
38/*
39 * raw_parser
40 * Given a query in string form, do lexical and grammatical analysis.
41 *
42 * Returns a list of raw (un-analyzed) parse trees. The immediate elements
43 * of the list are always PGRawStmt nodes.
44 */
45PGList *raw_parser(const char *str) {
46 core_yyscan_t yyscanner;
47 base_yy_extra_type yyextra;
48 int yyresult;
49
50 /* initialize the flex scanner */
51 yyscanner = scanner_init(str, yyext: &yyextra.core_yy_extra, keywords: ScanKeywords, num_keywords: NumScanKeywords);
52
53 /* base_yylex() only needs this much initialization */
54 yyextra.have_lookahead = false;
55
56 /* initialize the bison parser */
57 parser_init(yyext: &yyextra);
58
59 /* Parse! */
60 yyresult = base_yyparse(yyscanner);
61
62 /* Clean up (release memory) */
63 scanner_finish(yyscanner);
64
65 if (yyresult) /* error */
66 return NIL;
67
68 return yyextra.parsetree;
69}
70
71bool is_keyword(const char *text) {
72 return ScanKeywordLookup(text, keywords: ScanKeywords, num_keywords: NumScanKeywords) != NULL;
73}
74
75std::vector<PGKeyword> keyword_list() {
76 std::vector<PGKeyword> result;
77 for(size_t i = 0; i < NumScanKeywords; i++) {
78 PGKeyword keyword;
79 keyword.text = ScanKeywords[i].name;
80 switch(ScanKeywords[i].category) {
81 case UNRESERVED_KEYWORD:
82 keyword.category = PGKeywordCategory::PG_KEYWORD_UNRESERVED;
83 break;
84 case RESERVED_KEYWORD:
85 keyword.category = PGKeywordCategory::PG_KEYWORD_RESERVED;
86 break;
87 case TYPE_FUNC_NAME_KEYWORD:
88 keyword.category = PGKeywordCategory::PG_KEYWORD_TYPE_FUNC;
89 break;
90 case COL_NAME_KEYWORD:
91 keyword.category = PGKeywordCategory::PG_KEYWORD_COL_NAME;
92 break;
93 }
94 result.push_back(x: keyword);
95 }
96 return result;
97}
98
99std::vector<PGSimplifiedToken> tokenize(const char *str) {
100 core_yyscan_t yyscanner;
101 base_yy_extra_type yyextra;
102
103 std::vector<PGSimplifiedToken> result;
104 yyscanner = scanner_init(str, yyext: &yyextra.core_yy_extra, keywords: ScanKeywords, num_keywords: NumScanKeywords);
105 yyextra.have_lookahead = false;
106
107 while(true) {
108 YYSTYPE type;
109 YYLTYPE loc;
110 int token;
111 try {
112 token = base_yylex(lvalp: &type, llocp: &loc, yyscanner);
113 } catch(...) {
114 token = 0;
115 }
116 if (token == 0) {
117 break;
118 }
119 PGSimplifiedToken current_token;
120 switch(token) {
121 case IDENT:
122 current_token.type = PGSimplifiedTokenType::PG_SIMPLIFIED_TOKEN_IDENTIFIER;
123 break;
124 case ICONST:
125 case FCONST:
126 current_token.type = PGSimplifiedTokenType::PG_SIMPLIFIED_TOKEN_NUMERIC_CONSTANT;
127 break;
128 case SCONST:
129 case BCONST:
130 case XCONST:
131 current_token.type = PGSimplifiedTokenType::PG_SIMPLIFIED_TOKEN_STRING_CONSTANT;
132 break;
133 case Op:
134 case PARAM:
135 case COLON_EQUALS:
136 case EQUALS_GREATER:
137 case LESS_EQUALS:
138 case GREATER_EQUALS:
139 case NOT_EQUALS:
140 current_token.type = PGSimplifiedTokenType::PG_SIMPLIFIED_TOKEN_OPERATOR;
141 break;
142 default:
143 if (token >= 255) {
144 // non-ascii value, probably a keyword
145 current_token.type = PGSimplifiedTokenType::PG_SIMPLIFIED_TOKEN_KEYWORD;
146 } else {
147 // ascii value, probably an operator
148 current_token.type = PGSimplifiedTokenType::PG_SIMPLIFIED_TOKEN_OPERATOR;
149 }
150 break;
151 }
152 current_token.start = loc;
153 result.push_back(x: current_token);
154 }
155
156 scanner_finish(yyscanner);
157 return result;
158}
159
160
161
162/*
163 * Intermediate filter between parser and core lexer (core_yylex in scan.l).
164 *
165 * This filter is needed because in some cases the standard SQL grammar
166 * requires more than one token lookahead. We reduce these cases to one-token
167 * lookahead by replacing tokens here, in order to keep the grammar LALR(1).
168 *
169 * Using a filter is simpler than trying to recognize multiword tokens
170 * directly in scan.l, because we'd have to allow for comments between the
171 * words. Furthermore it's not clear how to do that without re-introducing
172 * scanner backtrack, which would cost more performance than this filter
173 * layer does.
174 *
175 * The filter also provides a convenient place to translate between
176 * the core_YYSTYPE and YYSTYPE representations (which are really the
177 * same thing anyway, but notationally they're different).
178 */
179int base_yylex(YYSTYPE *lvalp, YYLTYPE *llocp, core_yyscan_t yyscanner) {
180 base_yy_extra_type *yyextra = pg_yyget_extra(yyscanner);
181 int cur_token;
182 int next_token;
183 int cur_token_length;
184 YYLTYPE cur_yylloc;
185
186 /* Get next token --- we might already have it */
187 if (yyextra->have_lookahead) {
188 cur_token = yyextra->lookahead_token;
189 lvalp->core_yystype = yyextra->lookahead_yylval;
190 *llocp = yyextra->lookahead_yylloc;
191 *(yyextra->lookahead_end) = yyextra->lookahead_hold_char;
192 yyextra->have_lookahead = false;
193 } else
194 cur_token = core_yylex(lvalp: &(lvalp->core_yystype), llocp, yyscanner);
195
196 /*
197 * If this token isn't one that requires lookahead, just return it. If it
198 * does, determine the token length. (We could get that via strlen(), but
199 * since we have such a small set of possibilities, hardwiring seems
200 * feasible and more efficient.)
201 */
202 switch (cur_token) {
203 case NOT:
204 cur_token_length = 3;
205 break;
206 case NULLS_P:
207 cur_token_length = 5;
208 break;
209 case WITH:
210 cur_token_length = 4;
211 break;
212 default:
213 return cur_token;
214 }
215
216 /*
217 * Identify end+1 of current token. core_yylex() has temporarily stored a
218 * '\0' here, and will undo that when we call it again. We need to redo
219 * it to fully revert the lookahead call for error reporting purposes.
220 */
221 yyextra->lookahead_end = yyextra->core_yy_extra.scanbuf + *llocp + cur_token_length;
222 Assert(*(yyextra->lookahead_end) == '\0');
223
224 /*
225 * Save and restore *llocp around the call. It might look like we could
226 * avoid this by just passing &lookahead_yylloc to core_yylex(), but that
227 * does not work because flex actually holds onto the last-passed pointer
228 * internally, and will use that for error reporting. We need any error
229 * reports to point to the current token, not the next one.
230 */
231 cur_yylloc = *llocp;
232
233 /* Get next token, saving outputs into lookahead variables */
234 next_token = core_yylex(lvalp: &(yyextra->lookahead_yylval), llocp, yyscanner);
235 yyextra->lookahead_token = next_token;
236 yyextra->lookahead_yylloc = *llocp;
237
238 *llocp = cur_yylloc;
239
240 /* Now revert the un-truncation of the current token */
241 yyextra->lookahead_hold_char = *(yyextra->lookahead_end);
242 *(yyextra->lookahead_end) = '\0';
243
244 yyextra->have_lookahead = true;
245
246 /* Replace cur_token if needed, based on lookahead */
247 switch (cur_token) {
248 case NOT:
249 /* Replace NOT by NOT_LA if it's followed by BETWEEN, IN, etc */
250 switch (next_token) {
251 case BETWEEN:
252 case IN_P:
253 case LIKE:
254 case ILIKE:
255 case SIMILAR:
256 cur_token = NOT_LA;
257 break;
258 }
259 break;
260
261 case NULLS_P:
262 /* Replace NULLS_P by NULLS_LA if it's followed by FIRST or LAST */
263 switch (next_token) {
264 case FIRST_P:
265 case LAST_P:
266 cur_token = NULLS_LA;
267 break;
268 }
269 break;
270
271 case WITH:
272 /* Replace WITH by WITH_LA if it's followed by TIME or ORDINALITY */
273 switch (next_token) {
274 case TIME:
275 case ORDINALITY:
276 cur_token = WITH_LA;
277 break;
278 }
279 break;
280 }
281
282 return cur_token;
283}
284
285}