| 1 | /*-------------------------------------------------------------------- |
| 2 | * Symbols referenced in this file: |
| 3 | * - raw_parser |
| 4 | * - base_yylex |
| 5 | * - raw_parser |
| 6 | *-------------------------------------------------------------------- |
| 7 | */ |
| 8 | |
| 9 | /*------------------------------------------------------------------------- |
| 10 | * |
| 11 | * parser.c |
| 12 | * Main entry point/driver for PostgreSQL grammar |
| 13 | * |
| 14 | * Note that the grammar is not allowed to perform any table access |
| 15 | * (since we need to be able to do basic parsing even while inside an |
| 16 | * aborted transaction). Therefore, the data structures returned by |
| 17 | * the grammar are "raw" parsetrees that still need to be analyzed by |
| 18 | * analyze.c and related files. |
| 19 | * |
| 20 | * |
| 21 | * Portions Copyright (c) 1996-2017, PostgreSQL Global Development PGGroup |
| 22 | * Portions Copyright (c) 1994, Regents of the University of California |
| 23 | * |
| 24 | * IDENTIFICATION |
| 25 | * src/backend/parser/parser.c |
| 26 | * |
| 27 | *------------------------------------------------------------------------- |
| 28 | */ |
| 29 | |
| 30 | #include "pg_functions.hpp" |
| 31 | |
| 32 | #include "parser/gramparse.hpp" |
| 33 | #include "parser/parser.hpp" |
| 34 | #include "parser/kwlist.hpp" |
| 35 | |
| 36 | namespace duckdb_libpgquery { |
| 37 | |
| 38 | /* |
| 39 | * raw_parser |
| 40 | * Given a query in string form, do lexical and grammatical analysis. |
| 41 | * |
| 42 | * Returns a list of raw (un-analyzed) parse trees. The immediate elements |
| 43 | * of the list are always PGRawStmt nodes. |
| 44 | */ |
| 45 | PGList *raw_parser(const char *str) { |
| 46 | core_yyscan_t yyscanner; |
| 47 | base_yy_extra_type ; |
| 48 | int yyresult; |
| 49 | |
| 50 | /* initialize the flex scanner */ |
| 51 | yyscanner = scanner_init(str, yyext: &yyextra.core_yy_extra, keywords: ScanKeywords, num_keywords: NumScanKeywords); |
| 52 | |
| 53 | /* base_yylex() only needs this much initialization */ |
| 54 | yyextra.have_lookahead = false; |
| 55 | |
| 56 | /* initialize the bison parser */ |
| 57 | parser_init(yyext: &yyextra); |
| 58 | |
| 59 | /* Parse! */ |
| 60 | yyresult = base_yyparse(yyscanner); |
| 61 | |
| 62 | /* Clean up (release memory) */ |
| 63 | scanner_finish(yyscanner); |
| 64 | |
| 65 | if (yyresult) /* error */ |
| 66 | return NIL; |
| 67 | |
| 68 | return yyextra.parsetree; |
| 69 | } |
| 70 | |
| 71 | bool is_keyword(const char *text) { |
| 72 | return ScanKeywordLookup(text, keywords: ScanKeywords, num_keywords: NumScanKeywords) != NULL; |
| 73 | } |
| 74 | |
| 75 | std::vector<PGKeyword> keyword_list() { |
| 76 | std::vector<PGKeyword> result; |
| 77 | for(size_t i = 0; i < NumScanKeywords; i++) { |
| 78 | PGKeyword keyword; |
| 79 | keyword.text = ScanKeywords[i].name; |
| 80 | switch(ScanKeywords[i].category) { |
| 81 | case UNRESERVED_KEYWORD: |
| 82 | keyword.category = PGKeywordCategory::PG_KEYWORD_UNRESERVED; |
| 83 | break; |
| 84 | case RESERVED_KEYWORD: |
| 85 | keyword.category = PGKeywordCategory::PG_KEYWORD_RESERVED; |
| 86 | break; |
| 87 | case TYPE_FUNC_NAME_KEYWORD: |
| 88 | keyword.category = PGKeywordCategory::PG_KEYWORD_TYPE_FUNC; |
| 89 | break; |
| 90 | case COL_NAME_KEYWORD: |
| 91 | keyword.category = PGKeywordCategory::PG_KEYWORD_COL_NAME; |
| 92 | break; |
| 93 | } |
| 94 | result.push_back(x: keyword); |
| 95 | } |
| 96 | return result; |
| 97 | } |
| 98 | |
| 99 | std::vector<PGSimplifiedToken> tokenize(const char *str) { |
| 100 | core_yyscan_t yyscanner; |
| 101 | base_yy_extra_type ; |
| 102 | |
| 103 | std::vector<PGSimplifiedToken> result; |
| 104 | yyscanner = scanner_init(str, yyext: &yyextra.core_yy_extra, keywords: ScanKeywords, num_keywords: NumScanKeywords); |
| 105 | yyextra.have_lookahead = false; |
| 106 | |
| 107 | while(true) { |
| 108 | YYSTYPE type; |
| 109 | YYLTYPE loc; |
| 110 | int token; |
| 111 | try { |
| 112 | token = base_yylex(lvalp: &type, llocp: &loc, yyscanner); |
| 113 | } catch(...) { |
| 114 | token = 0; |
| 115 | } |
| 116 | if (token == 0) { |
| 117 | break; |
| 118 | } |
| 119 | PGSimplifiedToken current_token; |
| 120 | switch(token) { |
| 121 | case IDENT: |
| 122 | current_token.type = PGSimplifiedTokenType::PG_SIMPLIFIED_TOKEN_IDENTIFIER; |
| 123 | break; |
| 124 | case ICONST: |
| 125 | case FCONST: |
| 126 | current_token.type = PGSimplifiedTokenType::PG_SIMPLIFIED_TOKEN_NUMERIC_CONSTANT; |
| 127 | break; |
| 128 | case SCONST: |
| 129 | case BCONST: |
| 130 | case XCONST: |
| 131 | current_token.type = PGSimplifiedTokenType::PG_SIMPLIFIED_TOKEN_STRING_CONSTANT; |
| 132 | break; |
| 133 | case Op: |
| 134 | case PARAM: |
| 135 | case COLON_EQUALS: |
| 136 | case EQUALS_GREATER: |
| 137 | case LESS_EQUALS: |
| 138 | case GREATER_EQUALS: |
| 139 | case NOT_EQUALS: |
| 140 | current_token.type = PGSimplifiedTokenType::PG_SIMPLIFIED_TOKEN_OPERATOR; |
| 141 | break; |
| 142 | default: |
| 143 | if (token >= 255) { |
| 144 | // non-ascii value, probably a keyword |
| 145 | current_token.type = PGSimplifiedTokenType::PG_SIMPLIFIED_TOKEN_KEYWORD; |
| 146 | } else { |
| 147 | // ascii value, probably an operator |
| 148 | current_token.type = PGSimplifiedTokenType::PG_SIMPLIFIED_TOKEN_OPERATOR; |
| 149 | } |
| 150 | break; |
| 151 | } |
| 152 | current_token.start = loc; |
| 153 | result.push_back(x: current_token); |
| 154 | } |
| 155 | |
| 156 | scanner_finish(yyscanner); |
| 157 | return result; |
| 158 | } |
| 159 | |
| 160 | |
| 161 | |
| 162 | /* |
| 163 | * Intermediate filter between parser and core lexer (core_yylex in scan.l). |
| 164 | * |
| 165 | * This filter is needed because in some cases the standard SQL grammar |
| 166 | * requires more than one token lookahead. We reduce these cases to one-token |
| 167 | * lookahead by replacing tokens here, in order to keep the grammar LALR(1). |
| 168 | * |
| 169 | * Using a filter is simpler than trying to recognize multiword tokens |
| 170 | * directly in scan.l, because we'd have to allow for comments between the |
| 171 | * words. Furthermore it's not clear how to do that without re-introducing |
| 172 | * scanner backtrack, which would cost more performance than this filter |
| 173 | * layer does. |
| 174 | * |
| 175 | * The filter also provides a convenient place to translate between |
| 176 | * the core_YYSTYPE and YYSTYPE representations (which are really the |
| 177 | * same thing anyway, but notationally they're different). |
| 178 | */ |
| 179 | int base_yylex(YYSTYPE *lvalp, YYLTYPE *llocp, core_yyscan_t yyscanner) { |
| 180 | base_yy_extra_type * = pg_yyget_extra(yyscanner); |
| 181 | int cur_token; |
| 182 | int next_token; |
| 183 | int cur_token_length; |
| 184 | YYLTYPE cur_yylloc; |
| 185 | |
| 186 | /* Get next token --- we might already have it */ |
| 187 | if (yyextra->have_lookahead) { |
| 188 | cur_token = yyextra->lookahead_token; |
| 189 | lvalp->core_yystype = yyextra->lookahead_yylval; |
| 190 | *llocp = yyextra->lookahead_yylloc; |
| 191 | *(yyextra->lookahead_end) = yyextra->lookahead_hold_char; |
| 192 | yyextra->have_lookahead = false; |
| 193 | } else |
| 194 | cur_token = core_yylex(lvalp: &(lvalp->core_yystype), llocp, yyscanner); |
| 195 | |
| 196 | /* |
| 197 | * If this token isn't one that requires lookahead, just return it. If it |
| 198 | * does, determine the token length. (We could get that via strlen(), but |
| 199 | * since we have such a small set of possibilities, hardwiring seems |
| 200 | * feasible and more efficient.) |
| 201 | */ |
| 202 | switch (cur_token) { |
| 203 | case NOT: |
| 204 | cur_token_length = 3; |
| 205 | break; |
| 206 | case NULLS_P: |
| 207 | cur_token_length = 5; |
| 208 | break; |
| 209 | case WITH: |
| 210 | cur_token_length = 4; |
| 211 | break; |
| 212 | default: |
| 213 | return cur_token; |
| 214 | } |
| 215 | |
| 216 | /* |
| 217 | * Identify end+1 of current token. core_yylex() has temporarily stored a |
| 218 | * '\0' here, and will undo that when we call it again. We need to redo |
| 219 | * it to fully revert the lookahead call for error reporting purposes. |
| 220 | */ |
| 221 | yyextra->lookahead_end = yyextra->core_yy_extra.scanbuf + *llocp + cur_token_length; |
| 222 | Assert(*(yyextra->lookahead_end) == '\0'); |
| 223 | |
| 224 | /* |
| 225 | * Save and restore *llocp around the call. It might look like we could |
| 226 | * avoid this by just passing &lookahead_yylloc to core_yylex(), but that |
| 227 | * does not work because flex actually holds onto the last-passed pointer |
| 228 | * internally, and will use that for error reporting. We need any error |
| 229 | * reports to point to the current token, not the next one. |
| 230 | */ |
| 231 | cur_yylloc = *llocp; |
| 232 | |
| 233 | /* Get next token, saving outputs into lookahead variables */ |
| 234 | next_token = core_yylex(lvalp: &(yyextra->lookahead_yylval), llocp, yyscanner); |
| 235 | yyextra->lookahead_token = next_token; |
| 236 | yyextra->lookahead_yylloc = *llocp; |
| 237 | |
| 238 | *llocp = cur_yylloc; |
| 239 | |
| 240 | /* Now revert the un-truncation of the current token */ |
| 241 | yyextra->lookahead_hold_char = *(yyextra->lookahead_end); |
| 242 | *(yyextra->lookahead_end) = '\0'; |
| 243 | |
| 244 | yyextra->have_lookahead = true; |
| 245 | |
| 246 | /* Replace cur_token if needed, based on lookahead */ |
| 247 | switch (cur_token) { |
| 248 | case NOT: |
| 249 | /* Replace NOT by NOT_LA if it's followed by BETWEEN, IN, etc */ |
| 250 | switch (next_token) { |
| 251 | case BETWEEN: |
| 252 | case IN_P: |
| 253 | case LIKE: |
| 254 | case ILIKE: |
| 255 | case SIMILAR: |
| 256 | cur_token = NOT_LA; |
| 257 | break; |
| 258 | } |
| 259 | break; |
| 260 | |
| 261 | case NULLS_P: |
| 262 | /* Replace NULLS_P by NULLS_LA if it's followed by FIRST or LAST */ |
| 263 | switch (next_token) { |
| 264 | case FIRST_P: |
| 265 | case LAST_P: |
| 266 | cur_token = NULLS_LA; |
| 267 | break; |
| 268 | } |
| 269 | break; |
| 270 | |
| 271 | case WITH: |
| 272 | /* Replace WITH by WITH_LA if it's followed by TIME or ORDINALITY */ |
| 273 | switch (next_token) { |
| 274 | case TIME: |
| 275 | case ORDINALITY: |
| 276 | cur_token = WITH_LA; |
| 277 | break; |
| 278 | } |
| 279 | break; |
| 280 | } |
| 281 | |
| 282 | return cur_token; |
| 283 | } |
| 284 | |
| 285 | } |