/* * Internal yylex function. This wraps the core lexer and adds one feature: * a token pushback stack. We also make a couple of trivial single-token * translations from what the core lexer does to what we want, in particular * interfacing from the core_YYSTYPE to YYSTYPE union. */ static int internal_yylex(TokenAuxData *auxdata) { int token; const char *yytext; if (num_pushbacks > 0) { num_pushbacks--; token = pushback_token[num_pushbacks]; *auxdata = pushback_auxdata[num_pushbacks]; } else { token = core_yylex(&auxdata->lval.core_yystype, &auxdata->lloc, yyscanner); /* remember the length of yytext before it gets changed */ yytext = core_yy.scanbuf + auxdata->lloc; auxdata->leng = strlen(yytext); /* Check for << >> and #, which the core considers operators */ if (token == Op) { if (strcmp(auxdata->lval.str, "<<") == 0) token = LESS_LESS; else if (strcmp(auxdata->lval.str, ">>") == 0) token = GREATER_GREATER; else if (strcmp(auxdata->lval.str, "#") == 0) token = '#'; } /* The core returns PARAM as ival, but we treat it like IDENT */ else if (token == PARAM) { auxdata->lval.str = pstrdup(yytext); } } return token; }
/* * norm_yylex: core_yylex with replacing some tokens. */ static int norm_yylex(char *str, core_YYSTYPE *yylval, YYLTYPE *yylloc, core_yyscan_t yyscanner) { int tok; PG_TRY(); { tok = core_yylex(yylval, yylloc, yyscanner); } PG_CATCH(); { /* * Error might occur during parsing quoted tokens that chopped * halfway. Just ignore the rest of this query even if there might * be other reasons for parsing to fail. */ FlushErrorState(); return -1; } PG_END_TRY(); /* * '?' alone is assumed to be an IDENT. If there's a real * operator '?', this should be confused but there's hardly be. */ if (tok == Op && str[*yylloc] == '?' && strchr(OPCHARS, str[*yylloc + 1]) == NULL) tok = SCONST; /* * Replace tokens with '=' if the operator is consists of two or * more opchars only. Assuming that opchars do not compose a token * with non-opchars, check the first char only is sufficient. */ if (tok == Op && strchr(OPCHARS, str[*yylloc]) != NULL) tok = '='; return tok; }
/* * Intermediate filter between parser and core lexer (core_yylex in scan.l). * * This filter is needed because in some cases the standard SQL grammar * requires more than one token lookahead. We reduce these cases to one-token * lookahead by replacing tokens here, in order to keep the grammar LALR(1). * * Using a filter is simpler than trying to recognize multiword tokens * directly in scan.l, because we'd have to allow for comments between the * words. Furthermore it's not clear how to do that without re-introducing * scanner backtrack, which would cost more performance than this filter * layer does. * * The filter also provides a convenient place to translate between * the core_YYSTYPE and YYSTYPE representations (which are really the * same thing anyway, but notationally they're different). */ int base_yylex(YYSTYPE *lvalp, YYLTYPE *llocp, core_yyscan_t yyscanner) { base_yy_extra_type *yyextra = pg_yyget_extra(yyscanner); int cur_token; int next_token; int cur_token_length; YYLTYPE cur_yylloc; /* Get next token --- we might already have it */ if (yyextra->have_lookahead) { cur_token = yyextra->lookahead_token; lvalp->core_yystype = yyextra->lookahead_yylval; *llocp = yyextra->lookahead_yylloc; *(yyextra->lookahead_end) = yyextra->lookahead_hold_char; yyextra->have_lookahead = false; } else cur_token = core_yylex(&(lvalp->core_yystype), llocp, yyscanner); /* * If this token isn't one that requires lookahead, just return it. If it * does, determine the token length. (We could get that via strlen(), but * since we have such a small set of possibilities, hardwiring seems * feasible and more efficient.) */ switch (cur_token) { case NOT: cur_token_length = 3; break; case NULLS_P: cur_token_length = 5; break; case WITH: cur_token_length = 4; break; default: return cur_token; } /* * Identify end+1 of current token. core_yylex() has temporarily stored a * '\0' here, and will undo that when we call it again. We need to redo * it to fully revert the lookahead call for error reporting purposes. */ yyextra->lookahead_end = yyextra->core_yy_extra.scanbuf + *llocp + cur_token_length; Assert(*(yyextra->lookahead_end) == '\0'); /* * Save and restore *llocp around the call. It might look like we could * avoid this by just passing &lookahead_yylloc to core_yylex(), but that * does not work because flex actually holds onto the last-passed pointer * internally, and will use that for error reporting. We need any error * reports to point to the current token, not the next one. */ cur_yylloc = *llocp; /* Get next token, saving outputs into lookahead variables */ next_token = core_yylex(&(yyextra->lookahead_yylval), llocp, yyscanner); yyextra->lookahead_token = next_token; yyextra->lookahead_yylloc = *llocp; *llocp = cur_yylloc; /* Now revert the un-truncation of the current token */ yyextra->lookahead_hold_char = *(yyextra->lookahead_end); *(yyextra->lookahead_end) = '\0'; yyextra->have_lookahead = true; /* Replace cur_token if needed, based on lookahead */ switch (cur_token) { case NOT: /* Replace NOT by NOT_LA if it's followed by BETWEEN, IN, etc */ switch (next_token) { case BETWEEN: case IN_P: case LIKE: case ILIKE: case SIMILAR: cur_token = NOT_LA; break; } break; case NULLS_P: /* Replace NULLS_P by NULLS_LA if it's followed by FIRST or LAST */ switch (next_token) { case FIRST_P: case LAST_P: cur_token = NULLS_LA; break; } break; case WITH: /* Replace WITH by WITH_LA if it's followed by TIME or ORDINALITY */ switch (next_token) { case TIME: case ORDINALITY: cur_token = WITH_LA; break; } break; } return cur_token; }
/* * Given a valid SQL string and an array of constant-location records, * fill in the textual lengths of those constants. * * The constants may use any allowed constant syntax, such as float literals, * bit-strings, single-quoted strings and dollar-quoted strings. This is * accomplished by using the public API for the core scanner. * * It is the caller's job to ensure that the string is a valid SQL statement * with constants at the indicated locations. Since in practice the string * has already been parsed, and the locations that the caller provides will * have originated from within the authoritative parser, this should not be * a problem. * * Duplicate constant pointers are possible, and will have their lengths * marked as '-1', so that they are later ignored. (Actually, we assume the * lengths were initialized as -1 to start with, and don't change them here.) * * N.B. There is an assumption that a '-' character at a Const location begins * a negative numeric constant. This precludes there ever being another * reason for a constant to start with a '-'. */ static void fill_in_constant_lengths(pgssConstLocations *jstate, const char *query) { pgssLocationLen *locs; core_yyscan_t yyscanner; core_yy_extra_type yyextra; core_YYSTYPE yylval; YYLTYPE yylloc; int last_loc = -1; int i; /* * Sort the records by location so that we can process them in order while * scanning the query text. */ if (jstate->clocations_count > 1) qsort(jstate->clocations, jstate->clocations_count, sizeof(pgssLocationLen), comp_location); locs = jstate->clocations; /* initialize the flex scanner --- should match raw_parser() */ yyscanner = scanner_init(query, &yyextra, ScanKeywords, NumScanKeywords); /* Search for each constant, in sequence */ for (i = 0; i < jstate->clocations_count; i++) { int loc = locs[i].location; int tok; Assert(loc >= 0); if (loc <= last_loc) continue; /* Duplicate constant, ignore */ /* Lex tokens until we find the desired constant */ for (;;) { tok = core_yylex(&yylval, &yylloc, yyscanner); /* We should not hit end-of-string, but if we do, behave sanely */ if (tok == 0) break; /* out of inner for-loop */ /* * We should find the token position exactly, but if we somehow * run past it, work with that. */ if (yylloc >= loc) { if (query[loc] == '-') { /* * It's a negative value - this is the one and only case * where we replace more than a single token. * * Do not compensate for the core system's special-case * adjustment of location to that of the leading '-' * operator in the event of a negative constant. It is * also useful for our purposes to start from the minus * symbol. In this way, queries like "select * from foo * where bar = 1" and "select * from foo where bar = -2" * will have identical normalized query strings. */ tok = core_yylex(&yylval, &yylloc, yyscanner); if (tok == 0) break; /* out of inner for-loop */ } /* * We now rely on the assumption that flex has placed a zero * byte after the text of the current token in scanbuf. */ locs[i].length = (int) strlen(yyextra.scanbuf + loc); /* Quoted string with Unicode escapes * * The lexer consumes trailing whitespace in order to find UESCAPE, but if there * is no UESCAPE it has still consumed it - don't include it in constant length. */ if (locs[i].length > 4 && /* U&'' */ (yyextra.scanbuf[loc] == 'u' || yyextra.scanbuf[loc] == 'U') && yyextra.scanbuf[loc + 1] == '&' && yyextra.scanbuf[loc + 2] == '\'') { int j = locs[i].length - 1; /* Skip the \0 */ for (; j >= 0 && scanner_isspace(yyextra.scanbuf[loc + j]); j--) {} locs[i].length = j + 1; /* Count the \0 */ } break; /* out of inner for-loop */ } } /* If we hit end-of-string, give up, leaving remaining lengths -1 */ if (tok == 0) break; last_loc = loc; } scanner_finish(yyscanner); }
/* * Intermediate filter between parser and core lexer (core_yylex in scan.l). * * The filter is needed because in some cases the standard SQL grammar * requires more than one token lookahead. We reduce these cases to one-token * lookahead by combining tokens here, in order to keep the grammar LALR(1). * * Using a filter is simpler than trying to recognize multiword tokens * directly in scan.l, because we'd have to allow for comments between the * words. Furthermore it's not clear how to do it without re-introducing * scanner backtrack, which would cost more performance than this filter * layer does. * * The filter also provides a convenient place to translate between * the core_YYSTYPE and YYSTYPE representations (which are really the * same thing anyway, but notationally they're different). */ int base_yylex(YYSTYPE *lvalp, YYLTYPE *llocp, core_yyscan_t yyscanner) { base_yy_extra_type *yyextra = pg_yyget_extra(yyscanner); int cur_token; int next_token; core_YYSTYPE cur_yylval; YYLTYPE cur_yylloc; /* Get next token --- we might already have it */ if (yyextra->have_lookahead) { cur_token = yyextra->lookahead_token; lvalp->core_yystype = yyextra->lookahead_yylval; *llocp = yyextra->lookahead_yylloc; yyextra->have_lookahead = false; } else cur_token = core_yylex(&(lvalp->core_yystype), llocp, yyscanner); /* Do we need to look ahead for a possible multiword token? */ switch (cur_token) { case NULLS_P: /* * NULLS FIRST and NULLS LAST must be reduced to one token */ cur_yylval = lvalp->core_yystype; cur_yylloc = *llocp; next_token = core_yylex(&(lvalp->core_yystype), llocp, yyscanner); switch (next_token) { case FIRST_P: cur_token = NULLS_FIRST; break; case LAST_P: cur_token = NULLS_LAST; break; default: /* save the lookahead token for next time */ yyextra->lookahead_token = next_token; yyextra->lookahead_yylval = lvalp->core_yystype; yyextra->lookahead_yylloc = *llocp; yyextra->have_lookahead = true; /* and back up the output info to cur_token */ lvalp->core_yystype = cur_yylval; *llocp = cur_yylloc; break; } break; case WITH: /* * WITH TIME must be reduced to one token */ cur_yylval = lvalp->core_yystype; cur_yylloc = *llocp; next_token = core_yylex(&(lvalp->core_yystype), llocp, yyscanner); switch (next_token) { case TIME: cur_token = WITH_TIME; break; default: /* save the lookahead token for next time */ yyextra->lookahead_token = next_token; yyextra->lookahead_yylval = lvalp->core_yystype; yyextra->lookahead_yylloc = *llocp; yyextra->have_lookahead = true; /* and back up the output info to cur_token */ lvalp->core_yystype = cur_yylval; *llocp = cur_yylloc; break; } break; default: break; } return cur_token; }