blob: 59aff9c9a7a2f5a6d672ffc434debbd43b0ad638 [file] [log] [blame]
%{
/*-------------------------------------------------------------------------
*
* psqlscan.l
* lexical scanner for psql
*
* This code is mainly needed to determine where the end of a SQL statement
* is: we are looking for semicolons that are not within quotes, comments,
* or parentheses. The most reliable way to handle this is to borrow the
* backend's flex lexer rules, lock, stock, and barrel. The rules below
* are (except for a few) the same as the backend's, but their actions are
* just ECHO whereas the backend's actions generally do other things.
*
* XXX The rules in this file must be kept in sync with the backend lexer!!!
*
* XXX Avoid creating backtracking cases --- see the backend lexer for info.
*
* The most difficult aspect of this code is that we need to work in multibyte
* encodings that are not ASCII-safe. A "safe" encoding is one in which each
* byte of a multibyte character has the high bit set (it's >= 0x80). Since
* all our lexing rules treat all high-bit-set characters alike, we don't
* really need to care whether such a byte is part of a sequence or not.
* In an "unsafe" encoding, we still expect the first byte of a multibyte
* sequence to be >= 0x80, but later bytes might not be. If we scan such
* a sequence as-is, the lexing rules could easily be fooled into matching
* such bytes to ordinary ASCII characters. Our solution for this is to
* substitute 0xFF for each non-first byte within the data presented to flex.
* The flex rules will then pass the FF's through unmolested. The emit()
* subroutine is responsible for looking back to the original string and
* replacing FF's with the corresponding original bytes.
*
* Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
* src/bin/psql/psqlscan.l
*
*-------------------------------------------------------------------------
*/
#include "postgres_fe.h"
#include "psqlscan.h"
#include <ctype.h>
#include "common.h"
#include "settings.h"
#include "variables.h"
#define unify_version(a,b,c) ((a<<16)+(b<<8)+c)
#if unify_version(YY_FLEX_MAJOR_VERSION,YY_FLEX_MINOR_VERSION,YY_FLEX_SUBMINOR_VERSION) < unify_version(2,5,35)
int yyget_lineno (void);
FILE *yyget_in (void);
FILE *yyget_out (void);
int yyget_leng (void);
char *yyget_text (void);
void yyset_lineno (int line_number );
void yyset_in (FILE * in_str );
void yyset_out (FILE * out_str );
int yyget_debug (void);
void yyset_debug (int bdebug );
int yylex_destroy (void);
#endif
/*
* We use a stack of flex buffers to handle substitution of psql variables.
* Each stacked buffer contains the as-yet-unread text from one psql variable.
* When we pop the stack all the way, we resume reading from the outer buffer
* identified by scanbufhandle.
*/
typedef struct StackElem
{
YY_BUFFER_STATE buf; /* flex input control structure */
char *bufstring; /* data actually being scanned by flex */
char *origstring; /* copy of original data, if needed */
char *varname; /* name of variable providing data, or NULL */
struct StackElem *next;
} StackElem;
/*
* All working state of the lexer must be stored in PsqlScanStateData
* between calls. This allows us to have multiple open lexer operations,
* which is needed for nested include files. The lexer itself is not
* recursive, but it must be re-entrant.
*/
typedef struct PsqlScanStateData
{
StackElem *buffer_stack; /* stack of variable expansion buffers */
/*
* These variables always refer to the outer buffer, never to any
* stacked variable-expansion buffer.
*/
YY_BUFFER_STATE scanbufhandle;
char *scanbuf; /* start of outer-level input buffer */
const char *scanline; /* current input line at outer level */
/* safe_encoding, curline, refline are used by emit() to replace FFs */
int encoding; /* encoding being used now */
bool safe_encoding; /* is current encoding "safe"? */
const char *curline; /* actual flex input string for cur buf */
const char *refline; /* original data for cur buffer */
/*
* All this state lives across successive input lines, until explicitly
* reset by psql_scan_reset.
*/
int start_state; /* saved YY_START */
int paren_depth; /* depth of nesting in parentheses */
int xcdepth; /* depth of nesting in slash-star comments */
char *dolqstart; /* current $foo$ quote start string */
} PsqlScanStateData;
static PsqlScanState cur_state; /* current state while active */
static PQExpBuffer output_buf; /* current output buffer */
/* these variables do not need to be saved across calls */
static enum slash_option_type option_type;
static char *option_quote;
/* Return values from yylex() */
#define LEXRES_EOL 0 /* end of input */
#define LEXRES_SEMI 1 /* command-terminating semicolon found */
#define LEXRES_BACKSLASH 2 /* backslash command start */
#define LEXRES_OK 3 /* OK completion of backslash argument */
int yylex(void);
static void push_new_buffer(const char *newstr, const char *varname);
static void pop_buffer_stack(PsqlScanState state);
static bool var_is_current_source(PsqlScanState state, const char *varname);
static YY_BUFFER_STATE prepare_buffer(const char *txt, int len,
char **txtcopy);
static void emit(const char *txt, int len);
static void escape_variable(bool as_ident);
#ifdef ECHO
#undef ECHO
#endif
#define ECHO emit(yytext, yyleng)
%}
%option 8bit
%option never-interactive
%option nodefault
%option noinput
%option nounput
%option noyywrap
/*
* All of the following definitions and rules should exactly match
* src/backend/parser/scan.l so far as the flex patterns are concerned.
* The rule bodies are just ECHO as opposed to what the backend does,
* however. (But be sure to duplicate code that affects the lexing process,
* such as BEGIN().) Also, psqlscan uses a single <<EOF>> rule whereas
* scan.l has a separate one for each exclusive state.
*/
/*
* OK, here is a short description of lex/flex rules behavior.
* The longest pattern which matches an input string is always chosen.
* For equal-length patterns, the first occurring in the rules list is chosen.
* INITIAL is the starting state, to which all non-conditional rules apply.
* Exclusive states change parsing rules while the state is active. When in
* an exclusive state, only those rules defined for that state apply.
*
* We use exclusive states for quoted strings, extended comments,
* and to eliminate parsing troubles for numeric strings.
* Exclusive states:
* <xb> bit string literal
* <xc> extended C-style comments
* <xd> delimited identifiers (double-quoted identifiers)
* <xh> hexadecimal numeric string
* <xq> standard quoted strings
* <xe> extended quoted strings (support backslash escape sequences)
* <xdolq> $foo$ quoted strings
* <xui> quoted identifier with Unicode escapes
* <xus> quoted string with Unicode escapes
*
* Note: we intentionally don't mimic the backend's <xeu> state; we have
* no need to distinguish it from <xe> state, and no good way to get out
* of it in error cases. The backend just throws yyerror() in those
* cases, but that's not an option here.
*/
%x xb
%x xc
%x xd
%x xh
%x xe
%x xq
%x xdolq
%x xui
%x xus
/* Additional exclusive states for psql only: lex backslash commands */
%x xslashcmd
%x xslasharg
%x xslashquote
%x xslashbackquote
%x xslashdefaultarg
%x xslashquotedarg
%x xslashwholeline
%x xslashend
/*
* In order to make the world safe for Windows and Mac clients as well as
* Unix ones, we accept either \n or \r as a newline. A DOS-style \r\n
* sequence will be seen as two successive newlines, but that doesn't cause
* any problems. Comments that start with -- and extend to the next
* newline are treated as equivalent to a single whitespace character.
*
* NOTE a fine point: if there is no newline following --, we will absorb
* everything to the end of the input as a comment. This is correct. Older
* versions of Postgres failed to recognize -- as a comment if the input
* did not end with a newline.
*
* XXX perhaps \f (formfeed) should be treated as a newline as well?
*
* XXX if you change the set of whitespace characters, fix scanner_isspace()
* to agree, and see also the plpgsql lexer.
*/
space [ \t\n\r\f]
horiz_space [ \t\f]
newline [\n\r]
non_newline [^\n\r]
comment ("--"{non_newline}*)
whitespace ({space}+|{comment})
/*
* SQL requires at least one newline in the whitespace separating
* string literals that are to be concatenated. Silly, but who are we
* to argue? Note that {whitespace_with_newline} should not have * after
* it, whereas {whitespace} should generally have a * after it...
*/
special_whitespace ({space}+|{comment}{newline})
horiz_whitespace ({horiz_space}|{comment})
whitespace_with_newline ({horiz_whitespace}*{newline}{special_whitespace}*)
/*
* To ensure that {quotecontinue} can be scanned without having to back up
* if the full pattern isn't matched, we include trailing whitespace in
* {quotestop}. This matches all cases where {quotecontinue} fails to match,
* except for {quote} followed by whitespace and just one "-" (not two,
* which would start a {comment}). To cover that we have {quotefail}.
* The actions for {quotestop} and {quotefail} must throw back characters
* beyond the quote proper.
*/
quote '
quotestop {quote}{whitespace}*
quotecontinue {quote}{whitespace_with_newline}{quote}
quotefail {quote}{whitespace}*"-"
/* Bit string
* It is tempting to scan the string for only those characters
* which are allowed. However, this leads to silently swallowed
* characters if illegal characters are included in the string.
* For example, if xbinside is [01] then B'ABCD' is interpreted
* as a zero-length string, and the ABCD' is lost!
* Better to pass the string forward and let the input routines
* validate the contents.
*/
xbstart [bB]{quote}
xbinside [^']*
/* Hexadecimal number */
xhstart [xX]{quote}
xhinside [^']*
/* National character */
xnstart [nN]{quote}
/* Quoted string that allows backslash escapes */
xestart [eE]{quote}
xeinside [^\\']+
xeescape [\\][^0-7]
xeoctesc [\\][0-7]{1,3}
xehexesc [\\]x[0-9A-Fa-f]{1,2}
xeunicode [\\](u[0-9A-Fa-f]{4}|U[0-9A-Fa-f]{8})
xeunicodefail [\\](u[0-9A-Fa-f]{0,3}|U[0-9A-Fa-f]{0,7})
/* Extended quote
* xqdouble implements embedded quote, ''''
*/
xqstart {quote}
xqdouble {quote}{quote}
xqinside [^']+
/* $foo$ style quotes ("dollar quoting")
* The quoted string starts with $foo$ where "foo" is an optional string
* in the form of an identifier, except that it may not contain "$",
* and extends to the first occurrence of an identical string.
* There is *no* processing of the quoted text.
*
* {dolqfailed} is an error rule to avoid scanner backup when {dolqdelim}
* fails to match its trailing "$".
*/
dolq_start [A-Za-z\200-\377_]
dolq_cont [A-Za-z\200-\377_0-9]
dolqdelim \$({dolq_start}{dolq_cont}*)?\$
dolqfailed \${dolq_start}{dolq_cont}*
dolqinside [^$]+
/* Double quote
* Allows embedded spaces and other special characters into identifiers.
*/
dquote \"
xdstart {dquote}
xdstop {dquote}
xddouble {dquote}{dquote}
xdinside [^"]+
/* Unicode escapes */
uescape [uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}[^']{quote}
/* error rule to avoid backup */
uescapefail ("-"|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*"-"|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}[^']|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*|[uU][eE][sS][cC][aA][pP]|[uU][eE][sS][cC][aA]|[uU][eE][sS][cC]|[uU][eE][sS]|[uU][eE]|[uU])
/* Quoted identifier with Unicode escapes */
xuistart [uU]&{dquote}
xuistop1 {dquote}{whitespace}*{uescapefail}?
xuistop2 {dquote}{whitespace}*{uescape}
/* Quoted string with Unicode escapes */
xusstart [uU]&{quote}
xusstop1 {quote}{whitespace}*{uescapefail}?
xusstop2 {quote}{whitespace}*{uescape}
/* error rule to avoid backup */
xufailed [uU]&
/* C-style comments
*
* The "extended comment" syntax closely resembles allowable operator syntax.
* The tricky part here is to get lex to recognize a string starting with
* slash-star as a comment, when interpreting it as an operator would produce
* a longer match --- remember lex will prefer a longer match! Also, if we
* have something like plus-slash-star, lex will think this is a 3-character
* operator whereas we want to see it as a + operator and a comment start.
* The solution is two-fold:
* 1. append {op_chars}* to xcstart so that it matches as much text as
* {operator} would. Then the tie-breaker (first matching rule of same
* length) ensures xcstart wins. We put back the extra stuff with yyless()
* in case it contains a star-slash that should terminate the comment.
* 2. In the operator rule, check for slash-star within the operator, and
* if found throw it back with yyless(). This handles the plus-slash-star
* problem.
* Dash-dash comments have similar interactions with the operator rule.
*/
xcstart \/\*{op_chars}*
xcstop \*+\/
xcinside [^*/]+
digit [0-9]
ident_start [A-Za-z\200-\377_]
ident_cont [A-Za-z\200-\377_0-9\$]
identifier {ident_start}{ident_cont}*
typecast "::"
dot_dot \.\.
colon_equals ":="
/*
* "self" is the set of chars that should be returned as single-character
* tokens. "op_chars" is the set of chars that can make up "Op" tokens,
* which can be one or more characters long (but if a single-char token
* appears in the "self" set, it is not to be returned as an Op). Note
* that the sets overlap, but each has some chars that are not in the other.
*
* If you change either set, adjust the character lists appearing in the
* rule for "operator"!
*/
self [,()\[\].;\:\+\-\*\/\%\^\<\>\=]
op_chars [\~\!\@\#\^\&\|\`\?\+\-\*\/\%\<\>\=]
operator {op_chars}+
/* we no longer allow unary minus in numbers.
* instead we pass it separately to parser. there it gets
* coerced via doNegate() -- Leon aug 20 1999
*
* {decimalfail} is used because we would like "1..10" to lex as 1, dot_dot, 10.
*
* {realfail1} and {realfail2} are added to prevent the need for scanner
* backup when the {real} rule fails to match completely.
*/
integer {digit}+
decimal (({digit}*\.{digit}+)|({digit}+\.{digit}*))
decimalfail {digit}+\.\.
real ({integer}|{decimal})[Ee][-+]?{digit}+
realfail1 ({integer}|{decimal})[Ee]
realfail2 ({integer}|{decimal})[Ee][-+]
param \${integer}
other .
/*
* Dollar quoted strings are totally opaque, and no escaping is done on them.
* Other quoted strings must allow some special characters such as single-quote
* and newline.
* Embedded single-quotes are implemented both in the SQL standard
* style of two adjacent single quotes "''" and in the Postgres/Java style
* of escaped-quote "\'".
* Other embedded escaped characters are matched explicitly and the leading
* backslash is dropped from the string.
* Note that xcstart must appear before operator, as explained above!
* Also whitespace (comment) must appear before operator.
*/
%%
{whitespace} {
/*
* Note that the whitespace rule includes both true
* whitespace and single-line ("--" style) comments.
* We suppress whitespace at the start of the query
* buffer. We also suppress all single-line comments,
* which is pretty dubious but is the historical
* behavior.
*/
if (!(output_buf->len == 0 || yytext[0] == '-'))
ECHO;
}
{xcstart} {
cur_state->xcdepth = 0;
BEGIN(xc);
/* Put back any characters past slash-star; see above */
yyless(2);
ECHO;
}
<xc>{xcstart} {
cur_state->xcdepth++;
/* Put back any characters past slash-star; see above */
yyless(2);
ECHO;
}
<xc>{xcstop} {
if (cur_state->xcdepth <= 0)
{
BEGIN(INITIAL);
}
else
cur_state->xcdepth--;
ECHO;
}
<xc>{xcinside} {
ECHO;
}
<xc>{op_chars} {
ECHO;
}
<xc>\*+ {
ECHO;
}
{xbstart} {
BEGIN(xb);
ECHO;
}
<xb>{quotestop} |
<xb>{quotefail} {
yyless(1);
BEGIN(INITIAL);
ECHO;
}
<xh>{xhinside} |
<xb>{xbinside} {
ECHO;
}
<xh>{quotecontinue} |
<xb>{quotecontinue} {
ECHO;
}
{xhstart} {
/* Hexadecimal bit type.
* At some point we should simply pass the string
* forward to the parser and label it there.
* In the meantime, place a leading "x" on the string
* to mark it for the input routine as a hex string.
*/
BEGIN(xh);
ECHO;
}
<xh>{quotestop} |
<xh>{quotefail} {
yyless(1);
BEGIN(INITIAL);
ECHO;
}
{xnstart} {
yyless(1); /* eat only 'n' this time */
ECHO;
}
{xqstart} {
if (standard_strings())
BEGIN(xq);
else
BEGIN(xe);
ECHO;
}
{xestart} {
BEGIN(xe);
ECHO;
}
{xusstart} {
BEGIN(xus);
ECHO;
}
<xq,xe>{quotestop} |
<xq,xe>{quotefail} {
yyless(1);
BEGIN(INITIAL);
ECHO;
}
<xus>{xusstop1} {
yyless(1);
BEGIN(INITIAL);
ECHO;
}
<xus>{xusstop2} {
BEGIN(INITIAL);
ECHO;
}
<xq,xe,xus>{xqdouble} {
ECHO;
}
<xq,xus>{xqinside} {
ECHO;
}
<xe>{xeinside} {
ECHO;
}
<xe>{xeunicode} {
ECHO;
}
<xe>{xeunicodefail} {
ECHO;
}
<xe>{xeescape} {
ECHO;
}
<xe>{xeoctesc} {
ECHO;
}
<xe>{xehexesc} {
ECHO;
}
<xq,xe,xus>{quotecontinue} {
ECHO;
}
<xe>. {
/* This is only needed for \ just before EOF */
ECHO;
}
{dolqdelim} {
cur_state->dolqstart = pg_strdup(yytext);
BEGIN(xdolq);
ECHO;
}
{dolqfailed} {
/* throw back all but the initial "$" */
yyless(1);
ECHO;
}
<xdolq>{dolqdelim} {
if (strcmp(yytext, cur_state->dolqstart) == 0)
{
free(cur_state->dolqstart);
cur_state->dolqstart = NULL;
BEGIN(INITIAL);
}
else
{
/*
* When we fail to match $...$ to dolqstart, transfer
* the $... part to the output, but put back the final
* $ for rescanning. Consider $delim$...$junk$delim$
*/
yyless(yyleng-1);
}
ECHO;
}
<xdolq>{dolqinside} {
ECHO;
}
<xdolq>{dolqfailed} {
ECHO;
}
<xdolq>. {
/* This is only needed for $ inside the quoted text */
ECHO;
}
{xdstart} {
BEGIN(xd);
ECHO;
}
{xuistart} {
BEGIN(xui);
ECHO;
}
<xd>{xdstop} {
BEGIN(INITIAL);
ECHO;
}
<xui>{xuistop1} {
yyless(1);
BEGIN(INITIAL);
ECHO;
}
<xui>{xuistop2} {
BEGIN(INITIAL);
ECHO;
}
<xd,xui>{xddouble} {
ECHO;
}
<xd,xui>{xdinside} {
ECHO;
}
{xufailed} {
/* throw back all but the initial u/U */
yyless(1);
ECHO;
}
{typecast} {
ECHO;
}
{dot_dot} {
ECHO;
}
{colon_equals} {
ECHO;
}
/*
* These rules are specific to psql --- they implement parenthesis
* counting and detection of command-ending semicolon. These must
* appear before the {self} rule so that they take precedence over it.
*/
"(" {
cur_state->paren_depth++;
ECHO;
}
")" {
if (cur_state->paren_depth > 0)
cur_state->paren_depth--;
ECHO;
}
";" {
ECHO;
if (cur_state->paren_depth == 0)
{
/* Terminate lexing temporarily */
return LEXRES_SEMI;
}
}
/*
* psql-specific rules to handle backslash commands and variable
* substitution. We want these before {self}, also.
*/
"\\"[;:] {
/* Force a semicolon or colon into the query buffer */
emit(yytext + 1, 1);
}
"\\" {
/* Terminate lexing temporarily */
return LEXRES_BACKSLASH;
}
:[A-Za-z0-9_]+ {
/* Possible psql variable substitution */
const char *varname = yytext + 1;
const char *value;
value = GetVariable(pset.vars, varname);
if (value)
{
/* It is a variable, check for recursion */
if (var_is_current_source(cur_state, varname))
{
/* Recursive expansion --- don't go there */
psql_error("skipping recursive expansion of variable \"%s\"\n",
varname);
/* Instead copy the string as is */
ECHO;
}
else
{
/* OK, perform substitution */
push_new_buffer(value, varname);
/* yy_scan_string already made buffer active */
}
}
else
{
/*
* if the variable doesn't exist we'll copy the
* string as is
*/
ECHO;
}
}
:'[A-Za-z0-9_]+' {
escape_variable(false);
}
:\"[A-Za-z0-9_]+\" {
escape_variable(true);
}
/*
* Back to backend-compatible rules.
*/
{self} {
ECHO;
}
{operator} {
/*
* Check for embedded slash-star or dash-dash; those
* are comment starts, so operator must stop there.
* Note that slash-star or dash-dash at the first
* character will match a prior rule, not this one.
*/
int nchars = yyleng;
char *slashstar = strstr(yytext, "/*");
char *dashdash = strstr(yytext, "--");
if (slashstar && dashdash)
{
/* if both appear, take the first one */
if (slashstar > dashdash)
slashstar = dashdash;
}
else if (!slashstar)
slashstar = dashdash;
if (slashstar)
nchars = slashstar - yytext;
/*
* For SQL compatibility, '+' and '-' cannot be the
* last char of a multi-char operator unless the operator
* contains chars that are not in SQL operators.
* The idea is to lex '=-' as two operators, but not
* to forbid operator names like '?-' that could not be
* sequences of SQL operators.
*/
while (nchars > 1 &&
(yytext[nchars-1] == '+' ||
yytext[nchars-1] == '-'))
{
int ic;
for (ic = nchars-2; ic >= 0; ic--)
{
if (strchr("~!@#^&|`?%", yytext[ic]))
break;
}
if (ic >= 0)
break; /* found a char that makes it OK */
nchars--; /* else remove the +/-, and check again */
}
if (nchars < yyleng)
{
/* Strip the unwanted chars from the token */
yyless(nchars);
}
ECHO;
}
{param} {
ECHO;
}
{integer} {
ECHO;
}
{decimal} {
ECHO;
}
{decimalfail} {
/* throw back the .., and treat as integer */
yyless(yyleng-2);
ECHO;
}
{real} {
ECHO;
}
{realfail1} {
/*
* throw back the [Ee], and treat as {decimal}. Note
* that it is possible the input is actually {integer},
* but since this case will almost certainly lead to a
* syntax error anyway, we don't bother to distinguish.
*/
yyless(yyleng-1);
ECHO;
}
{realfail2} {
/* throw back the [Ee][+-], and proceed as above */
yyless(yyleng-2);
ECHO;
}
{identifier} {
ECHO;
}
{other} {
ECHO;
}
/*
* Everything from here down is psql-specific.
*/
<<EOF>> {
StackElem *stackelem = cur_state->buffer_stack;
if (stackelem == NULL)
return LEXRES_EOL; /* end of input reached */
/*
* We were expanding a variable, so pop the inclusion
* stack and keep lexing
*/
pop_buffer_stack(cur_state);
stackelem = cur_state->buffer_stack;
if (stackelem != NULL)
{
yy_switch_to_buffer(stackelem->buf);
cur_state->curline = stackelem->bufstring;
cur_state->refline = stackelem->origstring ? stackelem->origstring : stackelem->bufstring;
}
else
{
yy_switch_to_buffer(cur_state->scanbufhandle);
cur_state->curline = cur_state->scanbuf;
cur_state->refline = cur_state->scanline;
}
}
/*
* Exclusive lexer states to handle backslash command lexing
*/
<xslashcmd>{
/* command name ends at whitespace or backslash; eat all else */
{space}|"\\" {
yyless(0);
return LEXRES_OK;
}
/* GPDB: This rule removes the need for a space after the "!" command */
"!" {
ECHO;
return LEXRES_OK;
}
{other} { ECHO; }
}
<xslasharg>{
/* eat any whitespace, then decide what to do at first nonblank */
{space}+ { }
"\\" {
/*
* backslash is end of command or next command, do not eat
*
* XXX this means we can't conveniently accept options
* that start with a backslash; therefore, option
* processing that encourages use of backslashes is rather
* broken.
*/
yyless(0);
return LEXRES_OK;
}
{quote} {
*option_quote = '\'';
BEGIN(xslashquote);
}
"`" {
if (option_type == OT_VERBATIM)
{
/* in verbatim mode, backquote is not special */
ECHO;
BEGIN(xslashdefaultarg);
}
else
{
*option_quote = '`';
BEGIN(xslashbackquote);
}
}
:[A-Za-z0-9_]* {
/* Possible psql variable substitution */
if (option_type == OT_VERBATIM)
ECHO;
else
{
const char *value;
value = GetVariable(pset.vars, yytext + 1);
/*
* The variable value is just emitted without any
* further examination. This is consistent with the
* pre-8.0 code behavior, if not with the way that
* variables are handled outside backslash commands.
* Note that we needn't guard against recursion here.
*/
if (value)
appendPQExpBufferStr(output_buf, value);
}
*option_quote = ':';
return LEXRES_OK;
}
:'[A-Za-z0-9_]+' {
if (option_type == OT_VERBATIM)
ECHO;
else
{
escape_variable(false);
return LEXRES_OK;
}
}
:\"[A-Za-z0-9_]+\" {
if (option_type == OT_VERBATIM)
ECHO;
else
{
escape_variable(true);
return LEXRES_OK;
}
}
"|" {
ECHO;
if (option_type == OT_FILEPIPE)
{
/* treat like whole-string case */
BEGIN(xslashwholeline);
}
else
{
/* treat like default case */
BEGIN(xslashdefaultarg);
}
}
{dquote} {
*option_quote = '"';
ECHO;
BEGIN(xslashquotedarg);
}
{other} {
ECHO;
BEGIN(xslashdefaultarg);
}
}
<xslashquote>{
/*
* single-quoted text: copy literally except for '' and backslash
* sequences
*/
{quote} { return LEXRES_OK; }
{xqdouble} { appendPQExpBufferChar(output_buf, '\''); }
"\\n" { appendPQExpBufferChar(output_buf, '\n'); }
"\\t" { appendPQExpBufferChar(output_buf, '\t'); }
"\\b" { appendPQExpBufferChar(output_buf, '\b'); }
"\\r" { appendPQExpBufferChar(output_buf, '\r'); }
"\\f" { appendPQExpBufferChar(output_buf, '\f'); }
{xeoctesc} {
/* octal case */
appendPQExpBufferChar(output_buf,
(char) strtol(yytext + 1, NULL, 8));
}
{xehexesc} {
/* hex case */
appendPQExpBufferChar(output_buf,
(char) strtol(yytext + 2, NULL, 16));
}
"\\". { emit(yytext + 1, 1); }
{other}|\n { ECHO; }
}
<xslashbackquote>{
/*
* backticked text: copy everything until next backquote or end of line.
* Invocation of the command will happen in psql_scan_slash_option.
*/
"`" { return LEXRES_OK; }
{other}|\n { ECHO; }
}
<xslashdefaultarg>{
/*
* Copy everything until unquoted whitespace or end of line. Quotes
* do not get stripped yet.
*/
{space} {
yyless(0);
return LEXRES_OK;
}
"\\" {
/*
* unquoted backslash is end of command or next command,
* do not eat
*
* (this was not the behavior pre-8.0, but it seems
* consistent)
*/
yyless(0);
return LEXRES_OK;
}
{dquote} {
*option_quote = '"';
ECHO;
BEGIN(xslashquotedarg);
}
{other} { ECHO; }
}
<xslashquotedarg>{
/* double-quoted text within a default-type argument: copy */
{dquote} {
ECHO;
BEGIN(xslashdefaultarg);
}
{other}|\n { ECHO; }
}
<xslashwholeline>{
/* copy everything until end of input line */
/* but suppress leading whitespace */
{space}+ {
if (output_buf->len > 0)
ECHO;
}
{other} { ECHO; }
}
<xslashend>{
/* at end of command, eat a double backslash, but not anything else */
"\\\\" { return LEXRES_OK; }
{other}|\n {
yyless(0);
return LEXRES_OK;
}
}
%%
/*
* Create a lexer working state struct.
*/
PsqlScanState
psql_scan_create(void)
{
PsqlScanState state;
state = (PsqlScanStateData *) pg_malloc_zero(sizeof(PsqlScanStateData));
psql_scan_reset(state);
return state;
}
/*
* Destroy a lexer working state struct, releasing all resources.
*/
void
psql_scan_destroy(PsqlScanState state)
{
psql_scan_finish(state);
psql_scan_reset(state);
free(state);
}
/*
* Set up to perform lexing of the given input line.
*
* The text at *line, extending for line_len bytes, will be scanned by
* subsequent calls to the psql_scan routines. psql_scan_finish should
* be called when scanning is complete. Note that the lexer retains
* a pointer to the storage at *line --- this string must not be altered
* or freed until after psql_scan_finish is called.
*/
void
psql_scan_setup(PsqlScanState state,
const char *line, int line_len)
{
/* Mustn't be scanning already */
psql_assert(state->scanbufhandle == NULL);
psql_assert(state->buffer_stack == NULL);
/* Do we need to hack the character set encoding? */
state->encoding = pset.encoding;
state->safe_encoding = pg_valid_server_encoding_id(state->encoding);
/* needed for prepare_buffer */
cur_state = state;
/* Set up flex input buffer with appropriate translation and padding */
state->scanbufhandle = prepare_buffer(line, line_len,
&state->scanbuf);
state->scanline = line;
/* Set lookaside data in case we have to map unsafe encoding */
state->curline = state->scanbuf;
state->refline = state->scanline;
}
/*
* Do lexical analysis of SQL command text.
*
* The text previously passed to psql_scan_setup is scanned, and appended
* (possibly with transformation) to query_buf.
*
* The return value indicates the condition that stopped scanning:
*
* PSCAN_SEMICOLON: found a command-ending semicolon. (The semicolon is
* transferred to query_buf.) The command accumulated in query_buf should
* be executed, then clear query_buf and call again to scan the remainder
* of the line.
*
* PSCAN_BACKSLASH: found a backslash that starts a psql special command.
* Any previous data on the line has been transferred to query_buf.
* The caller will typically next call psql_scan_slash_command(),
* perhaps psql_scan_slash_option(), and psql_scan_slash_command_end().
*
* PSCAN_INCOMPLETE: the end of the line was reached, but we have an
* incomplete SQL command. *prompt is set to the appropriate prompt type.
*
* PSCAN_EOL: the end of the line was reached, and there is no lexical
* reason to consider the command incomplete. The caller may or may not
* choose to send it. *prompt is set to the appropriate prompt type if
* the caller chooses to collect more input.
*
* In the PSCAN_INCOMPLETE and PSCAN_EOL cases, psql_scan_finish() should
* be called next, then the cycle may be repeated with a fresh input line.
*
* In all cases, *prompt is set to an appropriate prompt type code for the
* next line-input operation.
*/
PsqlScanResult
psql_scan(PsqlScanState state,
PQExpBuffer query_buf,
promptStatus_t *prompt)
{
PsqlScanResult result;
int lexresult;
/* Must be scanning already */
psql_assert(state->scanbufhandle);
/* Set up static variables that will be used by yylex */
cur_state = state;
output_buf = query_buf;
if (state->buffer_stack != NULL)
yy_switch_to_buffer(state->buffer_stack->buf);
else
yy_switch_to_buffer(state->scanbufhandle);
BEGIN(state->start_state);
/* And lex. */
lexresult = yylex();
/* Update static vars back to the state struct */
state->start_state = YY_START;
/*
* Check termination state and return appropriate result info.
*/
switch (lexresult)
{
case LEXRES_EOL: /* end of input */
switch (state->start_state)
{
/* This switch must cover all non-slash-command states. */
case INITIAL:
if (state->paren_depth > 0)
{
result = PSCAN_INCOMPLETE;
*prompt = PROMPT_PAREN;
}
else if (query_buf->len > 0)
{
result = PSCAN_EOL;
*prompt = PROMPT_CONTINUE;
}
else
{
/* never bother to send an empty buffer */
result = PSCAN_INCOMPLETE;
*prompt = PROMPT_READY;
}
break;
case xb:
result = PSCAN_INCOMPLETE;
*prompt = PROMPT_SINGLEQUOTE;
break;
case xc:
result = PSCAN_INCOMPLETE;
*prompt = PROMPT_COMMENT;
break;
case xd:
result = PSCAN_INCOMPLETE;
*prompt = PROMPT_DOUBLEQUOTE;
break;
case xh:
result = PSCAN_INCOMPLETE;
*prompt = PROMPT_SINGLEQUOTE;
break;
case xe:
result = PSCAN_INCOMPLETE;
*prompt = PROMPT_SINGLEQUOTE;
break;
case xq:
result = PSCAN_INCOMPLETE;
*prompt = PROMPT_SINGLEQUOTE;
break;
case xdolq:
result = PSCAN_INCOMPLETE;
*prompt = PROMPT_DOLLARQUOTE;
break;
case xui:
result = PSCAN_INCOMPLETE;
*prompt = PROMPT_DOUBLEQUOTE;
break;
case xus:
result = PSCAN_INCOMPLETE;
*prompt = PROMPT_SINGLEQUOTE;
break;
default:
/* can't get here */
fprintf(stderr, "invalid YY_START\n");
exit(1);
}
break;
case LEXRES_SEMI: /* semicolon */
result = PSCAN_SEMICOLON;
*prompt = PROMPT_READY;
break;
case LEXRES_BACKSLASH: /* backslash */
result = PSCAN_BACKSLASH;
*prompt = PROMPT_READY;
break;
default:
/* can't get here */
fprintf(stderr, "invalid yylex result\n");
exit(1);
}
return result;
}
/*
* Clean up after scanning a string. This flushes any unread input and
* releases resources (but not the PsqlScanState itself). Note however
* that this does not reset the lexer scan state; that can be done by
* psql_scan_reset(), which is an orthogonal operation.
*
* It is legal to call this when not scanning anything (makes it easier
* to deal with error recovery).
*/
void
psql_scan_finish(PsqlScanState state)
{
/* Drop any incomplete variable expansions. */
while (state->buffer_stack != NULL)
pop_buffer_stack(state);
/* Done with the outer scan buffer, too */
if (state->scanbufhandle)
yy_delete_buffer(state->scanbufhandle);
state->scanbufhandle = NULL;
if (state->scanbuf)
free(state->scanbuf);
state->scanbuf = NULL;
}
/*
* Reset lexer scanning state to start conditions. This is appropriate
* for executing \r psql commands (or any other time that we discard the
* prior contents of query_buf). It is not, however, necessary to do this
* when we execute and clear the buffer after getting a PSCAN_SEMICOLON or
* PSCAN_EOL scan result, because the scan state must be INITIAL when those
* conditions are returned.
*
* Note that this is unrelated to flushing unread input; that task is
* done by psql_scan_finish().
*/
void
psql_scan_reset(PsqlScanState state)
{
state->start_state = INITIAL;
state->paren_depth = 0;
state->xcdepth = 0; /* not really necessary */
if (state->dolqstart)
free(state->dolqstart);
state->dolqstart = NULL;
}
/*
* Return true if lexer is currently in an "inside quotes" state.
*
* This is pretty grotty but is needed to preserve the old behavior
* that mainloop.c drops blank lines not inside quotes without even
* echoing them.
*/
bool
psql_scan_in_quote(PsqlScanState state)
{
return state->start_state != INITIAL;
}
/*
* Scan the command name of a psql backslash command. This should be called
* after psql_scan() returns PSCAN_BACKSLASH. It is assumed that the input
* has been consumed through the leading backslash.
*
* The return value is a malloc'd copy of the command name, as parsed off
* from the input.
*/
char *
psql_scan_slash_command(PsqlScanState state)
{
PQExpBufferData mybuf;
int lexresult;
/* Must be scanning already */
psql_assert(state->scanbufhandle);
/* Build a local buffer that we'll return the data of */
initPQExpBuffer(&mybuf);
/* Set up static variables that will be used by yylex */
cur_state = state;
output_buf = &mybuf;
if (state->buffer_stack != NULL)
yy_switch_to_buffer(state->buffer_stack->buf);
else
yy_switch_to_buffer(state->scanbufhandle);
BEGIN(xslashcmd);
/* And lex. */
lexresult = yylex();
/* There are no possible errors in this lex state... */
return mybuf.data;
}
/*
* Parse off the next argument for a backslash command, and return it as a
* malloc'd string. If there are no more arguments, returns NULL.
*
* type tells what processing, if any, to perform on the option string;
* for example, if it's a SQL identifier, we want to downcase any unquoted
* letters.
*
* if quote is not NULL, *quote is set to 0 if no quoting was found, else
* the quote symbol.
*
* if semicolon is true, unquoted trailing semicolon(s) that would otherwise
* be taken as part of the option string will be stripped.
*
* NOTE: the only possible syntax errors for backslash options are unmatched
* quotes, which are detected when we run out of input. Therefore, on a
* syntax error we just throw away the string and return NULL; there is no
* need to worry about flushing remaining input.
*/
char *
psql_scan_slash_option(PsqlScanState state,
enum slash_option_type type,
char *quote,
bool semicolon)
{
PQExpBufferData mybuf;
int lexresult;
char local_quote;
bool badarg;
/* Must be scanning already */
psql_assert(state->scanbufhandle);
if (quote == NULL)
quote = &local_quote;
*quote = 0;
/* Build a local buffer that we'll return the data of */
initPQExpBuffer(&mybuf);
/* Set up static variables that will be used by yylex */
cur_state = state;
output_buf = &mybuf;
option_type = type;
option_quote = quote;
if (state->buffer_stack != NULL)
yy_switch_to_buffer(state->buffer_stack->buf);
else
yy_switch_to_buffer(state->scanbufhandle);
if (type == OT_WHOLE_LINE)
BEGIN(xslashwholeline);
else
BEGIN(xslasharg);
/* And lex. */
lexresult = yylex();
/*
* Check the lex result: we should have gotten back either LEXRES_OK
* or LEXRES_EOL (the latter indicating end of string). If we were inside
* a quoted string, as indicated by YY_START, EOL is an error.
*/
psql_assert(lexresult == LEXRES_EOL || lexresult == LEXRES_OK);
badarg = false;
switch (YY_START)
{
case xslasharg:
/* empty arg, or possibly a psql variable substitution */
break;
case xslashquote:
if (lexresult != LEXRES_OK)
badarg = true; /* hit EOL not ending quote */
break;
case xslashbackquote:
if (lexresult != LEXRES_OK)
badarg = true; /* hit EOL not ending quote */
else
{
/* Perform evaluation of backticked command */
char *cmd = mybuf.data;
FILE *fd;
bool error = false;
PQExpBufferData output;
char buf[512];
size_t result;
fd = popen(cmd, PG_BINARY_R);
if (!fd)
{
psql_error("%s: %s\n", cmd, strerror(errno));
error = true;
}
initPQExpBuffer(&output);
if (!error)
{
do
{
result = fread(buf, 1, sizeof(buf), fd);
if (ferror(fd))
{
psql_error("%s: %s\n", cmd, strerror(errno));
error = true;
break;
}
appendBinaryPQExpBuffer(&output, buf, result);
} while (!feof(fd));
}
if (fd && pclose(fd) == -1)
{
psql_error("%s: %s\n", cmd, strerror(errno));
error = true;
}
if (PQExpBufferBroken(&output))
{
psql_error("%s: out of memory\n", cmd);
error = true;
}
/* Now done with cmd, transfer result to mybuf */
resetPQExpBuffer(&mybuf);
if (!error)
{
/* strip any trailing newline */
if (output.len > 0 &&
output.data[output.len - 1] == '\n')
output.len--;
appendBinaryPQExpBuffer(&mybuf, output.data, output.len);
}
termPQExpBuffer(&output);
}
break;
case xslashdefaultarg:
/* Strip any trailing semi-colons if requested */
if (semicolon)
{
while (mybuf.len > 0 &&
mybuf.data[mybuf.len - 1] == ';')
{
mybuf.data[--mybuf.len] = '\0';
}
}
/*
* If SQL identifier processing was requested, then we strip out
* excess double quotes and downcase unquoted letters.
* Doubled double-quotes become output double-quotes, per spec.
*
* Note that a string like FOO"BAR"BAZ will be converted to
* fooBARbaz; this is somewhat inconsistent with the SQL spec,
* which would have us parse it as several identifiers. But
* for psql's purposes, we want a string like "foo"."bar" to
* be treated as one option, so there's little choice.
*/
if (type == OT_SQLID || type == OT_SQLIDHACK)
{
bool inquotes = false;
char *cp = mybuf.data;
while (*cp)
{
if (*cp == '"')
{
if (inquotes && cp[1] == '"')
{
/* Keep the first quote, remove the second */
cp++;
}
inquotes = !inquotes;
/* Collapse out quote at *cp */
memmove(cp, cp + 1, strlen(cp));
mybuf.len--;
/* do not advance cp */
}
else
{
if (!inquotes && type == OT_SQLID)
*cp = pg_tolower((unsigned char) *cp);
cp += PQmblen(cp, pset.encoding);
}
}
}
break;
case xslashquotedarg:
/* must have hit EOL inside double quotes */
badarg = true;
break;
case xslashwholeline:
/* always okay */
break;
default:
/* can't get here */
fprintf(stderr, "invalid YY_START\n");
exit(1);
}
if (badarg)
{
psql_error("unterminated quoted string\n");
termPQExpBuffer(&mybuf);
return NULL;
}
/*
* An unquoted empty argument isn't possible unless we are at end of
* command. Return NULL instead.
*/
if (mybuf.len == 0 && *quote == 0)
{
termPQExpBuffer(&mybuf);
return NULL;
}
/* Else return the completed string. */
return mybuf.data;
}
/*
* Eat up any unused \\ to complete a backslash command.
*/
void
psql_scan_slash_command_end(PsqlScanState state)
{
int lexresult;
/* Must be scanning already */
psql_assert(state->scanbufhandle);
/* Set up static variables that will be used by yylex */
cur_state = state;
output_buf = NULL;
if (state->buffer_stack != NULL)
yy_switch_to_buffer(state->buffer_stack->buf);
else
yy_switch_to_buffer(state->scanbufhandle);
BEGIN(xslashend);
/* And lex. */
lexresult = yylex();
/* There are no possible errors in this lex state... */
}
/*
* Push the given string onto the stack of stuff to scan.
*
* cur_state must point to the active PsqlScanState.
*
* NOTE SIDE EFFECT: the new buffer is made the active flex input buffer.
*/
static void
push_new_buffer(const char *newstr, const char *varname)
{
StackElem *stackelem;
stackelem = (StackElem *) pg_malloc(sizeof(StackElem));
/*
* In current usage, the passed varname points at the current flex
* input buffer; we must copy it before calling prepare_buffer()
* because that will change the buffer state.
*/
stackelem->varname = varname ? pg_strdup(varname) : NULL;
stackelem->buf = prepare_buffer(newstr, strlen(newstr),
&stackelem->bufstring);
cur_state->curline = stackelem->bufstring;
if (cur_state->safe_encoding)
{
stackelem->origstring = NULL;
cur_state->refline = stackelem->bufstring;
}
else
{
stackelem->origstring = pg_strdup(newstr);
cur_state->refline = stackelem->origstring;
}
stackelem->next = cur_state->buffer_stack;
cur_state->buffer_stack = stackelem;
}
/*
* Pop the topmost buffer stack item (there must be one!)
*
* NB: after this, the flex input state is unspecified; caller must
* switch to an appropriate buffer to continue lexing.
*/
static void
pop_buffer_stack(PsqlScanState state)
{
StackElem *stackelem = state->buffer_stack;
state->buffer_stack = stackelem->next;
yy_delete_buffer(stackelem->buf);
free(stackelem->bufstring);
if (stackelem->origstring)
free(stackelem->origstring);
if (stackelem->varname)
free(stackelem->varname);
free(stackelem);
}
/*
* Check if specified variable name is the source for any string
* currently being scanned
*/
static bool
var_is_current_source(PsqlScanState state, const char *varname)
{
StackElem *stackelem;
for (stackelem = state->buffer_stack;
stackelem != NULL;
stackelem = stackelem->next)
{
if (stackelem->varname && strcmp(stackelem->varname, varname) == 0)
return true;
}
return false;
}
/*
* Set up a flex input buffer to scan the given data. We always make a
* copy of the data. If working in an unsafe encoding, the copy has
* multibyte sequences replaced by FFs to avoid fooling the lexer rules.
*
* cur_state must point to the active PsqlScanState.
*
* NOTE SIDE EFFECT: the new buffer is made the active flex input buffer.
*/
static YY_BUFFER_STATE
prepare_buffer(const char *txt, int len, char **txtcopy)
{
char *newtxt;
/* Flex wants two \0 characters after the actual data */
newtxt = pg_malloc(len + 2);
*txtcopy = newtxt;
newtxt[len] = newtxt[len + 1] = YY_END_OF_BUFFER_CHAR;
if (cur_state->safe_encoding)
memcpy(newtxt, txt, len);
else
{
/* Gotta do it the hard way */
int i = 0;
while (i < len)
{
int thislen = PQmblen(txt + i, cur_state->encoding);
/* first byte should always be okay... */
newtxt[i] = txt[i];
i++;
while (--thislen > 0)
newtxt[i++] = (char) 0xFF;
}
}
return yy_scan_buffer(newtxt, len + 2);
}
/*
* emit() --- body for ECHO macro
*
* NB: this must be used for ALL and ONLY the text copied from the flex
* input data. If you pass it something that is not part of the yytext
* string, you are making a mistake. Internally generated text can be
* appended directly to output_buf.
*/
static void
emit(const char *txt, int len)
{
if (cur_state->safe_encoding)
appendBinaryPQExpBuffer(output_buf, txt, len);
else
{
/* Gotta do it the hard way */
const char *reference = cur_state->refline;
int i;
reference += (txt - cur_state->curline);
for (i = 0; i < len; i++)
{
char ch = txt[i];
if (ch == (char) 0xFF)
ch = reference[i];
appendPQExpBufferChar(output_buf, ch);
}
}
}
static void
escape_variable(bool as_ident)
{
char saved_char;
const char *value;
/* Variable lookup. */
saved_char = yytext[yyleng - 1];
yytext[yyleng - 1] = '\0';
value = GetVariable(pset.vars, yytext + 2);
/* Escaping. */
if (value)
{
if (!pset.db)
psql_error("can't escape without active connection\n");
else
{
char *escaped_value;
if (as_ident)
escaped_value =
PQescapeIdentifier(pset.db, value, strlen(value));
else
escaped_value =
PQescapeLiteral(pset.db, value, strlen(value));
if (escaped_value == NULL)
{
const char *error = PQerrorMessage(pset.db);
psql_error("%s", error);
}
else
{
appendPQExpBufferStr(output_buf, escaped_value);
PQfreemem(escaped_value);
return;
}
}
}
/*
* If we reach this point, some kind of error has occurred. Emit the
* original text into the output buffer.
*/
yytext[yyleng - 1] = saved_char;
emit(yytext, yyleng);
}