* scan.l
* lexical scanner for PostgreSQL
*
- * XXX The rules in this file must be kept in sync with psql's lexer!!!
+ * NOTE NOTE NOTE:
+ *
+ * The rules in this file must be kept in sync with psql's lexer!!!
+ *
+ * The rules are designed so that the scanner never has to backtrack,
+ * in the sense that there is always a rule that can match the input
+ * consumed so far (the rule action may internally throw back some input
+ * with yyless(), however). As explained in the flex manual, this makes
+ * for a useful speed increase --- about a third faster than a plain -CF
+ * lexer, in simple testing. The extra complexity is mostly in the rules
+ * for handling float numbers and continued string literals. If you change
+ * the lexical rules, verify that you haven't broken the no-backtrack
+ * property by running flex with the "-b" option and checking that the
+ * resulting "lex.backup" file says that no backing up is needed.
+ *
*
* Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/parser/scan.l,v 1.121 2005/03/11 19:13:42 momjian Exp $
+ * $PostgreSQL: pgsql/src/backend/parser/scan.l,v 1.122 2005/05/26 01:24:29 tgl Exp $
*
*-------------------------------------------------------------------------
*/
horiz_whitespace ({horiz_space}|{comment})
whitespace_with_newline ({horiz_whitespace}*{newline}{special_whitespace}*)
+/*
+ * To ensure that {quotecontinue} can be scanned without having to back up
+ * if the full pattern isn't matched, we include trailing whitespace in
+ * {quotestop}. This matches all cases where {quotecontinue} fails to match,
+ * except for {quote} followed by whitespace and just one "-" (not two,
+ * which would start a {comment}). To cover that we have {quotefail}.
+ * The actions for {quotestop} and {quotefail} must throw back characters
+ * beyond the quote proper.
+ */
+quote '
+quotestop {quote}{whitespace}*
+quotecontinue {quote}{whitespace_with_newline}{quote}
+quotefail {quote}{whitespace}*"-"
+
/* Bit string
* It is tempting to scan the string for only those characters
* which are allowed. However, this leads to silently swallowed
* validate the contents.
*/
xbstart [bB]{quote}
-xbstop {quote}
xbinside [^']*
-xbcat {quote}{whitespace_with_newline}{quote}
/* Hexadecimal number
*/
xhstart [xX]{quote}
-xhstop {quote}
xhinside [^']*
-xhcat {quote}{whitespace_with_newline}{quote}
/* National character
*/
/* Extended quote
* xqdouble implements embedded quote
- * xqcat allows strings to cross input lines
*/
-quote '
xqstart {quote}
-xqstop {quote}
xqdouble {quote}{quote}
xqinside [^\\']+
xqescape [\\][^0-7]
xqoctesc [\\][0-7]{1,3}
-xqcat {quote}{whitespace_with_newline}{quote}
/* $foo$ style quotes ("dollar quoting")
* The quoted string starts with $foo$ where "foo" is an optional string
* in the form of an identifier, except that it may not contain "$",
* and extends to the first occurrence of an identical string.
* There is *no* processing of the quoted text.
+ *
+ * {dolqfailed} is an error rule to avoid scanner backup when {dolqdelim}
+ * fails to match its trailing "$".
*/
dolq_start [A-Za-z\200-\377_]
dolq_cont [A-Za-z\200-\377_0-9]
dolqdelim \$({dolq_start}{dolq_cont}*)?\$
+dolqfailed \${dolq_start}{dolq_cont}*
dolqinside [^$]+
/* Double quote
/* we no longer allow unary minus in numbers.
* instead we pass it separately to parser. there it gets
- * coerced via doNegate() -- Leon aug 20 1999
+ * coerced via doNegate() -- Leon aug 20 1999
+ *
+ * {realfail1} and {realfail2} are added to prevent the need for scanner
+ * backup when the {real} rule fails to match completely.
*/
integer {digit}+
decimal (({digit}*\.{digit}+)|({digit}+\.{digit}*))
-real ((({digit}*\.{digit}+)|({digit}+\.{digit}*)|({digit}+))([Ee][-+]?{digit}+))
+real ({integer}|{decimal})[Ee][-+]?{digit}+
+realfail1 ({integer}|{decimal})[Ee]
+realfail2 ({integer}|{decimal})[Ee][-+]
param \${integer}
/* ignore */
}
+\*+ {
+ /* ignore */
+ }
+
<> { yyerror("unterminated /* comment"); }
{xbstart} {
startlit();
addlitchar('b');
}
-{xbstop} {
+{quotestop} |
+{quotefail} {
+ yyless(1);
BEGIN(INITIAL);
yylval.str = litbufdup();
return BCONST;
{xbinside} {
addlit(yytext, yyleng);
}
-{xhcat} |
-{xbcat} {
+{quotecontinue} |
+{quotecontinue} {
/* ignore */
}
<> { yyerror("unterminated bit string literal"); }
startlit();
addlitchar('x');
}
-{xhstop} {
+{quotestop} |
+{quotefail} {
+ yyless(1);
BEGIN(INITIAL);
yylval.str = litbufdup();
return XCONST;
*/
const ScanKeyword *keyword;
- /* This had better be a keyword! */
+ yyless(1); /* eat only 'n' this time */
+ /* nchar had better be a keyword! */
keyword = ScanKeywordLookup("nchar");
Assert(keyword != NULL);
yylval.keyword = keyword->name;
- token_start = yytext;
- BEGIN(xq);
- startlit();
return keyword->value;
}
BEGIN(xq);
startlit();
}
-{xqstop} {
+{quotestop} |
+{quotefail} {
+ yyless(1);
BEGIN(INITIAL);
yylval.str = litbufdup();
return SCONST;
unsigned char c = strtoul(yytext+1, NULL, 8);
addlitchar(c);
}
-{xqcat} {
+{quotecontinue} {
/* ignore */
}
. {
BEGIN(xdolq);
startlit();
}
+{dolqfailed} {
+ /* throw back all but the initial "$" */
+ yyless(1);
+ /* and treat it as {other} */
+ return yytext[0];
+ }
{dolqdelim} {
if (strcmp(yytext, dolqstart) == 0)
{
{dolqinside} {
addlit(yytext, yyleng);
}
+{dolqfailed} {
+ addlit(yytext, yyleng);
+ }
. {
/* This is only needed for $ inside the quoted text */
addlitchar(yytext[0]);
yylval.str = pstrdup(yytext);
return FCONST;
}
+{realfail1} {
+ /*
+ * throw back the [Ee], and treat as {decimal}. Note
+ * that it is possible the input is actually {integer},
+ * but since this case will almost certainly lead to a
+ * syntax error anyway, we don't bother to distinguish.
+ */
+ yyless(yyleng-1);
+ yylval.str = pstrdup(yytext);
+ return FCONST;
+ }
+{realfail2} {
+ /* throw back the [Ee][+-], and proceed as above */
+ yyless(yyleng-2);
+ yylval.str = pstrdup(yytext);
+ return FCONST;
+ }
{identifier} {
* are (except for a few) the same as the backend's, but their actions are
* just ECHO whereas the backend's actions generally do other things.
*
- * XXX The rules in this file must be kept in sync with the main parser!!!
+ * XXX The rules in this file must be kept in sync with the backend lexer!!!
+ *
+ * XXX Avoid creating backtracking cases --- see the backend lexer for info.
*
* The most difficult aspect of this code is that we need to work in multibyte
* encodings that are not ASCII-safe. A "safe" encoding is one in which each
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/bin/psql/psqlscan.l,v 1.9 2004/12/31 22:03:15 pgsql Exp $
+ * $PostgreSQL: pgsql/src/bin/psql/psqlscan.l,v 1.10 2005/05/26 01:24:29 tgl Exp $
*
*-------------------------------------------------------------------------
*/
horiz_whitespace ({horiz_space}|{comment})
whitespace_with_newline ({horiz_whitespace}*{newline}{special_whitespace}*)
+/*
+ * To ensure that {quotecontinue} can be scanned without having to back up
+ * if the full pattern isn't matched, we include trailing whitespace in
+ * {quotestop}. This matches all cases where {quotecontinue} fails to match,
+ * except for {quote} followed by whitespace and just one "-" (not two,
+ * which would start a {comment}). To cover that we have {quotefail}.
+ * The actions for {quotestop} and {quotefail} must throw back characters
+ * beyond the quote proper.
+ */
+quote '
+quotestop {quote}{whitespace}*
+quotecontinue {quote}{whitespace_with_newline}{quote}
+quotefail {quote}{whitespace}*"-"
+
/* Bit string
* It is tempting to scan the string for only those characters
* which are allowed. However, this leads to silently swallowed
* validate the contents.
*/
xbstart [bB]{quote}
-xbstop {quote}
xbinside [^']*
-xbcat {quote}{whitespace_with_newline}{quote}
/* Hexadecimal number
*/
xhstart [xX]{quote}
-xhstop {quote}
xhinside [^']*
-xhcat {quote}{whitespace_with_newline}{quote}
/* National character
*/
/* Extended quote
* xqdouble implements embedded quote
- * xqcat allows strings to cross input lines
*/
-quote '
xqstart {quote}
-xqstop {quote}
xqdouble {quote}{quote}
xqinside [^\\']+
xqescape [\\][^0-7]
xqoctesc [\\][0-7]{1,3}
-xqcat {quote}{whitespace_with_newline}{quote}
/* $foo$ style quotes ("dollar quoting")
* The quoted string starts with $foo$ where "foo" is an optional string
* in the form of an identifier, except that it may not contain "$",
* and extends to the first occurrence of an identical string.
* There is *no* processing of the quoted text.
+ *
+ * {dolqfailed} is an error rule to avoid scanner backup when {dolqdelim}
+ * fails to match its trailing "$".
*/
dolq_start [A-Za-z\200-\377_]
dolq_cont [A-Za-z\200-\377_0-9]
dolqdelim \$({dolq_start}{dolq_cont}*)?\$
+dolqfailed \${dolq_start}{dolq_cont}*
dolqinside [^$]+
/* Double quote
/* we no longer allow unary minus in numbers.
* instead we pass it separately to parser. there it gets
- * coerced via doNegate() -- Leon aug 20 1999
+ * coerced via doNegate() -- Leon aug 20 1999
+ *
+ * {realfail1} and {realfail2} are added to prevent the need for scanner
+ * backup when the {real} rule fails to match completely.
*/
integer {digit}+
decimal (({digit}*\.{digit}+)|({digit}+\.{digit}*))
-real ((({digit}*\.{digit}+)|({digit}+\.{digit}*)|({digit}+))([Ee][-+]?{digit}+))
+real ({integer}|{decimal})[Ee][-+]?{digit}+
+realfail1 ({integer}|{decimal})[Ee]
+realfail2 ({integer}|{decimal})[Ee][-+]
param \${integer}
ECHO;
}
+\*+ {
+ ECHO;
+ }
+
{xbstart} {
BEGIN(xb);
ECHO;
}
-{xbstop} {
+{quotestop} |
+{quotefail} {
+ yyless(1);
BEGIN(INITIAL);
ECHO;
}
{xbinside} {
ECHO;
}
-{xhcat} |
-{xbcat} {
+{quotecontinue} |
+{quotecontinue} {
ECHO;
}
BEGIN(xh);
ECHO;
}
-{xhstop} {
+{quotestop} |
+{quotefail} {
+ yyless(1);
BEGIN(INITIAL);
ECHO;
}
{xnstart} {
- BEGIN(xq);
+ yyless(1); /* eat only 'n' this time */
ECHO;
}
BEGIN(xq);
ECHO;
}
-{xqstop} {
+{quotestop} |
+{quotefail} {
+ yyless(1);
BEGIN(INITIAL);
ECHO;
}
{xqoctesc} {
ECHO;
}
-{xqcat} {
+{quotecontinue} {
ECHO;
}
. {
BEGIN(xdolq);
ECHO;
}
+{dolqfailed} {
+ /* throw back all but the initial "$" */
+ yyless(1);
+ ECHO;
+ }
{dolqdelim} {
if (strcmp(yytext, cur_state->dolqstart) == 0)
{
{dolqinside} {
ECHO;
}
+{dolqfailed} {
+ ECHO;
+ }
. {
/* This is only needed for $ inside the quoted text */
ECHO;
{real} {
ECHO;
}
+{realfail1} {
+ /*
+ * throw back the [Ee], and treat as {decimal}. Note
+ * that it is possible the input is actually {integer},
+ * but since this case will almost certainly lead to a
+ * syntax error anyway, we don't bother to distinguish.
+ */
+ yyless(yyleng-1);
+ ECHO;
+ }
+{realfail2} {
+ /* throw back the [Ee][+-], and proceed as above */
+ yyless(yyleng-2);
+ ECHO;
+ }
{identifier} {
(char) strtol(yytext + 1, NULL, 0));
}
+"\\"0[xX] {
+ /* failed hex case */
+ yyless(2);
+ appendPQExpBufferChar(output_buf,
+ (char) strtol(yytext + 1, NULL, 0));
+ }
+
"\\". { emit(yytext + 1, 1); }
{other}|\n { ECHO; }