Tweak the backend scanner (and psqlscan.l, which must track the backend

author Tom Lane

Thu, 26 May 2005 01:24:29 +0000 (01:24 +0000)

committer Tom Lane

Thu, 26 May 2005 01:24:29 +0000 (01:24 +0000)
author Tom Lane
Thu, 26 May 2005 01:24:29 +0000 (01:24 +0000)
committer Tom Lane
Thu, 26 May 2005 01:24:29 +0000 (01:24 +0000)
diff --git a/src/backend/parser/scan.l b/src/backend/parser/scan.l

index a0635463bb632623c31d8ba2b0868c8d96497da8..ef5c1a639f1029b8e9709a4eedd8e83d048a41d7 100644 (file)
--- a/src/backend/parser/scan.l
+++ b/src/backend/parser/scan.l
@@ -4,13 +4,27 @@
   * scan.l
   *   lexical scanner for PostgreSQL
   *
- * XXX The rules in this file must be kept in sync with psql's lexer!!!
+ * NOTE NOTE NOTE:
+ *
+ * The rules in this file must be kept in sync with psql's lexer!!!
+ *
+ * The rules are designed so that the scanner never has to backtrack,
+ * in the sense that there is always a rule that can match the input
+ * consumed so far (the rule action may internally throw back some input
+ * with yyless(), however).  As explained in the flex manual, this makes
+ * for a useful speed increase --- about a third faster than a plain -CF
+ * lexer, in simple testing.  The extra complexity is mostly in the rules
+ * for handling float numbers and continued string literals.  If you change
+ * the lexical rules, verify that you haven't broken the no-backtrack
+ * property by running flex with the "-b" option and checking that the
+ * resulting "lex.backup" file says that no backing up is needed.
+ *
   *
   * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   * IDENTIFICATION
- *   $PostgreSQL: pgsql/src/backend/parser/scan.l,v 1.121 2005/03/11 19:13:42 momjian Exp $
+ *   $PostgreSQL: pgsql/src/backend/parser/scan.l,v 1.122 2005/05/26 01:24:29 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -138,6 +152,20 @@ special_whitespace     ({space}+|{comment}{newline})
  horiz_whitespace       ({horiz_space}|{comment})
  whitespace_with_newline    ({horiz_whitespace}*{newline}{special_whitespace}*)
  
+/*
+ * To ensure that {quotecontinue} can be scanned without having to back up
+ * if the full pattern isn't matched, we include trailing whitespace in
+ * {quotestop}.  This matches all cases where {quotecontinue} fails to match,
+ * except for {quote} followed by whitespace and just one "-" (not two,
+ * which would start a {comment}).  To cover that we have {quotefail}.
+ * The actions for {quotestop} and {quotefail} must throw back characters
+ * beyond the quote proper.
+ */
+quote          '
+quotestop      {quote}{whitespace}*
+quotecontinue  {quote}{whitespace_with_newline}{quote}
+quotefail      {quote}{whitespace}*"-"
+
  /* Bit string
   * It is tempting to scan the string for only those characters
   * which are allowed. However, this leads to silently swallowed
@@ -148,16 +176,12 @@ whitespace_with_newline   ({horiz_whitespace}*{newline}{special_whitespace}*)
   * validate the contents.
   */
  xbstart            [bB]{quote}
-xbstop         {quote}
  xbinside       [^']*
-xbcat          {quote}{whitespace_with_newline}{quote}
  
  /* Hexadecimal number
   */
  xhstart            [xX]{quote}
-xhstop         {quote}
  xhinside       [^']*
-xhcat          {quote}{whitespace_with_newline}{quote}
  
  /* National character
   */
@@ -165,26 +189,26 @@ xnstart           [nN]{quote}
  
  /* Extended quote
   * xqdouble implements embedded quote
- * xqcat allows strings to cross input lines
   */
-quote          '
  xqstart            {quote}
-xqstop         {quote}
  xqdouble       {quote}{quote}
  xqinside       [^\\']+
  xqescape       [\\][^0-7]
  xqoctesc       [\\][0-7]{1,3}
-xqcat          {quote}{whitespace_with_newline}{quote}
  
  /* $foo$ style quotes ("dollar quoting")
   * The quoted string starts with $foo$ where "foo" is an optional string
   * in the form of an identifier, except that it may not contain "$", 
   * and extends to the first occurrence of an identical string.  
   * There is *no* processing of the quoted text.
+ *
+ * {dolqfailed} is an error rule to avoid scanner backup when {dolqdelim}
+ * fails to match its trailing "$".
   */
  dolq_start     [A-Za-z\200-\377_]
  dolq_cont      [A-Za-z\200-\377_0-9]
  dolqdelim      \$({dolq_start}{dolq_cont}*)?\$
+dolqfailed     \${dolq_start}{dolq_cont}*
  dolqinside     [^$]+
  
  /* Double quote
@@ -242,12 +266,17 @@ operator      {op_chars}+
  
  /* we no longer allow unary minus in numbers. 
   * instead we pass it separately to parser. there it gets
- * coerced via doNegate() -- Leon aug 20 1999 
+ * coerced via doNegate() -- Leon aug 20 1999
+ *
+ * {realfail1} and {realfail2} are added to prevent the need for scanner
+ * backup when the {real} rule fails to match completely.
   */
  
  integer            {digit}+
  decimal            (({digit}*\.{digit}+)|({digit}+\.{digit}*))
-real           ((({digit}*\.{digit}+)|({digit}+\.{digit}*)|({digit}+))([Ee][-+]?{digit}+))
+real           ({integer}|{decimal})[Ee][-+]?{digit}+
+realfail1      ({integer}|{decimal})[Ee]
+realfail2      ({integer}|{decimal})[Ee][-+]
  
  param          \${integer}
  
@@ -310,6 +339,10 @@ other          .
                     /* ignore */
                 }
  
+\*+            {
+                   /* ignore */
+               }
+
  <>        { yyerror("unterminated /* comment"); }
  
  {xbstart}      {
@@ -324,7 +357,9 @@ other           .
                     startlit();
                     addlitchar('b');
                 }
-{xbstop}   {
+{quotestop}    |
+{quotefail} {
+                   yyless(1);
                     BEGIN(INITIAL);
                     yylval.str = litbufdup();
                     return BCONST;
@@ -333,8 +368,8 @@ other           .
  {xbinside} {
                     addlit(yytext, yyleng);
                 }
-{xhcat}        |
-{xbcat}        {
+{quotecontinue}    |
+{quotecontinue}    {
                     /* ignore */
                 }
  <>        { yyerror("unterminated bit string literal"); }
@@ -351,7 +386,9 @@ other           .
                     startlit();
                     addlitchar('x');
                 }
-{xhstop}   {
+{quotestop}    |
+{quotefail} {
+                   yyless(1);
                     BEGIN(INITIAL);
                     yylval.str = litbufdup();
                     return XCONST;
@@ -365,13 +402,11 @@ other         .
                      */
                     const ScanKeyword *keyword;
  
-                   /* This had better be a keyword! */
+                   yyless(1);              /* eat only 'n' this time */
+                   /* nchar had better be a keyword! */
                     keyword = ScanKeywordLookup("nchar");
                     Assert(keyword != NULL);
                     yylval.keyword = keyword->name;
-                   token_start = yytext;
-                   BEGIN(xq);
-                   startlit();
                     return keyword->value;
                 }
  
@@ -380,7 +415,9 @@ other           .
                     BEGIN(xq);
                     startlit();
                 }
-{xqstop}   {
+{quotestop}    |
+{quotefail} {
+                   yyless(1);
                     BEGIN(INITIAL);
                     yylval.str = litbufdup();
                     return SCONST;
@@ -398,7 +435,7 @@ other           .
                     unsigned char c = strtoul(yytext+1, NULL, 8);
                     addlitchar(c);
                 }
-{xqcat}        {
+{quotecontinue} {
                     /* ignore */
                 }
  .          {
@@ -413,6 +450,12 @@ other          .
                     BEGIN(xdolq);
                     startlit();
                 }
+{dolqfailed}   {
+                   /* throw back all but the initial "$" */
+                   yyless(1);
+                   /* and treat it as {other} */
+                   return yytext[0];
+               }
  {dolqdelim} {
                     if (strcmp(yytext, dolqstart) == 0)
                     {
@@ -435,6 +478,9 @@ other           .
  {dolqinside} {
                     addlit(yytext, yyleng);
                 }
+{dolqfailed} {
+                   addlit(yytext, yyleng);
+               }
  .       {
                     /* This is only needed for $ inside the quoted text */
                     addlitchar(yytext[0]);
@@ -576,6 +622,23 @@ other          .
                     yylval.str = pstrdup(yytext);
                     return FCONST;
                 }
+{realfail1}        {
+                   /*
+                    * throw back the [Ee], and treat as {decimal}.  Note
+                    * that it is possible the input is actually {integer},
+                    * but since this case will almost certainly lead to a
+                    * syntax error anyway, we don't bother to distinguish.
+                    */
+                   yyless(yyleng-1);
+                   yylval.str = pstrdup(yytext);
+                   return FCONST;
+               }
+{realfail2}        {
+                   /* throw back the [Ee][+-], and proceed as above */
+                   yyless(yyleng-2);
+                   yylval.str = pstrdup(yytext);
+                   return FCONST;
+               }
  
  
  {identifier}   {
diff --git a/src/bin/psql/psqlscan.l b/src/bin/psql/psqlscan.l

index 147d77872d8a6ee0f2ce018f6519b14796c652ef..88763d504bceac7e8d97e762155831500be5589d 100644 (file)
--- a/src/bin/psql/psqlscan.l
+++ b/src/bin/psql/psqlscan.l
@@ -11,7 +11,9 @@
   * are (except for a few) the same as the backend's, but their actions are
   * just ECHO whereas the backend's actions generally do other things.
   *
- * XXX The rules in this file must be kept in sync with the main parser!!!
+ * XXX The rules in this file must be kept in sync with the backend lexer!!!
+ *
+ * XXX Avoid creating backtracking cases --- see the backend lexer for info.
   *
   * The most difficult aspect of this code is that we need to work in multibyte
   * encodings that are not ASCII-safe.  A "safe" encoding is one in which each
@@ -31,7 +33,7 @@
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   * IDENTIFICATION
- *   $PostgreSQL: pgsql/src/bin/psql/psqlscan.l,v 1.9 2004/12/31 22:03:15 pgsql Exp $
+ *   $PostgreSQL: pgsql/src/bin/psql/psqlscan.l,v 1.10 2005/05/26 01:24:29 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -207,6 +209,20 @@ special_whitespace     ({space}+|{comment}{newline})
  horiz_whitespace       ({horiz_space}|{comment})
  whitespace_with_newline    ({horiz_whitespace}*{newline}{special_whitespace}*)
  
+/*
+ * To ensure that {quotecontinue} can be scanned without having to back up
+ * if the full pattern isn't matched, we include trailing whitespace in
+ * {quotestop}.  This matches all cases where {quotecontinue} fails to match,
+ * except for {quote} followed by whitespace and just one "-" (not two,
+ * which would start a {comment}).  To cover that we have {quotefail}.
+ * The actions for {quotestop} and {quotefail} must throw back characters
+ * beyond the quote proper.
+ */
+quote          '
+quotestop      {quote}{whitespace}*
+quotecontinue  {quote}{whitespace_with_newline}{quote}
+quotefail      {quote}{whitespace}*"-"
+
  /* Bit string
   * It is tempting to scan the string for only those characters
   * which are allowed. However, this leads to silently swallowed
@@ -217,16 +233,12 @@ whitespace_with_newline   ({horiz_whitespace}*{newline}{special_whitespace}*)
   * validate the contents.
   */
  xbstart            [bB]{quote}
-xbstop         {quote}
  xbinside       [^']*
-xbcat          {quote}{whitespace_with_newline}{quote}
  
  /* Hexadecimal number
   */
  xhstart            [xX]{quote}
-xhstop         {quote}
  xhinside       [^']*
-xhcat          {quote}{whitespace_with_newline}{quote}
  
  /* National character
   */
@@ -234,26 +246,26 @@ xnstart           [nN]{quote}
  
  /* Extended quote
   * xqdouble implements embedded quote
- * xqcat allows strings to cross input lines
   */
-quote          '
  xqstart            {quote}
-xqstop         {quote}
  xqdouble       {quote}{quote}
  xqinside       [^\\']+
  xqescape       [\\][^0-7]
  xqoctesc       [\\][0-7]{1,3}
-xqcat          {quote}{whitespace_with_newline}{quote}
  
  /* $foo$ style quotes ("dollar quoting")
   * The quoted string starts with $foo$ where "foo" is an optional string
   * in the form of an identifier, except that it may not contain "$", 
   * and extends to the first occurrence of an identical string.  
   * There is *no* processing of the quoted text.
+ *
+ * {dolqfailed} is an error rule to avoid scanner backup when {dolqdelim}
+ * fails to match its trailing "$".
   */
  dolq_start     [A-Za-z\200-\377_]
  dolq_cont      [A-Za-z\200-\377_0-9]
  dolqdelim      \$({dolq_start}{dolq_cont}*)?\$
+dolqfailed     \${dolq_start}{dolq_cont}*
  dolqinside     [^$]+
  
  /* Double quote
@@ -311,12 +323,17 @@ operator      {op_chars}+
  
  /* we no longer allow unary minus in numbers. 
   * instead we pass it separately to parser. there it gets
- * coerced via doNegate() -- Leon aug 20 1999 
+ * coerced via doNegate() -- Leon aug 20 1999
+ *
+ * {realfail1} and {realfail2} are added to prevent the need for scanner
+ * backup when the {real} rule fails to match completely.
   */
  
  integer            {digit}+
  decimal            (({digit}*\.{digit}+)|({digit}+\.{digit}*))
-real           ((({digit}*\.{digit}+)|({digit}+\.{digit}*)|({digit}+))([Ee][-+]?{digit}+))
+real           ({integer}|{decimal})[Ee][-+]?{digit}+
+realfail1      ({integer}|{decimal})[Ee]
+realfail2      ({integer}|{decimal})[Ee][-+]
  
  param          \${integer}
  
@@ -383,11 +400,17 @@ other         .
                     ECHO;
                 }
  
+\*+            {
+                   ECHO;
+               }
+
  {xbstart}      {
                     BEGIN(xb);
                     ECHO;
                 }
-{xbstop}   {
+{quotestop}    |
+{quotefail} {
+                   yyless(1);
                     BEGIN(INITIAL);
                     ECHO;
                 }
@@ -395,8 +418,8 @@ other           .
  {xbinside} {
                     ECHO;
                 }
-{xhcat}        |
-{xbcat}        {
+{quotecontinue}    |
+{quotecontinue}    {
                     ECHO;
                 }
  
@@ -410,13 +433,15 @@ other         .
                     BEGIN(xh);
                     ECHO;
                 }
-{xhstop}   {
+{quotestop}    |
+{quotefail} {
+                   yyless(1);
                     BEGIN(INITIAL);
                     ECHO;
                 }
  
  {xnstart}      {
-                   BEGIN(xq);
+                   yyless(1);              /* eat only 'n' this time */
                     ECHO;
                 }
  
@@ -424,7 +449,9 @@ other           .
                     BEGIN(xq);
                     ECHO;
                 }
-{xqstop}   {
+{quotestop}    |
+{quotefail} {
+                   yyless(1);
                     BEGIN(INITIAL);
                     ECHO;
                 }
@@ -440,7 +467,7 @@ other           .
  {xqoctesc}  {
                     ECHO;
                 }
-{xqcat}        {
+{quotecontinue} {
                     ECHO;
                 }
  .          {
@@ -453,6 +480,11 @@ other          .
                     BEGIN(xdolq);
                     ECHO;
                 }
+{dolqfailed}   {
+                   /* throw back all but the initial "$" */
+                   yyless(1);
+                   ECHO;
+               }
  {dolqdelim} {
                     if (strcmp(yytext, cur_state->dolqstart) == 0)
                     {
@@ -474,6 +506,9 @@ other           .
  {dolqinside} {
                     ECHO;
                 }
+{dolqfailed} {
+                   ECHO;
+               }
  .       {
                     /* This is only needed for $ inside the quoted text */
                     ECHO;
@@ -636,6 +671,21 @@ other          .
  {real}         {
                     ECHO;
                 }
+{realfail1}        {
+                   /*
+                    * throw back the [Ee], and treat as {decimal}.  Note
+                    * that it is possible the input is actually {integer},
+                    * but since this case will almost certainly lead to a
+                    * syntax error anyway, we don't bother to distinguish.
+                    */
+                   yyless(yyleng-1);
+                   ECHO;
+               }
+{realfail2}        {
+                   /* throw back the [Ee][+-], and proceed as above */
+                   yyless(yyleng-2);
+                   ECHO;
+               }
  
  
  {identifier}   {
@@ -817,6 +867,13 @@ other          .
                                           (char) strtol(yytext + 1, NULL, 0));
                 }
  
+"\\"0[xX]  {
+                   /* failed hex case */
+                   yyless(2);
+                   appendPQExpBufferChar(output_buf,
+                                         (char) strtol(yytext + 1, NULL, 0));
+               }
+
  "\\".          { emit(yytext + 1, 1); }
  
  {other}|\n     { ECHO; }
author	Tom Lane
	Thu, 26 May 2005 01:24:29 +0000 (01:24 +0000)
committer	Tom Lane
	Thu, 26 May 2005 01:24:29 +0000 (01:24 +0000)
src/backend/parser/scan.l		patch \| blob \| blame \| history
src/bin/psql/psqlscan.l		patch \| blob \| blame \| history