Change the way UESCAPE is lexed, to reduce the size of the flex tables.
authorHeikki Linnakangas
Thu, 14 Mar 2013 17:00:09 +0000 (19:00 +0200)
committerHeikki Linnakangas
Thu, 14 Mar 2013 17:04:43 +0000 (19:04 +0200)
The error rule used to avoid backtracking with the U&'...' UESCAPE 'x'
syntax bloated the flex tables, so refactor that. This patch makes the error
rule shorter, by introducing a new exclusive flex state that's entered after
parsing U&'...'. This shrinks the postgres binary by about 220kB.

src/backend/parser/scan.l

index 23c83c4fd9030dc46a8bae130df248c266e08b83..92f38a2a07ae2fcc748d69c08a0582c328180af0 100644 (file)
@@ -97,6 +97,7 @@ static bool is_utf16_surrogate_first(pg_wchar c);
 static bool is_utf16_surrogate_second(pg_wchar c);
 static pg_wchar surrogate_pair_to_codepoint(pg_wchar first, pg_wchar second);
 static void addunicode(pg_wchar c, yyscan_t yyscanner);
+static bool check_uescapechar(unsigned char escape);
 
 #define yyerror(msg)  scanner_yyerror(msg, yyscanner)
 
@@ -150,7 +151,9 @@ extern void core_yyset_column(int column_no, yyscan_t yyscanner);
  *   extended quoted strings (support backslash escape sequences)
  *   $foo$ quoted strings
  *   quoted identifier with Unicode escapes
+ *   end of a quoted identifier with Unicode escapes, UESCAPE can follow
  *   quoted string with Unicode escapes
+ *   end of a quoted string with Unicode escapes, UESCAPE can follow
  *   Unicode surrogate pair in extended quoted string
  */
 
@@ -162,7 +165,9 @@ extern void core_yyset_column(int column_no, yyscan_t yyscanner);
 %x xq
 %x xdolq
 %x xui
+%x xuiend
 %x xus
+%x xusend
 %x xeu
 
 /*
@@ -279,17 +284,17 @@ xdinside      [^"]+
 /* Unicode escapes */
 uescape            [uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}[^']{quote}
 /* error rule to avoid backup */
-uescapefail        ("-"|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*"-"|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}[^']|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*|[uU][eE][sS][cC][aA][pP]|[uU][eE][sS][cC][aA]|[uU][eE][sS][cC]|[uU][eE][sS]|[uU][eE]|[uU])
+uescapefail        [uU][eE][sS][cC][aA][pP][eE]{whitespace}*"-"|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}[^']|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*|[uU][eE][sS][cC][aA][pP]|[uU][eE][sS][cC][aA]|[uU][eE][sS][cC]|[uU][eE][sS]|[uU][eE]|[uU]
 
 /* Quoted identifier with Unicode escapes */
 xuistart       [uU]&{dquote}
-xuistop1       {dquote}{whitespace}*{uescapefail}?
-xuistop2       {dquote}{whitespace}*{uescape}
 
 /* Quoted string with Unicode escapes */
 xusstart       [uU]&{quote}
-xusstop1       {quote}{whitespace}*{uescapefail}?
-xusstop2       {quote}{whitespace}*{uescape}
+
+/* Optional UESCAPE after a quoted string or identifier with Unicode escapes. */
+xustop1        {uescapefail}?
+xustop2        {uescape}
 
 /* error rule to avoid backup */
 xufailed       [uU]&
@@ -536,15 +541,31 @@ other         .
                    yylval->str = litbufdup(yyscanner);
                    return SCONST;
                }
-{xusstop1} {
+{quotestop} |
+{quotefail} {
                    /* throw back all but the quote */
                    yyless(1);
+                   /* handle possible UESCAPE in xusend mode */
+                   BEGIN(xusend);
+               }
+{whitespace}
+{other} |
+{xustop1} {
+                   /* no UESCAPE after the quote, throw back everything */
+                   yyless(0);
                    BEGIN(INITIAL);
                    yylval->str = litbuf_udeescape('\\', yyscanner);
                    return SCONST;
                }
-{xusstop2} {
+{xustop2} {
+                   /* found UESCAPE after the end quote */
                    BEGIN(INITIAL);
+                   if (!check_uescapechar(yytext[yyleng-2]))
+                   {
+                       SET_YYLLOC();
+                       ADVANCE_YYLLOC(yyleng-2);
+                       yyerror("invalid Unicode escape character");
+                   }
                    yylval->str = litbuf_udeescape(yytext[yyleng-2], yyscanner);
                    return SCONST;
                }
@@ -702,9 +723,19 @@ other          .
                    yylval->str = ident;
                    return IDENT;
                }
-{xuistop1}    {
+{dquote} {
+                   yyless(1);
+                   /* handle possible UESCAPE in xuiend mode */
+                   BEGIN(xuiend);
+               }
+{whitespace} { }
+{other} |
+{xustop1} {
+                   /* no UESCAPE after the quote, throw back everything */
                    char           *ident;
 
+                   yyless(0);
+
                    BEGIN(INITIAL);
                    if (yyextra->literallen == 0)
                        yyerror("zero-length delimited identifier");
@@ -712,16 +743,21 @@ other         .
                    if (yyextra->literallen >= NAMEDATALEN)
                        truncate_identifier(ident, yyextra->literallen, true);
                    yylval->str = ident;
-                   /* throw back all but the quote */
-                   yyless(1);
                    return IDENT;
                }
-{xuistop2}    {
+{xustop2}  {
+                   /* found UESCAPE after the end quote */
                    char           *ident;
 
                    BEGIN(INITIAL);
                    if (yyextra->literallen == 0)
                        yyerror("zero-length delimited identifier");
+                   if (!check_uescapechar(yytext[yyleng-2]))
+                   {
+                       SET_YYLLOC();
+                       ADVANCE_YYLLOC(yyleng-2);
+                       yyerror("invalid Unicode escape character");
+                   }
                    ident = litbuf_udeescape(yytext[yyleng - 2], yyscanner);
                    if (yyextra->literallen >= NAMEDATALEN)
                        truncate_identifier(ident, yyextra->literallen, true);
@@ -1203,22 +1239,29 @@ addunicode(pg_wchar c, core_yyscan_t yyscanner)
    addlit(buf, pg_mblen(buf), yyscanner);
 }
 
-static char *
-litbuf_udeescape(unsigned char escape, core_yyscan_t yyscanner)
+/* is 'escape' acceptable as Unicode escape character (UESCAPE syntax) ? */
+static bool
+check_uescapechar(unsigned char escape)
 {
-   char *new;
-   char *litbuf, *in, *out;
-   pg_wchar pair_first = 0;
-
    if (isxdigit(escape)
        || escape == '+'
        || escape == '\''
        || escape == '"'
        || scanner_isspace(escape))
    {
-       ADVANCE_YYLLOC(yyextra->literallen + yyleng + 1);
-       yyerror("invalid Unicode escape character");
+       return false;
    }
+   else
+       return true;
+}
+
+/* like litbufdup, but handle unicode escapes */
+static char *
+litbuf_udeescape(unsigned char escape, core_yyscan_t yyscanner)
+{
+   char *new;
+   char *litbuf, *in, *out;
+   pg_wchar pair_first = 0;
 
    /* Make literalbuf null-terminated to simplify the scanning loop */
    litbuf = yyextra->literalbuf;