Unicode escapes in strings and identifiers

author Peter Eisentraut

Wed, 29 Oct 2008 08:04:54 +0000 (08:04 +0000)

committer Peter Eisentraut

Wed, 29 Oct 2008 08:04:54 +0000 (08:04 +0000)
author Peter Eisentraut
Wed, 29 Oct 2008 08:04:54 +0000 (08:04 +0000)
committer Peter Eisentraut
Wed, 29 Oct 2008 08:04:54 +0000 (08:04 +0000)
diff --git a/doc/src/sgml/syntax.sgml b/doc/src/sgml/syntax.sgml

index 0efba278c55b5dc8c74a776ae87f9d4c9493782a..6c988011b7ca5d8bf9b06d8cc518c59038be589b 100644 (file)
--- a/doc/src/sgml/syntax.sgml
+++ b/doc/src/sgml/syntax.sgml
@@ -1,4 +1,4 @@
-
+
  
  
   SQL Syntax
@@ -189,6 +189,57 @@ UPDATE "my_table" SET "a" = 5;
      ampersands.  The length limitation still applies.
     
  
+   
+    Unicode escapein
+    identifiers A variant of quoted
+    identifiers allows including escaped Unicode characters identified
+    by their code points.  This variant starts
+    with U& (upper or lower case U followed by
+    ampersand) immediately before the opening double quote, without
+    any spaces in between, for example U&"foo".
+    (Note that this creates an ambiguity with the
+    operator &.  Use spaces around the operator to
+    avoid this problem.)  Inside the quotes, Unicode characters can be
+    specified in escaped form by writing a backslash followed by the
+    four-digit hexadecimal code point number or alternatively a
+    backslash followed by a plus sign followed by a six-digit
+    hexadecimal code point number.  For example, the
+    identifier "data" could be written as
+
+U&"d\0061t\+000061"
+
+    The following less trivial example writes the Russian
+    word slon (elephant) in Cyrillic letters:
+
+U&"\0441\043B\043E\043D"
+
+   
+
+   
+    If a different escape character than backslash is desired, it can
+    be specified using
+    the UESCAPEUESCAPE
+    clause after the string, for example:
+
+U&"d!0061t!+000061" UESCAPE '!'
+
+    The escape character can be any single character other than a
+    hexadecimal digit, the plus sign, a single quote, a double quote,
+    or a whitespace character.  Note that the escape character is
+    written in single quotes, not double quotes.
+   
+
+   
+    To include the escape character in the identifier literally, write
+    it twice.
+   
+
+   
+    The Unicode escape syntax works only when the server encoding is
+    UTF8.  When other server encodings are used, only code points in
+    the ASCII range (up to \007F) can be specified.
+   
+
     
      Quoting an identifier also makes it case-sensitive, whereas
      unquoted names are always folded to lower case.  For example, the
@@ -245,7 +296,7 @@ UPDATE "my_table" SET "a" = 5;
       write two adjacent single quotes, e.g.
       'Dianne''s horse'.
       Note that this is not the same as a double-quote
-     character (").
+     character ("). 
      
  
      
@@ -269,14 +320,19 @@ SELECT 'foo'      'bar';
       by SQL; PostgreSQL is
       following the standard.)
      
+   
  
-    
-     
+   
+    String Constants with C-Style Escapes
+
+     
        escape string syntax
       
-     
+      zone="sql-syntax-strings-escape">
        backslash escapes
       
+
+    
       PostgreSQL also accepts escape
       string constants, which are an extension to the SQL standard.
       An escape string constant is specified by writing the letter
@@ -287,7 +343,8 @@ SELECT 'foo'      'bar';
       Within an escape string, a backslash character (\) begins a
       C-like backslash escape sequence, in which the combination
       of backslash and following character(s) represent a special byte
-     value:
+     value, as shown in .
+    
  
       
        Backslash Escape Sequences
@@ -341,14 +398,24 @@ SELECT 'foo'      'bar';
        
       
  
-     It is your responsibility that the byte sequences you create are
-     valid characters in the server character set encoding. Any other
+    
+     Any other
       character following a backslash is taken literally. Thus, to
       include a backslash character, write two backslashes (\\).
       Also, a single quote can be included in an escape string by writing
       \', in addition to the normal way of ''.
      
  
+    
+     It is your responsibility that the byte sequences you create are
+     valid characters in the server character set encoding.  When the
+     server encoding is UTF-8, then the alternative Unicode escape
+     syntax, explained in ,
+     should be used instead.  (The alternative would be doing the
+     UTF-8 encoding by hand and writing out the bytes, which would be
+     very cumbersome.)
+    
+
      
      
       If the configuration parameter
@@ -379,6 +446,65 @@ SELECT 'foo'      'bar';
      
     
  
+   
+    String Constants with Unicode Escapes
+
+    
+     Unicode escape
+     in string constants
+    
+
+    
+     PostgreSQL also supports another type
+     of escape syntax for strings that allows specifying arbitrary
+     Unicode characters by code point.  A Unicode escape string
+     constant starts with U& (upper or lower case
+     letter U followed by ampersand) immediately before the opening
+     quote, without any spaces in between, for
+     example U&'foo'.  (Note that this creates an
+     ambiguity with the operator &.  Use spaces
+     around the operator to avoid this problem.)  Inside the quotes,
+     Unicode characters can be specified in escaped form by writing a
+     backslash followed by the four-digit hexadecimal code point
+     number or alternatively a backslash followed by a plus sign
+     followed by a six-digit hexadecimal code point number.  For
+     example, the string 'data' could be written as
+
+U&'d\0061t\+000061'
+
+     The following less trivial example writes the Russian
+     word slon (elephant) in Cyrillic letters:
+
+U&'\0441\043B\043E\043D'
+
+    
+
+    
+     If a different escape character than backslash is desired, it can
+     be specified using
+     the UESCAPEUESCAPE
+     clause after the string, for example:
+
+          U&'d!0061t!+000061' UESCAPE '!'
+
+     The escape character can be any single character other than a
+     hexadecimal digit, the plus sign, a single quote, a double quote,
+     or a whitespace character.
+    
+
+    
+     The Unicode escape syntax works only when the server encoding is
+     UTF8.  When other server encodings are used, only code points in
+     the ASCII range (up to \007F) can be
+     specified.
+    
+
+    
+     To include the escape character in the string literally, write it
+     twice.
+    
+   
+
     
      Dollar-Quoted String Constants
  
diff --git a/src/backend/catalog/sql_features.txt b/src/backend/catalog/sql_features.txt

index b795a70f3cfa4ec0d4e7421755af35310261bdff..707a0710836f0444aa839fc509779b3c574a1cf6 100644 (file)
--- a/src/backend/catalog/sql_features.txt
+++ b/src/backend/catalog/sql_features.txt
@@ -238,8 +238,8 @@ F381    Extended schema manipulation    02  ALTER TABLE statement: ADD CONSTRAINT claus
  F381   Extended schema manipulation    03  ALTER TABLE statement: DROP CONSTRAINT clause   YES 
  F382   Alter column data type          YES 
  F391   Long identifiers            YES 
-F392   Unicode escapes in identifiers          NO  
-F393   Unicode escapes in literals         NO  
+F392   Unicode escapes in identifiers          YES 
+F393   Unicode escapes in literals         YES 
  F394   Optional normal form specification          NO  
  F401   Extended joined table           YES 
  F401   Extended joined table   01  NATURAL JOIN    YES 
diff --git a/src/backend/parser/scan.l b/src/backend/parser/scan.l

index dec0669d8baa196febb7f1847786b4786621013e..424907e3c5377ac7111c4609f4170a8e24e8c737 100644 (file)
--- a/src/backend/parser/scan.l
+++ b/src/backend/parser/scan.l
@@ -24,7 +24,7 @@
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   * IDENTIFICATION
- *   $PostgreSQL: pgsql/src/backend/parser/scan.l,v 1.146 2008/09/01 20:42:45 tgl Exp $
+ *   $PostgreSQL: pgsql/src/backend/parser/scan.l,v 1.147 2008/10/29 08:04:52 petere Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -76,6 +76,7 @@ static int        literalalloc;   /* current allocated buffer size */
  static void addlit(char *ytext, int yleng);
  static void addlitchar(unsigned char ychar);
  static char *litbufdup(void);
+static char *litbuf_udeescape(unsigned char escape);
  
  #define lexer_errposition()  scanner_errposition(yylloc)
  
@@ -125,6 +126,8 @@ static unsigned char unescape_single_char(unsigned char c);
   *   standard quoted strings
   *   extended quoted strings (support backslash escape sequences)
   *   $foo$ quoted strings
+ *   quoted identifier with Unicode escapes
+ *   quoted string with Unicode escapes
   */
  
  %x xb
@@ -134,6 +137,8 @@ static unsigned char unescape_single_char(unsigned char c);
  %x xe
  %x xq
  %x xdolq
+%x xui
+%x xus
  
  /*
   * In order to make the world safe for Windows and Mac clients as well as
@@ -244,6 +249,25 @@ xdstop         {dquote}
  xddouble       {dquote}{dquote}
  xdinside       [^"]+
  
+/* Unicode escapes */
+uescape            [uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}[^']{quote}
+/* error rule to avoid backup */
+uescapefail        ("-"|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*"-"|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}[^']|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*|[uU][eE][sS][cC][aA][pP]|[uU][eE][sS][cC][aA]|[uU][eE][sS][cC]|[uU][eE][sS]|[uU][eE]|[uU])
+
+/* Quoted identifier with Unicode escapes */
+xuistart       [uU]&{dquote}
+xuistop1       {dquote}{whitespace}*{uescapefail}?
+xuistop2       {dquote}{whitespace}*{uescape}
+
+/* Quoted string with Unicode escapes */
+xusstart       [uU]&{quote}
+xusstop1       {quote}{whitespace}*{uescapefail}?
+xusstop2       {quote}{whitespace}*{uescape}
+
+/* error rule to avoid backup */
+xufailed       [uU]&
+
+
  /* C-style comments
   *
   * The "extended comment" syntax closely resembles allowable operator syntax.
@@ -444,6 +468,11 @@ other          .
                     BEGIN(xe);
                     startlit();
                 }
+{xusstart}     {
+                   SET_YYLLOC();
+                   BEGIN(xus);
+                   startlit();
+               }
  {quotestop} |
  {quotefail} {
                     yyless(1);
@@ -456,10 +485,22 @@ other         .
                     yylval.str = litbufdup();
                     return SCONST;
                 }
-{xqdouble} {
+{xusstop1} {
+                   /* throw back all but the quote */
+                   yyless(1);
+                   BEGIN(INITIAL);
+                   yylval.str = litbuf_udeescape('\\');
+                   return SCONST;
+               }
+{xusstop2} {
+                   BEGIN(INITIAL);
+                   yylval.str = litbuf_udeescape(yytext[yyleng-2]);
+                   return SCONST;
+               }
+{xqdouble} {
                     addlitchar('\'');
                 }
-{xqinside}  {
+,xus>{xqinside}  {
                     addlit(yytext, yyleng);
                 }
  {xeinside}  {
@@ -496,14 +537,14 @@ other         .
                     if (IS_HIGHBIT_SET(c))
                         saw_high_bit = true;
                 }
-{quotecontinue} {
+,xus>{quotecontinue} {
                     /* ignore */
                 }
  .          {
                     /* This is only needed for \ just before EOF */
                     addlitchar(yytext[0]);
                 }
-<>     { yyerror("unterminated quoted string"); }
+,xus><>     { yyerror("unterminated quoted string"); }
  
  {dolqdelim}        {
                     SET_YYLLOC();
@@ -553,6 +594,11 @@ other          .
                     BEGIN(xd);
                     startlit();
                 }
+{xuistart}     {
+                   SET_YYLLOC();
+                   BEGIN(xui);
+                   startlit();
+               }
  {xdstop}   {
                     char           *ident;
  
@@ -565,13 +611,46 @@ other         .
                     yylval.str = ident;
                     return IDENT;
                 }
-{xddouble} {
+{xuistop1}    {
+                   char           *ident;
+
+                   BEGIN(INITIAL);
+                   if (literallen == 0)
+                       yyerror("zero-length delimited identifier");
+                   ident = litbuf_udeescape('\\');
+                   if (literallen >= NAMEDATALEN)
+                       truncate_identifier(ident, literallen, true);
+                   yylval.str = ident;
+                   /* throw back all but the quote */
+                   yyless(1);
+                   return IDENT;
+               }
+{xuistop2}    {
+                   char           *ident;
+
+                   BEGIN(INITIAL);
+                   if (literallen == 0)
+                       yyerror("zero-length delimited identifier");
+                   ident = litbuf_udeescape(yytext[yyleng - 2]);
+                   if (literallen >= NAMEDATALEN)
+                       truncate_identifier(ident, literallen, true);
+                   yylval.str = ident;
+                   return IDENT;
+               }
+{xddouble} {
                     addlitchar('"');
                 }
-{xdinside} {
+,xui>{xdinside} {
                     addlit(yytext, yyleng);
                 }
-<>        { yyerror("unterminated quoted identifier"); }
+<>        { yyerror("unterminated quoted identifier"); }
+
+{xufailed} {
+                   /* throw back all but the initial u/U */
+                   yyless(1);
+                   /* and treat it as {other} */
+                   return yytext[0];
+               }
  
  {typecast}     {
                     SET_YYLLOC();
@@ -908,6 +987,99 @@ litbufdup(void)
     return new;
  }
  
+static int
+hexval(unsigned char c)
+{
+   if (c >= '0' && c <= '9')
+       return c - '0';
+   if (c >= 'a' && c <= 'f')
+       return c - 'a' + 0xA;
+   if (c >= 'A' && c <= 'F')
+       return c - 'A' + 0xA;
+   elog(ERROR, "invalid hexadecimal digit");
+   return 0; /* not reached */
+}
+
+static void
+check_unicode_value(pg_wchar c, char * loc)
+{
+   if (GetDatabaseEncoding() == PG_UTF8)
+       return;
+
+   if (c > 0x7F)
+   {
+       yylloc += (char *) loc - literalbuf + 3;   /* 3 for U&" */
+       yyerror("Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8");
+   }
+}
+
+static char *
+litbuf_udeescape(unsigned char escape)
+{
+   char *new;
+   char *in, *out;
+
+   if (isxdigit(escape)
+       || escape == '+'
+       || escape == '\''
+       || escape == '"'
+       || scanner_isspace(escape))
+   {
+       yylloc += literallen + yyleng + 1;
+       yyerror("invalid Unicode escape character");
+   }
+
+   /*
+    * This relies on the subtle assumption that a UTF-8 expansion
+    * cannot be longer than its escaped representation.
+    */
+   new = palloc(literallen + 1);
+
+   in = literalbuf;
+   out = new;
+   while (*in)
+   {
+       if (in[0] == escape)
+       {
+           if (in[1] == escape)
+           {
+               *out++ = escape;
+               in += 2;
+           }
+           else if (isxdigit(in[1]) && isxdigit(in[2]) && isxdigit(in[3]) && isxdigit(in[4]))
+           {
+               pg_wchar unicode = hexval(in[1]) * 16*16*16 + hexval(in[2]) * 16*16 + hexval(in[3]) * 16 + hexval(in[4]);
+               check_unicode_value(unicode, in);
+               unicode_to_utf8(unicode, (unsigned char *) out);
+               in += 5;
+               out += pg_mblen(out);
+           }
+           else if (in[1] == '+'
+                    && isxdigit(in[2]) && isxdigit(in[3])
+                    && isxdigit(in[4]) && isxdigit(in[5])
+                    && isxdigit(in[6]) && isxdigit(in[7]))
+           {
+               pg_wchar unicode = hexval(in[2]) * 16*16*16*16*16 + hexval(in[3]) * 16*16*16*16 + hexval(in[4]) * 16*16*16
+                                   + hexval(in[5]) * 16*16 + hexval(in[6]) * 16 + hexval(in[7]);
+               check_unicode_value(unicode, in);
+               unicode_to_utf8(unicode, (unsigned char *) out);
+               in += 8;
+               out += pg_mblen(out);
+           }
+           else
+           {
+               yylloc += in - literalbuf + 3;   /* 3 for U&" */
+               yyerror("invalid Unicode escape value");
+           }
+       }
+       else
+           *out++ = *in++;
+   }
+
+   *out = '\0';
+   pg_verifymbstr(new, out - new, false);
+   return new;
+}
  
  static unsigned char
  unescape_single_char(unsigned char c)
diff --git a/src/backend/utils/adt/xml.c b/src/backend/utils/adt/xml.c

index e728e1254f51710dbdce5783eb437826822d452b..c346299caa89641b310c23d336d8902c082c3b0c 100644 (file)
--- a/src/backend/utils/adt/xml.c
+++ b/src/backend/utils/adt/xml.c
@@ -7,7 +7,7 @@
   * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
- * $PostgreSQL: pgsql/src/backend/utils/adt/xml.c,v 1.79 2008/10/14 17:12:33 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/utils/adt/xml.c,v 1.80 2008/10/29 08:04:53 petere Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -1497,28 +1497,7 @@ unicode_to_sqlchar(pg_wchar c)
  {
     static unsigned char utf8string[5]; /* need trailing zero */
  
-   if (c <= 0x7F)
-   {
-       utf8string[0] = c;
-   }
-   else if (c <= 0x7FF)
-   {
-       utf8string[0] = 0xC0 | ((c >> 6) & 0x1F);
-       utf8string[1] = 0x80 | (c & 0x3F);
-   }
-   else if (c <= 0xFFFF)
-   {
-       utf8string[0] = 0xE0 | ((c >> 12) & 0x0F);
-       utf8string[1] = 0x80 | ((c >> 6) & 0x3F);
-       utf8string[2] = 0x80 | (c & 0x3F);
-   }
-   else
-   {
-       utf8string[0] = 0xF0 | ((c >> 18) & 0x07);
-       utf8string[1] = 0x80 | ((c >> 12) & 0x3F);
-       utf8string[2] = 0x80 | ((c >> 6) & 0x3F);
-       utf8string[3] = 0x80 | (c & 0x3F);
-   }
+   unicode_to_utf8(c, utf8string);
  
     return (char *) pg_do_encoding_conversion(utf8string,
                                               pg_mblen((char *) utf8string),
diff --git a/src/backend/utils/mb/wchar.c b/src/backend/utils/mb/wchar.c

index 2f11b3aa9b0db267c19cd2925886162640034c93..2c6c3f3ff1cd30fba5fccf660b581f8ea82f8ee1 100644 (file)
--- a/src/backend/utils/mb/wchar.c
+++ b/src/backend/utils/mb/wchar.c
@@ -1,7 +1,7 @@
  /*
   * conversion functions between pg_wchar and multibyte streams.
   * Tatsuo Ishii
- * $PostgreSQL: pgsql/src/backend/utils/mb/wchar.c,v 1.67 2008/10/27 19:37:22 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/utils/mb/wchar.c,v 1.68 2008/10/29 08:04:53 petere Exp $
   *
   */
  /* can be used in either frontend or backend */
@@ -419,6 +419,41 @@ pg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
     return cnt;
  }
  
+
+/*
+ * Map a Unicode code point to UTF-8.  utf8string must have 4 bytes of
+ * space allocated.
+ */
+unsigned char *
+unicode_to_utf8(pg_wchar c, unsigned char *utf8string)
+{
+   if (c <= 0x7F)
+   {
+       utf8string[0] = c;
+   }
+   else if (c <= 0x7FF)
+   {
+       utf8string[0] = 0xC0 | ((c >> 6) & 0x1F);
+       utf8string[1] = 0x80 | (c & 0x3F);
+   }
+   else if (c <= 0xFFFF)
+   {
+       utf8string[0] = 0xE0 | ((c >> 12) & 0x0F);
+       utf8string[1] = 0x80 | ((c >> 6) & 0x3F);
+       utf8string[2] = 0x80 | (c & 0x3F);
+   }
+   else
+   {
+       utf8string[0] = 0xF0 | ((c >> 18) & 0x07);
+       utf8string[1] = 0x80 | ((c >> 12) & 0x3F);
+       utf8string[2] = 0x80 | ((c >> 6) & 0x3F);
+       utf8string[3] = 0x80 | (c & 0x3F);
+   }
+
+   return utf8string;
+}
+
+
  /*
   * Return the byte length of a UTF8 character pointed to by s
   *
diff --git a/src/bin/psql/psqlscan.l b/src/bin/psql/psqlscan.l

index b5f1149cdc44a2dde9ed59b2ba0b32a03c53002d..02329b9e7578fe97a6a1ae2db3568f01b2653c0c 100644 (file)
--- a/src/bin/psql/psqlscan.l
+++ b/src/bin/psql/psqlscan.l
@@ -33,7 +33,7 @@
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   * IDENTIFICATION
- *   $PostgreSQL: pgsql/src/bin/psql/psqlscan.l,v 1.25 2008/05/09 15:36:31 petere Exp $
+ *   $PostgreSQL: pgsql/src/bin/psql/psqlscan.l,v 1.26 2008/10/29 08:04:53 petere Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -156,6 +156,8 @@ static void emit(const char *txt, int len);
   *   standard quoted strings
   *   extended quoted strings (support backslash escape sequences)
   *   $foo$ quoted strings
+ *   quoted identifier with Unicode escapes
+ *   quoted string with Unicode escapes
   */
  
  %x xb
@@ -165,6 +167,8 @@ static void emit(const char *txt, int len);
  %x xe
  %x xq
  %x xdolq
+%x xui
+%x xus
  /* Additional exclusive states for psql only: lex backslash commands */
  %x xslashcmd
  %x xslasharg
@@ -281,6 +285,25 @@ xdstop         {dquote}
  xddouble       {dquote}{dquote}
  xdinside       [^"]+
  
+/* Unicode escapes */
+uescape            [uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}[^']{quote}
+/* error rule to avoid backup */
+uescapefail        ("-"|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*"-"|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}[^']|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*|[uU][eE][sS][cC][aA][pP]|[uU][eE][sS][cC][aA]|[uU][eE][sS][cC]|[uU][eE][sS]|[uU][eE]|[uU])
+
+/* Quoted identifier with Unicode escapes */
+xuistart       [uU]&{dquote}
+xuistop1       {dquote}{whitespace}*{uescapefail}?
+xuistop2       {dquote}{whitespace}*{uescape}
+
+/* Quoted string with Unicode escapes */
+xusstart       [uU]&{quote}
+xusstop1       {quote}{whitespace}*{uescapefail}?
+xusstop2       {quote}{whitespace}*{uescape}
+
+/* error rule to avoid backup */
+xufailed       [uU]&
+
+
  /* C-style comments
   *
   * The "extended comment" syntax closely resembles allowable operator syntax.
@@ -460,16 +483,29 @@ other         .
                     BEGIN(xe);
                     ECHO;
                 }
+{xusstart}     {
+                   BEGIN(xus);
+                   ECHO;
+               }
  {quotestop} |
  {quotefail} {
                     yyless(1);
                     BEGIN(INITIAL);
                     ECHO;
                 }
-{xqdouble} {
+{xusstop1} {
+                   yyless(1);
+                   BEGIN(INITIAL);
+                   ECHO;
+               }
+{xusstop2} {
+                   BEGIN(INITIAL);
+                   ECHO;
+               }
+{xqdouble} {
                     ECHO;
                 }
-{xqinside}  {
+,xus>{xqinside}  {
                     ECHO;
                 }
  {xeinside}  {
@@ -484,7 +520,7 @@ other           .
  {xehexesc}  {
                     ECHO;
                 }
-{quotecontinue} {
+,xus>{quotecontinue} {
                     ECHO;
                 }
  .          {
@@ -535,14 +571,33 @@ other         .
                     BEGIN(xd);
                     ECHO;
                 }
+{xuistart}     {
+                   BEGIN(xui);
+                   ECHO;
+               }
  {xdstop}   {
                     BEGIN(INITIAL);
                     ECHO;
                 }
-{xddouble} {
+{xuistop1}    {
+                   yyless(1);
+                   BEGIN(INITIAL);
+                   ECHO;
+               }
+{xuistop2}    {
+                   BEGIN(INITIAL);
                     ECHO;
                 }
-{xdinside} {
+{xddouble} {
+                   ECHO;
+               }
+{xdinside} {
+                   ECHO;
+               }
+
+{xufailed} {
+                   /* throw back all but the initial u/U */
+                   yyless(1);
                     ECHO;
                 }
  
diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h

index b29552fdeaf0856bf8ee3b12d5ac471066d547e4..75003ad52760608fffbef2a4faa24fd0b0ffb440 100644 (file)
--- a/src/include/mb/pg_wchar.h
+++ b/src/include/mb/pg_wchar.h
@@ -6,7 +6,7 @@
   * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
- * $PostgreSQL: pgsql/src/include/mb/pg_wchar.h,v 1.79 2008/06/18 18:42:54 momjian Exp $
+ * $PostgreSQL: pgsql/src/include/mb/pg_wchar.h,v 1.80 2008/10/29 08:04:53 petere Exp $
   *
   * NOTES
   *     This is used both by the backend and by libpq, but should not be
@@ -380,6 +380,7 @@ extern const char *GetDatabaseEncodingName(void);
  extern int pg_valid_client_encoding(const char *name);
  extern int pg_valid_server_encoding(const char *name);
  
+extern unsigned char *unicode_to_utf8(pg_wchar c, unsigned char *utf8string);
  extern int pg_utf_mblen(const unsigned char *);
  extern unsigned char *pg_do_encoding_conversion(unsigned char *src, int len,
                           int src_encoding,
diff --git a/src/interfaces/ecpg/preproc/pgc.l b/src/interfaces/ecpg/preproc/pgc.l

index a08eb8e2035827b3b9b6e5b6aa339fadafdd58a4..c744c46acaf267fb578eaf24bfb891faa9278e14 100644 (file)
--- a/src/interfaces/ecpg/preproc/pgc.l
+++ b/src/interfaces/ecpg/preproc/pgc.l
@@ -12,7 +12,7 @@
   *
   *
   * IDENTIFICATION
- *   $PostgreSQL: pgsql/src/interfaces/ecpg/preproc/pgc.l,v 1.166 2008/05/20 23:17:32 meskes Exp $
+ *   $PostgreSQL: pgsql/src/interfaces/ecpg/preproc/pgc.l,v 1.167 2008/10/29 08:04:53 petere Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -103,6 +103,8 @@ static struct _if_value
   *  extended quoted strings (support backslash escape sequences)
   *  national character quoted strings
   *   $foo$ quoted strings
+ *   quoted identifier with Unicode escapes
+ *   quoted string with Unicode escapes
   */
  
  %x xb
@@ -117,6 +119,8 @@ static struct _if_value
  %x xdolq
  %x xcond
  %x xskip
+%x xui
+%x xus
  
  /* Bit string
   */
@@ -172,6 +176,18 @@ xdstop         {dquote}
  xddouble       {dquote}{dquote}
  xdinside       [^"]+
  
+/* Unicode escapes */
+/* (The ecpg scanner is not backup-free, so the fail rules in scan.l are not needed here, but could be added if desired.) */
+uescape            [uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}[^']{quote}
+
+/* Quoted identifier with Unicode escapes */
+xuistart       [uU]&{dquote}
+xuistop            {dquote}({whitespace}*{uescape})?
+
+/* Quoted string with Unicode escapes */
+xusstart       [uU]&{quote}
+xusstop            {quote}({whitespace}*{uescape})?
+
  /* special stuff for C strings */
  xdcqq          \\\\
  xdcqdq         \\\"
@@ -433,6 +449,13 @@ cppline            {space}*#(.*\\{space})*.*{newline}
                 BEGIN(xe);
                 startlit();
             }
+{xusstart}    {
+               token_start = yytext;
+               state_before = YYSTATE;
+               BEGIN(xus);
+               startlit();
+               addlit(yytext, yyleng);
+           }
  {quotestop} |
  {quotefail} {
                 yyless(1);
@@ -454,22 +477,28 @@ cppline           {space}*#(.*\\{space})*.*{newline}
                 yylval.str = mm_strdup(literalbuf);
                 return NCONST;
             }
-{xqdouble}   { addlitchar('\''); }
+{xusstop} {
+               addlit(yytext, yyleng);
+               BEGIN(state_before);
+               yylval.str = mm_strdup(literalbuf);
+               return UCONST;
+           }
+{xqdouble}   { addlitchar('\''); }
  {xqcquote}        {
                 addlitchar('\\');
                 addlitchar('\'');
             }
-{xqinside}  { addlit(yytext, yyleng); }
+,xus>{xqinside}  { addlit(yytext, yyleng); }
  {xeinside}     { addlit(yytext, yyleng); }
  {xeescape}     { addlit(yytext, yyleng); }
  {xeoctesc}     { addlit(yytext, yyleng); }
  {xehexesc}     { addlit(yytext, yyleng); }
-{quotecontinue}  { /* ignore */ }
+,xus>{quotecontinue}  { /* ignore */ }
  .                   {
                /* This is only needed for \ just before EOF */
                addlitchar(yytext[0]);
             }
-<>  { mmerror(PARSE_ERROR, ET_FATAL, "unterminated quoted string"); }
+,xus><>  { mmerror(PARSE_ERROR, ET_FATAL, "unterminated quoted string"); }
  {dolqfailed}  {
                 /* throw back all but the initial "$" */
                 yyless(1);
@@ -515,6 +544,12 @@ cppline            {space}*#(.*\\{space})*.*{newline}
                         BEGIN(xd);
                         startlit();
                     }
+{xuistart}        {
+                       state_before = YYSTATE;
+                       BEGIN(xui);
+                       startlit();
+                       addlit(yytext, yyleng);
+                   }
  {xdstop}       {
                         BEGIN(state_before);
                         if (literallen == 0)
@@ -528,9 +563,18 @@ cppline            {space}*#(.*\\{space})*.*{newline}
                         yylval.str = mm_strdup(literalbuf);
                         return CSTRING;
                     }
-{xddouble}     { addlitchar('"'); }
-{xdinside}     { addlit(yytext, yyleng); }
-<>        { mmerror(PARSE_ERROR, ET_FATAL, "unterminated quoted identifier"); }
+{xuistop}     {
+                       BEGIN(state_before);
+                       if (literallen == 2) /* "U&" */
+                           mmerror(PARSE_ERROR, ET_ERROR, "zero-length delimited identifier");
+                       /* The backend will truncate the idnetifier here. We do not as it does not change the result. */
+                       addlit(yytext, yyleng);
+                       yylval.str = mm_strdup(literalbuf);
+                       return UIDENT;
+                   }
+{xddouble}     { addlitchar('"'); }
+{xdinside}     { addlit(yytext, yyleng); }
+<>        { mmerror(PARSE_ERROR, ET_FATAL, "unterminated quoted identifier"); }
  {xdstart}   {
                         state_before = YYSTATE;
                         BEGIN(xdc);
diff --git a/src/interfaces/ecpg/preproc/preproc.y b/src/interfaces/ecpg/preproc/preproc.y

index 2fbbbd94fc7cd3ac6d61060110f50b50988a0ded..f5fe0a979331acd1c929c27bd3d1e7e213d7fb57 100644 (file)
--- a/src/interfaces/ecpg/preproc/preproc.y
+++ b/src/interfaces/ecpg/preproc/preproc.y
@@ -1,4 +1,4 @@
-/* $PostgreSQL: pgsql/src/interfaces/ecpg/preproc/preproc.y,v 1.379 2008/10/28 14:09:45 petere Exp $ */
+/* $PostgreSQL: pgsql/src/interfaces/ecpg/preproc/preproc.y,v 1.380 2008/10/29 08:04:53 petere Exp $ */
  
  /* Copyright comment */
  %{
@@ -509,7 +509,7 @@ add_typedef(char *name, char * dimension, char * length, enum ECPGttype type_enu
  
  /* Special token types, not actually keywords - see the "lex" file */
  %token    IDENT SCONST Op CSTRING CVARIABLE CPP_LINE IP BCONST
-%token    XCONST DOLCONST ECONST NCONST
+%token    XCONST DOLCONST ECONST NCONST UCONST UIDENT
  %token   ICONST PARAM
  %token   FCONST
  
@@ -4966,6 +4966,10 @@ Sconst:  SCONST
             $$[strlen($1)+3]='\0';
             free($1);
         }
+   | UCONST
+       {
+           $$ = $1; 
+       }
     | DOLCONST
         {
             $$ = $1; 
@@ -7013,6 +7017,7 @@ cvariable:    CVARIABLE
         ;
  ident: IDENT               { $$ = $1; }
         | CSTRING       { $$ = make3_str(make_str("\""), $1, make_str("\"")); }
+       | UIDENT        { $$ = $1; }
         ;
  
  quoted_ident_stringvar: name
diff --git a/src/interfaces/ecpg/test/ecpg_schedule b/src/interfaces/ecpg/test/ecpg_schedule

index c478ed126b9310b3ba0a7f39b0a2727193308561..14fcd41a4640591c9926cb532301e8027803c091 100644 (file)
--- a/src/interfaces/ecpg/test/ecpg_schedule
+++ b/src/interfaces/ecpg/test/ecpg_schedule
@@ -18,6 +18,7 @@ test: preproc/autoprep
  test: preproc/comment
  test: preproc/define
  test: preproc/init
+test: preproc/strings
  test: preproc/type
  test: preproc/variable
  test: preproc/whenever
diff --git a/src/interfaces/ecpg/test/ecpg_schedule_tcp b/src/interfaces/ecpg/test/ecpg_schedule_tcp

index 5dbca9dd169b8a63f84b15452c2b12bfb76b4d63..814347324439c772371458f87052aaca17955347 100644 (file)
--- a/src/interfaces/ecpg/test/ecpg_schedule_tcp
+++ b/src/interfaces/ecpg/test/ecpg_schedule_tcp
@@ -18,6 +18,7 @@ test: preproc/autoprep
  test: preproc/comment
  test: preproc/define
  test: preproc/init
+test: preproc/strings
  test: preproc/type
  test: preproc/variable
  test: preproc/whenever
diff --git a/src/interfaces/ecpg/test/expected/preproc-strings.c b/src/interfaces/ecpg/test/expected/preproc-strings.c

new file mode 100644 (file)

index 0000000..9a99dad
--- /dev/null
+++ b/src/interfaces/ecpg/test/expected/preproc-strings.c
@@ -0,0 +1,62 @@
+/* Processed by ecpg (regression mode) */
+/* These include files are added by the preprocessor */
+#include 
+#include 
+#include 
+/* End of automatic include section */
+#define ECPGdebug(X,Y) ECPGdebug((X)+100,(Y))
+
+#line 1 "strings.pgc"
+#include 
+
+
+#line 1 "regression.h"
+
+
+
+
+
+
+#line 3 "strings.pgc"
+
+
+/* exec sql begin declare section */
+      
+
+#line 6 "strings.pgc"
+ char * s1    , * s2    , * s3    , * s4    , * s5    , * s6    ;
+/* exec sql end declare section */
+#line 7 "strings.pgc"
+
+
+int main(void)
+{
+  ECPGdebug(1, stderr);
+
+  { ECPGconnect(__LINE__, 0, "regress1" , NULL, NULL , NULL, 0); }
+#line 13 "strings.pgc"
+
+
+  { ECPGdo(__LINE__, 0, 1, NULL, 0, ECPGst_normal, "select  'abcdef' , N'abcdef' as foo , E'abc\\bdef' as \"foo\" , U&'d\\0061t\\0061' as U&\"foo\" , U&'d!+000061t!+000061' uescape '!' , $foo$abc$def$foo$     ", ECPGt_EOIT, 
+   ECPGt_char,&(s1),(long)0,(long)1,(1)*sizeof(char), 
+   ECPGt_NO_INDICATOR, NULL , 0L, 0L, 0L, 
+   ECPGt_char,&(s2),(long)0,(long)1,(1)*sizeof(char), 
+   ECPGt_NO_INDICATOR, NULL , 0L, 0L, 0L, 
+   ECPGt_char,&(s3),(long)0,(long)1,(1)*sizeof(char), 
+   ECPGt_NO_INDICATOR, NULL , 0L, 0L, 0L, 
+   ECPGt_char,&(s4),(long)0,(long)1,(1)*sizeof(char), 
+   ECPGt_NO_INDICATOR, NULL , 0L, 0L, 0L, 
+   ECPGt_char,&(s5),(long)0,(long)1,(1)*sizeof(char), 
+   ECPGt_NO_INDICATOR, NULL , 0L, 0L, 0L, 
+   ECPGt_char,&(s6),(long)0,(long)1,(1)*sizeof(char), 
+   ECPGt_NO_INDICATOR, NULL , 0L, 0L, 0L, ECPGt_EORT);}
+#line 21 "strings.pgc"
+
+
+  printf("%s %s %s %s %s %s\n", s1, s2, s3, s4, s5, s6);
+
+  { ECPGdisconnect(__LINE__, "CURRENT");}
+#line 25 "strings.pgc"
+
+  exit (0);
+}
diff --git a/src/interfaces/ecpg/test/expected/preproc-strings.stderr b/src/interfaces/ecpg/test/expected/preproc-strings.stderr

new file mode 100644 (file)

index 0000000..021e280
--- /dev/null
+++ b/src/interfaces/ecpg/test/expected/preproc-strings.stderr
@@ -0,0 +1,36 @@
+[NO_PID]: ECPGdebug: set to 1
+[NO_PID]: sqlca: code: 0, state: 00000
+[NO_PID]: ECPGconnect: opening database regress1 on  port   
+[NO_PID]: sqlca: code: 0, state: 00000
+[NO_PID]: ecpg_execute on line 15: query: select  'abcdef' , N'abcdef' as foo , E'abc\bdef' as "foo" , U&'d\0061t\0061' as U&"foo" , U&'d!+000061t!+000061' uescape '!' , $foo$abc$def$foo$     ; with 0 parameter(s) on connection regress1
+[NO_PID]: sqlca: code: 0, state: 00000
+[NO_PID]: ecpg_execute on line 15: using PQexec
+[NO_PID]: sqlca: code: 0, state: 00000
+[NO_PID]: ecpg_execute on line 15: correctly got 1 tuples with 6 fields
+[NO_PID]: sqlca: code: 0, state: 00000
+[NO_PID]: ecpg_store_result on line 15: allocating memory for 1 tuples
+[NO_PID]: sqlca: code: 0, state: 00000
+[NO_PID]: ecpg_get_data on line 15: RESULT: abcdef offset: -1; array: yes
+[NO_PID]: sqlca: code: 0, state: 00000
+[NO_PID]: ecpg_store_result on line 15: allocating memory for 1 tuples
+[NO_PID]: sqlca: code: 0, state: 00000
+[NO_PID]: ecpg_get_data on line 15: RESULT: abcdef offset: -1; array: yes
+[NO_PID]: sqlca: code: 0, state: 00000
+[NO_PID]: ecpg_store_result on line 15: allocating memory for 1 tuples
+[NO_PID]: sqlca: code: 0, state: 00000
+[NO_PID]: ecpg_get_data on line 15: RESULT: abc\bdef offset: -1; array: yes
+[NO_PID]: sqlca: code: 0, state: 00000
+[NO_PID]: ecpg_store_result on line 15: allocating memory for 1 tuples
+[NO_PID]: sqlca: code: 0, state: 00000
+[NO_PID]: ecpg_get_data on line 15: RESULT: data offset: -1; array: yes
+[NO_PID]: sqlca: code: 0, state: 00000
+[NO_PID]: ecpg_store_result on line 15: allocating memory for 1 tuples
+[NO_PID]: sqlca: code: 0, state: 00000
+[NO_PID]: ecpg_get_data on line 15: RESULT: data offset: -1; array: yes
+[NO_PID]: sqlca: code: 0, state: 00000
+[NO_PID]: ecpg_store_result on line 15: allocating memory for 1 tuples
+[NO_PID]: sqlca: code: 0, state: 00000
+[NO_PID]: ecpg_get_data on line 15: RESULT: abc$def offset: -1; array: yes
+[NO_PID]: sqlca: code: 0, state: 00000
+[NO_PID]: ecpg_finish: connection regress1 closed
+[NO_PID]: sqlca: code: 0, state: 00000
diff --git a/src/interfaces/ecpg/test/expected/preproc-strings.stdout b/src/interfaces/ecpg/test/expected/preproc-strings.stdout

new file mode 100644 (file)

index 0000000..730d72d
--- /dev/null
+++ b/src/interfaces/ecpg/test/expected/preproc-strings.stdout
@@ -0,0 +1 @@
+abcdef abcdef abc\bdef data data abc$def
diff --git a/src/interfaces/ecpg/test/preproc/Makefile b/src/interfaces/ecpg/test/preproc/Makefile

index 6928a1f3fe3e96e8447f660502c93a8ab901ac57..94b6779a417c13f02786313993224d308de088f0 100644 (file)
--- a/src/interfaces/ecpg/test/preproc/Makefile
+++ b/src/interfaces/ecpg/test/preproc/Makefile
@@ -9,6 +9,7 @@ TESTS = array_of_struct array_of_struct.c \
     comment comment.c \
     define define.c \
     init init.c \
+   strings strings.c \
     type type.c \
     variable variable.c \
     whenever whenever.c
diff --git a/src/interfaces/ecpg/test/preproc/strings.pgc b/src/interfaces/ecpg/test/preproc/strings.pgc

new file mode 100644 (file)

index 0000000..1a8c0d7
--- /dev/null
+++ b/src/interfaces/ecpg/test/preproc/strings.pgc
@@ -0,0 +1,27 @@
+#include 
+
+exec sql include ../regression;
+
+exec sql begin declare section;
+char *s1, *s2, *s3, *s4, *s5, *s6;
+exec sql end declare section;
+
+int main(void)
+{
+  ECPGdebug(1, stderr);
+
+  exec sql connect to REGRESSDB1;
+
+  exec sql select 'abcdef',
+                  N'abcdef' AS foo,
+                  E'abc\bdef' AS "foo",
+                  U&'d\0061t\0061' AS U&"foo",
+                  U&'d!+000061t!+000061' uescape '!',
+                  $foo$abc$def$foo$
+                  into :s1, :s2, :s3, :s4, :s5, :s6;
+
+  printf("%s %s %s %s %s %s\n", s1, s2, s3, s4, s5, s6);
+
+  exec sql disconnect;
+  exit (0);
+}
diff --git a/src/test/regress/expected/strings.out b/src/test/regress/expected/strings.out

index 742ec9291f94b6040ff389f97b94b075f966c39a..6b9dc5df9f46983a2d2e07eecd087e5ab6cbe2c2 100644 (file)
--- a/src/test/regress/expected/strings.out
+++ b/src/test/regress/expected/strings.out
@@ -21,6 +21,31 @@ SELECT 'first line'
  ERROR:  syntax error at or near "' - third line'"
  LINE 3: ' - third line'
          ^
+-- Unicode escapes
+SELECT U&'d\0061t\+000061' AS U&"d\0061t\+000061";
+ data 
+------
+ data
+(1 row)
+
+SELECT U&'d!0061t\+000061' UESCAPE '!' AS U&"d*0061t\+000061" UESCAPE '*';
+ dat\+000061 
+-------------
+ dat\+000061
+(1 row)
+
+SELECT U&'wrong: \061';
+ERROR:  invalid Unicode escape value at or near "\061'"
+LINE 1: SELECT U&'wrong: \061';
+                         ^
+SELECT U&'wrong: \+0061';
+ERROR:  invalid Unicode escape value at or near "\+0061'"
+LINE 1: SELECT U&'wrong: \+0061';
+                         ^
+SELECT U&'wrong: +0061' UESCAPE '+';
+ERROR:  invalid Unicode escape character at or near "+'"
+LINE 1: SELECT U&'wrong: +0061' UESCAPE '+';
+                                         ^
  --
  -- test conversions between various string types
  -- E021-10 implicit casting among the character data types
diff --git a/src/test/regress/sql/strings.sql b/src/test/regress/sql/strings.sql

index c042f33b017360b2d3955fc16626037e643f6920..0da88c7b29e6f09effba211cb4f51903bd9156af 100644 (file)
--- a/src/test/regress/sql/strings.sql
+++ b/src/test/regress/sql/strings.sql
@@ -16,6 +16,14 @@ SELECT 'first line'
  ' - third line'
     AS "Illegal comment within continuation";
  
+-- Unicode escapes
+SELECT U&'d\0061t\+000061' AS U&"d\0061t\+000061";
+SELECT U&'d!0061t\+000061' UESCAPE '!' AS U&"d*0061t\+000061" UESCAPE '*';
+
+SELECT U&'wrong: \061';
+SELECT U&'wrong: \+0061';
+SELECT U&'wrong: +0061' UESCAPE '+';
+
  --
  -- test conversions between various string types
  -- E021-10 implicit casting among the character data types
author	Peter Eisentraut
	Wed, 29 Oct 2008 08:04:54 +0000 (08:04 +0000)
committer	Peter Eisentraut
	Wed, 29 Oct 2008 08:04:54 +0000 (08:04 +0000)
doc/src/sgml/syntax.sgml		patch \| blob \| blame \| history
src/backend/catalog/sql_features.txt		patch \| blob \| blame \| history
src/backend/parser/scan.l		patch \| blob \| blame \| history
src/backend/utils/adt/xml.c		patch \| blob \| blame \| history
src/backend/utils/mb/wchar.c		patch \| blob \| blame \| history
src/bin/psql/psqlscan.l		patch \| blob \| blame \| history
src/include/mb/pg_wchar.h		patch \| blob \| blame \| history
src/interfaces/ecpg/preproc/pgc.l		patch \| blob \| blame \| history
src/interfaces/ecpg/preproc/preproc.y		patch \| blob \| blame \| history
src/interfaces/ecpg/test/ecpg_schedule		patch \| blob \| blame \| history
src/interfaces/ecpg/test/ecpg_schedule_tcp		patch \| blob \| blame \| history
src/interfaces/ecpg/test/expected/preproc-strings.c	[new file with mode: 0644]	patch \| blob
src/interfaces/ecpg/test/expected/preproc-strings.stderr	[new file with mode: 0644]	patch \| blob
src/interfaces/ecpg/test/expected/preproc-strings.stdout	[new file with mode: 0644]	patch \| blob
src/interfaces/ecpg/test/preproc/Makefile		patch \| blob \| blame \| history
src/interfaces/ecpg/test/preproc/strings.pgc	[new file with mode: 0644]	patch \| blob
src/test/regress/expected/strings.out		patch \| blob \| blame \| history
src/test/regress/sql/strings.sql		patch \| blob \| blame \| history