Unicode escapes in E'...' strings

author Peter Eisentraut

Tue, 22 Sep 2009 23:52:53 +0000 (23:52 +0000)

committer Peter Eisentraut

Tue, 22 Sep 2009 23:52:53 +0000 (23:52 +0000)
author Peter Eisentraut
Tue, 22 Sep 2009 23:52:53 +0000 (23:52 +0000)
committer Peter Eisentraut
Tue, 22 Sep 2009 23:52:53 +0000 (23:52 +0000)
diff --git a/doc/src/sgml/syntax.sgml b/doc/src/sgml/syntax.sgml

index c805e2e7141b9572945ddf5ddb87e268e83bb105..73db3235bd6f15250536cf96e4c8a24c140c01dc 100644 (file)
--- a/doc/src/sgml/syntax.sgml
+++ b/doc/src/sgml/syntax.sgml
@@ -1,4 +1,4 @@
-
+
  
  
   SQL Syntax
@@ -398,6 +398,14 @@ SELECT 'foo'      'bar';
          
          hexadecimal byte value
         
+       
+        
+         \uxxxx,
+         \Uxxxxxxxx
+         (x = 0 - 9, A - F)
+        
+        16 or 32-bit hexadecimal Unicode character value
+       
        
        
       
@@ -411,13 +419,25 @@ SELECT 'foo'      'bar';
      
  
      
-     It is your responsibility that the byte sequences you create are
+     It is your responsibility that the byte sequences you create,
+     especially when using the octal or hexadecimal escapes, compose
       valid characters in the server character set encoding.  When the
-     server encoding is UTF-8, then the alternative Unicode escape
-     syntax, explained in ,
-     should be used instead.  (The alternative would be doing the
-     UTF-8 encoding by hand and writing out the bytes, which would be
-     very cumbersome.)
+     server encoding is UTF-8, then the Unicode escapes or the
+     alternative Unicode escape syntax, explained
+     in , should be used
+     instead.  (The alternative would be doing the UTF-8 encoding by
+     hand and writing out the bytes, which would be very cumbersome.)
+    
+
+    
+     The Unicode escape syntax works fully only when the server
+     encoding is UTF-8.  When other server encodings are used, only
+     code points in the ASCII range (up to \u007F) can be
+     specified.  Both the 4-digit and the 8-digit form can be used to
+     specify UTF-16 surrogate pairs to compose characters with code
+     points larger than \FFFF (although the
+     availability of the 8-digit form technically makes this
+     unnecessary).
      
  
      
diff --git a/src/backend/parser/scan.l b/src/backend/parser/scan.l

index d40bd9dd97e3b351de4c4148a9e022033890ca6b..fcfe2b3c403c4b0341b29c1e1aba9da04cfe5b44 100644 (file)
--- a/src/backend/parser/scan.l
+++ b/src/backend/parser/scan.l
@@ -24,7 +24,7 @@
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   * IDENTIFICATION
- *   $PostgreSQL: pgsql/src/backend/parser/scan.l,v 1.158 2009/09/21 22:22:07 petere Exp $
+ *   $PostgreSQL: pgsql/src/backend/parser/scan.l,v 1.159 2009/09/22 23:52:53 petere Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -80,6 +80,9 @@ static void addlitchar(unsigned char ychar, base_yyscan_t yyscanner);
  static char *litbufdup(base_yyscan_t yyscanner);
  static char *litbuf_udeescape(unsigned char escape, base_yyscan_t yyscanner);
  static unsigned char unescape_single_char(unsigned char c, base_yyscan_t yyscanner);
+static bool is_utf16_surrogate_first(pg_wchar c);
+static bool is_utf16_surrogate_second(pg_wchar c);
+static pg_wchar surrogate_pair_to_codepoint(pg_wchar first, pg_wchar second);
  
  #define yyerror(msg)  scanner_yyerror(msg, yyscanner)
  
@@ -97,6 +100,8 @@ static void check_escape_warning(base_yyscan_t yyscanner);
  extern int base_yyget_column(yyscan_t yyscanner);
  extern void base_yyset_column(int column_no, yyscan_t yyscanner);
  
+static void addunicode(pg_wchar c, yyscan_t yyscanner);
+
  %}
  
  %option reentrant
@@ -134,6 +139,7 @@ extern void base_yyset_column(int column_no, yyscan_t yyscanner);
   *   $foo$ quoted strings
   *   quoted identifier with Unicode escapes
   *   quoted string with Unicode escapes
+ *   Unicode surrogate pair in extended quoted string
   */
  
  %x xb
@@ -145,6 +151,7 @@ extern void base_yyset_column(int column_no, yyscan_t yyscanner);
  %x xdolq
  %x xui
  %x xus
+%x xeu
  
  /*
   * In order to make the world safe for Windows and Mac clients as well as
@@ -223,6 +230,8 @@ xeinside        [^\\']+
  xeescape       [\\][^0-7]
  xeoctesc       [\\][0-7]{1,3}
  xehexesc       [\\]x[0-9A-Fa-f]{1,2}
+xeunicode      [\\](u[0-9A-Fa-f]{4}|U[0-9A-Fa-f]{8})
+xeunicodebad   [\\]([uU])
  
  /* Extended quote
   * xqdouble implements embedded quote, ''''
@@ -535,6 +544,45 @@ other          .
  {xeinside}  {
                     addlit(yytext, yyleng, yyscanner);
                 }
+{xeunicode} {
+                   pg_wchar c = strtoul(yytext+2, NULL, 16);
+
+                   check_escape_warning(yyscanner);
+
+                   if (is_utf16_surrogate_first(c))
+                   {
+                       yyextra->utf16_first_part = c;
+                       BEGIN(xeu);
+                   }
+                   else if (is_utf16_surrogate_second(c))
+                       yyerror("invalid Unicode surrogate pair");
+                   else
+                       addunicode(c, yyscanner);
+               }
+{xeunicode} {
+                   pg_wchar c = strtoul(yytext+2, NULL, 16);
+
+                   if (!is_utf16_surrogate_second(c))
+                       yyerror("invalid Unicode surrogate pair");
+
+                   c = surrogate_pair_to_codepoint(yyextra->utf16_first_part, c);
+
+                   addunicode(c, yyscanner);
+
+                   BEGIN(xe);
+               }
+.         |
+\n            |
+<>   { yyerror("invalid Unicode surrogate pair"); }
+
+{xeunicodebad} {
+                       ereport(ERROR,
+                               (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
+                                errmsg("invalid Unicode escape"),
+                                errhint("Unicode escapes must be \\uXXXX or \\UXXXXXXXX."),
+                                lexer_errposition()));
+                   }
+
  {xeescape}  {
                     if (yytext[1] == '\'')
                     {
@@ -1330,3 +1378,21 @@ base_yyfree(void *ptr, base_yyscan_t yyscanner)
     if (ptr)
         pfree(ptr);
  }
+
+static void
+addunicode(pg_wchar c, base_yyscan_t yyscanner)
+{
+   char buf[8];
+
+   if (c == 0 || c > 0x10FFFF)
+       yyerror("invalid Unicode escape value");
+   if (c > 0x7F)
+   {
+       if (GetDatabaseEncoding() != PG_UTF8)
+           yyerror("Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8");
+       yyextra->saw_non_ascii = true;
+   }
+   unicode_to_utf8(c, (unsigned char *)buf);
+   addlit(buf, pg_mblen(buf), yyscanner);
+}
+
diff --git a/src/include/parser/gramparse.h b/src/include/parser/gramparse.h

index 4b061e0504bb2126d8b574dfb2c81696329f74ce..df384a11caa83e7d1c100669fc1e4f4b20c27054 100644 (file)
--- a/src/include/parser/gramparse.h
+++ b/src/include/parser/gramparse.h
@@ -11,7 +11,7 @@
   * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
- * $PostgreSQL: pgsql/src/include/parser/gramparse.h,v 1.47 2009/07/14 20:24:10 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/parser/gramparse.h,v 1.48 2009/09/22 23:52:53 petere Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -71,6 +71,9 @@ typedef struct base_yy_extra_type
     int         xcdepth;        /* depth of nesting in slash-star comments */
     char       *dolqstart;      /* current $foo$ quote start string */
  
+   /* first part of UTF16 surrogate pair for Unicode escapes */
+   int32       utf16_first_part;
+
     /* state variables for literal-lexing warnings */
     bool        warn_on_first_escape;
     bool        saw_non_ascii;
author	Peter Eisentraut
	Tue, 22 Sep 2009 23:52:53 +0000 (23:52 +0000)
committer	Peter Eisentraut
	Tue, 22 Sep 2009 23:52:53 +0000 (23:52 +0000)
doc/src/sgml/syntax.sgml		patch \| blob \| blame \| history
src/backend/parser/scan.l		patch \| blob \| blame \| history
src/include/parser/gramparse.h		patch \| blob \| blame \| history