Surrogate pair support for U& string and identifier syntax
authorPeter Eisentraut
Mon, 21 Sep 2009 22:22:07 +0000 (22:22 +0000)
committerPeter Eisentraut
Mon, 21 Sep 2009 22:22:07 +0000 (22:22 +0000)
This is mainly to make the functionality consistent with the proposed \u
escape syntax.

doc/src/sgml/syntax.sgml
src/backend/parser/scan.l

index c2dd31b98d37b591ff64774aeab6ed6d8fd46b8c..c805e2e7141b9572945ddf5ddb87e268e83bb105 100644 (file)
@@ -1,4 +1,4 @@
-
+
 
 
  SQL Syntax
@@ -238,6 +238,10 @@ U&"d!0061t!+000061" UESCAPE '!'
     The Unicode escape syntax works only when the server encoding is
     UTF8.  When other server encodings are used, only code points in
     the ASCII range (up to \007F) can be specified.
+    Both the 4-digit and the 6-digit form can be used to specify
+    UTF-16 surrogate pairs to compose characters with code points
+    larger than \FFFF (although the availability of
+    the 6-digit form technically makes this unnecessary).
    
 
    
@@ -497,6 +501,10 @@ U&'d!0061t!+000061' UESCAPE '!'
      UTF8.  When other server encodings are used, only code points in
      the ASCII range (up to \007F) can be
      specified.
+     Both the 4-digit and the 6-digit form can be used to specify
+     UTF-16 surrogate pairs to compose characters with code points
+     larger than \FFFF (although the availability
+     of the 6-digit form technically makes this unnecessary).
     
 
     
index a5ed54792b66ae3c0f83915c4a08854bf4487450..d40bd9dd97e3b351de4c4148a9e022033890ca6b 100644 (file)
@@ -24,7 +24,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *   $PostgreSQL: pgsql/src/backend/parser/scan.l,v 1.157 2009/07/14 20:24:10 tgl Exp $
+ *   $PostgreSQL: pgsql/src/backend/parser/scan.l,v 1.158 2009/09/21 22:22:07 petere Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -1097,11 +1097,30 @@ check_unicode_value(pg_wchar c, char *loc, base_yyscan_t yyscanner)
    }
 }
 
+static bool
+is_utf16_surrogate_first(pg_wchar c)
+{
+   return (c >= 0xD800 && c <= 0xDBFF);
+}
+
+static bool
+is_utf16_surrogate_second(pg_wchar c)
+{
+   return (c >= 0xDC00 && c <= 0xDFFF);
+}
+
+static pg_wchar
+surrogate_pair_to_codepoint(pg_wchar first, pg_wchar second)
+{
+   return ((first & 0x3FF) << 10) + 0x10000 + (second & 0x3FF);
+}
+
 static char *
 litbuf_udeescape(unsigned char escape, base_yyscan_t yyscanner)
 {
    char *new;
    char *litbuf, *in, *out;
+   pg_wchar pair_first = 0;
 
    if (isxdigit(escape)
        || escape == '+'
@@ -1131,6 +1150,11 @@ litbuf_udeescape(unsigned char escape, base_yyscan_t yyscanner)
        {
            if (in[1] == escape)
            {
+               if (pair_first)
+               {
+                   ADVANCE_YYLLOC(in - litbuf + 3);   /* 3 for U&" */
+                   yyerror("invalid Unicode surrogate pair");
+               }
                *out++ = escape;
                in += 2;
            }
@@ -1138,9 +1162,27 @@ litbuf_udeescape(unsigned char escape, base_yyscan_t yyscanner)
            {
                pg_wchar unicode = hexval(in[1]) * 16*16*16 + hexval(in[2]) * 16*16 + hexval(in[3]) * 16 + hexval(in[4]);
                check_unicode_value(unicode, in, yyscanner);
-               unicode_to_utf8(unicode, (unsigned char *) out);
+               if (pair_first)
+               {
+                   if (is_utf16_surrogate_second(unicode))
+                   {
+                       unicode = surrogate_pair_to_codepoint(pair_first, unicode);
+                       pair_first = 0;
+                   }
+                   else
+                   {
+                       ADVANCE_YYLLOC(in - litbuf + 3);   /* 3 for U&" */
+                       yyerror("invalid Unicode surrogate pair");
+                   }
+               }
+               if (is_utf16_surrogate_first(unicode))
+                   pair_first = unicode;
+               else
+               {
+                   unicode_to_utf8(unicode, (unsigned char *) out);
+                   out += pg_mblen(out);
+               }
                in += 5;
-               out += pg_mblen(out);
            }
            else if (in[1] == '+'
                     && isxdigit(in[2]) && isxdigit(in[3])
@@ -1150,9 +1192,27 @@ litbuf_udeescape(unsigned char escape, base_yyscan_t yyscanner)
                pg_wchar unicode = hexval(in[2]) * 16*16*16*16*16 + hexval(in[3]) * 16*16*16*16 + hexval(in[4]) * 16*16*16
                                    + hexval(in[5]) * 16*16 + hexval(in[6]) * 16 + hexval(in[7]);
                check_unicode_value(unicode, in, yyscanner);
-               unicode_to_utf8(unicode, (unsigned char *) out);
+               if (pair_first)
+               {
+                   if (is_utf16_surrogate_second(unicode))
+                   {
+                       unicode = surrogate_pair_to_codepoint(pair_first, unicode);
+                       pair_first = 0;
+                   }
+                   else
+                   {
+                       ADVANCE_YYLLOC(in - litbuf + 3);   /* 3 for U&" */
+                       yyerror("invalid Unicode surrogate pair");
+                   }
+               }
+               if (is_utf16_surrogate_first(unicode))
+                   pair_first = unicode;
+               else
+               {
+                   unicode_to_utf8(unicode, (unsigned char *) out);
+                   out += pg_mblen(out);
+               }
                in += 8;
-               out += pg_mblen(out);
            }
            else
            {
@@ -1161,7 +1221,14 @@ litbuf_udeescape(unsigned char escape, base_yyscan_t yyscanner)
            }
        }
        else
+       {
+           if (pair_first)
+           {
+               ADVANCE_YYLLOC(in - litbuf + 3);   /* 3 for U&" */
+               yyerror("invalid Unicode surrogate pair");
+           }
            *out++ = *in++;
+       }
    }
 
    *out = '\0';