-
+
SQL Syntax
The Unicode escape syntax works only when the server encoding is
UTF8. When other server encodings are used, only code points in
the ASCII range (up to \007F) can be specified.
+ Both the 4-digit and the 6-digit form can be used to specify
+ UTF-16 surrogate pairs to compose characters with code points
+ larger than \FFFF (although the availability of
+ the 6-digit form technically makes this unnecessary).
UTF8. When other server encodings are used, only code points in
the ASCII range (up to \007F) can be
specified.
+ Both the 4-digit and the 6-digit form can be used to specify
+ UTF-16 surrogate pairs to compose characters with code points
+ larger than \FFFF (although the availability
+ of the 6-digit form technically makes this unnecessary).
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/parser/scan.l,v 1.157 2009/07/14 20:24:10 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/parser/scan.l,v 1.158 2009/09/21 22:22:07 petere Exp $
*
*-------------------------------------------------------------------------
*/
}
}
+static bool
+is_utf16_surrogate_first(pg_wchar c)
+{
+ return (c >= 0xD800 && c <= 0xDBFF);
+}
+
+static bool
+is_utf16_surrogate_second(pg_wchar c)
+{
+ return (c >= 0xDC00 && c <= 0xDFFF);
+}
+
+static pg_wchar
+surrogate_pair_to_codepoint(pg_wchar first, pg_wchar second)
+{
+ return ((first & 0x3FF) << 10) + 0x10000 + (second & 0x3FF);
+}
+
static char *
litbuf_udeescape(unsigned char escape, base_yyscan_t yyscanner)
{
char *new;
char *litbuf, *in, *out;
+ pg_wchar pair_first = 0;
if (isxdigit(escape)
|| escape == '+'
{
if (in[1] == escape)
{
+ if (pair_first)
+ {
+ ADVANCE_YYLLOC(in - litbuf + 3); /* 3 for U&" */
+ yyerror("invalid Unicode surrogate pair");
+ }
*out++ = escape;
in += 2;
}
{
pg_wchar unicode = hexval(in[1]) * 16*16*16 + hexval(in[2]) * 16*16 + hexval(in[3]) * 16 + hexval(in[4]);
check_unicode_value(unicode, in, yyscanner);
- unicode_to_utf8(unicode, (unsigned char *) out);
+ if (pair_first)
+ {
+ if (is_utf16_surrogate_second(unicode))
+ {
+ unicode = surrogate_pair_to_codepoint(pair_first, unicode);
+ pair_first = 0;
+ }
+ else
+ {
+ ADVANCE_YYLLOC(in - litbuf + 3); /* 3 for U&" */
+ yyerror("invalid Unicode surrogate pair");
+ }
+ }
+ if (is_utf16_surrogate_first(unicode))
+ pair_first = unicode;
+ else
+ {
+ unicode_to_utf8(unicode, (unsigned char *) out);
+ out += pg_mblen(out);
+ }
in += 5;
- out += pg_mblen(out);
}
else if (in[1] == '+'
&& isxdigit(in[2]) && isxdigit(in[3])
pg_wchar unicode = hexval(in[2]) * 16*16*16*16*16 + hexval(in[3]) * 16*16*16*16 + hexval(in[4]) * 16*16*16
+ hexval(in[5]) * 16*16 + hexval(in[6]) * 16 + hexval(in[7]);
check_unicode_value(unicode, in, yyscanner);
- unicode_to_utf8(unicode, (unsigned char *) out);
+ if (pair_first)
+ {
+ if (is_utf16_surrogate_second(unicode))
+ {
+ unicode = surrogate_pair_to_codepoint(pair_first, unicode);
+ pair_first = 0;
+ }
+ else
+ {
+ ADVANCE_YYLLOC(in - litbuf + 3); /* 3 for U&" */
+ yyerror("invalid Unicode surrogate pair");
+ }
+ }
+ if (is_utf16_surrogate_first(unicode))
+ pair_first = unicode;
+ else
+ {
+ unicode_to_utf8(unicode, (unsigned char *) out);
+ out += pg_mblen(out);
+ }
in += 8;
- out += pg_mblen(out);
}
else
{
}
}
else
+ {
+ if (pair_first)
+ {
+ ADVANCE_YYLLOC(in - litbuf + 3); /* 3 for U&" */
+ yyerror("invalid Unicode surrogate pair");
+ }
*out++ = *in++;
+ }
}
*out = '\0';