diff options
-rw-r--r-- | doc/src/sgml/syntax.sgml | 10 | ||||
-rw-r--r-- | src/backend/parser/scan.l | 77 |
2 files changed, 81 insertions, 6 deletions
diff --git a/doc/src/sgml/syntax.sgml b/doc/src/sgml/syntax.sgml index c2dd31b98d3..c805e2e7141 100644 --- a/doc/src/sgml/syntax.sgml +++ b/doc/src/sgml/syntax.sgml @@ -1,4 +1,4 @@ -<!-- $PostgreSQL: pgsql/doc/src/sgml/syntax.sgml,v 1.134 2009/08/27 20:08:02 tgl Exp $ --> +<!-- $PostgreSQL: pgsql/doc/src/sgml/syntax.sgml,v 1.135 2009/09/21 22:22:07 petere Exp $ --> <chapter id="sql-syntax"> <title>SQL Syntax</title> @@ -238,6 +238,10 @@ U&"d!0061t!+000061" UESCAPE '!' The Unicode escape syntax works only when the server encoding is UTF8. When other server encodings are used, only code points in the ASCII range (up to <literal>\007F</literal>) can be specified. + Both the 4-digit and the 6-digit form can be used to specify + UTF-16 surrogate pairs to compose characters with code points + larger than <literal>\FFFF</literal> (although the availability of + the 6-digit form technically makes this unnecessary). </para> <para> @@ -497,6 +501,10 @@ U&'d!0061t!+000061' UESCAPE '!' UTF8. When other server encodings are used, only code points in the ASCII range (up to <literal>\007F</literal>) can be specified. + Both the 4-digit and the 6-digit form can be used to specify + UTF-16 surrogate pairs to compose characters with code points + larger than <literal>\FFFF</literal> (although the availability + of the 6-digit form technically makes this unnecessary). </para> <para> diff --git a/src/backend/parser/scan.l b/src/backend/parser/scan.l index a5ed54792b6..d40bd9dd97e 100644 --- a/src/backend/parser/scan.l +++ b/src/backend/parser/scan.l @@ -24,7 +24,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/parser/scan.l,v 1.157 2009/07/14 20:24:10 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/parser/scan.l,v 1.158 2009/09/21 22:22:07 petere Exp $ * *------------------------------------------------------------------------- */ @@ -1097,11 +1097,30 @@ check_unicode_value(pg_wchar c, char *loc, base_yyscan_t yyscanner) } } +static bool +is_utf16_surrogate_first(pg_wchar c) +{ + return (c >= 0xD800 && c <= 0xDBFF); +} + +static bool +is_utf16_surrogate_second(pg_wchar c) +{ + return (c >= 0xDC00 && c <= 0xDFFF); +} + +static pg_wchar +surrogate_pair_to_codepoint(pg_wchar first, pg_wchar second) +{ + return ((first & 0x3FF) << 10) + 0x10000 + (second & 0x3FF); +} + static char * litbuf_udeescape(unsigned char escape, base_yyscan_t yyscanner) { char *new; char *litbuf, *in, *out; + pg_wchar pair_first = 0; if (isxdigit(escape) || escape == '+' @@ -1131,6 +1150,11 @@ litbuf_udeescape(unsigned char escape, base_yyscan_t yyscanner) { if (in[1] == escape) { + if (pair_first) + { + ADVANCE_YYLLOC(in - litbuf + 3); /* 3 for U&" */ + yyerror("invalid Unicode surrogate pair"); + } *out++ = escape; in += 2; } @@ -1138,9 +1162,27 @@ litbuf_udeescape(unsigned char escape, base_yyscan_t yyscanner) { pg_wchar unicode = hexval(in[1]) * 16*16*16 + hexval(in[2]) * 16*16 + hexval(in[3]) * 16 + hexval(in[4]); check_unicode_value(unicode, in, yyscanner); - unicode_to_utf8(unicode, (unsigned char *) out); + if (pair_first) + { + if (is_utf16_surrogate_second(unicode)) + { + unicode = surrogate_pair_to_codepoint(pair_first, unicode); + pair_first = 0; + } + else + { + ADVANCE_YYLLOC(in - litbuf + 3); /* 3 for U&" */ + yyerror("invalid Unicode surrogate pair"); + } + } + if (is_utf16_surrogate_first(unicode)) + pair_first = unicode; + else + { + unicode_to_utf8(unicode, (unsigned char *) out); + out += pg_mblen(out); + } in += 5; - out += pg_mblen(out); } else if (in[1] == '+' && isxdigit(in[2]) && isxdigit(in[3]) @@ -1150,9 +1192,27 @@ litbuf_udeescape(unsigned char escape, base_yyscan_t yyscanner) pg_wchar unicode = hexval(in[2]) * 16*16*16*16*16 + hexval(in[3]) * 16*16*16*16 + hexval(in[4]) * 16*16*16 + hexval(in[5]) * 16*16 + hexval(in[6]) * 16 + hexval(in[7]); check_unicode_value(unicode, in, yyscanner); - unicode_to_utf8(unicode, (unsigned char *) out); + if (pair_first) + { + if (is_utf16_surrogate_second(unicode)) + { + unicode = surrogate_pair_to_codepoint(pair_first, unicode); + pair_first = 0; + } + else + { + ADVANCE_YYLLOC(in - litbuf + 3); /* 3 for U&" */ + yyerror("invalid Unicode surrogate pair"); + } + } + if (is_utf16_surrogate_first(unicode)) + pair_first = unicode; + else + { + unicode_to_utf8(unicode, (unsigned char *) out); + out += pg_mblen(out); + } in += 8; - out += pg_mblen(out); } else { @@ -1161,7 +1221,14 @@ litbuf_udeescape(unsigned char escape, base_yyscan_t yyscanner) } } else + { + if (pair_first) + { + ADVANCE_YYLLOC(in - litbuf + 3); /* 3 for U&" */ + yyerror("invalid Unicode surrogate pair"); + } *out++ = *in++; + } } *out = '\0'; |