diff options
Diffstat (limited to 'src/backend')
-rw-r--r-- | src/backend/catalog/sql_features.txt | 4 | ||||
-rw-r--r-- | src/backend/parser/scan.l | 188 | ||||
-rw-r--r-- | src/backend/utils/adt/xml.c | 25 | ||||
-rw-r--r-- | src/backend/utils/mb/wchar.c | 37 |
4 files changed, 220 insertions, 34 deletions
diff --git a/src/backend/catalog/sql_features.txt b/src/backend/catalog/sql_features.txt index b795a70f3cf..707a0710836 100644 --- a/src/backend/catalog/sql_features.txt +++ b/src/backend/catalog/sql_features.txt @@ -238,8 +238,8 @@ F381 Extended schema manipulation 02 ALTER TABLE statement: ADD CONSTRAINT claus F381 Extended schema manipulation 03 ALTER TABLE statement: DROP CONSTRAINT clause YES F382 Alter column data type YES F391 Long identifiers YES -F392 Unicode escapes in identifiers NO -F393 Unicode escapes in literals NO +F392 Unicode escapes in identifiers YES +F393 Unicode escapes in literals YES F394 Optional normal form specification NO F401 Extended joined table YES F401 Extended joined table 01 NATURAL JOIN YES diff --git a/src/backend/parser/scan.l b/src/backend/parser/scan.l index dec0669d8ba..424907e3c53 100644 --- a/src/backend/parser/scan.l +++ b/src/backend/parser/scan.l @@ -24,7 +24,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/parser/scan.l,v 1.146 2008/09/01 20:42:45 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/parser/scan.l,v 1.147 2008/10/29 08:04:52 petere Exp $ * *------------------------------------------------------------------------- */ @@ -76,6 +76,7 @@ static int literalalloc; /* current allocated buffer size */ static void addlit(char *ytext, int yleng); static void addlitchar(unsigned char ychar); static char *litbufdup(void); +static char *litbuf_udeescape(unsigned char escape); #define lexer_errposition() scanner_errposition(yylloc) @@ -125,6 +126,8 @@ static unsigned char unescape_single_char(unsigned char c); * <xq> standard quoted strings * <xe> extended quoted strings (support backslash escape sequences) * <xdolq> $foo$ quoted strings + * <xui> quoted identifier with Unicode escapes + * <xus> quoted string with Unicode escapes */ %x xb @@ -134,6 +137,8 @@ static unsigned char unescape_single_char(unsigned char c); %x xe %x xq %x xdolq +%x xui +%x xus /* * In order to make the world safe for Windows and Mac clients as well as @@ -244,6 +249,25 @@ xdstop {dquote} xddouble {dquote}{dquote} xdinside [^"]+ +/* Unicode escapes */ +uescape [uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}[^']{quote} +/* error rule to avoid backup */ +uescapefail ("-"|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*"-"|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}[^']|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*|[uU][eE][sS][cC][aA][pP]|[uU][eE][sS][cC][aA]|[uU][eE][sS][cC]|[uU][eE][sS]|[uU][eE]|[uU]) + +/* Quoted identifier with Unicode escapes */ +xuistart [uU]&{dquote} +xuistop1 {dquote}{whitespace}*{uescapefail}? +xuistop2 {dquote}{whitespace}*{uescape} + +/* Quoted string with Unicode escapes */ +xusstart [uU]&{quote} +xusstop1 {quote}{whitespace}*{uescapefail}? +xusstop2 {quote}{whitespace}*{uescape} + +/* error rule to avoid backup */ +xufailed [uU]& + + /* C-style comments * * The "extended comment" syntax closely resembles allowable operator syntax. @@ -444,6 +468,11 @@ other . BEGIN(xe); startlit(); } +{xusstart} { + SET_YYLLOC(); + BEGIN(xus); + startlit(); + } <xq,xe>{quotestop} | <xq,xe>{quotefail} { yyless(1); @@ -456,10 +485,22 @@ other . yylval.str = litbufdup(); return SCONST; } -<xq,xe>{xqdouble} { +<xus>{xusstop1} { + /* throw back all but the quote */ + yyless(1); + BEGIN(INITIAL); + yylval.str = litbuf_udeescape('\\'); + return SCONST; + } +<xus>{xusstop2} { + BEGIN(INITIAL); + yylval.str = litbuf_udeescape(yytext[yyleng-2]); + return SCONST; + } +<xq,xe,xus>{xqdouble} { addlitchar('\''); } -<xq>{xqinside} { +<xq,xus>{xqinside} { addlit(yytext, yyleng); } <xe>{xeinside} { @@ -496,14 +537,14 @@ other . if (IS_HIGHBIT_SET(c)) saw_high_bit = true; } -<xq,xe>{quotecontinue} { +<xq,xe,xus>{quotecontinue} { /* ignore */ } <xe>. { /* This is only needed for \ just before EOF */ addlitchar(yytext[0]); } -<xq,xe><<EOF>> { yyerror("unterminated quoted string"); } +<xq,xe,xus><<EOF>> { yyerror("unterminated quoted string"); } {dolqdelim} { SET_YYLLOC(); @@ -553,6 +594,11 @@ other . BEGIN(xd); startlit(); } +{xuistart} { + SET_YYLLOC(); + BEGIN(xui); + startlit(); + } <xd>{xdstop} { char *ident; @@ -565,13 +611,46 @@ other . yylval.str = ident; return IDENT; } -<xd>{xddouble} { +<xui>{xuistop1} { + char *ident; + + BEGIN(INITIAL); + if (literallen == 0) + yyerror("zero-length delimited identifier"); + ident = litbuf_udeescape('\\'); + if (literallen >= NAMEDATALEN) + truncate_identifier(ident, literallen, true); + yylval.str = ident; + /* throw back all but the quote */ + yyless(1); + return IDENT; + } +<xui>{xuistop2} { + char *ident; + + BEGIN(INITIAL); + if (literallen == 0) + yyerror("zero-length delimited identifier"); + ident = litbuf_udeescape(yytext[yyleng - 2]); + if (literallen >= NAMEDATALEN) + truncate_identifier(ident, literallen, true); + yylval.str = ident; + return IDENT; + } +<xd,xui>{xddouble} { addlitchar('"'); } -<xd>{xdinside} { +<xd,xui>{xdinside} { addlit(yytext, yyleng); } -<xd><<EOF>> { yyerror("unterminated quoted identifier"); } +<xd,xui><<EOF>> { yyerror("unterminated quoted identifier"); } + +{xufailed} { + /* throw back all but the initial u/U */ + yyless(1); + /* and treat it as {other} */ + return yytext[0]; + } {typecast} { SET_YYLLOC(); @@ -908,6 +987,99 @@ litbufdup(void) return new; } +static int +hexval(unsigned char c) +{ + if (c >= '0' && c <= '9') + return c - '0'; + if (c >= 'a' && c <= 'f') + return c - 'a' + 0xA; + if (c >= 'A' && c <= 'F') + return c - 'A' + 0xA; + elog(ERROR, "invalid hexadecimal digit"); + return 0; /* not reached */ +} + +static void +check_unicode_value(pg_wchar c, char * loc) +{ + if (GetDatabaseEncoding() == PG_UTF8) + return; + + if (c > 0x7F) + { + yylloc += (char *) loc - literalbuf + 3; /* 3 for U&" */ + yyerror("Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8"); + } +} + +static char * +litbuf_udeescape(unsigned char escape) +{ + char *new; + char *in, *out; + + if (isxdigit(escape) + || escape == '+' + || escape == '\'' + || escape == '"' + || scanner_isspace(escape)) + { + yylloc += literallen + yyleng + 1; + yyerror("invalid Unicode escape character"); + } + + /* + * This relies on the subtle assumption that a UTF-8 expansion + * cannot be longer than its escaped representation. + */ + new = palloc(literallen + 1); + + in = literalbuf; + out = new; + while (*in) + { + if (in[0] == escape) + { + if (in[1] == escape) + { + *out++ = escape; + in += 2; + } + else if (isxdigit(in[1]) && isxdigit(in[2]) && isxdigit(in[3]) && isxdigit(in[4])) + { + pg_wchar unicode = hexval(in[1]) * 16*16*16 + hexval(in[2]) * 16*16 + hexval(in[3]) * 16 + hexval(in[4]); + check_unicode_value(unicode, in); + unicode_to_utf8(unicode, (unsigned char *) out); + in += 5; + out += pg_mblen(out); + } + else if (in[1] == '+' + && isxdigit(in[2]) && isxdigit(in[3]) + && isxdigit(in[4]) && isxdigit(in[5]) + && isxdigit(in[6]) && isxdigit(in[7])) + { + pg_wchar unicode = hexval(in[2]) * 16*16*16*16*16 + hexval(in[3]) * 16*16*16*16 + hexval(in[4]) * 16*16*16 + + hexval(in[5]) * 16*16 + hexval(in[6]) * 16 + hexval(in[7]); + check_unicode_value(unicode, in); + unicode_to_utf8(unicode, (unsigned char *) out); + in += 8; + out += pg_mblen(out); + } + else + { + yylloc += in - literalbuf + 3; /* 3 for U&" */ + yyerror("invalid Unicode escape value"); + } + } + else + *out++ = *in++; + } + + *out = '\0'; + pg_verifymbstr(new, out - new, false); + return new; +} static unsigned char unescape_single_char(unsigned char c) diff --git a/src/backend/utils/adt/xml.c b/src/backend/utils/adt/xml.c index e728e1254f5..c346299caa8 100644 --- a/src/backend/utils/adt/xml.c +++ b/src/backend/utils/adt/xml.c @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/backend/utils/adt/xml.c,v 1.79 2008/10/14 17:12:33 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/utils/adt/xml.c,v 1.80 2008/10/29 08:04:53 petere Exp $ * *------------------------------------------------------------------------- */ @@ -1497,28 +1497,7 @@ unicode_to_sqlchar(pg_wchar c) { static unsigned char utf8string[5]; /* need trailing zero */ - if (c <= 0x7F) - { - utf8string[0] = c; - } - else if (c <= 0x7FF) - { - utf8string[0] = 0xC0 | ((c >> 6) & 0x1F); - utf8string[1] = 0x80 | (c & 0x3F); - } - else if (c <= 0xFFFF) - { - utf8string[0] = 0xE0 | ((c >> 12) & 0x0F); - utf8string[1] = 0x80 | ((c >> 6) & 0x3F); - utf8string[2] = 0x80 | (c & 0x3F); - } - else - { - utf8string[0] = 0xF0 | ((c >> 18) & 0x07); - utf8string[1] = 0x80 | ((c >> 12) & 0x3F); - utf8string[2] = 0x80 | ((c >> 6) & 0x3F); - utf8string[3] = 0x80 | (c & 0x3F); - } + unicode_to_utf8(c, utf8string); return (char *) pg_do_encoding_conversion(utf8string, pg_mblen((char *) utf8string), diff --git a/src/backend/utils/mb/wchar.c b/src/backend/utils/mb/wchar.c index 2f11b3aa9b0..2c6c3f3ff1c 100644 --- a/src/backend/utils/mb/wchar.c +++ b/src/backend/utils/mb/wchar.c @@ -1,7 +1,7 @@ /* * conversion functions between pg_wchar and multibyte streams. * Tatsuo Ishii - * $PostgreSQL: pgsql/src/backend/utils/mb/wchar.c,v 1.67 2008/10/27 19:37:22 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/utils/mb/wchar.c,v 1.68 2008/10/29 08:04:53 petere Exp $ * */ /* can be used in either frontend or backend */ @@ -419,6 +419,41 @@ pg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len) return cnt; } + +/* + * Map a Unicode code point to UTF-8. utf8string must have 4 bytes of + * space allocated. + */ +unsigned char * +unicode_to_utf8(pg_wchar c, unsigned char *utf8string) +{ + if (c <= 0x7F) + { + utf8string[0] = c; + } + else if (c <= 0x7FF) + { + utf8string[0] = 0xC0 | ((c >> 6) & 0x1F); + utf8string[1] = 0x80 | (c & 0x3F); + } + else if (c <= 0xFFFF) + { + utf8string[0] = 0xE0 | ((c >> 12) & 0x0F); + utf8string[1] = 0x80 | ((c >> 6) & 0x3F); + utf8string[2] = 0x80 | (c & 0x3F); + } + else + { + utf8string[0] = 0xF0 | ((c >> 18) & 0x07); + utf8string[1] = 0x80 | ((c >> 12) & 0x3F); + utf8string[2] = 0x80 | ((c >> 6) & 0x3F); + utf8string[3] = 0x80 | (c & 0x3F); + } + + return utf8string; +} + + /* * Return the byte length of a UTF8 character pointed to by s * |