aboutsummaryrefslogtreecommitdiff
path: root/src/backend
diff options
context:
space:
mode:
Diffstat (limited to 'src/backend')
-rw-r--r--src/backend/catalog/sql_features.txt4
-rw-r--r--src/backend/parser/scan.l188
-rw-r--r--src/backend/utils/adt/xml.c25
-rw-r--r--src/backend/utils/mb/wchar.c37
4 files changed, 220 insertions, 34 deletions
diff --git a/src/backend/catalog/sql_features.txt b/src/backend/catalog/sql_features.txt
index b795a70f3cf..707a0710836 100644
--- a/src/backend/catalog/sql_features.txt
+++ b/src/backend/catalog/sql_features.txt
@@ -238,8 +238,8 @@ F381 Extended schema manipulation 02 ALTER TABLE statement: ADD CONSTRAINT claus
F381 Extended schema manipulation 03 ALTER TABLE statement: DROP CONSTRAINT clause YES
F382 Alter column data type YES
F391 Long identifiers YES
-F392 Unicode escapes in identifiers NO
-F393 Unicode escapes in literals NO
+F392 Unicode escapes in identifiers YES
+F393 Unicode escapes in literals YES
F394 Optional normal form specification NO
F401 Extended joined table YES
F401 Extended joined table 01 NATURAL JOIN YES
diff --git a/src/backend/parser/scan.l b/src/backend/parser/scan.l
index dec0669d8ba..424907e3c53 100644
--- a/src/backend/parser/scan.l
+++ b/src/backend/parser/scan.l
@@ -24,7 +24,7 @@
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/parser/scan.l,v 1.146 2008/09/01 20:42:45 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/parser/scan.l,v 1.147 2008/10/29 08:04:52 petere Exp $
*
*-------------------------------------------------------------------------
*/
@@ -76,6 +76,7 @@ static int literalalloc; /* current allocated buffer size */
static void addlit(char *ytext, int yleng);
static void addlitchar(unsigned char ychar);
static char *litbufdup(void);
+static char *litbuf_udeescape(unsigned char escape);
#define lexer_errposition() scanner_errposition(yylloc)
@@ -125,6 +126,8 @@ static unsigned char unescape_single_char(unsigned char c);
* <xq> standard quoted strings
* <xe> extended quoted strings (support backslash escape sequences)
* <xdolq> $foo$ quoted strings
+ * <xui> quoted identifier with Unicode escapes
+ * <xus> quoted string with Unicode escapes
*/
%x xb
@@ -134,6 +137,8 @@ static unsigned char unescape_single_char(unsigned char c);
%x xe
%x xq
%x xdolq
+%x xui
+%x xus
/*
* In order to make the world safe for Windows and Mac clients as well as
@@ -244,6 +249,25 @@ xdstop {dquote}
xddouble {dquote}{dquote}
xdinside [^"]+
+/* Unicode escapes */
+uescape [uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}[^']{quote}
+/* error rule to avoid backup */
+uescapefail ("-"|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*"-"|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}[^']|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*|[uU][eE][sS][cC][aA][pP]|[uU][eE][sS][cC][aA]|[uU][eE][sS][cC]|[uU][eE][sS]|[uU][eE]|[uU])
+
+/* Quoted identifier with Unicode escapes */
+xuistart [uU]&{dquote}
+xuistop1 {dquote}{whitespace}*{uescapefail}?
+xuistop2 {dquote}{whitespace}*{uescape}
+
+/* Quoted string with Unicode escapes */
+xusstart [uU]&{quote}
+xusstop1 {quote}{whitespace}*{uescapefail}?
+xusstop2 {quote}{whitespace}*{uescape}
+
+/* error rule to avoid backup */
+xufailed [uU]&
+
+
/* C-style comments
*
* The "extended comment" syntax closely resembles allowable operator syntax.
@@ -444,6 +468,11 @@ other .
BEGIN(xe);
startlit();
}
+{xusstart} {
+ SET_YYLLOC();
+ BEGIN(xus);
+ startlit();
+ }
<xq,xe>{quotestop} |
<xq,xe>{quotefail} {
yyless(1);
@@ -456,10 +485,22 @@ other .
yylval.str = litbufdup();
return SCONST;
}
-<xq,xe>{xqdouble} {
+<xus>{xusstop1} {
+ /* throw back all but the quote */
+ yyless(1);
+ BEGIN(INITIAL);
+ yylval.str = litbuf_udeescape('\\');
+ return SCONST;
+ }
+<xus>{xusstop2} {
+ BEGIN(INITIAL);
+ yylval.str = litbuf_udeescape(yytext[yyleng-2]);
+ return SCONST;
+ }
+<xq,xe,xus>{xqdouble} {
addlitchar('\'');
}
-<xq>{xqinside} {
+<xq,xus>{xqinside} {
addlit(yytext, yyleng);
}
<xe>{xeinside} {
@@ -496,14 +537,14 @@ other .
if (IS_HIGHBIT_SET(c))
saw_high_bit = true;
}
-<xq,xe>{quotecontinue} {
+<xq,xe,xus>{quotecontinue} {
/* ignore */
}
<xe>. {
/* This is only needed for \ just before EOF */
addlitchar(yytext[0]);
}
-<xq,xe><<EOF>> { yyerror("unterminated quoted string"); }
+<xq,xe,xus><<EOF>> { yyerror("unterminated quoted string"); }
{dolqdelim} {
SET_YYLLOC();
@@ -553,6 +594,11 @@ other .
BEGIN(xd);
startlit();
}
+{xuistart} {
+ SET_YYLLOC();
+ BEGIN(xui);
+ startlit();
+ }
<xd>{xdstop} {
char *ident;
@@ -565,13 +611,46 @@ other .
yylval.str = ident;
return IDENT;
}
-<xd>{xddouble} {
+<xui>{xuistop1} {
+ char *ident;
+
+ BEGIN(INITIAL);
+ if (literallen == 0)
+ yyerror("zero-length delimited identifier");
+ ident = litbuf_udeescape('\\');
+ if (literallen >= NAMEDATALEN)
+ truncate_identifier(ident, literallen, true);
+ yylval.str = ident;
+ /* throw back all but the quote */
+ yyless(1);
+ return IDENT;
+ }
+<xui>{xuistop2} {
+ char *ident;
+
+ BEGIN(INITIAL);
+ if (literallen == 0)
+ yyerror("zero-length delimited identifier");
+ ident = litbuf_udeescape(yytext[yyleng - 2]);
+ if (literallen >= NAMEDATALEN)
+ truncate_identifier(ident, literallen, true);
+ yylval.str = ident;
+ return IDENT;
+ }
+<xd,xui>{xddouble} {
addlitchar('"');
}
-<xd>{xdinside} {
+<xd,xui>{xdinside} {
addlit(yytext, yyleng);
}
-<xd><<EOF>> { yyerror("unterminated quoted identifier"); }
+<xd,xui><<EOF>> { yyerror("unterminated quoted identifier"); }
+
+{xufailed} {
+ /* throw back all but the initial u/U */
+ yyless(1);
+ /* and treat it as {other} */
+ return yytext[0];
+ }
{typecast} {
SET_YYLLOC();
@@ -908,6 +987,99 @@ litbufdup(void)
return new;
}
+static int
+hexval(unsigned char c)
+{
+ if (c >= '0' && c <= '9')
+ return c - '0';
+ if (c >= 'a' && c <= 'f')
+ return c - 'a' + 0xA;
+ if (c >= 'A' && c <= 'F')
+ return c - 'A' + 0xA;
+ elog(ERROR, "invalid hexadecimal digit");
+ return 0; /* not reached */
+}
+
+static void
+check_unicode_value(pg_wchar c, char * loc)
+{
+ if (GetDatabaseEncoding() == PG_UTF8)
+ return;
+
+ if (c > 0x7F)
+ {
+ yylloc += (char *) loc - literalbuf + 3; /* 3 for U&" */
+ yyerror("Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8");
+ }
+}
+
+static char *
+litbuf_udeescape(unsigned char escape)
+{
+ char *new;
+ char *in, *out;
+
+ if (isxdigit(escape)
+ || escape == '+'
+ || escape == '\''
+ || escape == '"'
+ || scanner_isspace(escape))
+ {
+ yylloc += literallen + yyleng + 1;
+ yyerror("invalid Unicode escape character");
+ }
+
+ /*
+ * This relies on the subtle assumption that a UTF-8 expansion
+ * cannot be longer than its escaped representation.
+ */
+ new = palloc(literallen + 1);
+
+ in = literalbuf;
+ out = new;
+ while (*in)
+ {
+ if (in[0] == escape)
+ {
+ if (in[1] == escape)
+ {
+ *out++ = escape;
+ in += 2;
+ }
+ else if (isxdigit(in[1]) && isxdigit(in[2]) && isxdigit(in[3]) && isxdigit(in[4]))
+ {
+ pg_wchar unicode = hexval(in[1]) * 16*16*16 + hexval(in[2]) * 16*16 + hexval(in[3]) * 16 + hexval(in[4]);
+ check_unicode_value(unicode, in);
+ unicode_to_utf8(unicode, (unsigned char *) out);
+ in += 5;
+ out += pg_mblen(out);
+ }
+ else if (in[1] == '+'
+ && isxdigit(in[2]) && isxdigit(in[3])
+ && isxdigit(in[4]) && isxdigit(in[5])
+ && isxdigit(in[6]) && isxdigit(in[7]))
+ {
+ pg_wchar unicode = hexval(in[2]) * 16*16*16*16*16 + hexval(in[3]) * 16*16*16*16 + hexval(in[4]) * 16*16*16
+ + hexval(in[5]) * 16*16 + hexval(in[6]) * 16 + hexval(in[7]);
+ check_unicode_value(unicode, in);
+ unicode_to_utf8(unicode, (unsigned char *) out);
+ in += 8;
+ out += pg_mblen(out);
+ }
+ else
+ {
+ yylloc += in - literalbuf + 3; /* 3 for U&" */
+ yyerror("invalid Unicode escape value");
+ }
+ }
+ else
+ *out++ = *in++;
+ }
+
+ *out = '\0';
+ pg_verifymbstr(new, out - new, false);
+ return new;
+}
static unsigned char
unescape_single_char(unsigned char c)
diff --git a/src/backend/utils/adt/xml.c b/src/backend/utils/adt/xml.c
index e728e1254f5..c346299caa8 100644
--- a/src/backend/utils/adt/xml.c
+++ b/src/backend/utils/adt/xml.c
@@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
- * $PostgreSQL: pgsql/src/backend/utils/adt/xml.c,v 1.79 2008/10/14 17:12:33 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/utils/adt/xml.c,v 1.80 2008/10/29 08:04:53 petere Exp $
*
*-------------------------------------------------------------------------
*/
@@ -1497,28 +1497,7 @@ unicode_to_sqlchar(pg_wchar c)
{
static unsigned char utf8string[5]; /* need trailing zero */
- if (c <= 0x7F)
- {
- utf8string[0] = c;
- }
- else if (c <= 0x7FF)
- {
- utf8string[0] = 0xC0 | ((c >> 6) & 0x1F);
- utf8string[1] = 0x80 | (c & 0x3F);
- }
- else if (c <= 0xFFFF)
- {
- utf8string[0] = 0xE0 | ((c >> 12) & 0x0F);
- utf8string[1] = 0x80 | ((c >> 6) & 0x3F);
- utf8string[2] = 0x80 | (c & 0x3F);
- }
- else
- {
- utf8string[0] = 0xF0 | ((c >> 18) & 0x07);
- utf8string[1] = 0x80 | ((c >> 12) & 0x3F);
- utf8string[2] = 0x80 | ((c >> 6) & 0x3F);
- utf8string[3] = 0x80 | (c & 0x3F);
- }
+ unicode_to_utf8(c, utf8string);
return (char *) pg_do_encoding_conversion(utf8string,
pg_mblen((char *) utf8string),
diff --git a/src/backend/utils/mb/wchar.c b/src/backend/utils/mb/wchar.c
index 2f11b3aa9b0..2c6c3f3ff1c 100644
--- a/src/backend/utils/mb/wchar.c
+++ b/src/backend/utils/mb/wchar.c
@@ -1,7 +1,7 @@
/*
* conversion functions between pg_wchar and multibyte streams.
* Tatsuo Ishii
- * $PostgreSQL: pgsql/src/backend/utils/mb/wchar.c,v 1.67 2008/10/27 19:37:22 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/utils/mb/wchar.c,v 1.68 2008/10/29 08:04:53 petere Exp $
*
*/
/* can be used in either frontend or backend */
@@ -419,6 +419,41 @@ pg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
return cnt;
}
+
+/*
+ * Map a Unicode code point to UTF-8. utf8string must have 4 bytes of
+ * space allocated.
+ */
+unsigned char *
+unicode_to_utf8(pg_wchar c, unsigned char *utf8string)
+{
+ if (c <= 0x7F)
+ {
+ utf8string[0] = c;
+ }
+ else if (c <= 0x7FF)
+ {
+ utf8string[0] = 0xC0 | ((c >> 6) & 0x1F);
+ utf8string[1] = 0x80 | (c & 0x3F);
+ }
+ else if (c <= 0xFFFF)
+ {
+ utf8string[0] = 0xE0 | ((c >> 12) & 0x0F);
+ utf8string[1] = 0x80 | ((c >> 6) & 0x3F);
+ utf8string[2] = 0x80 | (c & 0x3F);
+ }
+ else
+ {
+ utf8string[0] = 0xF0 | ((c >> 18) & 0x07);
+ utf8string[1] = 0x80 | ((c >> 12) & 0x3F);
+ utf8string[2] = 0x80 | ((c >> 6) & 0x3F);
+ utf8string[3] = 0x80 | (c & 0x3F);
+ }
+
+ return utf8string;
+}
+
+
/*
* Return the byte length of a UTF8 character pointed to by s
*