aboutsummaryrefslogtreecommitdiff
path: root/src/backend
diff options
context:
space:
mode:
Diffstat (limited to 'src/backend')
-rw-r--r--src/backend/parser/parser.c74
-rw-r--r--src/backend/parser/scan.l131
-rw-r--r--src/backend/utils/adt/jsonpath_scan.l45
-rw-r--r--src/backend/utils/adt/xml.c24
-rw-r--r--src/backend/utils/mb/mbutils.c105
5 files changed, 271 insertions, 108 deletions
diff --git a/src/backend/parser/parser.c b/src/backend/parser/parser.c
index 1bf1144c4fd..be86eb37fef 100644
--- a/src/backend/parser/parser.c
+++ b/src/backend/parser/parser.c
@@ -292,22 +292,14 @@ hexval(unsigned char c)
return 0; /* not reached */
}
-/* is Unicode code point acceptable in database's encoding? */
+/* is Unicode code point acceptable? */
static void
-check_unicode_value(pg_wchar c, int pos, core_yyscan_t yyscanner)
+check_unicode_value(pg_wchar c)
{
- /* See also addunicode() in scan.l */
- if (c == 0 || c > 0x10FFFF)
+ if (!is_valid_unicode_codepoint(c))
ereport(ERROR,
(errcode(ERRCODE_SYNTAX_ERROR),
- errmsg("invalid Unicode escape value"),
- scanner_errposition(pos, yyscanner)));
-
- if (c > 0x7F && GetDatabaseEncoding() != PG_UTF8)
- ereport(ERROR,
- (errcode(ERRCODE_SYNTAX_ERROR),
- errmsg("Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8"),
- scanner_errposition(pos, yyscanner)));
+ errmsg("invalid Unicode escape value")));
}
/* is 'escape' acceptable as Unicode escape character (UESCAPE syntax) ? */
@@ -338,20 +330,39 @@ str_udeescape(const char *str, char escape,
const char *in;
char *new,
*out;
+ size_t new_len;
pg_wchar pair_first = 0;
+ ScannerCallbackState scbstate;
/*
- * This relies on the subtle assumption that a UTF-8 expansion cannot be
- * longer than its escaped representation.
+ * Guesstimate that result will be no longer than input, but allow enough
+ * padding for Unicode conversion.
*/
- new = palloc(strlen(str) + 1);
+ new_len = strlen(str) + MAX_UNICODE_EQUIVALENT_STRING + 1;
+ new = palloc(new_len);
in = str;
out = new;
while (*in)
{
+ /* Enlarge string if needed */
+ size_t out_dist = out - new;
+
+ if (out_dist > new_len - (MAX_UNICODE_EQUIVALENT_STRING + 1))
+ {
+ new_len *= 2;
+ new = repalloc(new, new_len);
+ out = new + out_dist;
+ }
+
if (in[0] == escape)
{
+ /*
+ * Any errors reported while processing this escape sequence will
+ * have an error cursor pointing at the escape.
+ */
+ setup_scanner_errposition_callback(&scbstate, yyscanner,
+ in - str + position + 3); /* 3 for U&" */
if (in[1] == escape)
{
if (pair_first)
@@ -370,9 +381,7 @@ str_udeescape(const char *str, char escape,
(hexval(in[2]) << 8) +
(hexval(in[3]) << 4) +
hexval(in[4]);
- check_unicode_value(unicode,
- in - str + position + 3, /* 3 for U&" */
- yyscanner);
+ check_unicode_value(unicode);
if (pair_first)
{
if (is_utf16_surrogate_second(unicode))
@@ -390,8 +399,8 @@ str_udeescape(const char *str, char escape,
pair_first = unicode;
else
{
- unicode_to_utf8(unicode, (unsigned char *) out);
- out += pg_mblen(out);
+ pg_unicode_to_server(unicode, (unsigned char *) out);
+ out += strlen(out);
}
in += 5;
}
@@ -411,9 +420,7 @@ str_udeescape(const char *str, char escape,
(hexval(in[5]) << 8) +
(hexval(in[6]) << 4) +
hexval(in[7]);
- check_unicode_value(unicode,
- in - str + position + 3, /* 3 for U&" */
- yyscanner);
+ check_unicode_value(unicode);
if (pair_first)
{
if (is_utf16_surrogate_second(unicode))
@@ -431,17 +438,18 @@ str_udeescape(const char *str, char escape,
pair_first = unicode;
else
{
- unicode_to_utf8(unicode, (unsigned char *) out);
- out += pg_mblen(out);
+ pg_unicode_to_server(unicode, (unsigned char *) out);
+ out += strlen(out);
}
in += 8;
}
else
ereport(ERROR,
(errcode(ERRCODE_SYNTAX_ERROR),
- errmsg("invalid Unicode escape value"),
- scanner_errposition(in - str + position + 3, /* 3 for U&" */
- yyscanner)));
+ errmsg("invalid Unicode escape"),
+ errhint("Unicode escapes must be \\XXXX or \\+XXXXXX.")));
+
+ cancel_scanner_errposition_callback(&scbstate);
}
else
{
@@ -457,15 +465,13 @@ str_udeescape(const char *str, char escape,
goto invalid_pair;
*out = '\0';
+ return new;
/*
- * We could skip pg_verifymbstr if we didn't process any non-7-bit-ASCII
- * codes; but it's probably not worth the trouble, since this isn't likely
- * to be a performance-critical path.
+ * We might get here with the error callback active, or not. Call
+ * scanner_errposition to make sure an error cursor appears; if the
+ * callback is active, this is duplicative but harmless.
*/
- pg_verifymbstr(new, out - new, false);
- return new;
-
invalid_pair:
ereport(ERROR,
(errcode(ERRCODE_SYNTAX_ERROR),
diff --git a/src/backend/parser/scan.l b/src/backend/parser/scan.l
index 84c73914a85..b1ea0cb5384 100644
--- a/src/backend/parser/scan.l
+++ b/src/backend/parser/scan.l
@@ -106,6 +106,18 @@ const uint16 ScanKeywordTokens[] = {
*/
#define ADVANCE_YYLLOC(delta) ( *(yylloc) += (delta) )
+/*
+ * Sometimes, we do want yylloc to point into the middle of a token; this is
+ * useful for instance to throw an error about an escape sequence within a
+ * string literal. But if we find no error there, we want to revert yylloc
+ * to the token start, so that that's the location reported to the parser.
+ * Use PUSH_YYLLOC/POP_YYLLOC to save/restore yylloc around such code.
+ * (Currently the implied "stack" is just one location, but someday we might
+ * need to nest these.)
+ */
+#define PUSH_YYLLOC() (yyextra->save_yylloc = *(yylloc))
+#define POP_YYLLOC() (*(yylloc) = yyextra->save_yylloc)
+
#define startlit() ( yyextra->literallen = 0 )
static void addlit(char *ytext, int yleng, core_yyscan_t yyscanner);
static void addlitchar(unsigned char ychar, core_yyscan_t yyscanner);
@@ -605,8 +617,18 @@ other .
<xe>{xeunicode} {
pg_wchar c = strtoul(yytext + 2, NULL, 16);
+ /*
+ * For consistency with other productions, issue any
+ * escape warning with cursor pointing to start of string.
+ * We might want to change that, someday.
+ */
check_escape_warning(yyscanner);
+ /* Remember start of overall string token ... */
+ PUSH_YYLLOC();
+ /* ... and set the error cursor to point at this esc seq */
+ SET_YYLLOC();
+
if (is_utf16_surrogate_first(c))
{
yyextra->utf16_first_part = c;
@@ -616,10 +638,18 @@ other .
yyerror("invalid Unicode surrogate pair");
else
addunicode(c, yyscanner);
+
+ /* Restore yylloc to be start of string token */
+ POP_YYLLOC();
}
<xeu>{xeunicode} {
pg_wchar c = strtoul(yytext + 2, NULL, 16);
+ /* Remember start of overall string token ... */
+ PUSH_YYLLOC();
+ /* ... and set the error cursor to point at this esc seq */
+ SET_YYLLOC();
+
if (!is_utf16_surrogate_second(c))
yyerror("invalid Unicode surrogate pair");
@@ -627,12 +657,21 @@ other .
addunicode(c, yyscanner);
+ /* Restore yylloc to be start of string token */
+ POP_YYLLOC();
+
BEGIN(xe);
}
-<xeu>. { yyerror("invalid Unicode surrogate pair"); }
-<xeu>\n { yyerror("invalid Unicode surrogate pair"); }
-<xeu><<EOF>> { yyerror("invalid Unicode surrogate pair"); }
+<xeu>. |
+<xeu>\n |
+<xeu><<EOF>> {
+ /* Set the error cursor to point at missing esc seq */
+ SET_YYLLOC();
+ yyerror("invalid Unicode surrogate pair");
+ }
<xe,xeu>{xeunicodefail} {
+ /* Set the error cursor to point at malformed esc seq */
+ SET_YYLLOC();
ereport(ERROR,
(errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
errmsg("invalid Unicode escape"),
@@ -1029,12 +1068,13 @@ other .
* scanner_errposition
* Report a lexer or grammar error cursor position, if possible.
*
- * This is expected to be used within an ereport() call. The return value
+ * This is expected to be used within an ereport() call, or via an error
+ * callback such as setup_scanner_errposition_callback(). The return value
* is a dummy (always 0, in fact).
*
* Note that this can only be used for messages emitted during raw parsing
- * (essentially, scan.l and gram.y), since it requires the yyscanner struct
- * to still be available.
+ * (essentially, scan.l, parser.c, and gram.y), since it requires the
+ * yyscanner struct to still be available.
*/
int
scanner_errposition(int location, core_yyscan_t yyscanner)
@@ -1051,6 +1091,62 @@ scanner_errposition(int location, core_yyscan_t yyscanner)
}
/*
+ * Error context callback for inserting scanner error location.
+ *
+ * Note that this will be called for *any* error occurring while the
+ * callback is installed. We avoid inserting an irrelevant error location
+ * if the error is a query cancel --- are there any other important cases?
+ */
+static void
+scb_error_callback(void *arg)
+{
+ ScannerCallbackState *scbstate = (ScannerCallbackState *) arg;
+
+ if (geterrcode() != ERRCODE_QUERY_CANCELED)
+ (void) scanner_errposition(scbstate->location, scbstate->yyscanner);
+}
+
+/*
+ * setup_scanner_errposition_callback
+ * Arrange for non-scanner errors to report an error position
+ *
+ * Sometimes the scanner calls functions that aren't part of the scanner
+ * subsystem and can't reasonably be passed the yyscanner pointer; yet
+ * we would like any errors thrown in those functions to be tagged with an
+ * error location. Use this function to set up an error context stack
+ * entry that will accomplish that. Usage pattern:
+ *
+ * declare a local variable "ScannerCallbackState scbstate"
+ * ...
+ * setup_scanner_errposition_callback(&scbstate, yyscanner, location);
+ * call function that might throw error;
+ * cancel_scanner_errposition_callback(&scbstate);
+ */
+void
+setup_scanner_errposition_callback(ScannerCallbackState *scbstate,
+ core_yyscan_t yyscanner,
+ int location)
+{
+ /* Setup error traceback support for ereport() */
+ scbstate->yyscanner = yyscanner;
+ scbstate->location = location;
+ scbstate->errcallback.callback = scb_error_callback;
+ scbstate->errcallback.arg = (void *) scbstate;
+ scbstate->errcallback.previous = error_context_stack;
+ error_context_stack = &scbstate->errcallback;
+}
+
+/*
+ * Cancel a previously-set-up errposition callback.
+ */
+void
+cancel_scanner_errposition_callback(ScannerCallbackState *scbstate)
+{
+ /* Pop the error context stack */
+ error_context_stack = scbstate->errcallback.previous;
+}
+
+/*
* scanner_yyerror
* Report a lexer or grammar error.
*
@@ -1226,19 +1322,20 @@ process_integer_literal(const char *token, YYSTYPE *lval)
static void
addunicode(pg_wchar c, core_yyscan_t yyscanner)
{
- char buf[8];
+ ScannerCallbackState scbstate;
+ char buf[MAX_UNICODE_EQUIVALENT_STRING + 1];
- /* See also check_unicode_value() in parser.c */
- if (c == 0 || c > 0x10FFFF)
+ if (!is_valid_unicode_codepoint(c))
yyerror("invalid Unicode escape value");
- if (c > 0x7F)
- {
- if (GetDatabaseEncoding() != PG_UTF8)
- yyerror("Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8");
- yyextra->saw_non_ascii = true;
- }
- unicode_to_utf8(c, (unsigned char *) buf);
- addlit(buf, pg_mblen(buf), yyscanner);
+
+ /*
+ * We expect that pg_unicode_to_server() will complain about any
+ * unconvertible code point, so we don't have to set saw_non_ascii.
+ */
+ setup_scanner_errposition_callback(&scbstate, yyscanner, *(yylloc));
+ pg_unicode_to_server(c, (unsigned char *) buf);
+ cancel_scanner_errposition_callback(&scbstate);
+ addlit(buf, strlen(buf), yyscanner);
}
static unsigned char
diff --git a/src/backend/utils/adt/jsonpath_scan.l b/src/backend/utils/adt/jsonpath_scan.l
index 70681b789d3..be0a2cfa2f7 100644
--- a/src/backend/utils/adt/jsonpath_scan.l
+++ b/src/backend/utils/adt/jsonpath_scan.l
@@ -486,13 +486,6 @@ hexval(char c)
static void
addUnicodeChar(int ch)
{
- /*
- * For UTF8, replace the escape sequence by the actual
- * utf8 character in lex->strval. Do this also for other
- * encodings if the escape designates an ASCII character,
- * otherwise raise an error.
- */
-
if (ch == 0)
{
/* We can't allow this, since our TEXT type doesn't */
@@ -501,40 +494,20 @@ addUnicodeChar(int ch)
errmsg("unsupported Unicode escape sequence"),
errdetail("\\u0000 cannot be converted to text.")));
}
- else if (GetDatabaseEncoding() == PG_UTF8)
- {
- char utf8str[5];
- int utf8len;
-
- unicode_to_utf8(ch, (unsigned char *) utf8str);
- utf8len = pg_utf_mblen((unsigned char *) utf8str);
- addstring(false, utf8str, utf8len);
- }
- else if (ch <= 0x007f)
- {
- /*
- * This is the only way to designate things like a
- * form feed character in JSON, so it's useful in all
- * encodings.
- */
- addchar(false, (char) ch);
- }
else
{
- ereport(ERROR,
- (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
- errmsg("invalid input syntax for type %s", "jsonpath"),
- errdetail("Unicode escape values cannot be used for code "
- "point values above 007F when the server encoding "
- "is not UTF8.")));
+ char cbuf[MAX_UNICODE_EQUIVALENT_STRING + 1];
+
+ pg_unicode_to_server(ch, (unsigned char *) cbuf);
+ addstring(false, cbuf, strlen(cbuf));
}
}
-/* Add unicode character and process its hi surrogate */
+/* Add unicode character, processing any surrogate pairs */
static void
addUnicode(int ch, int *hi_surrogate)
{
- if (ch >= 0xd800 && ch <= 0xdbff)
+ if (is_utf16_surrogate_first(ch))
{
if (*hi_surrogate != -1)
ereport(ERROR,
@@ -542,10 +515,10 @@ addUnicode(int ch, int *hi_surrogate)
errmsg("invalid input syntax for type %s", "jsonpath"),
errdetail("Unicode high surrogate must not follow "
"a high surrogate.")));
- *hi_surrogate = (ch & 0x3ff) << 10;
+ *hi_surrogate = ch;
return;
}
- else if (ch >= 0xdc00 && ch <= 0xdfff)
+ else if (is_utf16_surrogate_second(ch))
{
if (*hi_surrogate == -1)
ereport(ERROR,
@@ -553,7 +526,7 @@ addUnicode(int ch, int *hi_surrogate)
errmsg("invalid input syntax for type %s", "jsonpath"),
errdetail("Unicode low surrogate must follow a high "
"surrogate.")));
- ch = 0x10000 + *hi_surrogate + (ch & 0x3ff);
+ ch = surrogate_pair_to_codepoint(*hi_surrogate, ch);
*hi_surrogate = -1;
}
else if (*hi_surrogate != -1)
diff --git a/src/backend/utils/adt/xml.c b/src/backend/utils/adt/xml.c
index c7ae1eded80..4c299057a6f 100644
--- a/src/backend/utils/adt/xml.c
+++ b/src/backend/utils/adt/xml.c
@@ -2086,26 +2086,6 @@ map_sql_identifier_to_xml_name(const char *ident, bool fully_escaped,
/*
- * Map a Unicode codepoint into the current server encoding.
- */
-static char *
-unicode_to_sqlchar(pg_wchar c)
-{
- char utf8string[8]; /* need room for trailing zero */
- char *result;
-
- memset(utf8string, 0, sizeof(utf8string));
- unicode_to_utf8(c, (unsigned char *) utf8string);
-
- result = pg_any_to_server(utf8string, strlen(utf8string), PG_UTF8);
- /* if pg_any_to_server didn't strdup, we must */
- if (result == utf8string)
- result = pstrdup(result);
- return result;
-}
-
-
-/*
* Map XML name to SQL identifier; see SQL/XML:2008 section 9.3.
*/
char *
@@ -2125,10 +2105,12 @@ map_xml_name_to_sql_identifier(const char *name)
&& isxdigit((unsigned char) *(p + 5))
&& *(p + 6) == '_')
{
+ char cbuf[MAX_UNICODE_EQUIVALENT_STRING + 1];
unsigned int u;
sscanf(p + 2, "%X", &u);
- appendStringInfoString(&buf, unicode_to_sqlchar(u));
+ pg_unicode_to_server(u, (unsigned char *) cbuf);
+ appendStringInfoString(&buf, cbuf);
p += 6;
}
else
diff --git a/src/backend/utils/mb/mbutils.c b/src/backend/utils/mb/mbutils.c
index 86787bcb319..a8e13cacfde 100644
--- a/src/backend/utils/mb/mbutils.c
+++ b/src/backend/utils/mb/mbutils.c
@@ -68,6 +68,13 @@ static FmgrInfo *ToServerConvProc = NULL;
static FmgrInfo *ToClientConvProc = NULL;
/*
+ * This variable stores the conversion function to convert from UTF-8
+ * to the server encoding. It's NULL if the server encoding *is* UTF-8,
+ * or if we lack a conversion function for this.
+ */
+static FmgrInfo *Utf8ToServerConvProc = NULL;
+
+/*
* These variables track the currently-selected encodings.
*/
static const pg_enc2name *ClientEncoding = &pg_enc2name_tbl[PG_SQL_ASCII];
@@ -273,6 +280,8 @@ SetClientEncoding(int encoding)
void
InitializeClientEncoding(void)
{
+ int current_server_encoding;
+
Assert(!backend_startup_complete);
backend_startup_complete = true;
@@ -289,6 +298,35 @@ InitializeClientEncoding(void)
pg_enc2name_tbl[pending_client_encoding].name,
GetDatabaseEncodingName())));
}
+
+ /*
+ * Also look up the UTF8-to-server conversion function if needed. Since
+ * the server encoding is fixed within any one backend process, we don't
+ * have to do this more than once.
+ */
+ current_server_encoding = GetDatabaseEncoding();
+ if (current_server_encoding != PG_UTF8 &&
+ current_server_encoding != PG_SQL_ASCII)
+ {
+ Oid utf8_to_server_proc;
+
+ Assert(IsTransactionState());
+ utf8_to_server_proc =
+ FindDefaultConversionProc(PG_UTF8,
+ current_server_encoding);
+ /* If there's no such conversion, just leave the pointer as NULL */
+ if (OidIsValid(utf8_to_server_proc))
+ {
+ FmgrInfo *finfo;
+
+ finfo = (FmgrInfo *) MemoryContextAlloc(TopMemoryContext,
+ sizeof(FmgrInfo));
+ fmgr_info_cxt(utf8_to_server_proc, finfo,
+ TopMemoryContext);
+ /* Set Utf8ToServerConvProc only after data is fully valid */
+ Utf8ToServerConvProc = finfo;
+ }
+ }
}
/*
@@ -752,6 +790,73 @@ perform_default_encoding_conversion(const char *src, int len,
return result;
}
+/*
+ * Convert a single Unicode code point into a string in the server encoding.
+ *
+ * The code point given by "c" is converted and stored at *s, which must
+ * have at least MAX_UNICODE_EQUIVALENT_STRING+1 bytes available.
+ * The output will have a trailing '\0'. Throws error if the conversion
+ * cannot be performed.
+ *
+ * Note that this relies on having previously looked up any required
+ * conversion function. That's partly for speed but mostly because the parser
+ * may call this outside any transaction, or in an aborted transaction.
+ */
+void
+pg_unicode_to_server(pg_wchar c, unsigned char *s)
+{
+ unsigned char c_as_utf8[MAX_MULTIBYTE_CHAR_LEN + 1];
+ int c_as_utf8_len;
+ int server_encoding;
+
+ /*
+ * Complain if invalid Unicode code point. The choice of errcode here is
+ * debatable, but really our caller should have checked this anyway.
+ */
+ if (!is_valid_unicode_codepoint(c))
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("invalid Unicode code point")));
+
+ /* Otherwise, if it's in ASCII range, conversion is trivial */
+ if (c <= 0x7F)
+ {
+ s[0] = (unsigned char) c;
+ s[1] = '\0';
+ return;
+ }
+
+ /* If the server encoding is UTF-8, we just need to reformat the code */
+ server_encoding = GetDatabaseEncoding();
+ if (server_encoding == PG_UTF8)
+ {
+ unicode_to_utf8(c, s);
+ s[pg_utf_mblen(s)] = '\0';
+ return;
+ }
+
+ /* For all other cases, we must have a conversion function available */
+ if (Utf8ToServerConvProc == NULL)
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("conversion between %s and %s is not supported",
+ pg_enc2name_tbl[PG_UTF8].name,
+ GetDatabaseEncodingName())));
+
+ /* Construct UTF-8 source string */
+ unicode_to_utf8(c, c_as_utf8);
+ c_as_utf8_len = pg_utf_mblen(c_as_utf8);
+ c_as_utf8[c_as_utf8_len] = '\0';
+
+ /* Convert, or throw error if we can't */
+ FunctionCall5(Utf8ToServerConvProc,
+ Int32GetDatum(PG_UTF8),
+ Int32GetDatum(server_encoding),
+ CStringGetDatum(c_as_utf8),
+ CStringGetDatum(s),
+ Int32GetDatum(c_as_utf8_len));
+}
+
/* convert a multibyte string to a wchar */
int