2 files changed, 154 insertions, 51 deletions
diff --git a/src/backend/parser/parser.c b/src/backend/parser/parser.c
index 1bf1144c4fd..be86eb37fef 100644
--- a/src/backend/parser/parser.c
+++ b/src/backend/parser/parser.c
@@ -292,22 +292,14 @@ hexval(unsigned char c)
 	return 0;					/* not reached */
 }
 
-/* is Unicode code point acceptable in database's encoding? */
+/* is Unicode code point acceptable? */
 static void
-check_unicode_value(pg_wchar c, int pos, core_yyscan_t yyscanner)
+check_unicode_value(pg_wchar c)
 {
-	/* See also addunicode() in scan.l */
-	if (c == 0 || c > 0x10FFFF)
+	if (!is_valid_unicode_codepoint(c))
 		ereport(ERROR,
 				(errcode(ERRCODE_SYNTAX_ERROR),
-				 errmsg("invalid Unicode escape value"),
-				 scanner_errposition(pos, yyscanner)));
-
-	if (c > 0x7F && GetDatabaseEncoding() != PG_UTF8)
-		ereport(ERROR,
-				(errcode(ERRCODE_SYNTAX_ERROR),
-				 errmsg("Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8"),
-				 scanner_errposition(pos, yyscanner)));
+				 errmsg("invalid Unicode escape value")));
 }
 
 /* is 'escape' acceptable as Unicode escape character (UESCAPE syntax) ? */
@@ -338,20 +330,39 @@ str_udeescape(const char *str, char escape,
 	const char *in;
 	char	   *new,
 			   *out;
+	size_t		new_len;
 	pg_wchar	pair_first = 0;
+	ScannerCallbackState scbstate;
 
 	/*
-	 * This relies on the subtle assumption that a UTF-8 expansion cannot be
-	 * longer than its escaped representation.
+	 * Guesstimate that result will be no longer than input, but allow enough
+	 * padding for Unicode conversion.
 	 */
-	new = palloc(strlen(str) + 1);
+	new_len = strlen(str) + MAX_UNICODE_EQUIVALENT_STRING + 1;
+	new = palloc(new_len);
 
 	in = str;
 	out = new;
 	while (*in)
 	{
+		/* Enlarge string if needed */
+		size_t		out_dist = out - new;
+
+		if (out_dist > new_len - (MAX_UNICODE_EQUIVALENT_STRING + 1))
+		{
+			new_len *= 2;
+			new = repalloc(new, new_len);
+			out = new + out_dist;
+		}
+
 		if (in[0] == escape)
 		{
+			/*
+			 * Any errors reported while processing this escape sequence will
+			 * have an error cursor pointing at the escape.
+			 */
+			setup_scanner_errposition_callback(&scbstate, yyscanner,
+											   in - str + position + 3);	/* 3 for U&" */
 			if (in[1] == escape)
 			{
 				if (pair_first)
@@ -370,9 +381,7 @@ str_udeescape(const char *str, char escape,
 					(hexval(in[2]) << 8) +
 					(hexval(in[3]) << 4) +
 					hexval(in[4]);
-				check_unicode_value(unicode,
-									in - str + position + 3,	/* 3 for U&" */
-									yyscanner);
+				check_unicode_value(unicode);
 				if (pair_first)
 				{
 					if (is_utf16_surrogate_second(unicode))
@@ -390,8 +399,8 @@ str_udeescape(const char *str, char escape,
 					pair_first = unicode;
 				else
 				{
-					unicode_to_utf8(unicode, (unsigned char *) out);
-					out += pg_mblen(out);
+					pg_unicode_to_server(unicode, (unsigned char *) out);
+					out += strlen(out);
 				}
 				in += 5;
 			}
@@ -411,9 +420,7 @@ str_udeescape(const char *str, char escape,
 					(hexval(in[5]) << 8) +
 					(hexval(in[6]) << 4) +
 					hexval(in[7]);
-				check_unicode_value(unicode,
-									in - str + position + 3,	/* 3 for U&" */
-									yyscanner);
+				check_unicode_value(unicode);
 				if (pair_first)
 				{
 					if (is_utf16_surrogate_second(unicode))
@@ -431,17 +438,18 @@ str_udeescape(const char *str, char escape,
 					pair_first = unicode;
 				else
 				{
-					unicode_to_utf8(unicode, (unsigned char *) out);
-					out += pg_mblen(out);
+					pg_unicode_to_server(unicode, (unsigned char *) out);
+					out += strlen(out);
 				}
 				in += 8;
 			}
 			else
 				ereport(ERROR,
 						(errcode(ERRCODE_SYNTAX_ERROR),
-						 errmsg("invalid Unicode escape value"),
-						 scanner_errposition(in - str + position + 3,	/* 3 for U&" */
-											 yyscanner)));
+						 errmsg("invalid Unicode escape"),
+						 errhint("Unicode escapes must be \\XXXX or \\+XXXXXX.")));
+
+			cancel_scanner_errposition_callback(&scbstate);
 		}
 		else
 		{
@@ -457,15 +465,13 @@ str_udeescape(const char *str, char escape,
 		goto invalid_pair;
 
 	*out = '\0';
+	return new;
 
 	/*
-	 * We could skip pg_verifymbstr if we didn't process any non-7-bit-ASCII
-	 * codes; but it's probably not worth the trouble, since this isn't likely
-	 * to be a performance-critical path.
+	 * We might get here with the error callback active, or not.  Call
+	 * scanner_errposition to make sure an error cursor appears; if the
+	 * callback is active, this is duplicative but harmless.
 	 */
-	pg_verifymbstr(new, out - new, false);
-	return new;
-
 invalid_pair:
 	ereport(ERROR,
 			(errcode(ERRCODE_SYNTAX_ERROR),
diff --git a/src/backend/parser/scan.l b/src/backend/parser/scan.l
index 84c73914a85..b1ea0cb5384 100644
--- a/src/backend/parser/scan.l
+++ b/src/backend/parser/scan.l
@@ -106,6 +106,18 @@ const uint16 ScanKeywordTokens[] = {
  */
 #define ADVANCE_YYLLOC(delta)  ( *(yylloc) += (delta) )
 
+/*
+ * Sometimes, we do want yylloc to point into the middle of a token; this is
+ * useful for instance to throw an error about an escape sequence within a
+ * string literal.  But if we find no error there, we want to revert yylloc
+ * to the token start, so that that's the location reported to the parser.
+ * Use PUSH_YYLLOC/POP_YYLLOC to save/restore yylloc around such code.
+ * (Currently the implied "stack" is just one location, but someday we might
+ * need to nest these.)
+ */
+#define PUSH_YYLLOC()	(yyextra->save_yylloc = *(yylloc))
+#define POP_YYLLOC()	(*(yylloc) = yyextra->save_yylloc)
+
 #define startlit()	( yyextra->literallen = 0 )
 static void addlit(char *ytext, int yleng, core_yyscan_t yyscanner);
 static void addlitchar(unsigned char ychar, core_yyscan_t yyscanner);
@@ -605,8 +617,18 @@ other			.
 <xe>{xeunicode} {
 					pg_wchar	c = strtoul(yytext + 2, NULL, 16);
 
+					/*
+					 * For consistency with other productions, issue any
+					 * escape warning with cursor pointing to start of string.
+					 * We might want to change that, someday.
+					 */
 					check_escape_warning(yyscanner);
 
+					/* Remember start of overall string token ... */
+					PUSH_YYLLOC();
+					/* ... and set the error cursor to point at this esc seq */
+					SET_YYLLOC();
+
 					if (is_utf16_surrogate_first(c))
 					{
 						yyextra->utf16_first_part = c;
@@ -616,10 +638,18 @@ other			.
 						yyerror("invalid Unicode surrogate pair");
 					else
 						addunicode(c, yyscanner);
+
+					/* Restore yylloc to be start of string token */
+					POP_YYLLOC();
 				}
 <xeu>{xeunicode} {
 					pg_wchar	c = strtoul(yytext + 2, NULL, 16);
 
+					/* Remember start of overall string token ... */
+					PUSH_YYLLOC();
+					/* ... and set the error cursor to point at this esc seq */
+					SET_YYLLOC();
+
 					if (!is_utf16_surrogate_second(c))
 						yyerror("invalid Unicode surrogate pair");
 
@@ -627,12 +657,21 @@ other			.
 
 					addunicode(c, yyscanner);
 
+					/* Restore yylloc to be start of string token */
+					POP_YYLLOC();
+
 					BEGIN(xe);
 				}
-<xeu>.			{ yyerror("invalid Unicode surrogate pair"); }
-<xeu>\n			{ yyerror("invalid Unicode surrogate pair"); }
-<xeu><<EOF>>	{ yyerror("invalid Unicode surrogate pair"); }
+<xeu>. |
+<xeu>\n |
+<xeu><<EOF>>	{
+					/* Set the error cursor to point at missing esc seq */
+					SET_YYLLOC();
+					yyerror("invalid Unicode surrogate pair");
+				}
 <xe,xeu>{xeunicodefail}	{
+					/* Set the error cursor to point at malformed esc seq */
+					SET_YYLLOC();
 					ereport(ERROR,
 							(errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
 							 errmsg("invalid Unicode escape"),
@@ -1029,12 +1068,13 @@ other			.
  * scanner_errposition
  *		Report a lexer or grammar error cursor position, if possible.
  *
- * This is expected to be used within an ereport() call.  The return value
+ * This is expected to be used within an ereport() call, or via an error
+ * callback such as setup_scanner_errposition_callback().  The return value
  * is a dummy (always 0, in fact).
  *
  * Note that this can only be used for messages emitted during raw parsing
- * (essentially, scan.l and gram.y), since it requires the yyscanner struct
- * to still be available.
+ * (essentially, scan.l, parser.c, and gram.y), since it requires the
+ * yyscanner struct to still be available.
  */
 int
 scanner_errposition(int location, core_yyscan_t yyscanner)
@@ -1051,6 +1091,62 @@ scanner_errposition(int location, core_yyscan_t yyscanner)
 }
 
 /*
+ * Error context callback for inserting scanner error location.
+ *
+ * Note that this will be called for *any* error occurring while the
+ * callback is installed.  We avoid inserting an irrelevant error location
+ * if the error is a query cancel --- are there any other important cases?
+ */
+static void
+scb_error_callback(void *arg)
+{
+	ScannerCallbackState *scbstate = (ScannerCallbackState *) arg;
+
+	if (geterrcode() != ERRCODE_QUERY_CANCELED)
+		(void) scanner_errposition(scbstate->location, scbstate->yyscanner);
+}
+
+/*
+ * setup_scanner_errposition_callback
+ *		Arrange for non-scanner errors to report an error position
+ *
+ * Sometimes the scanner calls functions that aren't part of the scanner
+ * subsystem and can't reasonably be passed the yyscanner pointer; yet
+ * we would like any errors thrown in those functions to be tagged with an
+ * error location.  Use this function to set up an error context stack
+ * entry that will accomplish that.  Usage pattern:
+ *
+ *		declare a local variable "ScannerCallbackState scbstate"
+ *		...
+ *		setup_scanner_errposition_callback(&scbstate, yyscanner, location);
+ *		call function that might throw error;
+ *		cancel_scanner_errposition_callback(&scbstate);
+ */
+void
+setup_scanner_errposition_callback(ScannerCallbackState *scbstate,
+								   core_yyscan_t yyscanner,
+								   int location)
+{
+	/* Setup error traceback support for ereport() */
+	scbstate->yyscanner = yyscanner;
+	scbstate->location = location;
+	scbstate->errcallback.callback = scb_error_callback;
+	scbstate->errcallback.arg = (void *) scbstate;
+	scbstate->errcallback.previous = error_context_stack;
+	error_context_stack = &scbstate->errcallback;
+}
+
+/*
+ * Cancel a previously-set-up errposition callback.
+ */
+void
+cancel_scanner_errposition_callback(ScannerCallbackState *scbstate)
+{
+	/* Pop the error context stack */
+	error_context_stack = scbstate->errcallback.previous;
+}
+
+/*
  * scanner_yyerror
  *		Report a lexer or grammar error.
  *
@@ -1226,19 +1322,20 @@ process_integer_literal(const char *token, YYSTYPE *lval)
 static void
 addunicode(pg_wchar c, core_yyscan_t yyscanner)
 {
-	char		buf[8];
+	ScannerCallbackState scbstate;
+	char		buf[MAX_UNICODE_EQUIVALENT_STRING + 1];
 
-	/* See also check_unicode_value() in parser.c */
-	if (c == 0 || c > 0x10FFFF)
+	if (!is_valid_unicode_codepoint(c))
 		yyerror("invalid Unicode escape value");
-	if (c > 0x7F)
-	{
-		if (GetDatabaseEncoding() != PG_UTF8)
-			yyerror("Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8");
-		yyextra->saw_non_ascii = true;
-	}
-	unicode_to_utf8(c, (unsigned char *) buf);
-	addlit(buf, pg_mblen(buf), yyscanner);
+
+	/*
+	 * We expect that pg_unicode_to_server() will complain about any
+	 * unconvertible code point, so we don't have to set saw_non_ascii.
+	 */
+	setup_scanner_errposition_callback(&scbstate, yyscanner, *(yylloc));
+	pg_unicode_to_server(c, (unsigned char *) buf);
+	cancel_scanner_errposition_callback(&scbstate);
+	addlit(buf, strlen(buf), yyscanner);
 }
 
 static unsigned char