3 files changed, 57 insertions, 32 deletions
diff --git a/src/backend/utils/adt/json.c b/src/backend/utils/adt/json.c
index 26d293709aa..40d1dff39c9 100644
--- a/src/backend/utils/adt/json.c
+++ b/src/backend/utils/adt/json.c
@@ -750,6 +750,13 @@ json_lex(JsonLexContext *lex)
 
 /*
  * The next token in the input stream is known to be a string; lex it.
+ *
+ * If lex->strval isn't NULL, fill it with the decoded string.
+ * Set lex->token_terminator to the end of the decoded input, and in
+ * success cases, transfer its previous value to lex->prev_token_terminator.
+ *
+ * Note: be careful that all error cases advance lex->token_terminator
+ * to the point after the character we detected the error on.
  */
 static inline void
 json_lex_string(JsonLexContext *lex)
@@ -837,33 +844,42 @@ json_lex_string(JsonLexContext *lex)
 					if (ch >= 0xd800 && ch <= 0xdbff)
 					{
 						if (hi_surrogate != -1)
+						{
+							lex->token_terminator = s + pg_mblen(s);
 							ereport(ERROR,
 									(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
 									 errmsg("invalid input syntax for type %s",
 											"json"),
 									 errdetail("Unicode high surrogate must not follow a high surrogate."),
 									 report_json_context(lex)));
+						}
 						hi_surrogate = (ch & 0x3ff) << 10;
 						continue;
 					}
 					else if (ch >= 0xdc00 && ch <= 0xdfff)
 					{
 						if (hi_surrogate == -1)
+						{
+							lex->token_terminator = s + pg_mblen(s);
 							ereport(ERROR,
 									(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
 									 errmsg("invalid input syntax for type %s", "json"),
 									 errdetail("Unicode low surrogate must follow a high surrogate."),
 									 report_json_context(lex)));
+						}
 						ch = 0x10000 + hi_surrogate + (ch & 0x3ff);
 						hi_surrogate = -1;
 					}
 
 					if (hi_surrogate != -1)
+					{
+						lex->token_terminator = s + pg_mblen(s);
 						ereport(ERROR,
 								(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
 								 errmsg("invalid input syntax for type %s", "json"),
 								 errdetail("Unicode low surrogate must follow a high surrogate."),
 								 report_json_context(lex)));
+					}
 
 					/*
 					 * For UTF8, replace the escape sequence by the actual
@@ -875,6 +891,7 @@ json_lex_string(JsonLexContext *lex)
 					if (ch == 0)
 					{
 						/* We can't allow this, since our TEXT type doesn't */
+						lex->token_terminator = s + pg_mblen(s);
 						ereport(ERROR,
 								(errcode(ERRCODE_UNTRANSLATABLE_CHARACTER),
 								 errmsg("unsupported Unicode escape sequence"),
@@ -898,24 +915,27 @@ json_lex_string(JsonLexContext *lex)
 					}
 					else
 					{
+						lex->token_terminator = s + pg_mblen(s);
 						ereport(ERROR,
 								(errcode(ERRCODE_UNTRANSLATABLE_CHARACTER),
 								 errmsg("unsupported Unicode escape sequence"),
 								 errdetail("Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8."),
 								 report_json_context(lex)));
 					}
-
 				}
 			}
 			else if (lex->strval != NULL)
 			{
 				if (hi_surrogate != -1)
+				{
+					lex->token_terminator = s + pg_mblen(s);
 					ereport(ERROR,
 							(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
 							 errmsg("invalid input syntax for type %s",
 									"json"),
 							 errdetail("Unicode low surrogate must follow a high surrogate."),
 							 report_json_context(lex)));
+				}
 
 				switch (*s)
 				{
@@ -968,16 +988,18 @@ json_lex_string(JsonLexContext *lex)
 								   extract_mb_char(s)),
 						 report_json_context(lex)));
 			}
-
 		}
 		else if (lex->strval != NULL)
 		{
 			if (hi_surrogate != -1)
+			{
+				lex->token_terminator = s + pg_mblen(s);
 				ereport(ERROR,
 						(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
 						 errmsg("invalid input syntax for type %s", "json"),
 						 errdetail("Unicode low surrogate must follow a high surrogate."),
 						 report_json_context(lex)));
+			}
 
 			appendStringInfoChar(lex->strval, *s);
 		}
@@ -985,11 +1007,14 @@ json_lex_string(JsonLexContext *lex)
 	}
 
 	if (hi_surrogate != -1)
+	{
+		lex->token_terminator = s + pg_mblen(s);
 		ereport(ERROR,
 				(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
 				 errmsg("invalid input syntax for type %s", "json"),
 				 errdetail("Unicode low surrogate must follow a high surrogate."),
 				 report_json_context(lex)));
+	}
 
 	/* Hooray, we found the end of the string! */
 	lex->prev_token_terminator = lex->token_terminator;
diff --git a/src/test/regress/expected/json_encoding.out b/src/test/regress/expected/json_encoding.out
index d8d34f4ff6a..3156c63c6ff 100644
--- a/src/test/regress/expected/json_encoding.out
+++ b/src/test/regress/expected/json_encoding.out
@@ -41,19 +41,19 @@ select json '{ "a":  "\ud83d\ude04\ud83d\udc36" }' -> 'a' as correct_in_utf8;
 select json '{ "a":  "\ud83d\ud83d" }' -> 'a'; -- 2 high surrogates in a row
 ERROR:  invalid input syntax for type json
 DETAIL:  Unicode high surrogate must not follow a high surrogate.
-CONTEXT:  JSON data, line 1: { "a":...
+CONTEXT:  JSON data, line 1: { "a":  "\ud83d\ud83d...
 select json '{ "a":  "\ude04\ud83d" }' -> 'a'; -- surrogates in wrong order
 ERROR:  invalid input syntax for type json
 DETAIL:  Unicode low surrogate must follow a high surrogate.
-CONTEXT:  JSON data, line 1: { "a":...
+CONTEXT:  JSON data, line 1: { "a":  "\ude04...
 select json '{ "a":  "\ud83dX" }' -> 'a'; -- orphan high surrogate
 ERROR:  invalid input syntax for type json
 DETAIL:  Unicode low surrogate must follow a high surrogate.
-CONTEXT:  JSON data, line 1: { "a":...
+CONTEXT:  JSON data, line 1: { "a":  "\ud83dX...
 select json '{ "a":  "\ude04X" }' -> 'a'; -- orphan low surrogate
 ERROR:  invalid input syntax for type json
 DETAIL:  Unicode low surrogate must follow a high surrogate.
-CONTEXT:  JSON data, line 1: { "a":...
+CONTEXT:  JSON data, line 1: { "a":  "\ude04...
 --handling of simple unicode escapes
 select json '{ "a":  "the Copyright \u00a9 sign" }' as correct_in_utf8;
             correct_in_utf8            
@@ -106,7 +106,7 @@ select json '{ "a":  "dollar \\u0024 character" }' ->> 'a' as not_an_escape;
 select json '{ "a":  "null \u0000 escape" }' ->> 'a' as fails;
 ERROR:  unsupported Unicode escape sequence
 DETAIL:  \u0000 cannot be converted to text.
-CONTEXT:  JSON data, line 1: { "a":...
+CONTEXT:  JSON data, line 1: { "a":  "null \u0000...
 select json '{ "a":  "null \\u0000 escape" }' ->> 'a' as not_an_escape;
    not_an_escape    
 --------------------
@@ -144,7 +144,7 @@ ERROR:  unsupported Unicode escape sequence
 LINE 1: SELECT '"\u0000"'::jsonb;
                ^
 DETAIL:  \u0000 cannot be converted to text.
-CONTEXT:  JSON data, line 1: ...
+CONTEXT:  JSON data, line 1: "\u0000...
 -- use octet_length here so we don't get an odd unicode char in the
 -- output
 SELECT octet_length('"\uaBcD"'::jsonb::text); -- OK, uppercase and lower case both OK
@@ -165,25 +165,25 @@ ERROR:  invalid input syntax for type json
 LINE 1: SELECT jsonb '{ "a":  "\ud83d\ud83d" }' -> 'a';
                      ^
 DETAIL:  Unicode high surrogate must not follow a high surrogate.
-CONTEXT:  JSON data, line 1: { "a":...
+CONTEXT:  JSON data, line 1: { "a":  "\ud83d\ud83d...
 SELECT jsonb '{ "a":  "\ude04\ud83d" }' -> 'a'; -- surrogates in wrong order
 ERROR:  invalid input syntax for type json
 LINE 1: SELECT jsonb '{ "a":  "\ude04\ud83d" }' -> 'a';
                      ^
 DETAIL:  Unicode low surrogate must follow a high surrogate.
-CONTEXT:  JSON data, line 1: { "a":...
+CONTEXT:  JSON data, line 1: { "a":  "\ude04...
 SELECT jsonb '{ "a":  "\ud83dX" }' -> 'a'; -- orphan high surrogate
 ERROR:  invalid input syntax for type json
 LINE 1: SELECT jsonb '{ "a":  "\ud83dX" }' -> 'a';
                      ^
 DETAIL:  Unicode low surrogate must follow a high surrogate.
-CONTEXT:  JSON data, line 1: { "a":...
+CONTEXT:  JSON data, line 1: { "a":  "\ud83dX...
 SELECT jsonb '{ "a":  "\ude04X" }' -> 'a'; -- orphan low surrogate
 ERROR:  invalid input syntax for type json
 LINE 1: SELECT jsonb '{ "a":  "\ude04X" }' -> 'a';
                      ^
 DETAIL:  Unicode low surrogate must follow a high surrogate.
-CONTEXT:  JSON data, line 1: { "a":...
+CONTEXT:  JSON data, line 1: { "a":  "\ude04...
 -- handling of simple unicode escapes
 SELECT jsonb '{ "a":  "the Copyright \u00a9 sign" }' as correct_in_utf8;
         correct_in_utf8        
@@ -208,7 +208,7 @@ ERROR:  unsupported Unicode escape sequence
 LINE 1: SELECT jsonb '{ "a":  "null \u0000 escape" }' as fails;
                      ^
 DETAIL:  \u0000 cannot be converted to text.
-CONTEXT:  JSON data, line 1: { "a":...
+CONTEXT:  JSON data, line 1: { "a":  "null \u0000...
 SELECT jsonb '{ "a":  "null \\u0000 escape" }' as not_an_escape;
         not_an_escape         
 ------------------------------
@@ -238,7 +238,7 @@ ERROR:  unsupported Unicode escape sequence
 LINE 1: SELECT jsonb '{ "a":  "null \u0000 escape" }' ->> 'a' as fai...
                      ^
 DETAIL:  \u0000 cannot be converted to text.
-CONTEXT:  JSON data, line 1: { "a":...
+CONTEXT:  JSON data, line 1: { "a":  "null \u0000...
 SELECT jsonb '{ "a":  "null \\u0000 escape" }' ->> 'a' as not_an_escape;
    not_an_escape    
 --------------------
diff --git a/src/test/regress/expected/json_encoding_1.out b/src/test/regress/expected/json_encoding_1.out
index 79ed78e1c5f..d320bfa8118 100644
--- a/src/test/regress/expected/json_encoding_1.out
+++ b/src/test/regress/expected/json_encoding_1.out
@@ -35,23 +35,23 @@ SELECT '"\uaBcD"'::json;		-- OK, uppercase and lower case both OK
 select json '{ "a":  "\ud83d\ude04\ud83d\udc36" }' -> 'a' as correct_in_utf8;
 ERROR:  unsupported Unicode escape sequence
 DETAIL:  Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8.
-CONTEXT:  JSON data, line 1: { "a":...
+CONTEXT:  JSON data, line 1: { "a":  "\ud83d\ude04...
 select json '{ "a":  "\ud83d\ud83d" }' -> 'a'; -- 2 high surrogates in a row
 ERROR:  invalid input syntax for type json
 DETAIL:  Unicode high surrogate must not follow a high surrogate.
-CONTEXT:  JSON data, line 1: { "a":...
+CONTEXT:  JSON data, line 1: { "a":  "\ud83d\ud83d...
 select json '{ "a":  "\ude04\ud83d" }' -> 'a'; -- surrogates in wrong order
 ERROR:  invalid input syntax for type json
 DETAIL:  Unicode low surrogate must follow a high surrogate.
-CONTEXT:  JSON data, line 1: { "a":...
+CONTEXT:  JSON data, line 1: { "a":  "\ude04...
 select json '{ "a":  "\ud83dX" }' -> 'a'; -- orphan high surrogate
 ERROR:  invalid input syntax for type json
 DETAIL:  Unicode low surrogate must follow a high surrogate.
-CONTEXT:  JSON data, line 1: { "a":...
+CONTEXT:  JSON data, line 1: { "a":  "\ud83dX...
 select json '{ "a":  "\ude04X" }' -> 'a'; -- orphan low surrogate
 ERROR:  invalid input syntax for type json
 DETAIL:  Unicode low surrogate must follow a high surrogate.
-CONTEXT:  JSON data, line 1: { "a":...
+CONTEXT:  JSON data, line 1: { "a":  "\ude04...
 --handling of simple unicode escapes
 select json '{ "a":  "the Copyright \u00a9 sign" }' as correct_in_utf8;
             correct_in_utf8            
@@ -86,7 +86,7 @@ select json '{ "a":  "null \\u0000 escape" }' as not_an_escape;
 select json '{ "a":  "the Copyright \u00a9 sign" }' ->> 'a' as correct_in_utf8;
 ERROR:  unsupported Unicode escape sequence
 DETAIL:  Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8.
-CONTEXT:  JSON data, line 1: { "a":...
+CONTEXT:  JSON data, line 1: { "a":  "the Copyright \u00a9...
 select json '{ "a":  "dollar \u0024 character" }' ->> 'a' as correct_everywhere;
  correct_everywhere 
 --------------------
@@ -102,7 +102,7 @@ select json '{ "a":  "dollar \\u0024 character" }' ->> 'a' as not_an_escape;
 select json '{ "a":  "null \u0000 escape" }' ->> 'a' as fails;
 ERROR:  unsupported Unicode escape sequence
 DETAIL:  \u0000 cannot be converted to text.
-CONTEXT:  JSON data, line 1: { "a":...
+CONTEXT:  JSON data, line 1: { "a":  "null \u0000...
 select json '{ "a":  "null \\u0000 escape" }' ->> 'a' as not_an_escape;
    not_an_escape    
 --------------------
@@ -140,7 +140,7 @@ ERROR:  unsupported Unicode escape sequence
 LINE 1: SELECT '"\u0000"'::jsonb;
                ^
 DETAIL:  \u0000 cannot be converted to text.
-CONTEXT:  JSON data, line 1: ...
+CONTEXT:  JSON data, line 1: "\u0000...
 -- use octet_length here so we don't get an odd unicode char in the
 -- output
 SELECT octet_length('"\uaBcD"'::jsonb::text); -- OK, uppercase and lower case both OK
@@ -148,45 +148,45 @@ ERROR:  unsupported Unicode escape sequence
 LINE 1: SELECT octet_length('"\uaBcD"'::jsonb::text);
                             ^
 DETAIL:  Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8.
-CONTEXT:  JSON data, line 1: ...
+CONTEXT:  JSON data, line 1: "\uaBcD...
 -- handling of unicode surrogate pairs
 SELECT octet_length((jsonb '{ "a":  "\ud83d\ude04\ud83d\udc36" }' -> 'a')::text) AS correct_in_utf8;
 ERROR:  unsupported Unicode escape sequence
 LINE 1: SELECT octet_length((jsonb '{ "a":  "\ud83d\ude04\ud83d\udc3...
                                    ^
 DETAIL:  Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8.
-CONTEXT:  JSON data, line 1: { "a":...
+CONTEXT:  JSON data, line 1: { "a":  "\ud83d\ude04...
 SELECT jsonb '{ "a":  "\ud83d\ud83d" }' -> 'a'; -- 2 high surrogates in a row
 ERROR:  invalid input syntax for type json
 LINE 1: SELECT jsonb '{ "a":  "\ud83d\ud83d" }' -> 'a';
                      ^
 DETAIL:  Unicode high surrogate must not follow a high surrogate.
-CONTEXT:  JSON data, line 1: { "a":...
+CONTEXT:  JSON data, line 1: { "a":  "\ud83d\ud83d...
 SELECT jsonb '{ "a":  "\ude04\ud83d" }' -> 'a'; -- surrogates in wrong order
 ERROR:  invalid input syntax for type json
 LINE 1: SELECT jsonb '{ "a":  "\ude04\ud83d" }' -> 'a';
                      ^
 DETAIL:  Unicode low surrogate must follow a high surrogate.
-CONTEXT:  JSON data, line 1: { "a":...
+CONTEXT:  JSON data, line 1: { "a":  "\ude04...
 SELECT jsonb '{ "a":  "\ud83dX" }' -> 'a'; -- orphan high surrogate
 ERROR:  invalid input syntax for type json
 LINE 1: SELECT jsonb '{ "a":  "\ud83dX" }' -> 'a';
                      ^
 DETAIL:  Unicode low surrogate must follow a high surrogate.
-CONTEXT:  JSON data, line 1: { "a":...
+CONTEXT:  JSON data, line 1: { "a":  "\ud83dX...
 SELECT jsonb '{ "a":  "\ude04X" }' -> 'a'; -- orphan low surrogate
 ERROR:  invalid input syntax for type json
 LINE 1: SELECT jsonb '{ "a":  "\ude04X" }' -> 'a';
                      ^
 DETAIL:  Unicode low surrogate must follow a high surrogate.
-CONTEXT:  JSON data, line 1: { "a":...
+CONTEXT:  JSON data, line 1: { "a":  "\ude04...
 -- handling of simple unicode escapes
 SELECT jsonb '{ "a":  "the Copyright \u00a9 sign" }' as correct_in_utf8;
 ERROR:  unsupported Unicode escape sequence
 LINE 1: SELECT jsonb '{ "a":  "the Copyright \u00a9 sign" }' as corr...
                      ^
 DETAIL:  Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8.
-CONTEXT:  JSON data, line 1: { "a":...
+CONTEXT:  JSON data, line 1: { "a":  "the Copyright \u00a9...
 SELECT jsonb '{ "a":  "dollar \u0024 character" }' as correct_everywhere;
      correct_everywhere      
 -----------------------------
@@ -204,7 +204,7 @@ ERROR:  unsupported Unicode escape sequence
 LINE 1: SELECT jsonb '{ "a":  "null \u0000 escape" }' as fails;
                      ^
 DETAIL:  \u0000 cannot be converted to text.
-CONTEXT:  JSON data, line 1: { "a":...
+CONTEXT:  JSON data, line 1: { "a":  "null \u0000...
 SELECT jsonb '{ "a":  "null \\u0000 escape" }' as not_an_escape;
         not_an_escape         
 ------------------------------
@@ -216,7 +216,7 @@ ERROR:  unsupported Unicode escape sequence
 LINE 1: SELECT jsonb '{ "a":  "the Copyright \u00a9 sign" }' ->> 'a'...
                      ^
 DETAIL:  Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8.
-CONTEXT:  JSON data, line 1: { "a":...
+CONTEXT:  JSON data, line 1: { "a":  "the Copyright \u00a9...
 SELECT jsonb '{ "a":  "dollar \u0024 character" }' ->> 'a' as correct_everywhere;
  correct_everywhere 
 --------------------
@@ -234,7 +234,7 @@ ERROR:  unsupported Unicode escape sequence
 LINE 1: SELECT jsonb '{ "a":  "null \u0000 escape" }' ->> 'a' as fai...
                      ^
 DETAIL:  \u0000 cannot be converted to text.
-CONTEXT:  JSON data, line 1: { "a":...
+CONTEXT:  JSON data, line 1: { "a":  "null \u0000...
 SELECT jsonb '{ "a":  "null \\u0000 escape" }' ->> 'a' as not_an_escape;
    not_an_escape    
 --------------------