Fix JSON error reporting for many cases of erroneous string values.

The majority of error exit cases in json_lex_string() failed to set lex->token_terminator, causing problems for the error context reporting code: it would see token_terminator less than token_start and do something more or less nuts. In v14 and up the end result could be as bad as a crash in report_json_context(). Older versions accidentally avoided that fate; but all versions produce error context lines that are far less useful than intended, because they'd stop at the end of the prior token instead of continuing to where the actually-bad input is. To fix, invent some macros that make it less notationally painful to do the right thing. Also add documentation about what the function is actually required to do; and in >= v14, add an assertion in report_json_context about token_terminator being sufficiently far advanced. Per report from Nikolay Shaplov. Back-patch to all supported versions. Discussion: https://postgr.es/m/7332649.x5DLKWyVIX@thinkpad-pgpro
author: Tom Lane <tgl@sss.pgh.pa.us> 2023-03-13 15:19:00 -0400
committer: Tom Lane <tgl@sss.pgh.pa.us> 2023-03-13 15:19:00 -0400
commit: c25a929a6c8869a148b3ee064eb03ab1d3cb127d (patch)
tree: 003d2325f6b916252612506418247e71963dd8be /src/backend/utils/adt/json.c
parent: 62a91a1b092606e55d8a9807d249ceda58feebb0 (diff)
download: postgresql-c25a929a6c8869a148b3ee064eb03ab1d3cb127d.tar.gz
postgresql-c25a929a6c8869a148b3ee064eb03ab1d3cb127d.zip
1 files changed, 27 insertions, 2 deletions
diff --git a/src/backend/utils/adt/json.c b/src/backend/utils/adt/json.c
index 26d293709aa..40d1dff39c9 100644
--- a/src/backend/utils/adt/json.c
+++ b/src/backend/utils/adt/json.c
@@ -750,6 +750,13 @@ json_lex(JsonLexContext *lex)
 
 /*
  * The next token in the input stream is known to be a string; lex it.
+ *
+ * If lex->strval isn't NULL, fill it with the decoded string.
+ * Set lex->token_terminator to the end of the decoded input, and in
+ * success cases, transfer its previous value to lex->prev_token_terminator.
+ *
+ * Note: be careful that all error cases advance lex->token_terminator
+ * to the point after the character we detected the error on.
  */
 static inline void
 json_lex_string(JsonLexContext *lex)
@@ -837,33 +844,42 @@ json_lex_string(JsonLexContext *lex)
 					if (ch >= 0xd800 && ch <= 0xdbff)
 					{
 						if (hi_surrogate != -1)
+						{
+							lex->token_terminator = s + pg_mblen(s);
 							ereport(ERROR,
 									(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
 									 errmsg("invalid input syntax for type %s",
 											"json"),
 									 errdetail("Unicode high surrogate must not follow a high surrogate."),
 									 report_json_context(lex)));
+						}
 						hi_surrogate = (ch & 0x3ff) << 10;
 						continue;
 					}
 					else if (ch >= 0xdc00 && ch <= 0xdfff)
 					{
 						if (hi_surrogate == -1)
+						{
+							lex->token_terminator = s + pg_mblen(s);
 							ereport(ERROR,
 									(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
 									 errmsg("invalid input syntax for type %s", "json"),
 									 errdetail("Unicode low surrogate must follow a high surrogate."),
 									 report_json_context(lex)));
+						}
 						ch = 0x10000 + hi_surrogate + (ch & 0x3ff);
 						hi_surrogate = -1;
 					}
 
 					if (hi_surrogate != -1)
+					{
+						lex->token_terminator = s + pg_mblen(s);
 						ereport(ERROR,
 								(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
 								 errmsg("invalid input syntax for type %s", "json"),
 								 errdetail("Unicode low surrogate must follow a high surrogate."),
 								 report_json_context(lex)));
+					}
 
 					/*
 					 * For UTF8, replace the escape sequence by the actual
@@ -875,6 +891,7 @@ json_lex_string(JsonLexContext *lex)
 					if (ch == 0)
 					{
 						/* We can't allow this, since our TEXT type doesn't */
+						lex->token_terminator = s + pg_mblen(s);
 						ereport(ERROR,
 								(errcode(ERRCODE_UNTRANSLATABLE_CHARACTER),
 								 errmsg("unsupported Unicode escape sequence"),
@@ -898,24 +915,27 @@ json_lex_string(JsonLexContext *lex)
 					}
 					else
 					{
+						lex->token_terminator = s + pg_mblen(s);
 						ereport(ERROR,
 								(errcode(ERRCODE_UNTRANSLATABLE_CHARACTER),
 								 errmsg("unsupported Unicode escape sequence"),
 								 errdetail("Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8."),
 								 report_json_context(lex)));
 					}
-
 				}
 			}
 			else if (lex->strval != NULL)
 			{
 				if (hi_surrogate != -1)
+				{
+					lex->token_terminator = s + pg_mblen(s);
 					ereport(ERROR,
 							(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
 							 errmsg("invalid input syntax for type %s",
 									"json"),
 							 errdetail("Unicode low surrogate must follow a high surrogate."),
 							 report_json_context(lex)));
+				}
 
 				switch (*s)
 				{
@@ -968,16 +988,18 @@ json_lex_string(JsonLexContext *lex)
 								   extract_mb_char(s)),
 						 report_json_context(lex)));
 			}
-
 		}
 		else if (lex->strval != NULL)
 		{
 			if (hi_surrogate != -1)
+			{
+				lex->token_terminator = s + pg_mblen(s);
 				ereport(ERROR,
 						(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
 						 errmsg("invalid input syntax for type %s", "json"),
 						 errdetail("Unicode low surrogate must follow a high surrogate."),
 						 report_json_context(lex)));
+			}
 
 			appendStringInfoChar(lex->strval, *s);
 		}
@@ -985,11 +1007,14 @@ json_lex_string(JsonLexContext *lex)
 	}
 
 	if (hi_surrogate != -1)
+	{
+		lex->token_terminator = s + pg_mblen(s);
 		ereport(ERROR,
 				(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
 				 errmsg("invalid input syntax for type %s", "json"),
 				 errdetail("Unicode low surrogate must follow a high surrogate."),
 				 report_json_context(lex)));
+	}
 
 	/* Hooray, we found the end of the string! */
 	lex->prev_token_terminator = lex->token_terminator;
author	Tom Lane <tgl@sss.pgh.pa.us>	2023-03-13 15:19:00 -0400
committer	Tom Lane <tgl@sss.pgh.pa.us>	2023-03-13 15:19:00 -0400
commit	c25a929a6c8869a148b3ee064eb03ab1d3cb127d (patch)
tree	003d2325f6b916252612506418247e71963dd8be /src/backend/utils/adt/json.c
parent	62a91a1b092606e55d8a9807d249ceda58feebb0 (diff)
download	postgresql-c25a929a6c8869a148b3ee064eb03ab1d3cb127d.tar.gz postgresql-c25a929a6c8869a148b3ee064eb03ab1d3cb127d.zip