Handle Unicode surrogate pairs correctly when processing JSON.

In 9.2, Unicode escape sequences are not analysed at all other than to make sure that they are in the form \uXXXX. But in 9.3 many of the new operators and functions try to turn JSON text values into text in the server encoding, and this includes de-escaping Unicode escape sequences. This processing had not taken into account the possibility that this might contain a surrogate pair to designate a character outside the BMP. That is now handled correctly. This also enforces correct use of surrogate pairs, something that is not done by the type's input routines. This fact is noted in the docs.
author: Andrew Dunstan <andrew@dunslane.net> 2013-06-08 09:12:48 -0400
committer: Andrew Dunstan <andrew@dunslane.net> 2013-06-08 09:12:48 -0400
commit: 94e3311b97448324d67ba9a527854271373329d9 (patch)
tree: f54ab210d201b70735affadcd018c00c8db737c4 /src
parent: c99d5d1bcc137c15058458bbdcdd2789b56e4c66 (diff)
download: postgresql-94e3311b97448324d67ba9a527854271373329d9.tar.gz
postgresql-94e3311b97448324d67ba9a527854271373329d9.zip
3 files changed, 83 insertions, 0 deletions
diff --git a/src/backend/utils/adt/json.c b/src/backend/utils/adt/json.c
index aaf99bddf27..d8046c5b54d 100644
--- a/src/backend/utils/adt/json.c
+++ b/src/backend/utils/adt/json.c
@@ -646,6 +646,7 @@ json_lex_string(JsonLexContext *lex)
 {
 	char	   *s;
 	int			len;
+	int			hi_surrogate = -1;
 
 	if (lex->strval != NULL)
 		resetStringInfo(lex->strval);
@@ -718,6 +719,36 @@ json_lex_string(JsonLexContext *lex)
 					int			utf8len;
 					char	   *converted;
 
+					if (ch >= 0xd800 && ch <= 0xdbff)
+					{
+						if (hi_surrogate != -1)
+							ereport(ERROR,
+							   (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
+								errmsg("invalid input syntax for type json"),
+								errdetail("high order surrogate must not follow a high order surrogate."),
+								report_json_context(lex)));
+						hi_surrogate = (ch & 0x3ff) << 10;
+						continue;
+					}
+					else if (ch >= 0xdc00 && ch <= 0xdfff)
+					{
+						if (hi_surrogate == -1)
+							ereport(ERROR,
+							   (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
+								errmsg("invalid input syntax for type json"),
+								errdetail("low order surrogate must follow a high order surrogate."),
+								report_json_context(lex)));
+						ch = 0x10000 + hi_surrogate + (ch & 0x3ff);
+						hi_surrogate = -1;
+					}
+
+					if (hi_surrogate != -1)
+						ereport(ERROR,
+								(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
+								 errmsg("invalid input syntax for type json"),
+								 errdetail("low order surrogate must follow a high order surrogate."),
+								 report_json_context(lex)));
+
 					unicode_to_utf8(ch, (unsigned char *) utf8str);
 					utf8len = pg_utf_mblen((unsigned char *) utf8str);
 					utf8str[utf8len] = '\0';
@@ -730,6 +761,13 @@ json_lex_string(JsonLexContext *lex)
 			}
 			else if (lex->strval != NULL)
 			{
+				if (hi_surrogate != -1)
+					ereport(ERROR,
+							(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
+							 errmsg("invalid input syntax for type json"),
+							 errdetail("low order surrogate must follow a high order surrogate."),
+							 report_json_context(lex)));
+
 				switch (*s)
 				{
 					case '"':
@@ -784,11 +822,25 @@ json_lex_string(JsonLexContext *lex)
 		}
 		else if (lex->strval != NULL)
 		{
+			if (hi_surrogate != -1)
+				ereport(ERROR,
+						(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
+						 errmsg("invalid input syntax for type json"),
+						 errdetail("low order surrogate must follow a high order surrogate."),
+						 report_json_context(lex)));
+
 			appendStringInfoChar(lex->strval, *s);
 		}
 
 	}
 
+	if (hi_surrogate != -1)
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
+				 errmsg("invalid input syntax for type json"),
+		errdetail("low order surrogate must follow a high order surrogate."),
+				 report_json_context(lex)));
+
 	/* Hooray, we found the end of the string! */
 	lex->prev_token_terminator = lex->token_terminator;
 	lex->token_terminator = s + 1;
diff --git a/src/test/regress/expected/json.out b/src/test/regress/expected/json.out
index 1d7cf5ff2f3..293c7429627 100644
--- a/src/test/regress/expected/json.out
+++ b/src/test/regress/expected/json.out
@@ -920,3 +920,26 @@ select * from json_populate_recordset(row('def',99,null)::jpop,'[{"a":[100,200,3
 ERROR:  cannot call json_populate_recordset on a nested object
 select * from json_populate_recordset(row('def',99,null)::jpop,'[{"c":[100,200,300],"x":43.2},{"a":{"z":true},"b":3,"c":"2012-01-20 10:42:53"}]') q;
 ERROR:  cannot call json_populate_recordset on a nested object
+-- handling of unicode surrogate pairs
+select json '{ "a":  "\ud83d\ude04\ud83d\udc36" }' -> 'a' as correct;
+          correct           
+----------------------------
+ "\ud83d\ude04\ud83d\udc36"
+(1 row)
+
+select json '{ "a":  "\ud83d\ud83d" }' -> 'a'; -- 2 high surrogates in a row
+ERROR:  invalid input syntax for type json
+DETAIL:  high order surrogate must not follow a high order surrogate.
+CONTEXT:  JSON data, line 1: { "a":...
+select json '{ "a":  "\ude04\ud83d" }' -> 'a'; -- surrogates in wrong order
+ERROR:  invalid input syntax for type json
+DETAIL:  low order surrogate must follow a high order surrogate.
+CONTEXT:  JSON data, line 1: { "a":...
+select json '{ "a":  "\ud83dX" }' -> 'a'; -- orphan high surrogate
+ERROR:  invalid input syntax for type json
+DETAIL:  low order surrogate must follow a high order surrogate.
+CONTEXT:  JSON data, line 1: { "a":...
+select json '{ "a":  "\ude04X" }' -> 'a'; -- orphan low surrogate
+ERROR:  invalid input syntax for type json
+DETAIL:  low order surrogate must follow a high order surrogate.
+CONTEXT:  JSON data, line 1: { "a":...
diff --git a/src/test/regress/sql/json.sql b/src/test/regress/sql/json.sql
index 8a136d7a273..5b6bc36517e 100644
--- a/src/test/regress/sql/json.sql
+++ b/src/test/regress/sql/json.sql
@@ -296,3 +296,11 @@ select * from json_populate_recordset(null::jpop,'[{"a":"blurfl","x":43.2},{"b":
 select * from json_populate_recordset(row('def',99,null)::jpop,'[{"a":"blurfl","x":43.2},{"b":3,"c":"2012-01-20 10:42:53"}]') q;
 select * from json_populate_recordset(row('def',99,null)::jpop,'[{"a":[100,200,300],"x":43.2},{"a":{"z":true},"b":3,"c":"2012-01-20 10:42:53"}]') q;
 select * from json_populate_recordset(row('def',99,null)::jpop,'[{"c":[100,200,300],"x":43.2},{"a":{"z":true},"b":3,"c":"2012-01-20 10:42:53"}]') q;
+
+-- handling of unicode surrogate pairs
+
+select json '{ "a":  "\ud83d\ude04\ud83d\udc36" }' -> 'a' as correct;
+select json '{ "a":  "\ud83d\ud83d" }' -> 'a'; -- 2 high surrogates in a row
+select json '{ "a":  "\ude04\ud83d" }' -> 'a'; -- surrogates in wrong order
+select json '{ "a":  "\ud83dX" }' -> 'a'; -- orphan high surrogate
+select json '{ "a":  "\ude04X" }' -> 'a'; -- orphan low surrogate
author	Andrew Dunstan <andrew@dunslane.net>	2013-06-08 09:12:48 -0400
committer	Andrew Dunstan <andrew@dunslane.net>	2013-06-08 09:12:48 -0400
commit	94e3311b97448324d67ba9a527854271373329d9 (patch)
tree	f54ab210d201b70735affadcd018c00c8db737c4 /src
parent	c99d5d1bcc137c15058458bbdcdd2789b56e4c66 (diff)
download	postgresql-94e3311b97448324d67ba9a527854271373329d9.tar.gz postgresql-94e3311b97448324d67ba9a527854271373329d9.zip