With GB18030, prevent SIGSEGV from reading past end of allocation.

With GB18030 as source encoding, applications could crash the server via SQL functions convert() or convert_from(). Applications themselves could crash after passing unterminated GB18030 input to libpq functions PQescapeLiteral(), PQescapeIdentifier(), PQescapeStringConn(), or PQescapeString(). Extension code could crash by passing unterminated GB18030 input to jsonapi.h functions. All those functions have been intended to handle untrusted, unterminated input safely. A crash required allocating the input such that the last byte of the allocation was the last byte of a virtual memory page. Some malloc() implementations take measures against that, making the SIGSEGV hard to reach. Back-patch to v13 (all supported versions). Author: Noah Misch <noah@leadboat.com> Author: Andres Freund <andres@anarazel.de> Reviewed-by: Masahiko Sawada <sawada.mshk@gmail.com> Backpatch-through: 13 Security: CVE-2025-4207
author: Noah Misch <noah@leadboat.com> 2025-05-05 04:52:04 -0700
committer: Noah Misch <noah@leadboat.com> 2025-05-05 04:52:08 -0700
commit: 3f2ab73934ab1e27151ecd14fd7d8ef602555093 (patch)
tree: 104f0bf916d41f0a07d46bd1c75a230399b7c0b2
parent: 258cde839d902ad9dd41cbd7ab58ebf4ba086874 (diff)
download: postgresql-3f2ab73934ab1e27151ecd14fd7d8ef602555093.tar.gz
postgresql-3f2ab73934ab1e27151ecd14fd7d8ef602555093.zip
9 files changed, 188 insertions, 30 deletions
diff --git a/src/backend/utils/mb/mbutils.c b/src/backend/utils/mb/mbutils.c
index a13c398f4ac..1abe90d4aa5 100644
--- a/src/backend/utils/mb/mbutils.c
+++ b/src/backend/utils/mb/mbutils.c
@@ -1030,7 +1030,7 @@ pg_mbcliplen(const char *mbstr, int len, int limit)
 }
 
 /*
- * pg_mbcliplen with specified encoding
+ * pg_mbcliplen with specified encoding; string must be valid in encoding
  */
 int
 pg_encoding_mbcliplen(int encoding, const char *mbstr,
@@ -1641,12 +1641,12 @@ check_encoding_conversion_args(int src_encoding,
  * report_invalid_encoding: complain about invalid multibyte character
  *
  * note: len is remaining length of string, not length of character;
- * len must be greater than zero, as we always examine the first byte.
+ * len must be greater than zero (or we'd neglect initializing "buf").
  */
 void
 report_invalid_encoding(int encoding, const char *mbstr, int len)
 {
-	int			l = pg_encoding_mblen(encoding, mbstr);
+	int			l = pg_encoding_mblen_or_incomplete(encoding, mbstr, len);
 	char		buf[8 * 5 + 1];
 	char	   *p = buf;
 	int			j,
@@ -1673,18 +1673,26 @@ report_invalid_encoding(int encoding, const char *mbstr, int len)
  * report_untranslatable_char: complain about untranslatable character
  *
  * note: len is remaining length of string, not length of character;
- * len must be greater than zero, as we always examine the first byte.
+ * len must be greater than zero (or we'd neglect initializing "buf").
  */
 void
 report_untranslatable_char(int src_encoding, int dest_encoding,
 						   const char *mbstr, int len)
 {
-	int			l = pg_encoding_mblen(src_encoding, mbstr);
+	int			l;
 	char		buf[8 * 5 + 1];
 	char	   *p = buf;
 	int			j,
 				jlimit;
 
+	/*
+	 * We probably could use plain pg_encoding_mblen(), because
+	 * gb18030_to_utf8() verifies before it converts.  All conversions should.
+	 * For src_encoding!=GB18030, len>0 meets pg_encoding_mblen() needs.  Even
+	 * so, be defensive, since a buggy conversion might pass invalid data.
+	 * This is not a performance-critical path.
+	 */
+	l = pg_encoding_mblen_or_incomplete(src_encoding, mbstr, len);
 	jlimit = Min(l, len);
 	jlimit = Min(jlimit, 8);	/* prevent buffer overrun */
 
diff --git a/src/common/jsonapi.c b/src/common/jsonapi.c
index 8da2bdb271b..573375618d7 100644
--- a/src/common/jsonapi.c
+++ b/src/common/jsonapi.c
@@ -700,8 +700,11 @@ json_lex_string(JsonLexContext *lex)
 	} while (0)
 #define FAIL_AT_CHAR_END(code) \
 	do { \
-		char	   *term = s + pg_encoding_mblen(lex->input_encoding, s); \
-		lex->token_terminator = (term <= end) ? term : end; \
+		ptrdiff_t	remaining = end - s; \
+		int			charlen; \
+		charlen = pg_encoding_mblen_or_incomplete(lex->input_encoding, \
+												  s, remaining); \
+		lex->token_terminator = (charlen <= remaining) ? s + charlen : end; \
 		return code; \
 	} while (0)
 
diff --git a/src/common/wchar.c b/src/common/wchar.c
index 35885fb6de7..71f564abfad 100644
--- a/src/common/wchar.c
+++ b/src/common/wchar.c
@@ -12,6 +12,8 @@
  */
 #include "c.h"
 
+#include <limits.h>
+
 #include "mb/pg_wchar.h"
 
 
@@ -1959,10 +1961,27 @@ const pg_wchar_tbl pg_wchar_table[] = {
 /*
  * Returns the byte length of a multibyte character.
  *
- * Caution: when dealing with text that is not certainly valid in the
- * specified encoding, the result may exceed the actual remaining
- * string length.  Callers that are not prepared to deal with that
- * should use pg_encoding_mblen_bounded() instead.
+ * Choose "mblen" functions based on the input string characteristics.
+ * pg_encoding_mblen() can be used when ANY of these conditions are met:
+ *
+ * - The input string is zero-terminated
+ *
+ * - The input string is known to be valid in the encoding (e.g., string
+ *   converted from database encoding)
+ *
+ * - The encoding is not GB18030 (e.g., when only database encodings are
+ *   passed to 'encoding' parameter)
+ *
+ * encoding==GB18030 requires examining up to two bytes to determine character
+ * length.  Therefore, callers satisfying none of those conditions must use
+ * pg_encoding_mblen_or_incomplete() instead, as access to mbstr[1] cannot be
+ * guaranteed to be within allocation bounds.
+ *
+ * When dealing with text that is not certainly valid in the specified
+ * encoding, the result may exceed the actual remaining string length.
+ * Callers that are not prepared to deal with that should use Min(remaining,
+ * pg_encoding_mblen_or_incomplete()).  For zero-terminated strings, that and
+ * pg_encoding_mblen_bounded() are interchangeable.
  */
 int
 pg_encoding_mblen(int encoding, const char *mbstr)
@@ -1973,8 +1992,28 @@ pg_encoding_mblen(int encoding, const char *mbstr)
 }
 
 /*
- * Returns the byte length of a multibyte character; but not more than
- * the distance to end of string.
+ * Returns the byte length of a multibyte character (possibly not
+ * zero-terminated), or INT_MAX if too few bytes remain to determine a length.
+ */
+int
+pg_encoding_mblen_or_incomplete(int encoding, const char *mbstr,
+								size_t remaining)
+{
+	/*
+	 * Define zero remaining as too few, even for single-byte encodings.
+	 * pg_gb18030_mblen() reads one or two bytes; single-byte encodings read
+	 * zero; others read one.
+	 */
+	if (remaining < 1 ||
+		(encoding == PG_GB18030 && IS_HIGHBIT_SET(*mbstr) && remaining < 2))
+		return INT_MAX;
+	return pg_encoding_mblen(encoding, mbstr);
+}
+
+/*
+ * Returns the byte length of a multibyte character; but not more than the
+ * distance to the terminating zero byte.  For input that might lack a
+ * terminating zero, use Min(remaining, pg_encoding_mblen_or_incomplete()).
  */
 int
 pg_encoding_mblen_bounded(int encoding, const char *mbstr)
diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h
index abd65eb9f7d..a44ea1acbc0 100644
--- a/src/include/mb/pg_wchar.h
+++ b/src/include/mb/pg_wchar.h
@@ -575,6 +575,8 @@ extern int	pg_valid_server_encoding_id(int encoding);
  */
 extern void pg_encoding_set_invalid(int encoding, char *dst);
 extern int	pg_encoding_mblen(int encoding, const char *mbstr);
+extern int	pg_encoding_mblen_or_incomplete(int encoding, const char *mbstr,
+											size_t remaining);
 extern int	pg_encoding_mblen_bounded(int encoding, const char *mbstr);
 extern int	pg_encoding_dsplen(int encoding, const char *mbstr);
 extern int	pg_encoding_verifymbchar(int encoding, const char *mbstr, int len);
diff --git a/src/interfaces/libpq/fe-exec.c b/src/interfaces/libpq/fe-exec.c
index 526e51abf63..4abb05516f3 100644
--- a/src/interfaces/libpq/fe-exec.c
+++ b/src/interfaces/libpq/fe-exec.c
@@ -3980,7 +3980,8 @@ PQescapeStringInternal(PGconn *conn,
 		}
 
 		/* Slow path for possible multibyte characters */
-		charlen = pg_encoding_mblen(encoding, source);
+		charlen = pg_encoding_mblen_or_incomplete(encoding,
+												  source, remaining);
 
 		if (remaining < charlen ||
 			pg_encoding_verifymbchar(encoding, source, charlen) == -1)
@@ -4124,7 +4125,8 @@ PQescapeInternal(PGconn *conn, const char *str, size_t len, bool as_ident)
 			int			charlen;
 
 			/* Slow path for possible multibyte characters */
-			charlen = pg_encoding_mblen(conn->client_encoding, s);
+			charlen = pg_encoding_mblen_or_incomplete(conn->client_encoding,
+													  s, remaining);
 
 			if (charlen > remaining)
 			{
diff --git a/src/interfaces/libpq/fe-misc.c b/src/interfaces/libpq/fe-misc.c
index 55021fa5bf3..f92d7c0c7d1 100644
--- a/src/interfaces/libpq/fe-misc.c
+++ b/src/interfaces/libpq/fe-misc.c
@@ -1180,13 +1180,9 @@ pqSocketPoll(int sock, int forRead, int forWrite, time_t end_time)
  */
 
 /*
- * Returns the byte length of the character beginning at s, using the
- * specified encoding.
- *
- * Caution: when dealing with text that is not certainly valid in the
- * specified encoding, the result may exceed the actual remaining
- * string length.  Callers that are not prepared to deal with that
- * should use PQmblenBounded() instead.
+ * Like pg_encoding_mblen().  Use this in callers that want the
+ * dynamically-linked libpq's stance on encodings, even if that means
+ * different behavior in different startups of the executable.
  */
 int
 PQmblen(const char *s, int encoding)
@@ -1195,8 +1191,9 @@ PQmblen(const char *s, int encoding)
 }
 
 /*
- * Returns the byte length of the character beginning at s, using the
- * specified encoding; but not more than the distance to end of string.
+ * Like pg_encoding_mblen_bounded().  Use this in callers that want the
+ * dynamically-linked libpq's stance on encodings, even if that means
+ * different behavior in different startups of the executable.
  */
 int
 PQmblenBounded(const char *s, int encoding)
diff --git a/src/test/modules/test_escape/test_escape.c b/src/test/modules/test_escape/test_escape.c
index 454eb557c55..dddb6d6d623 100644
--- a/src/test/modules/test_escape/test_escape.c
+++ b/src/test/modules/test_escape/test_escape.c
@@ -12,6 +12,7 @@
 #include <string.h>
 #include <stdio.h>
 
+#include "common/jsonapi.h"
 #include "fe_utils/psqlscan.h"
 #include "fe_utils/string_utils.h"
 #include "getopt_long.h"
@@ -164,6 +165,91 @@ encoding_conflicts_ascii(int encoding)
 }
 
 
+/*
+ * Confirm escaping doesn't read past the end of an allocation.  Consider the
+ * result of malloc(4096), in the absence of freelist entries satisfying the
+ * allocation.  On OpenBSD, reading one byte past the end of that object
+ * yields SIGSEGV.
+ *
+ * Run this test before the program's other tests, so freelists are minimal.
+ * len=4096 didn't SIGSEGV, likely due to free() calls in libpq.  len=8192
+ * did.  Use 128 KiB, to somewhat insulate the outcome from distant new free()
+ * calls and libc changes.
+ */
+static void
+test_gb18030_page_multiple(pe_test_config *tc)
+{
+	PQExpBuffer testname;
+	size_t		input_len = 0x20000;
+	char	   *input;
+
+	/* prepare input */
+	input = pg_malloc(input_len);
+	memset(input, '-', input_len - 1);
+	input[input_len - 1] = 0xfe;
+
+	/* name to describe the test */
+	testname = createPQExpBuffer();
+	appendPQExpBuffer(testname, ">repeat(%c, %zu)", input[0], input_len - 1);
+	escapify(testname, input + input_len - 1, 1);
+	appendPQExpBuffer(testname, "< - GB18030 - PQescapeLiteral");
+
+	/* test itself */
+	PQsetClientEncoding(tc->conn, "GB18030");
+	report_result(tc, PQescapeLiteral(tc->conn, input, input_len) == NULL,
+				  testname->data, "",
+				  "input validity vs escape success", "ok");
+
+	destroyPQExpBuffer(testname);
+	pg_free(input);
+}
+
+/*
+ * Confirm json parsing doesn't read past the end of an allocation.  This
+ * exercises wchar.c infrastructure like the true "escape" tests do, but this
+ * isn't an "escape" test.
+ */
+static void
+test_gb18030_json(pe_test_config *tc)
+{
+	PQExpBuffer raw_buf;
+	PQExpBuffer testname;
+	const char	input[] = "{\"\\u\xFE";
+	size_t		input_len = sizeof(input) - 1;
+	JsonLexContext *lex;
+	JsonSemAction sem = {0};	/* no callbacks */
+	JsonParseErrorType json_error;
+	char	   *error_str;
+
+	/* prepare input like test_one_vector_escape() does */
+	raw_buf = createPQExpBuffer();
+	appendBinaryPQExpBuffer(raw_buf, input, input_len);
+	appendPQExpBufferStr(raw_buf, NEVER_ACCESS_STR);
+	VALGRIND_MAKE_MEM_NOACCESS(&raw_buf->data[input_len],
+							   raw_buf->len - input_len);
+
+	/* name to describe the test */
+	testname = createPQExpBuffer();
+	appendPQExpBuffer(testname, ">");
+	escapify(testname, input, input_len);
+	appendPQExpBuffer(testname, "< - GB18030 - pg_parse_json");
+
+	/* test itself */
+	lex = makeJsonLexContextCstringLen(raw_buf->data, input_len,
+									   PG_GB18030, false);
+	json_error = pg_parse_json(lex, &sem);
+	error_str = psprintf("JsonParseErrorType %d", json_error);
+	report_result(tc, json_error == JSON_UNICODE_ESCAPE_FORMAT,
+				  testname->data, "",
+				  "diagnosed", error_str);
+
+	pfree(error_str);
+	pfree(lex);
+	destroyPQExpBuffer(testname);
+	destroyPQExpBuffer(raw_buf);
+}
+
+
 static bool
 escape_literal(PGconn *conn, PQExpBuffer target,
 			   const char *unescaped, size_t unescaped_len,
@@ -454,8 +540,18 @@ static pe_test_vector pe_test_vectors[] =
 	 * Testcases that are not null terminated for the specified input length.
 	 * That's interesting to verify that escape functions don't read beyond
 	 * the intended input length.
+	 *
+	 * One interesting special case is GB18030, which has the odd behaviour
+	 * needing to read beyond the first byte to determine the length of a
+	 * multi-byte character.
 	 */
 	TV_LEN("gbk", "\x80", 1),
+	TV_LEN("GB18030", "\x80", 1),
+	TV_LEN("GB18030", "\x80\0", 2),
+	TV_LEN("GB18030", "\x80\x30", 2),
+	TV_LEN("GB18030", "\x80\x30\0", 3),
+	TV_LEN("GB18030", "\x80\x30\x30", 3),
+	TV_LEN("GB18030", "\x80\x30\x30\0", 4),
 	TV_LEN("UTF-8", "\xC3\xb6  ", 1),
 	TV_LEN("UTF-8", "\xC3\xb6  ", 2),
 };
@@ -864,6 +960,9 @@ main(int argc, char *argv[])
 		exit(1);
 	}
 
+	test_gb18030_page_multiple(&tc);
+	test_gb18030_json(&tc);
+
 	for (int i = 0; i < lengthof(pe_test_vectors); i++)
 	{
 		test_one_vector(&tc, &pe_test_vectors[i]);
diff --git a/src/test/regress/expected/conversion.out b/src/test/regress/expected/conversion.out
index 772814732af..f21eeaf9088 100644
--- a/src/test/regress/expected/conversion.out
+++ b/src/test/regress/expected/conversion.out
@@ -329,10 +329,13 @@ insert into gb18030_inputs  values
   ('\x666f6f84309c38',	'valid, translates to UTF-8 by mapping function'),
   ('\x666f6f84309c',	'incomplete char '),
   ('\x666f6f84309c0a',	'incomplete char, followed by newline '),
+  ('\x666f6f84',		'incomplete char at end'),
   ('\x666f6f84309c3800', 'invalid, NUL byte'),
   ('\x666f6f84309c0038', 'invalid, NUL byte');
--- Test GB18030 verification
-select description, inbytes, (test_conv(inbytes, 'gb18030', 'gb18030')).* from gb18030_inputs;
+-- Test GB18030 verification.  Round-trip through text so the backing of the
+-- bytea values is palloc, not shared_buffers.  This lets Valgrind detect
+-- reads past the end.
+select description, inbytes, (test_conv(inbytes::text::bytea, 'gb18030', 'gb18030')).* from gb18030_inputs;
                   description                   |      inbytes       |      result      |   errorat    |                               error                               
 ------------------------------------------------+--------------------+------------------+--------------+-------------------------------------------------------------------
  valid, pure ASCII                              | \x666f6f           | \x666f6f         |              | 
@@ -341,9 +344,10 @@ select description, inbytes, (test_conv(inbytes, 'gb18030', 'gb18030')).* from g
  valid, translates to UTF-8 by mapping function | \x666f6f84309c38   | \x666f6f84309c38 |              | 
  incomplete char                                | \x666f6f84309c     | \x666f6f         | \x84309c     | invalid byte sequence for encoding "GB18030": 0x84 0x30 0x9c
  incomplete char, followed by newline           | \x666f6f84309c0a   | \x666f6f         | \x84309c0a   | invalid byte sequence for encoding "GB18030": 0x84 0x30 0x9c 0x0a
+ incomplete char at end                         | \x666f6f84         | \x666f6f         | \x84         | invalid byte sequence for encoding "GB18030": 0x84
  invalid, NUL byte                              | \x666f6f84309c3800 | \x666f6f84309c38 | \x00         | invalid byte sequence for encoding "GB18030": 0x00
  invalid, NUL byte                              | \x666f6f84309c0038 | \x666f6f         | \x84309c0038 | invalid byte sequence for encoding "GB18030": 0x84 0x30 0x9c 0x00
-(8 rows)
+(9 rows)
 
 -- Test conversions from GB18030
 select description, inbytes, (test_conv(inbytes, 'gb18030', 'utf8')).* from gb18030_inputs;
@@ -355,9 +359,10 @@ select description, inbytes, (test_conv(inbytes, 'gb18030', 'utf8')).* from gb18
  valid, translates to UTF-8 by mapping function | \x666f6f84309c38   | \x666f6fefa8aa |              | 
  incomplete char                                | \x666f6f84309c     | \x666f6f       | \x84309c     | invalid byte sequence for encoding "GB18030": 0x84 0x30 0x9c
  incomplete char, followed by newline           | \x666f6f84309c0a   | \x666f6f       | \x84309c0a   | invalid byte sequence for encoding "GB18030": 0x84 0x30 0x9c 0x0a
+ incomplete char at end                         | \x666f6f84         | \x666f6f       | \x84         | invalid byte sequence for encoding "GB18030": 0x84
  invalid, NUL byte                              | \x666f6f84309c3800 | \x666f6fefa8aa | \x00         | invalid byte sequence for encoding "GB18030": 0x00
  invalid, NUL byte                              | \x666f6f84309c0038 | \x666f6f       | \x84309c0038 | invalid byte sequence for encoding "GB18030": 0x84 0x30 0x9c 0x00
-(8 rows)
+(9 rows)
 
 --
 -- ISO-8859-5
diff --git a/src/test/regress/sql/conversion.sql b/src/test/regress/sql/conversion.sql
index d22b065885f..a9e6fb6a10f 100644
--- a/src/test/regress/sql/conversion.sql
+++ b/src/test/regress/sql/conversion.sql
@@ -154,11 +154,14 @@ insert into gb18030_inputs  values
   ('\x666f6f84309c38',	'valid, translates to UTF-8 by mapping function'),
   ('\x666f6f84309c',	'incomplete char '),
   ('\x666f6f84309c0a',	'incomplete char, followed by newline '),
+  ('\x666f6f84',		'incomplete char at end'),
   ('\x666f6f84309c3800', 'invalid, NUL byte'),
   ('\x666f6f84309c0038', 'invalid, NUL byte');
 
--- Test GB18030 verification
-select description, inbytes, (test_conv(inbytes, 'gb18030', 'gb18030')).* from gb18030_inputs;
+-- Test GB18030 verification.  Round-trip through text so the backing of the
+-- bytea values is palloc, not shared_buffers.  This lets Valgrind detect
+-- reads past the end.
+select description, inbytes, (test_conv(inbytes::text::bytea, 'gb18030', 'gb18030')).* from gb18030_inputs;
 -- Test conversions from GB18030
 select description, inbytes, (test_conv(inbytes, 'gb18030', 'utf8')).* from gb18030_inputs;
author	Noah Misch <noah@leadboat.com>	2025-05-05 04:52:04 -0700
committer	Noah Misch <noah@leadboat.com>	2025-05-05 04:52:08 -0700
commit	3f2ab73934ab1e27151ecd14fd7d8ef602555093 (patch)
tree	104f0bf916d41f0a07d46bd1c75a230399b7c0b2
parent	258cde839d902ad9dd41cbd7ab58ebf4ba086874 (diff)
download	postgresql-3f2ab73934ab1e27151ecd14fd7d8ef602555093.tar.gz postgresql-3f2ab73934ab1e27151ecd14fd7d8ef602555093.zip