5 files changed, 117 insertions, 1 deletions
diff --git a/src/common/wchar.c b/src/common/wchar.c
index ea46192e364..4f3621fef12 100644
--- a/src/common/wchar.c
+++ b/src/common/wchar.c
@@ -17,6 +17,25 @@
 
 
 /*
+ * In today's multibyte encodings other than UTF8, this two-byte sequence
+ * ensures pg_encoding_mblen() == 2 && pg_encoding_verifymbstr() == 0.
+ *
+ * For historical reasons, several verifychar implementations opt to reject
+ * this pair specifically.  Byte pair range constraints, in encoding
+ * originator documentation, always excluded this pair.  No core conversion
+ * could translate it.  However, longstanding verifychar implementations
+ * accepted any non-NUL byte.  big5_to_euc_tw and big5_to_mic even translate
+ * pairs not valid per encoding originator documentation.  To avoid tightening
+ * core or non-core conversions in a security patch, we sought this one pair.
+ *
+ * PQescapeString() historically used spaces for BYTE1; many other values
+ * could suffice for BYTE1.
+ */
+#define NONUTF8_INVALID_BYTE0 (0x8d)
+#define NONUTF8_INVALID_BYTE1 (' ')
+
+
+/*
  * Operations on multi-byte encodings are driven by a table of helper
  * functions.
  *
@@ -1526,6 +1545,11 @@ pg_big5_verifychar(const unsigned char *s, int len)
 	if (len < l)
 		return -1;
 
+	if (l == 2 &&
+		s[0] == NONUTF8_INVALID_BYTE0 &&
+		s[1] == NONUTF8_INVALID_BYTE1)
+		return -1;
+
 	while (--l > 0)
 	{
 		if (*++s == '\0')
@@ -1575,6 +1599,11 @@ pg_gbk_verifychar(const unsigned char *s, int len)
 	if (len < l)
 		return -1;
 
+	if (l == 2 &&
+		s[0] == NONUTF8_INVALID_BYTE0 &&
+		s[1] == NONUTF8_INVALID_BYTE1)
+		return -1;
+
 	while (--l > 0)
 	{
 		if (*++s == '\0')
@@ -1624,6 +1653,11 @@ pg_uhc_verifychar(const unsigned char *s, int len)
 	if (len < l)
 		return -1;
 
+	if (l == 2 &&
+		s[0] == NONUTF8_INVALID_BYTE0 &&
+		s[1] == NONUTF8_INVALID_BYTE1)
+		return -1;
+
 	while (--l > 0)
 	{
 		if (*++s == '\0')
@@ -2068,6 +2102,19 @@ pg_utf8_islegal(const unsigned char *source, int length)
 
 
 /*
+ * Fills the provided buffer with two bytes such that:
+ *   pg_encoding_mblen(dst) == 2 && pg_encoding_verifymbstr(dst) == 0
+ */
+void
+pg_encoding_set_invalid(int encoding, char *dst)
+{
+	Assert(pg_encoding_max_length(encoding) > 1);
+
+	dst[0] = (encoding == PG_UTF8 ? 0xc0 : NONUTF8_INVALID_BYTE0);
+	dst[1] = NONUTF8_INVALID_BYTE1;
+}
+
+/*
  *-------------------------------------------------------------------
  * encoding info table
  * XXX must be sorted by the same order as enum pg_enc (in mb/pg_wchar.h)
@@ -2189,5 +2236,11 @@ pg_encoding_max_length(int encoding)
 {
 	Assert(PG_VALID_ENCODING(encoding));
 
-	return pg_wchar_table[encoding].maxmblen;
+	/*
+	 * Check for the encoding despite the assert, due to some mingw versions
+	 * otherwise issuing bogus warnings.
+	 */
+	return PG_VALID_ENCODING(encoding) ?
+		pg_wchar_table[encoding].maxmblen :
+		pg_wchar_table[PG_SQL_ASCII].maxmblen;
 }
diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h
index 4a0e6bfbccd..0e9e412a82e 100644
--- a/src/include/mb/pg_wchar.h
+++ b/src/include/mb/pg_wchar.h
@@ -573,6 +573,7 @@ extern int	pg_valid_server_encoding_id(int encoding);
  * (in addition to the ones just above).  The constant tables declared
  * earlier in this file are also available from libpgcommon.
  */
+extern void pg_encoding_set_invalid(int encoding, char *dst);
 extern int	pg_encoding_mblen(int encoding, const char *mbstr);
 extern int	pg_encoding_mblen_bounded(int encoding, const char *mbstr);
 extern int	pg_encoding_dsplen(int encoding, const char *mbstr);
diff --git a/src/test/regress/expected/conversion.out b/src/test/regress/expected/conversion.out
index 442e7aff2b2..d785f92561e 100644
--- a/src/test/regress/expected/conversion.out
+++ b/src/test/regress/expected/conversion.out
@@ -5,6 +5,13 @@
 \getenv libdir PG_LIBDIR
 \getenv dlsuffix PG_DLSUFFIX
 \set regresslib :libdir '/regress' :dlsuffix
+CREATE FUNCTION test_enc_setup() RETURNS void
+    AS :'regresslib', 'test_enc_setup'
+    LANGUAGE C STRICT;
+SELECT FROM test_enc_setup();
+--
+(1 row)
+
 CREATE FUNCTION test_enc_conversion(bytea, name, name, bool, validlen OUT int, result OUT bytea)
     AS :'regresslib', 'test_enc_conversion'
     LANGUAGE C STRICT;
diff --git a/src/test/regress/regress.c b/src/test/regress/regress.c
index 6764589418e..5c0ecbace42 100644
--- a/src/test/regress/regress.c
+++ b/src/test/regress/regress.c
@@ -1099,6 +1099,56 @@ test_opclass_options_func(PG_FUNCTION_ARGS)
 	PG_RETURN_NULL();
 }
 
+/* one-time tests for encoding infrastructure */
+PG_FUNCTION_INFO_V1(test_enc_setup);
+Datum
+test_enc_setup(PG_FUNCTION_ARGS)
+{
+	/* Test pg_encoding_set_invalid() */
+	for (int i = 0; i < _PG_LAST_ENCODING_; i++)
+	{
+		char		buf[2],
+					bigbuf[16];
+		int			len,
+					mblen,
+					valid;
+
+		if (pg_encoding_max_length(i) == 1)
+			continue;
+		pg_encoding_set_invalid(i, buf);
+		len = strnlen(buf, 2);
+		if (len != 2)
+			elog(WARNING,
+				 "official invalid string for encoding \"%s\" has length %d",
+				 pg_enc2name_tbl[i].name, len);
+		mblen = pg_encoding_mblen(i, buf);
+		if (mblen != 2)
+			elog(WARNING,
+				 "official invalid string for encoding \"%s\" has mblen %d",
+				 pg_enc2name_tbl[i].name, mblen);
+		valid = pg_encoding_verifymbstr(i, buf, len);
+		if (valid != 0)
+			elog(WARNING,
+				 "official invalid string for encoding \"%s\" has valid prefix of length %d",
+				 pg_enc2name_tbl[i].name, valid);
+		valid = pg_encoding_verifymbstr(i, buf, 1);
+		if (valid != 0)
+			elog(WARNING,
+				 "first byte of official invalid string for encoding \"%s\" has valid prefix of length %d",
+				 pg_enc2name_tbl[i].name, valid);
+		memset(bigbuf, ' ', sizeof(bigbuf));
+		bigbuf[0] = buf[0];
+		bigbuf[1] = buf[1];
+		valid = pg_encoding_verifymbstr(i, bigbuf, sizeof(bigbuf));
+		if (valid != 0)
+			elog(WARNING,
+				 "trailing data changed official invalid string for encoding \"%s\" to have valid prefix of length %d",
+				 pg_enc2name_tbl[i].name, valid);
+	}
+
+	PG_RETURN_VOID();
+}
+
 /*
  * Call an encoding conversion or verification function.
  *
diff --git a/src/test/regress/sql/conversion.sql b/src/test/regress/sql/conversion.sql
index 9a65fca91fb..b567a1a5721 100644
--- a/src/test/regress/sql/conversion.sql
+++ b/src/test/regress/sql/conversion.sql
@@ -8,6 +8,11 @@
 
 \set regresslib :libdir '/regress' :dlsuffix
 
+CREATE FUNCTION test_enc_setup() RETURNS void
+    AS :'regresslib', 'test_enc_setup'
+    LANGUAGE C STRICT;
+SELECT FROM test_enc_setup();
+
 CREATE FUNCTION test_enc_conversion(bytea, name, name, bool, validlen OUT int, result OUT bytea)
     AS :'regresslib', 'test_enc_conversion'
     LANGUAGE C STRICT;