1 files changed, 554 insertions, 94 deletions
diff --git a/src/backend/utils/mb/wchar.c b/src/backend/utils/mb/wchar.c
index 3adf4481e8d..96a26886959 100644
--- a/src/backend/utils/mb/wchar.c
+++ b/src/backend/utils/mb/wchar.c
@@ -1,7 +1,7 @@
 /*
  * conversion functions between pg_wchar and multibyte streams.
  * Tatsuo Ishii
- * $PostgreSQL: pgsql/src/backend/utils/mb/wchar.c,v 1.40.4.1 2005/12/24 10:11:32 ishii Exp $
+ * $PostgreSQL: pgsql/src/backend/utils/mb/wchar.c,v 1.40.4.2 2006/05/21 20:06:16 tgl Exp $
  *
  * WIN1250 client encoding updated by Pavel Behal
  *
@@ -96,7 +96,7 @@ static int	pg_euc2wchar_with_len
 	return (cnt);
 }
 
-static int
+static inline int
 pg_euc_mblen(const unsigned char *s)
 {
 	int			len;
@@ -112,7 +112,7 @@ pg_euc_mblen(const unsigned char *s)
 	return (len);
 }
 
-static int
+static inline int
 pg_euc_dsplen(const unsigned char *s)
 {
 	int			len;
@@ -714,53 +714,433 @@ pg_gb18030_dsplen(const unsigned char *s)
 	return (len);
 }
 
+/*
+ *-------------------------------------------------------------------
+ * multibyte sequence validators
+ *
+ * These functions accept "s", a pointer to the first byte of a string,
+ * and "len", the remaining length of the string.  If there is a validly
+ * encoded character beginning at *s, return its length in bytes; else
+ * return -1.
+ *
+ * The functions can assume that len > 0 and that *s != '\0', but they must
+ * test for and reject zeroes in any additional bytes of a multibyte character.
+ *
+ * Note that this definition allows the function for a single-byte
+ * encoding to be just "return 1".
+ *-------------------------------------------------------------------
+ */
+
+static int
+pg_ascii_verifier(const unsigned char *s, int len)
+{
+	return 1;
+}
+
+#define IS_EUC_RANGE_VALID(c)	((c) >= 0xa1 && (c) <= 0xfe)
+
+static int
+pg_eucjp_verifier(const unsigned char *s, int len)
+{
+	int			l;
+	unsigned char c1, c2;
+
+	c1 = *s++;
+
+	switch (c1)
+	{
+		case SS2:		/* JIS X 0201 */
+			l = 2;
+			if (l > len)
+				return -1;
+			c2 = *s++;
+			if (c2 < 0xa1 || c2 > 0xdf)
+				return -1;
+			break;
+
+		case SS3:		/* JIS X 0212 */
+			l = 3;
+			if (l > len)
+				return -1;
+			c2 = *s++;
+			if (!IS_EUC_RANGE_VALID(c2))
+				return -1;
+			c2 = *s++;
+			if (!IS_EUC_RANGE_VALID(c2))
+				return -1;
+			break;
+
+		default:
+			if (IS_HIGHBIT_SET(c1))		/* JIS X 0208? */
+			{
+				l = 2;
+				if (l > len)
+					return -1;
+				if (!IS_EUC_RANGE_VALID(c1))
+					return -1;
+				c2 = *s++;
+				if (!IS_EUC_RANGE_VALID(c2))
+					return -1;
+			}
+			else		/* must be ASCII */
+			{
+				l = 1;
+			}
+			break;
+	}
+
+	return l;
+}
+
+static int
+pg_euckr_verifier(const unsigned char *s, int len)
+{
+	int			l;
+	unsigned char c1, c2;
+
+	c1 = *s++;
+
+	if (IS_HIGHBIT_SET(c1))
+	{
+		l = 2;
+		if (l > len)
+			return -1;
+		if (!IS_EUC_RANGE_VALID(c1))
+			return -1;
+		c2 = *s++;
+		if (!IS_EUC_RANGE_VALID(c2))
+			return -1;
+	}
+	else		/* must be ASCII */
+	{
+		l = 1;
+	}
+
+	return l;
+}
+
+/* EUC-CN byte sequences are exactly same as EUC-KR */
+#define pg_euccn_verifier	pg_euckr_verifier
+
+static int
+pg_euctw_verifier(const unsigned char *s, int len)
+{
+	int			l;
+	unsigned char c1, c2;
+
+	c1 = *s++;
+
+	switch (c1)
+	{
+		case SS2:		/* CNS 11643 Plane 1-7 */
+			l = 4;
+			if (l > len)
+				return -1;
+			c2 = *s++;
+			if (c2 < 0xa1 || c2 > 0xa7)
+				return -1;
+			c2 = *s++;
+			if (!IS_EUC_RANGE_VALID(c2))
+				return -1;
+			c2 = *s++;
+			if (!IS_EUC_RANGE_VALID(c2))
+				return -1;
+			break;
+
+		case SS3:		/* unused */
+			return -1;
+
+		default:
+			if (IS_HIGHBIT_SET(c1))		/* CNS 11643 Plane 1 */
+			{
+				l = 2;
+				if (l > len)
+					return -1;
+				/* no further range check on c1? */
+				c2 = *s++;
+				if (!IS_EUC_RANGE_VALID(c2))
+					return -1;
+			}
+			else		/* must be ASCII */
+			{
+				l = 1;
+			}
+			break;
+	}
+	return l;
+}
+
+static int
+pg_johab_verifier(const unsigned char *s, int len)
+{
+	int l, mbl;
+	unsigned char c;
+
+	l = mbl = pg_johab_mblen(s);
+
+	if (len < l)
+		return -1;
+
+	if (!IS_HIGHBIT_SET(*s))
+		return mbl;
+
+	while (--l > 0)
+	{
+		c = *++s;
+		if (!IS_EUC_RANGE_VALID(c))
+			return -1;
+	}
+	return mbl;
+}
+
+static int
+pg_mule_verifier(const unsigned char *s, int len)
+{
+	int l, mbl;
+	unsigned char c;
+
+	l = mbl = pg_mule_mblen(s);
+
+	if (len < l)
+		return -1;
+
+	while (--l > 0)
+	{
+		c = *++s;
+		if (!IS_HIGHBIT_SET(c))
+			return -1;
+	}
+	return mbl;
+}
+
+static int
+pg_latin1_verifier(const unsigned char *s, int len)
+{
+	return 1;
+}
+
+static int
+pg_sjis_verifier(const unsigned char *s, int len)
+{
+	int l, mbl;
+	unsigned char c1, c2;
+
+	l = mbl = pg_sjis_mblen(s);
+
+	if (len < l)
+		return -1;
+
+	if (l == 1)					/* pg_sjis_mblen already verified it */
+		return mbl;
+
+	c1 = *s++;
+	c2 = *s;
+	if (!ISSJISHEAD(c1) || !ISSJISTAIL(c2))
+		return -1;
+	return mbl;
+}
+
+static int
+pg_big5_verifier(const unsigned char *s, int len)
+{
+	int l, mbl;
+
+	l = mbl = pg_big5_mblen(s);
+
+	if (len < l)
+		return -1;
+
+	while (--l > 0)
+	{
+		if (*++s == '\0')
+			return -1;
+	}
+
+	return mbl;
+}
+
+static int
+pg_gbk_verifier(const unsigned char *s, int len)
+{
+	int l, mbl;
+
+	l = mbl = pg_gbk_mblen(s);
+
+	if (len < l)
+		return -1;
+
+	while (--l > 0)
+	{
+		if (*++s == '\0')
+			return -1;
+	}
+
+	return mbl;
+}
+
+static int
+pg_uhc_verifier(const unsigned char *s, int len)
+{
+	int l, mbl;
+
+	l = mbl = pg_uhc_mblen(s);
+
+	if (len < l)
+		return -1;
+
+	while (--l > 0)
+	{
+		if (*++s == '\0')
+			return -1;
+	}
+
+	return mbl;
+}
+
+static int
+pg_gb18030_verifier(const unsigned char *s, int len)
+{
+	int l, mbl;
+
+	l = mbl = pg_gb18030_mblen(s);
+
+	if (len < l)
+		return -1;
+
+	while (--l > 0)
+	{
+		if (*++s == '\0')
+			return -1;
+	}
+
+	return mbl;
+}
+
+static int
+pg_utf8_verifier(const unsigned char *s, int len)
+{
+	int l = pg_utf_mblen(s);
+
+	if (len < l)
+		return -1;
+
+	if (!pg_utf8_islegal(s, l))
+		return -1;
 
+	return l;
+}
+
+/*
+ * Check for validity of a single UTF-8 encoded character
+ *
+ * This directly implements the rules in RFC3629, modified to restrict
+ * us to 16-bit Unicode code points (hence, at most 3 bytes in UTF8).
+ * The bizarre-looking
+ * restrictions on the second byte are meant to ensure that there isn't
+ * more than one encoding of a given Unicode character point; that is,
+ * you may not use a longer-than-necessary byte sequence with high order
+ * zero bits to represent a character that would fit in fewer bytes.
+ * To do otherwise is to create security hazards (eg, create an apparent
+ * non-ASCII character that decodes to plain ASCII).
+ *
+ * length is assumed to have been obtained by pg_utf_mblen(), and the
+ * caller must have checked that that many bytes are present in the buffer.
+ */
+bool
+pg_utf8_islegal(const unsigned char *source, int length)
+{
+	unsigned char a;
+
+	switch (length)
+	{
+		default:
+			/* reject lengths 4, 5 and 6 for now */
+			return false;
+		case 3:
+			a = source[2];
+			if (a < 0x80 || a > 0xBF)
+				return false;
+			/* FALL THRU */
+		case 2:
+			a = source[1];
+			switch (*source)
+			{
+				case 0xE0:
+					if (a < 0xA0 || a > 0xBF)
+						return false;
+					break;
+				case 0xED:
+					if (a < 0x80 || a > 0x9F)
+						return false;
+					break;
+				default:
+					if (a < 0x80 || a > 0xBF)
+						return false;
+					break;
+			}
+			/* FALL THRU */
+		case 1:
+			a = *source;
+			if (a >= 0x80 && a < 0xC2)
+				return false;
+			if (a > 0xEF)
+				return false;
+			break;
+	}
+	return true;
+}
+
+/*
+ *-------------------------------------------------------------------
+ * encoding info table
+ *-------------------------------------------------------------------
+ */
 pg_wchar_tbl pg_wchar_table[] = {
-	{pg_ascii2wchar_with_len, pg_ascii_mblen, pg_ascii_dsplen, 1},		/* 0; PG_SQL_ASCII	*/
-	{pg_eucjp2wchar_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, 3},		/* 1; PG_EUC_JP */
-	{pg_euccn2wchar_with_len, pg_euccn_mblen, pg_euccn_dsplen, 3},		/* 2; PG_EUC_CN */
-	{pg_euckr2wchar_with_len, pg_euckr_mblen, pg_euckr_dsplen, 3},		/* 3; PG_EUC_KR */
-	{pg_euctw2wchar_with_len, pg_euctw_mblen, pg_euctw_dsplen, 3},		/* 4; PG_EUC_TW */
-	{pg_johab2wchar_with_len, pg_johab_mblen, pg_johab_dsplen, 3},		/* 5; PG_JOHAB */
-	{pg_utf2wchar_with_len, pg_utf_mblen, pg_utf_dsplen, 3},	/* 6; PG_UNICODE */
-	{pg_mule2wchar_with_len, pg_mule_mblen, pg_mule_dsplen, 3}, /* 7; PG_MULE_INTERNAL */
-	{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1},	/* 8; PG_LATIN1 */
-	{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1},	/* 9; PG_LATIN2 */
-	{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1},	/* 10; PG_LATIN3 */
-	{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1},	/* 11; PG_LATIN4 */
-	{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1},	/* 12; PG_LATIN5 */
-	{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1},	/* 13; PG_LATIN6 */
-	{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1},	/* 14; PG_LATIN7 */
-	{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1},	/* 15; PG_LATIN8 */
-	{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1},	/* 16; PG_LATIN9 */
-	{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1},	/* 17; PG_LATIN10 */
-	{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1},	/* 18; PG_WIN1256 */
-	{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1},	/* 19; PG_TCVN */
-	{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1},	/* 20; PG_WIN874 */
-	{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1},	/* 21; PG_KOI8 */
-	{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1},	/* 22; PG_WIN1251 */
-	{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1},	/* 23; PG_ALT */
-	{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1},	/* 24; ISO-8859-5 */
-	{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1},	/* 25; ISO-8859-6 */
-	{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1},	/* 26; ISO-8859-7 */
-	{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1},	/* 27; ISO-8859-8 */
-	{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1},	/* 28; PG_WIN1250 */
-	{0, pg_sjis_mblen, pg_sjis_dsplen, 2},		/* 29; PG_SJIS */
-	{0, pg_big5_mblen, pg_big5_dsplen, 2},		/* 30; PG_BIG5 */
-	{0, pg_gbk_mblen, pg_gbk_dsplen, 2},		/* 31; PG_GBK */
-	{0, pg_uhc_mblen, pg_uhc_dsplen, 2},		/* 32; PG_UHC */
-	{0, pg_gb18030_mblen, pg_gb18030_dsplen, 2} /* 33; PG_GB18030 */
+	{pg_ascii2wchar_with_len, pg_ascii_mblen, pg_ascii_dsplen, pg_ascii_verifier, 1},		/* 0; PG_SQL_ASCII	*/
+	{pg_eucjp2wchar_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifier, 3},		/* 1; PG_EUC_JP */
+	{pg_euccn2wchar_with_len, pg_euccn_mblen, pg_euccn_dsplen, pg_euccn_verifier, 3},		/* 2; PG_EUC_CN */
+	{pg_euckr2wchar_with_len, pg_euckr_mblen, pg_euckr_dsplen, pg_euckr_verifier, 3},		/* 3; PG_EUC_KR */
+	{pg_euctw2wchar_with_len, pg_euctw_mblen, pg_euctw_dsplen, pg_euctw_verifier, 3},		/* 4; PG_EUC_TW */
+	{pg_johab2wchar_with_len, pg_johab_mblen, pg_johab_dsplen, pg_johab_verifier, 3},		/* 5; PG_JOHAB */
+	{pg_utf2wchar_with_len, pg_utf_mblen, pg_utf_dsplen, pg_utf8_verifier, 3},	/* 6; PG_UNICODE */
+	{pg_mule2wchar_with_len, pg_mule_mblen, pg_mule_dsplen, pg_mule_verifier, 3}, /* 7; PG_MULE_INTERNAL */
+	{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},	/* 8; PG_LATIN1 */
+	{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},	/* 9; PG_LATIN2 */
+	{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},	/* 10; PG_LATIN3 */
+	{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},	/* 11; PG_LATIN4 */
+	{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},	/* 12; PG_LATIN5 */
+	{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},	/* 13; PG_LATIN6 */
+	{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},	/* 14; PG_LATIN7 */
+	{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},	/* 15; PG_LATIN8 */
+	{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},	/* 16; PG_LATIN9 */
+	{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},	/* 17; PG_LATIN10 */
+	{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},	/* 18; PG_WIN1256 */
+	{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},	/* 19; PG_TCVN */
+	{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},	/* 20; PG_WIN874 */
+	{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},	/* 21; PG_KOI8 */
+	{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},	/* 22; PG_WIN1251 */
+	{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},	/* 23; PG_ALT */
+	{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},	/* 24; ISO-8859-5 */
+	{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},	/* 25; ISO-8859-6 */
+	{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},	/* 26; ISO-8859-7 */
+	{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},	/* 27; ISO-8859-8 */
+	{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},	/* 28; PG_WIN1250 */
+	{0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifier, 2},		/* 29; PG_SJIS */
+	{0, pg_big5_mblen, pg_big5_dsplen, pg_big5_verifier, 2},		/* 30; PG_BIG5 */
+	{0, pg_gbk_mblen, pg_gbk_dsplen, pg_gbk_verifier, 2},		/* 31; PG_GBK */
+	{0, pg_uhc_mblen, pg_uhc_dsplen, pg_uhc_verifier, 2},		/* 32; PG_UHC */
+	{0, pg_gb18030_mblen, pg_gb18030_dsplen, pg_gb18030_verifier, 2} /* 33; PG_GB18030 */
 };
 
 /* returns the byte length of a word for mule internal code */
 int
 pg_mic_mblen(const unsigned char *mbstr)
 {
-	return (pg_mule_mblen(mbstr));
+	return pg_mule_mblen(mbstr);
 }
 
 /*
- * Returns the byte length of a multibyte word.
+ * Returns the byte length of a multibyte character.
  */
 int
 pg_encoding_mblen(int encoding, const unsigned char *mbstr)
@@ -769,12 +1149,12 @@ pg_encoding_mblen(int encoding, const unsigned char *mbstr)
 
 	return ((encoding >= 0 &&
 			 encoding < sizeof(pg_wchar_table) / sizeof(pg_wchar_tbl)) ?
-			((*pg_wchar_table[encoding].mblen) (mbstr)) :
-			((*pg_wchar_table[PG_SQL_ASCII].mblen) (mbstr)));
+		((*pg_wchar_table[encoding].mblen) ((const unsigned char *) mbstr)) :
+	((*pg_wchar_table[PG_SQL_ASCII].mblen) ((const unsigned char *) mbstr)));
 }
 
 /*
- * Returns the display length of a multibyte word.
+ * Returns the display length of a multibyte character.
  */
 int
 pg_encoding_dsplen(int encoding, const unsigned char *mbstr)
@@ -783,12 +1163,28 @@ pg_encoding_dsplen(int encoding, const unsigned char *mbstr)
 
 	return ((encoding >= 0 &&
 			 encoding < sizeof(pg_wchar_table) / sizeof(pg_wchar_tbl)) ?
-			((*pg_wchar_table[encoding].dsplen) (mbstr)) :
-			((*pg_wchar_table[PG_SQL_ASCII].dsplen) (mbstr)));
+	   ((*pg_wchar_table[encoding].dsplen) ((const unsigned char *) mbstr)) :
+	((*pg_wchar_table[PG_SQL_ASCII].dsplen) ((const unsigned char *) mbstr)));
+}
+
+/*
+ * Verify the first multibyte character of the given string.
+ * Return its byte length if good, -1 if bad.  (See comments above for
+ * full details of the mbverify API.)
+ */
+int
+pg_encoding_verifymb(int encoding, const char *mbstr, int len)
+{
+	Assert(PG_VALID_ENCODING(encoding));
+
+	return ((encoding >= 0 &&
+			 encoding < sizeof(pg_wchar_table) / sizeof(pg_wchar_tbl)) ?
+		((*pg_wchar_table[encoding].mbverify) ((const unsigned char *) mbstr, len)) :
+	((*pg_wchar_table[PG_SQL_ASCII].mbverify) ((const unsigned char *) mbstr, len)));
 }
 
 /*
- * fetch maximum length of a char encoding
+ * fetch maximum length of a given encoding
  */
 int
 pg_encoding_max_length(int encoding)
@@ -801,83 +1197,147 @@ pg_encoding_max_length(int encoding)
 #ifndef FRONTEND
 
 /*
- * Verify mbstr to make sure that it has a valid character sequence.
- * mbstr is not necessarily NULL terminated; length of mbstr is
+ * fetch maximum length of the encoding for the current database
+ */
+int
+pg_database_encoding_max_length(void)
+{
+	return pg_wchar_table[GetDatabaseEncoding()].maxmblen;
+}
+
+/*
+ * Verify mbstr to make sure that it is validly encoded in the current
+ * database encoding.  Otherwise same as pg_verify_mbstr().
+ */
+bool
+pg_verifymbstr(const char *mbstr, int len, bool noError)
+{
+	return pg_verify_mbstr(GetDatabaseEncoding(), mbstr, len, noError);
+}
+
+/*
+ * Verify mbstr to make sure that it is validly encoded in the specified
+ * encoding.
+ *
+ * mbstr is not necessarily zero terminated; length of mbstr is
  * specified by len.
  *
  * If OK, return TRUE.	If a problem is found, return FALSE when noError is
  * true; when noError is false, ereport() a descriptive message.
  */
 bool
-pg_verifymbstr(const unsigned char *mbstr, int len, bool noError)
+pg_verify_mbstr(int encoding, const char *mbstr, int len, bool noError)
 {
-	int			l;
-	int			i;
-	int			encoding;
+	mbverifier	mbverify;
+
+	Assert(PG_VALID_ENCODING(encoding));
 
-	/* we do not need any check in single-byte encodings */
-	if (pg_database_encoding_max_length() <= 1)
-		return true;
+	/*
+	 * In single-byte encodings, we need only reject nulls (\0).
+	 */
+	if (pg_encoding_max_length(encoding) <= 1)
+	{
+		const char *nullpos = memchr(mbstr, 0, len);
 
-	encoding = GetDatabaseEncoding();
+		if (nullpos == NULL)
+			return true;
+		if (noError)
+			return false;
+		report_invalid_encoding(encoding, nullpos, 1);
+	}
 
-	while (len > 0 && *mbstr)
+	/* fetch function pointer just once */
+	mbverify = pg_wchar_table[encoding].mbverify;
+
+	while (len > 0)
 	{
-		/* special UTF-8 check */
-		if (encoding == PG_UTF8 && (*mbstr & 0xf8) == 0xf0)
+		int			l;
+
+		/* fast path for ASCII-subset characters */
+		if (!IS_HIGHBIT_SET(*mbstr))
 		{
+			if (*mbstr != '\0')
+			{
+				mbstr++;
+				len--;
+				continue;
+			}
 			if (noError)
 				return false;
-			ereport(ERROR,
-					(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
-					 errmsg("Unicode characters greater than or equal to 0x10000 are not supported")));
+			report_invalid_encoding(encoding, mbstr, len);
 		}
 
-		l = pg_mblen(mbstr);
+		l = (*mbverify) ((const unsigned char *) mbstr, len);
 
-		for (i = 1; i < l; i++)
+		if (l < 0)
 		{
-			/*
-			 * we expect that every multibyte char consists of bytes
-			 * having the 8th bit set
-			 */
-			if (i >= len || (mbstr[i] & 0x80) == 0)
-			{
-				char		buf[8 * 2 + 1];
-				char	   *p = buf;
-				int			j,
-							jlimit;
-
-				if (noError)
-					return false;
-
-				jlimit = Min(l, len);
-				jlimit = Min(jlimit, 8);		/* prevent buffer overrun */
-
-				for (j = 0; j < jlimit; j++)
-					p += sprintf(p, "%02x", mbstr[j]);
-
-				ereport(ERROR,
-						(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
-				errmsg("invalid byte sequence for encoding \"%s\": 0x%s",
-					   GetDatabaseEncodingName(), buf)));
-			}
+			if (noError)
+				return false;
+			report_invalid_encoding(encoding, mbstr, len);
 		}
 
-		len -= l;
 		mbstr += l;
+		len -= l;
 	}
-
 	return true;
 }
 
 /*
- * fetch maximum length of a char encoding for the current database
+ * report_invalid_encoding: complain about invalid multibyte character
+ *
+ * note: len is remaining length of string, not length of character;
+ * len must be greater than zero, as we always examine the first byte.
  */
-int
-pg_database_encoding_max_length(void)
+void
+report_invalid_encoding(int encoding, const char *mbstr, int len)
 {
-	return pg_wchar_table[GetDatabaseEncoding()].maxmblen;
+	int			l = pg_encoding_mblen(encoding, mbstr);
+	char		buf[8 * 2 + 1];
+	char	   *p = buf;
+	int			j,
+				jlimit;
+
+	jlimit = Min(l, len);
+	jlimit = Min(jlimit, 8);	/* prevent buffer overrun */
+
+	for (j = 0; j < jlimit; j++)
+		p += sprintf(p, "%02x", (unsigned char) mbstr[j]);
+
+	ereport(ERROR,
+			(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
+			 errmsg("invalid byte sequence for encoding \"%s\": 0x%s",
+					pg_enc2name_tbl[encoding].name,
+					buf)));
+}
+
+/*
+ * report_untranslatable_char: complain about untranslatable character
+ *
+ * note: len is remaining length of string, not length of character;
+ * len must be greater than zero, as we always examine the first byte.
+ */
+void
+report_untranslatable_char(int src_encoding, int dest_encoding,
+						   const char *mbstr, int len)
+{
+	int			l = pg_encoding_mblen(src_encoding, mbstr);
+	char		buf[8 * 2 + 1];
+	char	   *p = buf;
+	int			j,
+				jlimit;
+
+	jlimit = Min(l, len);
+	jlimit = Min(jlimit, 8);	/* prevent buffer overrun */
+
+	for (j = 0; j < jlimit; j++)
+		p += sprintf(p, "%02x", (unsigned char) mbstr[j]);
+
+	ereport(ERROR,
+			(errcode(ERRCODE_UNTRANSLATABLE_CHARACTER),
+			 errmsg("character 0x%s of encoding \"%s\" has no equivalent in \"%s\"",
+					buf,
+					pg_enc2name_tbl[src_encoding].name,
+					pg_enc2name_tbl[dest_encoding].name)));
 }
 
 #endif