2 files changed, 236 insertions, 68 deletions
diff --git a/src/fe_utils/string_utils.c b/src/fe_utils/string_utils.c
index 3000da913da..e4561bafea6 100644
--- a/src/fe_utils/string_utils.c
+++ b/src/fe_utils/string_utils.c
@@ -104,6 +104,7 @@ fmtIdEnc(const char *rawid, int encoding)
 
 	const char *cp;
 	bool		need_quotes = false;
+	size_t		remaining = strlen(rawid);
 
 	/*
 	 * These checks need to match the identifier production in scan.l. Don't
@@ -117,7 +118,8 @@ fmtIdEnc(const char *rawid, int encoding)
 	else
 	{
 		/* otherwise check the entire string */
-		for (cp = rawid; *cp; cp++)
+		cp = rawid;
+		for (size_t i = 0; i < remaining; i++, cp++)
 		{
 			if (!((*cp >= 'a' && *cp <= 'z')
 				  || (*cp >= '0' && *cp <= '9')
@@ -153,17 +155,90 @@ fmtIdEnc(const char *rawid, int encoding)
 	else
 	{
 		appendPQExpBufferChar(id_return, '"');
-		for (cp = rawid; *cp; cp++)
+
+		cp = &rawid[0];
+		while (remaining > 0)
 		{
-			/*
-			 * Did we find a double-quote in the string? Then make this a
-			 * double double-quote per SQL99. Before, we put in a
-			 * backslash/double-quote pair. - thomas 2000-08-05
-			 */
-			if (*cp == '"')
-				appendPQExpBufferChar(id_return, '"');
-			appendPQExpBufferChar(id_return, *cp);
+			int			charlen;
+
+			/* Fast path for plain ASCII */
+			if (!IS_HIGHBIT_SET(*cp))
+			{
+				/*
+				 * Did we find a double-quote in the string? Then make this a
+				 * double double-quote per SQL99. Before, we put in a
+				 * backslash/double-quote pair. - thomas 2000-08-05
+				 */
+				if (*cp == '"')
+					appendPQExpBufferChar(id_return, '"');
+				appendPQExpBufferChar(id_return, *cp);
+				remaining--;
+				cp++;
+				continue;
+			}
+
+			/* Slow path for possible multibyte characters */
+			charlen = pg_encoding_mblen(encoding, cp);
+
+			if (remaining < charlen)
+			{
+				/*
+				 * If the character is longer than the available input,
+				 * replace the string with an invalid sequence. The invalid
+				 * sequence ensures that the escaped string will trigger an
+				 * error on the server-side, even if we can't directly report
+				 * an error here.
+				 */
+				enlargePQExpBuffer(id_return, 2);
+				pg_encoding_set_invalid(encoding,
+										id_return->data + id_return->len);
+				id_return->len += 2;
+				id_return->data[id_return->len] = '\0';
+
+				/* there's no more input data, so we can stop */
+				break;
+			}
+			else if (pg_encoding_verifymbchar(encoding, cp, charlen) == -1)
+			{
+				/*
+				 * Multibyte character is invalid.  It's important to verify
+				 * that as invalid multi-byte characters could e.g. be used to
+				 * "skip" over quote characters, e.g. when parsing
+				 * character-by-character.
+				 *
+				 * Replace the bytes corresponding to the invalid character
+				 * with an invalid sequence, for the same reason as above.
+				 *
+				 * It would be a bit faster to verify the whole string the
+				 * first time we encounter a set highbit, but this way we can
+				 * replace just the invalid characters, which probably makes
+				 * it easier for users to find the invalidly encoded portion
+				 * of a larger string.
+				 */
+				enlargePQExpBuffer(id_return, 2);
+				pg_encoding_set_invalid(encoding,
+										id_return->data + id_return->len);
+				id_return->len += 2;
+				id_return->data[id_return->len] = '\0';
+
+				/*
+				 * Copy the rest of the string after the invalid multi-byte
+				 * character.
+				 */
+				remaining -= charlen;
+				cp += charlen;
+			}
+			else
+			{
+				for (int i = 0; i < charlen; i++)
+				{
+					appendPQExpBufferChar(id_return, *cp);
+					remaining--;
+					cp++;
+				}
+			}
 		}
+
 		appendPQExpBufferChar(id_return, '"');
 	}
 
@@ -290,6 +365,7 @@ appendStringLiteral(PQExpBuffer buf, const char *str,
 	size_t		length = strlen(str);
 	const char *source = str;
 	char	   *target;
+	size_t		remaining = length;
 
 	if (!enlargePQExpBuffer(buf, 2 * length + 2))
 		return;
@@ -297,10 +373,10 @@ appendStringLiteral(PQExpBuffer buf, const char *str,
 	target = buf->data + buf->len;
 	*target++ = '\'';
 
-	while (*source != '\0')
+	while (remaining > 0)
 	{
 		char		c = *source;
-		int			len;
+		int			charlen;
 		int			i;
 
 		/* Fast path for plain ASCII */
@@ -312,39 +388,65 @@ appendStringLiteral(PQExpBuffer buf, const char *str,
 			/* Copy the character */
 			*target++ = c;
 			source++;
+			remaining--;
 			continue;
 		}
 
 		/* Slow path for possible multibyte characters */
-		len = PQmblen(source, encoding);
+		charlen = PQmblen(source, encoding);
 
-		/* Copy the character */
-		for (i = 0; i < len; i++)
+		if (remaining < charlen)
 		{
-			if (*source == '\0')
-				break;
-			*target++ = *source++;
-		}
+			/*
+			 * If the character is longer than the available input, replace
+			 * the string with an invalid sequence. The invalid sequence
+			 * ensures that the escaped string will trigger an error on the
+			 * server-side, even if we can't directly report an error here.
+			 *
+			 * We know there's enough space for the invalid sequence because
+			 * the "target" buffer is 2 * length + 2 long, and at worst we're
+			 * replacing a single input byte with two invalid bytes.
+			 */
+			pg_encoding_set_invalid(encoding, target);
+			target += 2;
 
-		/*
-		 * If we hit premature end of string (ie, incomplete multibyte
-		 * character), try to pad out to the correct length with spaces. We
-		 * may not be able to pad completely, but we will always be able to
-		 * insert at least one pad space (since we'd not have quoted a
-		 * multibyte character).  This should be enough to make a string that
-		 * the server will error out on.
-		 */
-		if (i < len)
+			/* there's no more valid input data, so we can stop */
+			break;
+		}
+		else if (pg_encoding_verifymbchar(encoding, source, charlen) == -1)
 		{
-			char	   *stop = buf->data + buf->maxlen - 2;
+			/*
+			 * Multibyte character is invalid.  It's important to verify that
+			 * as invalid multi-byte characters could e.g. be used to "skip"
+			 * over quote characters, e.g. when parsing
+			 * character-by-character.
+			 *
+			 * Replace the bytes corresponding to the invalid character with
+			 * an invalid sequence, for the same reason as above.
+			 *
+			 * It would be a bit faster to verify the whole string the first
+			 * time we encounter a set highbit, but this way we can replace
+			 * just the invalid characters, which probably makes it easier for
+			 * users to find the invalidly encoded portion of a larger string.
+			 */
+			pg_encoding_set_invalid(encoding, target);
+			target += 2;
+			remaining -= charlen;
 
-			for (; i < len; i++)
+			/*
+			 * Copy the rest of the string after the invalid multi-byte
+			 * character.
+			 */
+			source += charlen;
+		}
+		else
+		{
+			/* Copy the character */
+			for (i = 0; i < charlen; i++)
 			{
-				if (target >= stop)
-					break;
-				*target++ = ' ';
+				*target++ = *source++;
+				remaining--;
 			}
-			break;
 		}
 	}
 
diff --git a/src/interfaces/libpq/fe-exec.c b/src/interfaces/libpq/fe-exec.c
index 0d224a8524e..14fc5c77793 100644
--- a/src/interfaces/libpq/fe-exec.c
+++ b/src/interfaces/libpq/fe-exec.c
@@ -4075,15 +4075,15 @@ PQescapeStringInternal(PGconn *conn,
 {
 	const char *source = from;
 	char	   *target = to;
-	size_t		remaining = length;
+	size_t		remaining = strnlen(from, length);
 
 	if (error)
 		*error = 0;
 
-	while (remaining > 0 && *source != '\0')
+	while (remaining > 0)
 	{
 		char		c = *source;
-		int			len;
+		int			charlen;
 		int			i;
 
 		/* Fast path for plain ASCII */
@@ -4100,38 +4100,76 @@ PQescapeStringInternal(PGconn *conn,
 		}
 
 		/* Slow path for possible multibyte characters */
-		len = pg_encoding_mblen(encoding, source);
+		charlen = pg_encoding_mblen(encoding, source);
 
-		/* Copy the character */
-		for (i = 0; i < len; i++)
+		if (remaining < charlen)
 		{
-			if (remaining == 0 || *source == '\0')
-				break;
-			*target++ = *source++;
-			remaining--;
-		}
+			/*
+			 * If the character is longer than the available input, report an
+			 * error if possible, and replace the string with an invalid
+			 * sequence. The invalid sequence ensures that the escaped string
+			 * will trigger an error on the server-side, even if we can't
+			 * directly report an error here.
+			 *
+			 * This isn't *that* crucial when we can report an error to the
+			 * caller, but if we can't, the caller will use this string
+			 * unmodified and it needs to be safe for parsing.
+			 *
+			 * We know there's enough space for the invalid sequence because
+			 * the "to" buffer needs to be at least 2 * length + 1 long, and
+			 * at worst we're replacing a single input byte with two invalid
+			 * bytes.
+			 */
+			if (error)
+				*error = 1;
+			if (conn)
+				libpq_append_conn_error(conn, "incomplete multibyte character");
 
-		/*
-		 * If we hit premature end of string (ie, incomplete multibyte
-		 * character), try to pad out to the correct length with spaces. We
-		 * may not be able to pad completely, but we will always be able to
-		 * insert at least one pad space (since we'd not have quoted a
-		 * multibyte character).  This should be enough to make a string that
-		 * the server will error out on.
-		 */
-		if (i < len)
+			pg_encoding_set_invalid(encoding, target);
+			target += 2;
+
+			/* there's no more input data, so we can stop */
+			break;
+		}
+		else if (pg_encoding_verifymbchar(encoding, source, charlen) == -1)
 		{
+			/*
+			 * Multibyte character is invalid.  It's important to verify that
+			 * as invalid multi-byte characters could e.g. be used to "skip"
+			 * over quote characters, e.g. when parsing
+			 * character-by-character.
+			 *
+			 * Replace the bytes corresponding to the invalid character with
+			 * an invalid sequence, for the same reason as above.
+			 *
+			 * It would be a bit faster to verify the whole string the first
+			 * time we encounter a set highbit, but this way we can replace
+			 * just the invalid characters, which probably makes it easier for
+			 * users to find the invalidly encoded portion of a larger string.
+			 */
 			if (error)
 				*error = 1;
 			if (conn)
-				libpq_append_conn_error(conn, "incomplete multibyte character");
-			for (; i < len; i++)
+				libpq_append_conn_error(conn, "invalid multibyte character");
+
+			pg_encoding_set_invalid(encoding, target);
+			target += 2;
+			remaining -= charlen;
+
+			/*
+			 * Copy the rest of the string after the invalid multi-byte
+			 * character.
+			 */
+			source += charlen;
+		}
+		else
+		{
+			/* Copy the character */
+			for (i = 0; i < charlen; i++)
 			{
-				if (((size_t) (target - to)) / 2 >= length)
-					break;
-				*target++ = ' ';
+				*target++ = *source++;
+				remaining--;
 			}
-			break;
 		}
 	}
 
@@ -4186,9 +4224,10 @@ PQescapeInternal(PGconn *conn, const char *str, size_t len, bool as_ident)
 	char	   *rp;
 	int			num_quotes = 0; /* single or double, depending on as_ident */
 	int			num_backslashes = 0;
-	int			input_len;
-	int			result_size;
+	size_t		input_len = strlen(str);
+	size_t		result_size;
 	char		quote_char = as_ident ? '"' : '\'';
+	bool		validated_mb = false;
 
 	/* We must have a connection, else fail immediately. */
 	if (!conn)
@@ -4197,8 +4236,12 @@ PQescapeInternal(PGconn *conn, const char *str, size_t len, bool as_ident)
 	if (conn->cmd_queue_head == NULL)
 		pqClearConnErrorState(conn);
 
-	/* Scan the string for characters that must be escaped. */
-	for (s = str; (s - str) < len && *s != '\0'; ++s)
+	/*
+	 * Scan the string for characters that must be escaped and for invalidly
+	 * encoded data.
+	 */
+	s = str;
+	for (size_t remaining = input_len; remaining > 0; remaining--, s++)
 	{
 		if (*s == quote_char)
 			++num_quotes;
@@ -4211,20 +4254,41 @@ PQescapeInternal(PGconn *conn, const char *str, size_t len, bool as_ident)
 			/* Slow path for possible multibyte characters */
 			charlen = pg_encoding_mblen(conn->client_encoding, s);
 
-			/* Multibyte character overruns allowable length. */
-			if ((s - str) + charlen > len || memchr(s, 0, charlen) != NULL)
+			if (charlen > remaining)
 			{
+				/* Multibyte character overruns allowable length. */
 				libpq_append_conn_error(conn, "incomplete multibyte character");
 				return NULL;
 			}
 
+			/*
+			 * If we haven't already, check that multibyte characters are
+			 * valid. It's important to verify that as invalid multi-byte
+			 * characters could e.g. be used to "skip" over quote characters,
+			 * e.g. when parsing character-by-character.
+			 *
+			 * We check validity once, for the whole remainder of the string,
+			 * when we first encounter any multi-byte character. Some
+			 * encodings have optimized implementations for longer strings.
+			 */
+			if (!validated_mb)
+			{
+				if (pg_encoding_verifymbstr(conn->client_encoding, s, remaining)
+					!= strlen(s))
+				{
+					libpq_append_conn_error(conn, "invalid multibyte character");
+					return NULL;
+				}
+				validated_mb = true;
+			}
+
 			/* Adjust s, bearing in mind that for loop will increment it. */
 			s += charlen - 1;
+			remaining -= charlen - 1;
 		}
 	}
 
 	/* Allocate output buffer. */
-	input_len = s - str;
 	result_size = input_len + num_quotes + 3;	/* two quotes, plus a NUL */
 	if (!as_ident && num_backslashes > 0)
 		result_size += num_backslashes + 2;
@@ -4269,7 +4333,8 @@ PQescapeInternal(PGconn *conn, const char *str, size_t len, bool as_ident)
 	}
 	else
 	{
-		for (s = str; s - str < input_len; ++s)
+		s = str;
+		for (size_t remaining = input_len; remaining > 0; remaining--, s++)
 		{
 			if (*s == quote_char || (!as_ident && *s == '\\'))
 			{
@@ -4287,6 +4352,7 @@ PQescapeInternal(PGconn *conn, const char *str, size_t len, bool as_ident)
 					*rp++ = *s;
 					if (--i == 0)
 						break;
+					remaining--;
 					++s;		/* for loop will provide the final increment */
 				}
 			}