diff options
-rw-r--r-- | src/fe_utils/string_utils.c | 170 | ||||
-rw-r--r-- | src/interfaces/libpq/fe-exec.c | 134 |
2 files changed, 236 insertions, 68 deletions
diff --git a/src/fe_utils/string_utils.c b/src/fe_utils/string_utils.c index 3000da913da..e4561bafea6 100644 --- a/src/fe_utils/string_utils.c +++ b/src/fe_utils/string_utils.c @@ -104,6 +104,7 @@ fmtIdEnc(const char *rawid, int encoding) const char *cp; bool need_quotes = false; + size_t remaining = strlen(rawid); /* * These checks need to match the identifier production in scan.l. Don't @@ -117,7 +118,8 @@ fmtIdEnc(const char *rawid, int encoding) else { /* otherwise check the entire string */ - for (cp = rawid; *cp; cp++) + cp = rawid; + for (size_t i = 0; i < remaining; i++, cp++) { if (!((*cp >= 'a' && *cp <= 'z') || (*cp >= '0' && *cp <= '9') @@ -153,17 +155,90 @@ fmtIdEnc(const char *rawid, int encoding) else { appendPQExpBufferChar(id_return, '"'); - for (cp = rawid; *cp; cp++) + + cp = &rawid[0]; + while (remaining > 0) { - /* - * Did we find a double-quote in the string? Then make this a - * double double-quote per SQL99. Before, we put in a - * backslash/double-quote pair. - thomas 2000-08-05 - */ - if (*cp == '"') - appendPQExpBufferChar(id_return, '"'); - appendPQExpBufferChar(id_return, *cp); + int charlen; + + /* Fast path for plain ASCII */ + if (!IS_HIGHBIT_SET(*cp)) + { + /* + * Did we find a double-quote in the string? Then make this a + * double double-quote per SQL99. Before, we put in a + * backslash/double-quote pair. - thomas 2000-08-05 + */ + if (*cp == '"') + appendPQExpBufferChar(id_return, '"'); + appendPQExpBufferChar(id_return, *cp); + remaining--; + cp++; + continue; + } + + /* Slow path for possible multibyte characters */ + charlen = pg_encoding_mblen(encoding, cp); + + if (remaining < charlen) + { + /* + * If the character is longer than the available input, + * replace the string with an invalid sequence. The invalid + * sequence ensures that the escaped string will trigger an + * error on the server-side, even if we can't directly report + * an error here. + */ + enlargePQExpBuffer(id_return, 2); + pg_encoding_set_invalid(encoding, + id_return->data + id_return->len); + id_return->len += 2; + id_return->data[id_return->len] = '\0'; + + /* there's no more input data, so we can stop */ + break; + } + else if (pg_encoding_verifymbchar(encoding, cp, charlen) == -1) + { + /* + * Multibyte character is invalid. It's important to verify + * that as invalid multi-byte characters could e.g. be used to + * "skip" over quote characters, e.g. when parsing + * character-by-character. + * + * Replace the bytes corresponding to the invalid character + * with an invalid sequence, for the same reason as above. + * + * It would be a bit faster to verify the whole string the + * first time we encounter a set highbit, but this way we can + * replace just the invalid characters, which probably makes + * it easier for users to find the invalidly encoded portion + * of a larger string. + */ + enlargePQExpBuffer(id_return, 2); + pg_encoding_set_invalid(encoding, + id_return->data + id_return->len); + id_return->len += 2; + id_return->data[id_return->len] = '\0'; + + /* + * Copy the rest of the string after the invalid multi-byte + * character. + */ + remaining -= charlen; + cp += charlen; + } + else + { + for (int i = 0; i < charlen; i++) + { + appendPQExpBufferChar(id_return, *cp); + remaining--; + cp++; + } + } } + appendPQExpBufferChar(id_return, '"'); } @@ -290,6 +365,7 @@ appendStringLiteral(PQExpBuffer buf, const char *str, size_t length = strlen(str); const char *source = str; char *target; + size_t remaining = length; if (!enlargePQExpBuffer(buf, 2 * length + 2)) return; @@ -297,10 +373,10 @@ appendStringLiteral(PQExpBuffer buf, const char *str, target = buf->data + buf->len; *target++ = '\''; - while (*source != '\0') + while (remaining > 0) { char c = *source; - int len; + int charlen; int i; /* Fast path for plain ASCII */ @@ -312,39 +388,65 @@ appendStringLiteral(PQExpBuffer buf, const char *str, /* Copy the character */ *target++ = c; source++; + remaining--; continue; } /* Slow path for possible multibyte characters */ - len = PQmblen(source, encoding); + charlen = PQmblen(source, encoding); - /* Copy the character */ - for (i = 0; i < len; i++) + if (remaining < charlen) { - if (*source == '\0') - break; - *target++ = *source++; - } + /* + * If the character is longer than the available input, replace + * the string with an invalid sequence. The invalid sequence + * ensures that the escaped string will trigger an error on the + * server-side, even if we can't directly report an error here. + * + * We know there's enough space for the invalid sequence because + * the "target" buffer is 2 * length + 2 long, and at worst we're + * replacing a single input byte with two invalid bytes. + */ + pg_encoding_set_invalid(encoding, target); + target += 2; - /* - * If we hit premature end of string (ie, incomplete multibyte - * character), try to pad out to the correct length with spaces. We - * may not be able to pad completely, but we will always be able to - * insert at least one pad space (since we'd not have quoted a - * multibyte character). This should be enough to make a string that - * the server will error out on. - */ - if (i < len) + /* there's no more valid input data, so we can stop */ + break; + } + else if (pg_encoding_verifymbchar(encoding, source, charlen) == -1) { - char *stop = buf->data + buf->maxlen - 2; + /* + * Multibyte character is invalid. It's important to verify that + * as invalid multi-byte characters could e.g. be used to "skip" + * over quote characters, e.g. when parsing + * character-by-character. + * + * Replace the bytes corresponding to the invalid character with + * an invalid sequence, for the same reason as above. + * + * It would be a bit faster to verify the whole string the first + * time we encounter a set highbit, but this way we can replace + * just the invalid characters, which probably makes it easier for + * users to find the invalidly encoded portion of a larger string. + */ + pg_encoding_set_invalid(encoding, target); + target += 2; + remaining -= charlen; - for (; i < len; i++) + /* + * Copy the rest of the string after the invalid multi-byte + * character. + */ + source += charlen; + } + else + { + /* Copy the character */ + for (i = 0; i < charlen; i++) { - if (target >= stop) - break; - *target++ = ' '; + *target++ = *source++; + remaining--; } - break; } } diff --git a/src/interfaces/libpq/fe-exec.c b/src/interfaces/libpq/fe-exec.c index 0d224a8524e..14fc5c77793 100644 --- a/src/interfaces/libpq/fe-exec.c +++ b/src/interfaces/libpq/fe-exec.c @@ -4075,15 +4075,15 @@ PQescapeStringInternal(PGconn *conn, { const char *source = from; char *target = to; - size_t remaining = length; + size_t remaining = strnlen(from, length); if (error) *error = 0; - while (remaining > 0 && *source != '\0') + while (remaining > 0) { char c = *source; - int len; + int charlen; int i; /* Fast path for plain ASCII */ @@ -4100,38 +4100,76 @@ PQescapeStringInternal(PGconn *conn, } /* Slow path for possible multibyte characters */ - len = pg_encoding_mblen(encoding, source); + charlen = pg_encoding_mblen(encoding, source); - /* Copy the character */ - for (i = 0; i < len; i++) + if (remaining < charlen) { - if (remaining == 0 || *source == '\0') - break; - *target++ = *source++; - remaining--; - } + /* + * If the character is longer than the available input, report an + * error if possible, and replace the string with an invalid + * sequence. The invalid sequence ensures that the escaped string + * will trigger an error on the server-side, even if we can't + * directly report an error here. + * + * This isn't *that* crucial when we can report an error to the + * caller, but if we can't, the caller will use this string + * unmodified and it needs to be safe for parsing. + * + * We know there's enough space for the invalid sequence because + * the "to" buffer needs to be at least 2 * length + 1 long, and + * at worst we're replacing a single input byte with two invalid + * bytes. + */ + if (error) + *error = 1; + if (conn) + libpq_append_conn_error(conn, "incomplete multibyte character"); - /* - * If we hit premature end of string (ie, incomplete multibyte - * character), try to pad out to the correct length with spaces. We - * may not be able to pad completely, but we will always be able to - * insert at least one pad space (since we'd not have quoted a - * multibyte character). This should be enough to make a string that - * the server will error out on. - */ - if (i < len) + pg_encoding_set_invalid(encoding, target); + target += 2; + + /* there's no more input data, so we can stop */ + break; + } + else if (pg_encoding_verifymbchar(encoding, source, charlen) == -1) { + /* + * Multibyte character is invalid. It's important to verify that + * as invalid multi-byte characters could e.g. be used to "skip" + * over quote characters, e.g. when parsing + * character-by-character. + * + * Replace the bytes corresponding to the invalid character with + * an invalid sequence, for the same reason as above. + * + * It would be a bit faster to verify the whole string the first + * time we encounter a set highbit, but this way we can replace + * just the invalid characters, which probably makes it easier for + * users to find the invalidly encoded portion of a larger string. + */ if (error) *error = 1; if (conn) - libpq_append_conn_error(conn, "incomplete multibyte character"); - for (; i < len; i++) + libpq_append_conn_error(conn, "invalid multibyte character"); + + pg_encoding_set_invalid(encoding, target); + target += 2; + remaining -= charlen; + + /* + * Copy the rest of the string after the invalid multi-byte + * character. + */ + source += charlen; + } + else + { + /* Copy the character */ + for (i = 0; i < charlen; i++) { - if (((size_t) (target - to)) / 2 >= length) - break; - *target++ = ' '; + *target++ = *source++; + remaining--; } - break; } } @@ -4186,9 +4224,10 @@ PQescapeInternal(PGconn *conn, const char *str, size_t len, bool as_ident) char *rp; int num_quotes = 0; /* single or double, depending on as_ident */ int num_backslashes = 0; - int input_len; - int result_size; + size_t input_len = strlen(str); + size_t result_size; char quote_char = as_ident ? '"' : '\''; + bool validated_mb = false; /* We must have a connection, else fail immediately. */ if (!conn) @@ -4197,8 +4236,12 @@ PQescapeInternal(PGconn *conn, const char *str, size_t len, bool as_ident) if (conn->cmd_queue_head == NULL) pqClearConnErrorState(conn); - /* Scan the string for characters that must be escaped. */ - for (s = str; (s - str) < len && *s != '\0'; ++s) + /* + * Scan the string for characters that must be escaped and for invalidly + * encoded data. + */ + s = str; + for (size_t remaining = input_len; remaining > 0; remaining--, s++) { if (*s == quote_char) ++num_quotes; @@ -4211,20 +4254,41 @@ PQescapeInternal(PGconn *conn, const char *str, size_t len, bool as_ident) /* Slow path for possible multibyte characters */ charlen = pg_encoding_mblen(conn->client_encoding, s); - /* Multibyte character overruns allowable length. */ - if ((s - str) + charlen > len || memchr(s, 0, charlen) != NULL) + if (charlen > remaining) { + /* Multibyte character overruns allowable length. */ libpq_append_conn_error(conn, "incomplete multibyte character"); return NULL; } + /* + * If we haven't already, check that multibyte characters are + * valid. It's important to verify that as invalid multi-byte + * characters could e.g. be used to "skip" over quote characters, + * e.g. when parsing character-by-character. + * + * We check validity once, for the whole remainder of the string, + * when we first encounter any multi-byte character. Some + * encodings have optimized implementations for longer strings. + */ + if (!validated_mb) + { + if (pg_encoding_verifymbstr(conn->client_encoding, s, remaining) + != strlen(s)) + { + libpq_append_conn_error(conn, "invalid multibyte character"); + return NULL; + } + validated_mb = true; + } + /* Adjust s, bearing in mind that for loop will increment it. */ s += charlen - 1; + remaining -= charlen - 1; } } /* Allocate output buffer. */ - input_len = s - str; result_size = input_len + num_quotes + 3; /* two quotes, plus a NUL */ if (!as_ident && num_backslashes > 0) result_size += num_backslashes + 2; @@ -4269,7 +4333,8 @@ PQescapeInternal(PGconn *conn, const char *str, size_t len, bool as_ident) } else { - for (s = str; s - str < input_len; ++s) + s = str; + for (size_t remaining = input_len; remaining > 0; remaining--, s++) { if (*s == quote_char || (!as_ident && *s == '\\')) { @@ -4287,6 +4352,7 @@ PQescapeInternal(PGconn *conn, const char *str, size_t len, bool as_ident) *rp++ = *s; if (--i == 0) break; + remaining--; ++s; /* for loop will provide the final increment */ } } |