aboutsummaryrefslogtreecommitdiff
path: root/src/backend/utils/adt/regexp.c
diff options
context:
space:
mode:
authorJeff Davis <jdavis@postgresql.org>2014-08-27 21:07:36 -0700
committerJeff Davis <jdavis@postgresql.org>2014-08-27 21:07:36 -0700
commit8167a3883a3c49f2f77785f8e5f638920c9f14ef (patch)
tree1359c3af2d74e1dbbebc7f34ea509de7ac1745e7 /src/backend/utils/adt/regexp.c
parent1c9701cfe58267cf5d79543a42ee4f0967cc73ab (diff)
downloadpostgresql-8167a3883a3c49f2f77785f8e5f638920c9f14ef.tar.gz
postgresql-8167a3883a3c49f2f77785f8e5f638920c9f14ef.zip
Allow multibyte characters as escape in SIMILAR TO and SUBSTRING.
Previously, only a single-byte character was allowed as an escape. This patch allows it to be a multi-byte character, though it still must be a single character. Reviewed by Heikki Linnakangas and Tom Lane.
Diffstat (limited to 'src/backend/utils/adt/regexp.c')
-rw-r--r--src/backend/utils/adt/regexp.c63
1 files changed, 58 insertions, 5 deletions
diff --git a/src/backend/utils/adt/regexp.c b/src/backend/utils/adt/regexp.c
index caf45ef85f9..50b33f6b364 100644
--- a/src/backend/utils/adt/regexp.c
+++ b/src/backend/utils/adt/regexp.c
@@ -688,11 +688,16 @@ similar_escape(PG_FUNCTION_ARGS)
elen = VARSIZE_ANY_EXHDR(esc_text);
if (elen == 0)
e = NULL; /* no escape character */
- else if (elen != 1)
- ereport(ERROR,
- (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
- errmsg("invalid escape string"),
- errhint("Escape string must be empty or one character.")));
+ else
+ {
+ int escape_mblen = pg_mbstrlen_with_len(e, elen);
+
+ if (escape_mblen > 1)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
+ errmsg("invalid escape string"),
+ errhint("Escape string must be empty or one character.")));
+ }
}
/*----------
@@ -724,6 +729,54 @@ similar_escape(PG_FUNCTION_ARGS)
{
char pchar = *p;
+ /*
+ * If both the escape character and the current character from the
+ * pattern are multi-byte, we need to take the slow path.
+ *
+ * But if one of them is single-byte, we can process the pattern one
+ * byte at a time, ignoring multi-byte characters. (This works
+ * because all server-encodings have the property that a valid
+ * multi-byte character representation cannot contain the
+ * representation of a valid single-byte character.)
+ */
+
+ if (elen > 1)
+ {
+ int mblen = pg_mblen(p);
+ if (mblen > 1)
+ {
+ /* slow, multi-byte path */
+ if (afterescape)
+ {
+ *r++ = '\\';
+ memcpy(r, p, mblen);
+ r += mblen;
+ afterescape = false;
+ }
+ else if (e && elen == mblen && memcmp(e, p, mblen) == 0)
+ {
+ /* SQL99 escape character; do not send to output */
+ afterescape = true;
+ }
+ else
+ {
+ /*
+ * We know it's a multi-byte character, so we don't need
+ * to do all the comparisons to single-byte characters
+ * that we do below.
+ */
+ memcpy(r, p, mblen);
+ r += mblen;
+ }
+
+ p += mblen;
+ plen -= mblen;
+
+ continue;
+ }
+ }
+
+ /* fast path */
if (afterescape)
{
if (pchar == '"' && !incharclass) /* for SUBSTRING patterns */