aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorTom Lane <tgl@sss.pgh.pa.us>2008-03-19 02:41:15 +0000
committerTom Lane <tgl@sss.pgh.pa.us>2008-03-19 02:41:15 +0000
commita1453f121bd44aa1b37ac213e2716640260a75eb (patch)
tree7b4084fedd474ea6dc214b307f1b6e92c15c6070 /src
parent0d52d7a71efd0f090b62f3bfa77bd0060aa560d5 (diff)
downloadpostgresql-a1453f121bd44aa1b37ac213e2716640260a75eb.tar.gz
postgresql-a1453f121bd44aa1b37ac213e2716640260a75eb.zip
Fix regexp substring matching (substring(string from pattern)) for the corner
case where there is a match to the pattern overall but the user has specified a parenthesized subexpression and that subexpression hasn't got a match. An example is substring('foo' from 'foo(bar)?'). This should return NULL, since (bar) isn't matched, but it was mistakenly returning the whole-pattern match instead (ie, 'foo'). Per bug #4044 from Rui Martins. This has been broken since the beginning; patch in all supported versions. The old behavior was sufficiently inconsistent that it's impossible to believe anyone is depending on it.
Diffstat (limited to 'src')
-rw-r--r--src/backend/utils/adt/regexp.c207
1 files changed, 119 insertions, 88 deletions
diff --git a/src/backend/utils/adt/regexp.c b/src/backend/utils/adt/regexp.c
index 37a82eed4fd..11dff457d6f 100644
--- a/src/backend/utils/adt/regexp.c
+++ b/src/backend/utils/adt/regexp.c
@@ -8,7 +8,7 @@
*
*
* IDENTIFICATION
- * $Header: /cvsroot/pgsql/src/backend/utils/adt/regexp.c,v 1.49.4.4 2007/01/03 22:39:56 tgl Exp $
+ * $Header: /cvsroot/pgsql/src/backend/utils/adt/regexp.c,v 1.49.4.5 2008/03/19 02:41:15 tgl Exp $
*
* Alistair Crooks added the code for the regex caching
* agc - cached the regular expressions used - there's a good chance
@@ -80,42 +80,31 @@ static cached_re_str re_array[MAX_CACHED_RES]; /* cached re's */
/*
- * RE_compile_and_execute - compile and execute a RE, caching if possible
+ * RE_compile_and_cache - compile a RE, caching if possible
*
- * Returns TRUE on match, FALSE on no match
+ * Returns regex_t *
*
* text_re --- the pattern, expressed as an *untoasted* TEXT object
- * dat --- the data to match against (need not be null-terminated)
- * dat_len --- the length of the data string
* cflags --- compile options for the pattern
- * nmatch, pmatch --- optional return area for match details
*
- * Both pattern and data are given in the database encoding. We internally
- * convert to array of pg_wchar which is what Spencer's regex package wants.
+ * Pattern is given in the database encoding. We internally convert to
+ * array of pg_wchar which is what Spencer's regex package wants.
*/
-static bool
-RE_compile_and_execute(text *text_re, unsigned char *dat, int dat_len,
- int cflags, int nmatch, regmatch_t *pmatch)
+static regex_t *
+RE_compile_and_cache(text *text_re, int cflags)
{
int text_re_len = VARSIZE(text_re);
- pg_wchar *data;
- size_t data_len;
pg_wchar *pattern;
size_t pattern_len;
int i;
int regcomp_result;
- int regexec_result;
cached_re_str re_temp;
char errMsg[100];
- /* Convert data string to wide characters */
- data = (pg_wchar *) palloc((dat_len + 1) * sizeof(pg_wchar));
- data_len = pg_mb2wchar_with_len(dat, data, dat_len);
-
/*
* Look for a match among previously compiled REs. Since the data
- * structure is self-organizing with most-used entries at the front,
- * our search strategy can just be to scan from the front.
+ * structure is self-organizing with most-used entries at the front, our
+ * search strategy can just be to scan from the front.
*/
for (i = 0; i < num_res; i++)
{
@@ -133,28 +122,7 @@ RE_compile_and_execute(text *text_re, unsigned char *dat, int dat_len,
re_array[0] = re_temp;
}
- /* Perform RE match and return result */
- regexec_result = pg_regexec(&re_array[0].cre_re,
- data,
- data_len,
- NULL, /* no details */
- nmatch,
- pmatch,
- 0);
-
- pfree(data);
-
- if (regexec_result != REG_OKAY && regexec_result != REG_NOMATCH)
- {
- /* re failed??? */
- pg_regerror(regexec_result, &re_array[0].cre_re,
- errMsg, sizeof(errMsg));
- ereport(ERROR,
- (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
- errmsg("regular expression failed: %s", errMsg)));
- }
-
- return (regexec_result == REG_OKAY);
+ return &re_array[0].cre_re;
}
}
@@ -165,7 +133,7 @@ RE_compile_and_execute(text *text_re, unsigned char *dat, int dat_len,
/* Convert pattern string to wide characters */
pattern = (pg_wchar *) palloc((text_re_len - VARHDRSZ + 1) * sizeof(pg_wchar));
- pattern_len = pg_mb2wchar_with_len((unsigned char *) VARDATA(text_re),
+ pattern_len = pg_mb2wchar_with_len(VARDATA(text_re),
pattern,
text_re_len - VARHDRSZ);
@@ -202,8 +170,8 @@ RE_compile_and_execute(text *text_re, unsigned char *dat, int dat_len,
re_temp.cre_flags = cflags;
/*
- * Okay, we have a valid new item in re_temp; insert it into the
- * storage array. Discard last entry if needed.
+ * Okay, we have a valid new item in re_temp; insert it into the storage
+ * array. Discard last entry if needed.
*/
if (num_res >= MAX_CACHED_RES)
{
@@ -219,8 +187,37 @@ RE_compile_and_execute(text *text_re, unsigned char *dat, int dat_len,
re_array[0] = re_temp;
num_res++;
+ return &re_array[0].cre_re;
+}
+
+/*
+ * RE_execute - execute a RE
+ *
+ * Returns TRUE on match, FALSE on no match
+ *
+ * re --- the compiled pattern as returned by RE_compile_and_cache
+ * dat --- the data to match against (need not be null-terminated)
+ * dat_len --- the length of the data string
+ * nmatch, pmatch --- optional return area for match details
+ *
+ * Data is given in the database encoding. We internally
+ * convert to array of pg_wchar which is what Spencer's regex package wants.
+ */
+static bool
+RE_execute(regex_t *re, char *dat, int dat_len,
+ int nmatch, regmatch_t *pmatch)
+{
+ pg_wchar *data;
+ size_t data_len;
+ int regexec_result;
+ char errMsg[100];
+
+ /* Convert data string to wide characters */
+ data = (pg_wchar *) palloc((dat_len + 1) * sizeof(pg_wchar));
+ data_len = pg_mb2wchar_with_len(dat, data, dat_len);
+
/* Perform RE match and return result */
- regexec_result = pg_regexec(&re_array[0].cre_re,
+ regexec_result = pg_regexec(re,
data,
data_len,
NULL, /* no details */
@@ -233,8 +230,7 @@ RE_compile_and_execute(text *text_re, unsigned char *dat, int dat_len,
if (regexec_result != REG_OKAY && regexec_result != REG_NOMATCH)
{
/* re failed??? */
- pg_regerror(regexec_result, &re_array[0].cre_re,
- errMsg, sizeof(errMsg));
+ pg_regerror(regexec_result, re, errMsg, sizeof(errMsg));
ereport(ERROR,
(errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
errmsg("regular expression failed: %s", errMsg)));
@@ -243,6 +239,33 @@ RE_compile_and_execute(text *text_re, unsigned char *dat, int dat_len,
return (regexec_result == REG_OKAY);
}
+/*
+ * RE_compile_and_execute - compile and execute a RE
+ *
+ * Returns TRUE on match, FALSE on no match
+ *
+ * text_re --- the pattern, expressed as an *untoasted* TEXT object
+ * dat --- the data to match against (need not be null-terminated)
+ * dat_len --- the length of the data string
+ * cflags --- compile options for the pattern
+ * nmatch, pmatch --- optional return area for match details
+ *
+ * Both pattern and data are given in the database encoding. We internally
+ * convert to array of pg_wchar which is what Spencer's regex package wants.
+ */
+static bool
+RE_compile_and_execute(text *text_re, char *dat, int dat_len,
+ int cflags, int nmatch, regmatch_t *pmatch)
+{
+ regex_t *re;
+
+ /* Compile RE */
+ re = RE_compile_and_cache(text_re, cflags);
+
+ /* Perform RE match and return result */
+ return RE_execute(re, dat, dat_len, nmatch, pmatch);
+}
+
/*
* assign_regex_flavor - GUC hook to validate and set REGEX_FLAVOR
@@ -283,7 +306,7 @@ nameregexeq(PG_FUNCTION_ARGS)
text *p = PG_GETARG_TEXT_P(1);
PG_RETURN_BOOL(RE_compile_and_execute(p,
- (unsigned char *) NameStr(*n),
+ NameStr(*n),
strlen(NameStr(*n)),
regex_flavor,
0, NULL));
@@ -296,7 +319,7 @@ nameregexne(PG_FUNCTION_ARGS)
text *p = PG_GETARG_TEXT_P(1);
PG_RETURN_BOOL(!RE_compile_and_execute(p,
- (unsigned char *) NameStr(*n),
+ NameStr(*n),
strlen(NameStr(*n)),
regex_flavor,
0, NULL));
@@ -309,7 +332,7 @@ textregexeq(PG_FUNCTION_ARGS)
text *p = PG_GETARG_TEXT_P(1);
PG_RETURN_BOOL(RE_compile_and_execute(p,
- (unsigned char *) VARDATA(s),
+ VARDATA(s),
VARSIZE(s) - VARHDRSZ,
regex_flavor,
0, NULL));
@@ -322,7 +345,7 @@ textregexne(PG_FUNCTION_ARGS)
text *p = PG_GETARG_TEXT_P(1);
PG_RETURN_BOOL(!RE_compile_and_execute(p,
- (unsigned char *) VARDATA(s),
+ VARDATA(s),
VARSIZE(s) - VARHDRSZ,
regex_flavor,
0, NULL));
@@ -342,7 +365,7 @@ nameicregexeq(PG_FUNCTION_ARGS)
text *p = PG_GETARG_TEXT_P(1);
PG_RETURN_BOOL(RE_compile_and_execute(p,
- (unsigned char *) NameStr(*n),
+ NameStr(*n),
strlen(NameStr(*n)),
regex_flavor | REG_ICASE,
0, NULL));
@@ -355,7 +378,7 @@ nameicregexne(PG_FUNCTION_ARGS)
text *p = PG_GETARG_TEXT_P(1);
PG_RETURN_BOOL(!RE_compile_and_execute(p,
- (unsigned char *) NameStr(*n),
+ NameStr(*n),
strlen(NameStr(*n)),
regex_flavor | REG_ICASE,
0, NULL));
@@ -368,7 +391,7 @@ texticregexeq(PG_FUNCTION_ARGS)
text *p = PG_GETARG_TEXT_P(1);
PG_RETURN_BOOL(RE_compile_and_execute(p,
- (unsigned char *) VARDATA(s),
+ VARDATA(s),
VARSIZE(s) - VARHDRSZ,
regex_flavor | REG_ICASE,
0, NULL));
@@ -381,7 +404,7 @@ texticregexne(PG_FUNCTION_ARGS)
text *p = PG_GETARG_TEXT_P(1);
PG_RETURN_BOOL(!RE_compile_and_execute(p,
- (unsigned char *) VARDATA(s),
+ VARDATA(s),
VARSIZE(s) - VARHDRSZ,
regex_flavor | REG_ICASE,
0, NULL));
@@ -397,43 +420,51 @@ textregexsubstr(PG_FUNCTION_ARGS)
{
text *s = PG_GETARG_TEXT_P(0);
text *p = PG_GETARG_TEXT_P(1);
- bool match;
+ regex_t *re;
regmatch_t pmatch[2];
+ int so,
+ eo;
+
+ /* Compile RE */
+ re = RE_compile_and_cache(p, regex_flavor);
/*
- * We pass two regmatch_t structs to get info about the overall match
- * and the match for the first parenthesized subexpression (if any).
- * If there is a parenthesized subexpression, we return what it
- * matched; else return what the whole regexp matched.
+ * We pass two regmatch_t structs to get info about the overall match and
+ * the match for the first parenthesized subexpression (if any). If there
+ * is a parenthesized subexpression, we return what it matched; else
+ * return what the whole regexp matched.
*/
- match = RE_compile_and_execute(p,
- (unsigned char *) VARDATA(s),
- VARSIZE(s) - VARHDRSZ,
- regex_flavor,
- 2, pmatch);
-
- /* match? then return the substring matching the pattern */
- if (match)
- {
- int so,
- eo;
+ if (!RE_execute(re,
+ VARDATA(s), VARSIZE(s) - VARHDRSZ,
+ 2, pmatch))
+ PG_RETURN_NULL(); /* definitely no match */
+ if (re->re_nsub > 0)
+ {
+ /* has parenthesized subexpressions, use the first one */
so = pmatch[1].rm_so;
eo = pmatch[1].rm_eo;
- if (so < 0 || eo < 0)
- {
- /* no parenthesized subexpression */
- so = pmatch[0].rm_so;
- eo = pmatch[0].rm_eo;
- }
-
- return (DirectFunctionCall3(text_substr,
- PointerGetDatum(s),
- Int32GetDatum(so + 1),
- Int32GetDatum(eo - so)));
}
+ else
+ {
+ /* no parenthesized subexpression, use whole match */
+ so = pmatch[0].rm_so;
+ eo = pmatch[0].rm_eo;
+ }
+
+ /*
+ * It is possible to have a match to the whole pattern but no match
+ * for a subexpression; for example 'foo(bar)?' is considered to match
+ * 'foo' but there is no subexpression match. So this extra test for
+ * match failure is not redundant.
+ */
+ if (so < 0 || eo < 0)
+ PG_RETURN_NULL();
- PG_RETURN_NULL();
+ return DirectFunctionCall3(text_substr,
+ PointerGetDatum(s),
+ Int32GetDatum(so + 1),
+ Int32GetDatum(eo - so));
}
/* similar_escape()
@@ -446,7 +477,7 @@ similar_escape(PG_FUNCTION_ARGS)
text *pat_text;
text *esc_text;
text *result;
- unsigned char *p,
+ char *p,
*e,
*r;
int plen,
@@ -477,7 +508,7 @@ similar_escape(PG_FUNCTION_ARGS)
ereport(ERROR,
(errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
errmsg("invalid escape string"),
- errhint("Escape string must be empty or one character.")));
+ errhint("Escape string must be empty or one character.")));
}
/*----------
@@ -513,7 +544,7 @@ similar_escape(PG_FUNCTION_ARGS)
while (plen > 0)
{
- unsigned char pchar = *p;
+ char pchar = *p;
if (afterescape)
{
@@ -552,7 +583,7 @@ similar_escape(PG_FUNCTION_ARGS)
*r++ = ')';
*r++ = '$';
- VARATT_SIZEP(result) = r - ((unsigned char *) result);
+ VARATT_SIZEP(result) = r - ((char *) result);
PG_RETURN_TEXT_P(result);
}