diff options
Diffstat (limited to 'src/backend')
-rw-r--r-- | src/backend/regex/regexec.c | 10 | ||||
-rw-r--r-- | src/backend/utils/adt/regexp.c | 165 | ||||
-rw-r--r-- | src/backend/utils/adt/varlena.c | 222 |
3 files changed, 349 insertions, 48 deletions
diff --git a/src/backend/regex/regexec.c b/src/backend/regex/regexec.c index 6df4aff05e9..e3bc41fa5e0 100644 --- a/src/backend/regex/regexec.c +++ b/src/backend/regex/regexec.c @@ -27,7 +27,7 @@ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * - * $PostgreSQL: pgsql/src/backend/regex/regexec.c,v 1.24 2003/11/29 19:51:55 pgsql Exp $ + * $PostgreSQL: pgsql/src/backend/regex/regexec.c,v 1.25 2005/07/10 04:54:30 momjian Exp $ * */ @@ -110,6 +110,7 @@ struct vars regmatch_t *pmatch; rm_detail_t *details; chr *start; /* start of string */ + chr *search_start; /* search start of string */ chr *stop; /* just past end of string */ int err; /* error code if any (0 none) */ regoff_t *mem; /* memory vector for backtracking */ @@ -168,6 +169,7 @@ int pg_regexec(regex_t *re, const chr *string, size_t len, + size_t search_start, rm_detail_t *details, size_t nmatch, regmatch_t pmatch[], @@ -219,6 +221,7 @@ pg_regexec(regex_t *re, v->pmatch = pmatch; v->details = details; v->start = (chr *) string; + v->search_start = (chr *) string + search_start; v->stop = (chr *) string + len; v->err = 0; if (backref) @@ -288,7 +291,8 @@ find(struct vars * v, NOERR(); MDEBUG(("\nsearch at %ld\n", LOFF(v->start))); cold = NULL; - close = shortest(v, s, v->start, v->start, v->stop, &cold, (int *) NULL); + close = shortest(v, s, v->search_start, v->search_start, v->stop, + &cold, (int *) NULL); freedfa(s); NOERR(); if (v->g->cflags & REG_EXPECT) @@ -415,7 +419,7 @@ cfindloop(struct vars * v, assert(d != NULL && s != NULL); cold = NULL; - close = v->start; + close = v->search_start; do { MDEBUG(("\ncsearch at %ld\n", LOFF(close))); diff --git a/src/backend/utils/adt/regexp.c b/src/backend/utils/adt/regexp.c index d8b038a09a8..b8112a2cb95 100644 --- a/src/backend/utils/adt/regexp.c +++ b/src/backend/utils/adt/regexp.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/adt/regexp.c,v 1.56 2004/12/31 22:01:22 pgsql Exp $ + * $PostgreSQL: pgsql/src/backend/utils/adt/regexp.c,v 1.57 2005/07/10 04:54:30 momjian Exp $ * * Alistair Crooks added the code for the regex caching * agc - cached the regular expressions used - there's a good chance @@ -81,38 +81,27 @@ static cached_re_str re_array[MAX_CACHED_RES]; /* cached re's */ /* - * RE_compile_and_execute - compile and execute a RE, caching if possible + * RE_compile_and_cache - compile a RE, caching if possible * - * Returns TRUE on match, FALSE on no match + * Returns regex_t * - * text_re --- the pattern, expressed as an *untoasted* TEXT object - * dat --- the data to match against (need not be null-terminated) - * dat_len --- the length of the data string - * cflags --- compile options for the pattern - * nmatch, pmatch --- optional return area for match details + * text_re --- the pattern, expressed as an *untoasted* TEXT object + * cflags --- compile options for the pattern * - * Both pattern and data are given in the database encoding. We internally - * convert to array of pg_wchar which is what Spencer's regex package wants. + * Pattern is given in the database encoding. We internally convert to + * array of pg_wchar which is what Spencer's regex package wants. */ -static bool -RE_compile_and_execute(text *text_re, unsigned char *dat, int dat_len, - int cflags, int nmatch, regmatch_t *pmatch) +static regex_t +RE_compile_and_cache(text *text_re, int cflags) { int text_re_len = VARSIZE(text_re); - pg_wchar *data; - size_t data_len; pg_wchar *pattern; size_t pattern_len; int i; int regcomp_result; - int regexec_result; cached_re_str re_temp; char errMsg[100]; - /* Convert data string to wide characters */ - data = (pg_wchar *) palloc((dat_len + 1) * sizeof(pg_wchar)); - data_len = pg_mb2wchar_with_len(dat, data, dat_len); - /* * Look for a match among previously compiled REs. Since the data * structure is self-organizing with most-used entries at the front, @@ -134,28 +123,7 @@ RE_compile_and_execute(text *text_re, unsigned char *dat, int dat_len, re_array[0] = re_temp; } - /* Perform RE match and return result */ - regexec_result = pg_regexec(&re_array[0].cre_re, - data, - data_len, - NULL, /* no details */ - nmatch, - pmatch, - 0); - - pfree(data); - - if (regexec_result != REG_OKAY && regexec_result != REG_NOMATCH) - { - /* re failed??? */ - pg_regerror(regexec_result, &re_array[0].cre_re, - errMsg, sizeof(errMsg)); - ereport(ERROR, - (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION), - errmsg("regular expression failed: %s", errMsg))); - } - - return (regexec_result == REG_OKAY); + return re_array[0].cre_re; } } @@ -220,10 +188,45 @@ RE_compile_and_execute(text *text_re, unsigned char *dat, int dat_len, re_array[0] = re_temp; num_res++; + return re_array[0].cre_re; +} + +/* + * RE_compile_and_execute - compile and execute a RE + * + * Returns TRUE on match, FALSE on no match + * + * text_re --- the pattern, expressed as an *untoasted* TEXT object + * dat --- the data to match against (need not be null-terminated) + * dat_len --- the length of the data string + * cflags --- compile options for the pattern + * nmatch, pmatch --- optional return area for match details + * + * Both pattern and data are given in the database encoding. We internally + * convert to array of pg_wchar which is what Spencer's regex package wants. + */ +static bool +RE_compile_and_execute(text *text_re, unsigned char *dat, int dat_len, + int cflags, int nmatch, regmatch_t *pmatch) +{ + pg_wchar *data; + size_t data_len; + int regexec_result; + regex_t re; + char errMsg[100]; + + /* Convert data string to wide characters */ + data = (pg_wchar *) palloc((dat_len + 1) * sizeof(pg_wchar)); + data_len = pg_mb2wchar_with_len(dat, data, dat_len); + + /* Compile RE */ + re = RE_compile_and_cache(text_re, cflags); + /* Perform RE match and return result */ regexec_result = pg_regexec(&re_array[0].cre_re, data, data_len, + 0, NULL, /* no details */ nmatch, pmatch, @@ -428,15 +431,89 @@ textregexsubstr(PG_FUNCTION_ARGS) eo = pmatch[0].rm_eo; } - return (DirectFunctionCall3(text_substr, + return DirectFunctionCall3(text_substr, PointerGetDatum(s), Int32GetDatum(so + 1), - Int32GetDatum(eo - so))); + Int32GetDatum(eo - so)); } PG_RETURN_NULL(); } +/* + * textregexreplace_noopt() + * Return a replace string matched by a regular expression. + * This function is a version that doesn't specify the option of + * textregexreplace. This is case sensitive, replace the first + * instance only. + */ +Datum +textregexreplace_noopt(PG_FUNCTION_ARGS) +{ + text *s = PG_GETARG_TEXT_P(0); + text *p = PG_GETARG_TEXT_P(1); + text *r = PG_GETARG_TEXT_P(2); + regex_t re; + + re = RE_compile_and_cache(p, regex_flavor); + + return DirectFunctionCall4(replace_text_regexp, + PointerGetDatum(s), + PointerGetDatum(&re), + PointerGetDatum(r), + BoolGetDatum(false)); +} + +/* + * textregexreplace() + * Return a replace string matched by a regular expression. + */ +Datum +textregexreplace(PG_FUNCTION_ARGS) +{ + text *s = PG_GETARG_TEXT_P(0); + text *p = PG_GETARG_TEXT_P(1); + text *r = PG_GETARG_TEXT_P(2); + text *opt = PG_GETARG_TEXT_P(3); + char *opt_p = VARDATA(opt); + int opt_len = (VARSIZE(opt) - VARHDRSZ); + int i; + bool global = false; + bool ignorecase = false; + regex_t re; + + /* parse options */ + for (i = 0; i < opt_len; i++) + { + switch (opt_p[i]) + { + case 'i': + ignorecase = true; + break; + case 'g': + global = true; + break; + default: + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid option of regexp_replace: %c", + opt_p[i]))); + break; + } + } + + if (ignorecase) + re = RE_compile_and_cache(p, regex_flavor | REG_ICASE); + else + re = RE_compile_and_cache(p, regex_flavor); + + return DirectFunctionCall4(replace_text_regexp, + PointerGetDatum(s), + PointerGetDatum(&re), + PointerGetDatum(r), + BoolGetDatum(global)); +} + /* similar_escape() * Convert a SQL99 regexp pattern to POSIX style, so it can be used by * our regexp engine. diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c index 01cd5d2b7a3..ba40747df41 100644 --- a/src/backend/utils/adt/varlena.c +++ b/src/backend/utils/adt/varlena.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/adt/varlena.c,v 1.126 2005/07/07 04:36:08 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/utils/adt/varlena.c,v 1.127 2005/07/10 04:54:30 momjian Exp $ * *------------------------------------------------------------------------- */ @@ -28,6 +28,7 @@ #include "utils/builtins.h" #include "utils/lsyscache.h" #include "utils/pg_locale.h" +#include "regex/regex.h" typedef struct varlena unknown; @@ -1994,6 +1995,225 @@ replace_text(PG_FUNCTION_ARGS) } /* + * check_replace_text_has_escape_char + * check whether replace_text has escape char. + */ +static bool +check_replace_text_has_escape_char(const text *replace_text) +{ + const char *p = VARDATA(replace_text); + const char *p_end = p + (VARSIZE(replace_text) - VARHDRSZ); + + if (pg_database_encoding_max_length() == 1) + { + for (; p < p_end; p++) + if (*p == '\\') return true; + } + else + { + for (; p < p_end; p += pg_mblen(p)) + if (*p == '\\') return true; + } + + return false; +} + +/* + * appendStringInfoRegexpSubstr + * append string by using back references of regexp. + */ +static void +appendStringInfoRegexpSubstr(StringInfo str, text *replace_text, + regmatch_t *pmatch, text *src_text) +{ + const char *p = VARDATA(replace_text); + const char *p_end = p + (VARSIZE(replace_text) - VARHDRSZ); + + int eml = pg_database_encoding_max_length(); + + int substr_start = 1; + int ch_cnt; + + int so; + int eo; + + while (1) + { + /* Find escape char. */ + ch_cnt = 0; + if (eml == 1) + { + for (; p < p_end && *p != '\\'; p++) + ch_cnt++; + } + else + { + for (; p < p_end && *p != '\\'; p += pg_mblen(p)) + ch_cnt++; + } + + /* + * Copy the text when there is a text in the left of escape char + * or escape char is not found. + */ + if (ch_cnt) + { + text *append_text = text_substring(PointerGetDatum(replace_text), + substr_start, ch_cnt, false); + appendStringInfoString(str, PG_TEXT_GET_STR(append_text)); + pfree(append_text); + } + substr_start += ch_cnt + 1; + + if (p >= p_end) /* When escape char is not found. */ + break; + + /* See the next character of escape char. */ + p++; + so = eo = -1; + + if (*p >= '1' && *p <= '9') + { + /* Use the back reference of regexp. */ + int idx = *p - '0'; + so = pmatch[idx].rm_so; + eo = pmatch[idx].rm_eo; + p++; + substr_start++; + } + else if (*p == '&') + { + /* Use the entire matched string. */ + so = pmatch[0].rm_so; + eo = pmatch[0].rm_eo; + p++; + substr_start++; + } + + if (so != -1 && eo != -1) + { + /* Copy the text that is back reference of regexp. */ + text *append_text = text_substring(PointerGetDatum(src_text), + so + 1, (eo - so), false); + appendStringInfoString(str, PG_TEXT_GET_STR(append_text)); + pfree(append_text); + } + } +} + +#define REGEXP_REPLACE_BACKREF_CNT 10 + +/* + * replace_text_regexp + * replace text that matches to regexp in src_text to replace_text. + */ +Datum +replace_text_regexp(PG_FUNCTION_ARGS) +{ + text *ret_text; + text *src_text = PG_GETARG_TEXT_P(0); + int src_text_len = VARSIZE(src_text) - VARHDRSZ; + regex_t *re = (regex_t *)PG_GETARG_POINTER(1); + text *replace_text = PG_GETARG_TEXT_P(2); + bool global = PG_GETARG_BOOL(3); + StringInfo str = makeStringInfo(); + int regexec_result; + regmatch_t pmatch[REGEXP_REPLACE_BACKREF_CNT]; + pg_wchar *data; + size_t data_len; + int search_start; + int data_pos; + bool have_escape; + + /* Convert data string to wide characters. */ + data = (pg_wchar *) palloc((src_text_len + 1) * sizeof(pg_wchar)); + data_len = pg_mb2wchar_with_len(VARDATA(src_text), data, src_text_len); + + /* Check whether replace_text has escape char. */ + have_escape = check_replace_text_has_escape_char(replace_text); + + for (search_start = data_pos = 0; search_start <= data_len;) + { + regexec_result = pg_regexec(re, + data, + data_len, + search_start, + NULL, /* no details */ + REGEXP_REPLACE_BACKREF_CNT, + pmatch, + 0); + + if (regexec_result != REG_OKAY && regexec_result != REG_NOMATCH) + { + char errMsg[100]; + + /* re failed??? */ + pg_regerror(regexec_result, re, errMsg, sizeof(errMsg)); + ereport(ERROR, + (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION), + errmsg("regular expression failed: %s", errMsg))); + } + + if (regexec_result == REG_NOMATCH) + break; + + /* + * Copy the text when there is a text in the left of matched position. + */ + if (pmatch[0].rm_so - data_pos > 0) + { + text *left_text = text_substring(PointerGetDatum(src_text), + data_pos + 1, + pmatch[0].rm_so - data_pos, false); + appendStringInfoString(str, PG_TEXT_GET_STR(left_text)); + pfree(left_text); + } + + /* + * Copy the replace_text. Process back references when the + * replace_text has escape characters. + */ + if (have_escape) + appendStringInfoRegexpSubstr(str, replace_text, pmatch, src_text); + else + appendStringInfoString(str, PG_TEXT_GET_STR(replace_text)); + + search_start = data_pos = pmatch[0].rm_eo; + + /* + * When global option is off, replace the first instance only. + */ + if (!global) + break; + + /* + * Search from next character when the matching text is zero width. + */ + if (pmatch[0].rm_so == pmatch[0].rm_eo) + search_start++; + } + + /* + * Copy the text when there is a text at the right of last matched + * or regexp is not matched. + */ + if (data_pos < data_len) + { + text *right_text = text_substring(PointerGetDatum(src_text), + data_pos + 1, -1, true); + appendStringInfoString(str, PG_TEXT_GET_STR(right_text)); + pfree(right_text); + } + + ret_text = PG_STR_GET_TEXT(str->data); + pfree(str->data); + pfree(str); + pfree(data); + + PG_RETURN_TEXT_P(ret_text); +} + +/* * split_text * parse input string * return ord item (1 based) |