Code review for regexp_replace patch. Improve documentation and comments,

fix problems with replacement-string backslashes that aren't followed by one of the expected characters, avoid giving the impression that replace_text_regexp() is meant to be called directly as a SQL function, etc.
author: Tom Lane <tgl@sss.pgh.pa.us> 2005-10-18 20:38:58 +0000
committer: Tom Lane <tgl@sss.pgh.pa.us> 2005-10-18 20:38:58 +0000
commit: 220f2a7d15adc6ca811a3cbe5fee79ce4904cd91 (patch)
tree: f71baba620e60bdc4fcff8b3a008590d17e5ede1 /src
parent: 800af89004f1a9e816b8df644c31b3fa2a247f92 (diff)
download: postgresql-220f2a7d15adc6ca811a3cbe5fee79ce4904cd91.tar.gz
postgresql-220f2a7d15adc6ca811a3cbe5fee79ce4904cd91.zip
3 files changed, 100 insertions, 85 deletions
diff --git a/src/backend/utils/adt/regexp.c b/src/backend/utils/adt/regexp.c
index a872762c3c2..ce04ce77e67 100644
--- a/src/backend/utils/adt/regexp.c
+++ b/src/backend/utils/adt/regexp.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/utils/adt/regexp.c,v 1.59 2005/10/15 02:49:29 momjian Exp $
+ *	  $PostgreSQL: pgsql/src/backend/utils/adt/regexp.c,v 1.60 2005/10/18 20:38:58 tgl Exp $
  *
  *		Alistair Crooks added the code for the regex caching
  *		agc - cached the regular expressions used - there's a good chance
@@ -83,7 +83,7 @@ static cached_re_str re_array[MAX_CACHED_RES];	/* cached re's */
 /*
  * RE_compile_and_cache - compile a RE, caching if possible
  *
- * Returns regex_t
+ * Returns regex_t *
  *
  *	text_re --- the pattern, expressed as an *untoasted* TEXT object
  *	cflags --- compile options for the pattern
@@ -91,7 +91,7 @@ static cached_re_str re_array[MAX_CACHED_RES];	/* cached re's */
  * Pattern is given in the database encoding.  We internally convert to
  * array of pg_wchar which is what Spencer's regex package wants.
  */
-static regex_t
+static regex_t *
 RE_compile_and_cache(text *text_re, int cflags)
 {
 	int			text_re_len = VARSIZE(text_re);
@@ -123,7 +123,7 @@ RE_compile_and_cache(text *text_re, int cflags)
 				re_array[0] = re_temp;
 			}
 
-			return re_array[0].cre_re;
+			return &re_array[0].cre_re;
 		}
 	}
 
@@ -188,7 +188,7 @@ RE_compile_and_cache(text *text_re, int cflags)
 	re_array[0] = re_temp;
 	num_res++;
 
-	return re_array[0].cre_re;
+	return &re_array[0].cre_re;
 }
 
 /*
@@ -212,7 +212,7 @@ RE_compile_and_execute(text *text_re, char *dat, int dat_len,
 	pg_wchar   *data;
 	size_t		data_len;
 	int			regexec_result;
-	regex_t		re;
+	regex_t	   *re;
 	char		errMsg[100];
 
 	/* Convert data string to wide characters */
@@ -223,7 +223,7 @@ RE_compile_and_execute(text *text_re, char *dat, int dat_len,
 	re = RE_compile_and_cache(text_re, cflags);
 
 	/* Perform RE match and return result */
-	regexec_result = pg_regexec(&re_array[0].cre_re,
+	regexec_result = pg_regexec(re,
 								data,
 								data_len,
 								0,
@@ -237,8 +237,7 @@ RE_compile_and_execute(text *text_re, char *dat, int dat_len,
 	if (regexec_result != REG_OKAY && regexec_result != REG_NOMATCH)
 	{
 		/* re failed??? */
-		pg_regerror(regexec_result, &re_array[0].cre_re,
-					errMsg, sizeof(errMsg));
+		pg_regerror(regexec_result, re, errMsg, sizeof(errMsg));
 		ereport(ERROR,
 				(errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
 				 errmsg("regular expression failed: %s", errMsg)));
@@ -442,10 +441,10 @@ textregexsubstr(PG_FUNCTION_ARGS)
 
 /*
  * textregexreplace_noopt()
- *		Return a replace string matched by a regular expression.
- *		This function is a version that doesn't specify the option of
- *		textregexreplace. This is case sensitive, replace the first
- *		instance only.
+ *		Return a string matched by a regular expression, with replacement.
+ *
+ * This version doesn't have an option argument: we default to case
+ * sensitive match, replace the first instance only.
  */
 Datum
 textregexreplace_noopt(PG_FUNCTION_ARGS)
@@ -453,20 +452,16 @@ textregexreplace_noopt(PG_FUNCTION_ARGS)
 	text	   *s = PG_GETARG_TEXT_P(0);
 	text	   *p = PG_GETARG_TEXT_P(1);
 	text	   *r = PG_GETARG_TEXT_P(2);
-	regex_t		re;
+	regex_t	   *re;
 
 	re = RE_compile_and_cache(p, regex_flavor);
 
-	return DirectFunctionCall4(replace_text_regexp,
-							   PointerGetDatum(s),
-							   PointerGetDatum(&re),
-							   PointerGetDatum(r),
-							   BoolGetDatum(false));
+	PG_RETURN_TEXT_P(replace_text_regexp(s, (void *) re, r, false));
 }
 
 /*
  * textregexreplace()
- *		Return a replace string matched by a regular expression.
+ *		Return a string matched by a regular expression, with replacement.
  */
 Datum
 textregexreplace(PG_FUNCTION_ARGS)
@@ -478,9 +473,9 @@ textregexreplace(PG_FUNCTION_ARGS)
 	char	   *opt_p = VARDATA(opt);
 	int			opt_len = (VARSIZE(opt) - VARHDRSZ);
 	int			i;
-	bool global = false;
+	bool		glob = false;
 	bool		ignorecase = false;
-	regex_t		re;
+	regex_t	   *re;
 
 	/* parse options */
 	for (i = 0; i < opt_len; i++)
@@ -491,8 +486,7 @@ textregexreplace(PG_FUNCTION_ARGS)
 				ignorecase = true;
 				break;
 			case 'g':
-				global = true;
-
+				glob = true;
 				break;
 			default:
 				ereport(ERROR,
@@ -508,11 +502,7 @@ textregexreplace(PG_FUNCTION_ARGS)
 	else
 		re = RE_compile_and_cache(p, regex_flavor);
 
-	return DirectFunctionCall4(replace_text_regexp,
-							   PointerGetDatum(s),
-							   PointerGetDatum(&re),
-							   PointerGetDatum(r),
-							   BoolGetDatum(global));
+	PG_RETURN_TEXT_P(replace_text_regexp(s, (void *) re, r, glob));
 }
 
 /* similar_escape()
diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c
index 5fc3128360c..3f67403ba44 100644
--- a/src/backend/utils/adt/varlena.c
+++ b/src/backend/utils/adt/varlena.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/utils/adt/varlena.c,v 1.137 2005/10/17 16:24:19 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/utils/adt/varlena.c,v 1.138 2005/10/18 20:38:58 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -24,11 +24,11 @@
 #include "mb/pg_wchar.h"
 #include "miscadmin.h"
 #include "parser/scansup.h"
+#include "regex/regex.h"
 #include "utils/array.h"
 #include "utils/builtins.h"
 #include "utils/lsyscache.h"
 #include "utils/pg_locale.h"
-#include "regex/regex.h"
 
 
 typedef struct varlena unknown;
@@ -2066,7 +2066,8 @@ replace_text(PG_FUNCTION_ARGS)
 
 /*
  * check_replace_text_has_escape_char
- * check whether replace_text has escape char.
+ *
+ * check whether replace_text contains escape char.
  */
 static bool
 check_replace_text_has_escape_char(const text *replace_text)
@@ -2077,14 +2078,18 @@ check_replace_text_has_escape_char(const text *replace_text)
 	if (pg_database_encoding_max_length() == 1)
 	{
 		for (; p < p_end; p++)
+		{
 			if (*p == '\\')
 				return true;
+		}
 	}
 	else
 	{
 		for (; p < p_end; p += pg_mblen(p))
+		{
 			if (*p == '\\')
 				return true;
+		}
 	}
 
 	return false;
@@ -2092,7 +2097,9 @@ check_replace_text_has_escape_char(const text *replace_text)
 
 /*
  * appendStringInfoRegexpSubstr
- * append string by using back references of regexp.
+ *
+ * Append replace_text to str, substituting regexp back references for
+ * \n escapes.
  */
 static void
 appendStringInfoRegexpSubstr(StringInfo str, text *replace_text,
@@ -2100,50 +2107,41 @@ appendStringInfoRegexpSubstr(StringInfo str, text *replace_text,
 {
 	const char *p = VARDATA(replace_text);
 	const char *p_end = p + (VARSIZE(replace_text) - VARHDRSZ);
-
 	int			eml = pg_database_encoding_max_length();
 
-	int			substr_start = 1;
-	int			ch_cnt;
-
-	int			so;
-	int			eo;
-
-	while (1)
+	for (;;)
 	{
-		/* Find escape char. */
-		ch_cnt = 0;
+		const char *chunk_start = p;
+		int			so;
+		int			eo;
+
+		/* Find next escape char. */
 		if (eml == 1)
 		{
 			for (; p < p_end && *p != '\\'; p++)
-				ch_cnt++;
+				/* nothing */ ;
 		}
 		else
 		{
 			for (; p < p_end && *p != '\\'; p += pg_mblen(p))
-				ch_cnt++;
+				/* nothing */ ;
 		}
 
-		/*
-		 * Copy the text when there is a text in the left of escape char or
-		 * escape char is not found.
-		 */
-		if (ch_cnt)
-		{
-			text	   *append_text = text_substring(PointerGetDatum(replace_text),
-												substr_start, ch_cnt, false);
-
-			appendStringInfoText(str, append_text);
-			pfree(append_text);
-		}
-		substr_start += ch_cnt + 1;
+		/* Copy the text we just scanned over, if any. */
+		if (p > chunk_start)
+			appendBinaryStringInfo(str, chunk_start, p - chunk_start);
 
-		if (p >= p_end)			/* When escape char is not found. */
+		/* Done if at end of string, else advance over escape char. */
+		if (p >= p_end)
 			break;
-
-		/* See the next character of escape char. */
 		p++;
-		so = eo = -1;
+
+		if (p >= p_end)
+		{
+			/* Escape at very end of input.  Treat same as unexpected char */
+			appendStringInfoChar(str, '\\');
+			break;
+		}
 
 		if (*p >= '1' && *p <= '9')
 		{
@@ -2153,7 +2151,6 @@ appendStringInfoRegexpSubstr(StringInfo str, text *replace_text,
 			so = pmatch[idx].rm_so;
 			eo = pmatch[idx].rm_eo;
 			p++;
-			substr_start++;
 		}
 		else if (*p == '&')
 		{
@@ -2161,15 +2158,36 @@ appendStringInfoRegexpSubstr(StringInfo str, text *replace_text,
 			so = pmatch[0].rm_so;
 			eo = pmatch[0].rm_eo;
 			p++;
-			substr_start++;
+		}
+		else if (*p == '\\')
+		{
+			/* \\ means transfer one \ to output. */
+			appendStringInfoChar(str, '\\');
+			p++;
+			continue;
+		}
+		else
+		{
+			/*
+			 * If escape char is not followed by any expected char,
+			 * just treat it as ordinary data to copy.  (XXX would it be
+			 * better to throw an error?)
+			 */
+			appendStringInfoChar(str, '\\');
+			continue;
 		}
 
 		if (so != -1 && eo != -1)
 		{
-			/* Copy the text that is back reference of regexp. */
-			text	   *append_text = text_substring(PointerGetDatum(src_text),
-												   so + 1, (eo - so), false);
+			/*
+			 * Copy the text that is back reference of regexp.  Because so and
+			 * eo are counted in characters not bytes, it's easiest to use
+			 * text_substring to pull out the correct chunk of text.
+			 */
+			text	   *append_text;
 
+			append_text = text_substring(PointerGetDatum(src_text),
+										 so + 1, (eo - so), false);
 			appendStringInfoText(str, append_text);
 			pfree(append_text);
 		}
@@ -2180,17 +2198,19 @@ appendStringInfoRegexpSubstr(StringInfo str, text *replace_text,
 
 /*
  * replace_text_regexp
+ *
  * replace text that matches to regexp in src_text to replace_text.
+ *
+ * Note: to avoid having to include regex.h in builtins.h, we declare
+ * the regexp argument as void *, but really it's regex_t *.
  */
-Datum
-replace_text_regexp(PG_FUNCTION_ARGS)
+text *
+replace_text_regexp(text *src_text, void *regexp,
+					text *replace_text, bool glob)
 {
 	text	   *ret_text;
-	text	   *src_text = PG_GETARG_TEXT_P(0);
+	regex_t    *re = (regex_t *) regexp;
 	int			src_text_len = VARSIZE(src_text) - VARHDRSZ;
-	regex_t    *re = (regex_t *) PG_GETARG_POINTER(1);
-	text	   *replace_text = PG_GETARG_TEXT_P(2);
-	bool global = PG_GETARG_BOOL(3);
 	StringInfo	str = makeStringInfo();
 	int			regexec_result;
 	regmatch_t	pmatch[REGEXP_REPLACE_BACKREF_CNT];
@@ -2233,14 +2253,18 @@ replace_text_regexp(PG_FUNCTION_ARGS)
 			break;
 
 		/*
-		 * Copy the text when there is a text in the left of matched position.
+		 * Copy the text to the left of the match position.  Because we
+		 * are working with character not byte indexes, it's easiest to
+		 * use text_substring to pull out the needed data.
 		 */
 		if (pmatch[0].rm_so - data_pos > 0)
 		{
-			text	   *left_text = text_substring(PointerGetDatum(src_text),
-												   data_pos + 1,
-										  pmatch[0].rm_so - data_pos, false);
+			text	   *left_text;
 
+			left_text = text_substring(PointerGetDatum(src_text),
+									   data_pos + 1,
+									   pmatch[0].rm_so - data_pos,
+									   false);
 			appendStringInfoText(str, left_text);
 			pfree(left_text);
 		}
@@ -2259,7 +2283,7 @@ replace_text_regexp(PG_FUNCTION_ARGS)
 		/*
 		 * When global option is off, replace the first instance only.
 		 */
-		if (!global)
+		if (!glob)
 			break;
 
 		/*
@@ -2270,14 +2294,14 @@ replace_text_regexp(PG_FUNCTION_ARGS)
 	}
 
 	/*
-	 * Copy the text when there is a text at the right of last matched or
-	 * regexp is not matched.
+	 * Copy the text to the right of the last match.
 	 */
 	if (data_pos < data_len)
 	{
-		text	   *right_text = text_substring(PointerGetDatum(src_text),
-												data_pos + 1, -1, true);
+		text	   *right_text;
 
+		right_text = text_substring(PointerGetDatum(src_text),
+									data_pos + 1, -1, true);
 		appendStringInfoText(str, right_text);
 		pfree(right_text);
 	}
@@ -2287,7 +2311,7 @@ replace_text_regexp(PG_FUNCTION_ARGS)
 	pfree(str);
 	pfree(data);
 
-	PG_RETURN_TEXT_P(ret_text);
+	return ret_text;
 }
 
 /*
diff --git a/src/include/utils/builtins.h b/src/include/utils/builtins.h
index 469d993d04d..1ec358f3849 100644
--- a/src/include/utils/builtins.h
+++ b/src/include/utils/builtins.h
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/utils/builtins.h,v 1.266 2005/10/15 02:49:46 momjian Exp $
+ * $PostgreSQL: pgsql/src/include/utils/builtins.h,v 1.267 2005/10/18 20:38:58 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -593,7 +593,8 @@ extern List *textToQualifiedNameList(text *textval);
 extern bool SplitIdentifierString(char *rawstring, char separator,
 					  List **namelist);
 extern Datum replace_text(PG_FUNCTION_ARGS);
-extern Datum replace_text_regexp(PG_FUNCTION_ARGS);
+extern text *replace_text_regexp(text *src_text, void *regexp,
+								 text *replace_text, bool glob);
 extern Datum split_text(PG_FUNCTION_ARGS);
 extern Datum text_to_array(PG_FUNCTION_ARGS);
 extern Datum array_to_text(PG_FUNCTION_ARGS);
author	Tom Lane <tgl@sss.pgh.pa.us>	2005-10-18 20:38:58 +0000
committer	Tom Lane <tgl@sss.pgh.pa.us>	2005-10-18 20:38:58 +0000
commit	220f2a7d15adc6ca811a3cbe5fee79ce4904cd91 (patch)
tree	f71baba620e60bdc4fcff8b3a008590d17e5ede1 /src
parent	800af89004f1a9e816b8df644c31b3fa2a247f92 (diff)
download	postgresql-220f2a7d15adc6ca811a3cbe5fee79ce4904cd91.tar.gz postgresql-220f2a7d15adc6ca811a3cbe5fee79ce4904cd91.zip