Close previously open holes for invalidly encoded data to enter the

database via builtin functions, as recently discussed on -hackers. chr() now returns a character in the database encoding. For UTF8 encoded databases the argument is treated as a Unicode code point. For other multi-byte encodings the argument must designate a strict ascii character, or an error is raised, as is also the case if the argument is 0. ascii() is adjusted so that it remains the inverse of chr(). The two argument form of convert() is gone, and the three argument form now takes a bytea first argument and returns a bytea. To cover this loss three new functions are introduced: . convert_from(bytea, name) returns text - converts the first argument from the named encoding to the database encoding . convert_to(text, name) returns bytea - converts the first argument from the database encoding to the named encoding . length(bytea, name) returns int - gives the length of the first argument in characters in the named encoding
author: Andrew Dunstan <andrew@dunslane.net> 2007-09-18 17:41:17 +0000
committer: Andrew Dunstan <andrew@dunslane.net> 2007-09-18 17:41:17 +0000
commit: 55613bf9cd7d6071e43e68ac14bc0243a1027507 (patch)
tree: 9b151f94d94e7dc3aa5988c03867d3f6f6b562ba /src/backend
parent: 8544110042ddf8be29e177e37f53516686a06da2 (diff)
download: postgresql-55613bf9cd7d6071e43e68ac14bc0243a1027507.tar.gz
postgresql-55613bf9cd7d6071e43e68ac14bc0243a1027507.zip
4 files changed, 231 insertions, 38 deletions
diff --git a/src/backend/catalog/pg_conversion.c b/src/backend/catalog/pg_conversion.c
index 262d9f41fb7..7146d0b4f53 100644
--- a/src/backend/catalog/pg_conversion.c
+++ b/src/backend/catalog/pg_conversion.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/catalog/pg_conversion.c,v 1.36 2007/02/27 23:48:07 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/catalog/pg_conversion.c,v 1.37 2007/09/18 17:41:17 adunstan Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -282,7 +282,10 @@ FindConversion(const char *conname, Oid connamespace)
  * CONVERT <left paren> <character value expression>
  * USING <form-of-use conversion name> <right paren>
  *
- * TEXT convert_using(TEXT string, TEXT conversion_name)
+ * BYTEA convert_using(TEXT string, TEXT conversion_name)
+ *
+ * bytea is returned so we don't give a value that is
+ * not valid in the database encoding.
  */
 Datum
 pg_convert_using(PG_FUNCTION_ARGS)
@@ -344,5 +347,5 @@ pg_convert_using(PG_FUNCTION_ARGS)
 	pfree(result);
 	pfree(str);
 
-	PG_RETURN_TEXT_P(retval);
+	PG_RETURN_BYTEA_P(retval);
 }
diff --git a/src/backend/utils/adt/oracle_compat.c b/src/backend/utils/adt/oracle_compat.c
index 9fcd5ae747a..d62315d0f61 100644
--- a/src/backend/utils/adt/oracle_compat.c
+++ b/src/backend/utils/adt/oracle_compat.c
@@ -9,7 +9,7 @@
  *
  *
  * IDENTIFICATION
- *	$PostgreSQL: pgsql/src/backend/utils/adt/oracle_compat.c,v 1.70 2007/02/27 23:48:08 tgl Exp $
+ *	$PostgreSQL: pgsql/src/backend/utils/adt/oracle_compat.c,v 1.71 2007/09/18 17:41:17 adunstan Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -1246,6 +1246,13 @@ translate(PG_FUNCTION_ARGS)
  *
  *	 Returns the decimal representation of the first character from
  *	 string.
+ *   If the string is empty we return 0.
+ *   If the database encoding is UTF8, we return the Unicode codepoint. 
+ *   If the database encoding is any other multi-byte encoding, we
+ *   return the value of the first byte if it is an ASCII character
+ *   (range 1 .. 127), or raise an error.
+ *   For all other encodings we return the value of the first byte,
+ *   (range 1..255).
  *
  ********************************************************************/
 
@@ -1253,11 +1260,57 @@ Datum
 ascii(PG_FUNCTION_ARGS)
 {
 	text	   *string = PG_GETARG_TEXT_P(0);
+	int encoding = GetDatabaseEncoding();
+	unsigned char *data;
 
 	if (VARSIZE(string) <= VARHDRSZ)
 		PG_RETURN_INT32(0);
 
-	PG_RETURN_INT32((int32) *((unsigned char *) VARDATA(string)));
+	data = (unsigned char *) VARDATA(string);
+
+	if (encoding == PG_UTF8 && *data > 127)
+	{
+		/* return the code point for Unicode */
+
+		int result = 0, tbytes = 0, i;
+
+		if (*data >= 0xF0)
+		{
+			result = *data & 0x07;
+			tbytes = 3;
+		}
+		else if (*data >= 0xE0)
+		{
+			result = *data & 0x0F;
+			tbytes = 2;
+		}
+		else
+		{
+			Assert (*data > 0xC0);
+			result = *data & 0x1f;
+			tbytes = 1;
+		}
+
+		Assert (tbytes > 0);
+
+		for (i = 1; i <= tbytes; i++)
+		{
+			Assert ((data[i] & 0xC0) == 0x80);
+			result = (result << 6) + (data[i] & 0x3f);
+		}
+
+		PG_RETURN_INT32(result);
+	}
+	else
+	{
+		if (pg_encoding_max_length(encoding) > 1 && *data > 127)
+			ereport(ERROR,
+					(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+					 errmsg("requested character too large")));
+
+
+		PG_RETURN_INT32((int32) *data);
+	}
 }
 
 /********************************************************************
@@ -1270,19 +1323,96 @@ ascii(PG_FUNCTION_ARGS)
  *
  * Purpose:
  *
- *	Returns the character having the binary equivalent to val
+ *	Returns the character having the binary equivalent to val.
+ *
+ * For UTF8 we treat the argumwent as a Unicode code point.
+ * For other multi-byte encodings we raise an error for arguments
+ * outside the strict ASCII range (1..127).
+ *
+ * It's important that we don't ever return a value that is not valid
+ * in the database encoding, so that this doesn't become a way for
+ * invalid data to enter the database.
  *
  ********************************************************************/
 
 Datum
 chr(PG_FUNCTION_ARGS)
 {
-	int32		cvalue = PG_GETARG_INT32(0);
+	uint32		cvalue = PG_GETARG_UINT32(0);
 	text	   *result;
+	int encoding = GetDatabaseEncoding();
+
+	if (encoding == PG_UTF8 && cvalue > 127)
+	{
+		/* for Unicode we treat the argument as a code point */
+		int bytes ;
+		char *wch;
 
-	result = (text *) palloc(VARHDRSZ + 1);
-	SET_VARSIZE(result, VARHDRSZ + 1);
-	*VARDATA(result) = (char) cvalue;
+		/* We only allow valid Unicode code points */
+		if (cvalue > 0x001fffff)
+			ereport(ERROR,
+					(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+					 errmsg("requested character too large for encoding: %d", 
+							cvalue)));
+
+		if (cvalue > 0xffff)
+			bytes = 4;
+		else if (cvalue > 0x07ff)
+			bytes = 3;
+		else
+			bytes = 2;
+
+		result = (text *) palloc(VARHDRSZ + bytes);
+		SET_VARSIZE(result, VARHDRSZ + bytes);
+		wch = VARDATA(result);
+
+		if (bytes == 2)
+		{
+			wch[0] = 0xC0 | ((cvalue >> 6) & 0x1F);
+			wch[1] = 0x80 | (cvalue & 0x3F);;
+		}
+		else if (bytes == 3)
+		{
+			wch[0] = 0xE0 | ((cvalue >> 12) & 0x0F);
+			wch[1] = 0x80 | ((cvalue >> 6) & 0x3F);
+			wch[2] = 0x80 | (cvalue & 0x3F);
+		}
+		else
+		{
+			wch[0] = 0xF0 | ((cvalue >> 18) & 0x07);
+			wch[1] = 0x80 | ((cvalue >> 12) & 0x3F);
+			wch[2] = 0x80 | ((cvalue >> 6) & 0x3F);
+			wch[3] = 0x80 | (cvalue & 0x3F);
+		}
+		
+	}
+
+	else
+	{
+		bool is_mb;
+
+		/* Error out on arguments that make no sense or that we
+		 * can't validly represent in the encoding.
+		 */
+
+		if (cvalue == 0)
+			ereport(ERROR,
+					(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+					 errmsg("null character not permitted")));
+
+		is_mb = pg_encoding_max_length(encoding) > 1;
+
+		if ((is_mb && (cvalue > 255)) || (! is_mb && (cvalue > 127)))
+			ereport(ERROR,
+					(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+					 errmsg("requested character too large for encoding: %d",
+							cvalue)));
+		
+
+		result = (text *) palloc(VARHDRSZ + 1);
+		SET_VARSIZE(result, VARHDRSZ + 1);
+		*VARDATA(result) = (char) cvalue;
+	}
 
 	PG_RETURN_TEXT_P(result);
 }
diff --git a/src/backend/utils/mb/mbutils.c b/src/backend/utils/mb/mbutils.c
index a466073ca0a..e3ffd370e81 100644
--- a/src/backend/utils/mb/mbutils.c
+++ b/src/backend/utils/mb/mbutils.c
@@ -4,7 +4,7 @@
  * (currently mule internal code (mic) is used)
  * Tatsuo Ishii
  *
- * $PostgreSQL: pgsql/src/backend/utils/mb/mbutils.c,v 1.63 2007/05/28 16:43:24 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/utils/mb/mbutils.c,v 1.64 2007/09/18 17:41:17 adunstan Exp $
  */
 #include "postgres.h"
 
@@ -292,12 +292,12 @@ pg_do_encoding_conversion(unsigned char *src, int len,
 }
 
 /*
- * Convert string using encoding_nanme. We assume that string's
- * encoding is same as DB encoding.
+ * Convert string using encoding_name. The source
+ * encoding is the DB encoding.
  *
- * TEXT convert(TEXT string, NAME encoding_name) */
+ * BYTEA convert_to(TEXT string, NAME encoding_name) */
 Datum
-pg_convert(PG_FUNCTION_ARGS)
+pg_convert_to(PG_FUNCTION_ARGS)
 {
 	Datum		string = PG_GETARG_DATUM(0);
 	Datum		dest_encoding_name = PG_GETARG_DATUM(1);
@@ -306,7 +306,30 @@ pg_convert(PG_FUNCTION_ARGS)
 	Datum		result;
 
 	result = DirectFunctionCall3(
-				 pg_convert2, string, src_encoding_name, dest_encoding_name);
+				 pg_convert, string, src_encoding_name, dest_encoding_name);
+
+	/* free memory allocated by namein */
+	pfree((void *) src_encoding_name);
+
+	PG_RETURN_BYTEA_P(result);
+}
+
+/*
+ * Convert string using encoding_name. The destination
+ * encoding is the DB encoding.
+ *
+ * TEXT convert_from(BYTEA string, NAME encoding_name) */
+Datum
+pg_convert_from(PG_FUNCTION_ARGS)
+{
+	Datum		string = PG_GETARG_DATUM(0);
+	Datum		src_encoding_name = PG_GETARG_DATUM(1);
+	Datum		dest_encoding_name = DirectFunctionCall1(
+							namein, CStringGetDatum(DatabaseEncoding->name));
+	Datum		result;
+
+	result = DirectFunctionCall3(
+				 pg_convert, string, src_encoding_name, dest_encoding_name);
 
 	/* free memory allocated by namein */
 	pfree((void *) src_encoding_name);
@@ -315,20 +338,20 @@ pg_convert(PG_FUNCTION_ARGS)
 }
 
 /*
- * Convert string using encoding_name.
+ * Convert string using encoding_names.
  *
- * TEXT convert2(TEXT string, NAME src_encoding_name, NAME dest_encoding_name)
+ * BYTEA convert(BYTEA string, NAME src_encoding_name, NAME dest_encoding_name)
  */
 Datum
-pg_convert2(PG_FUNCTION_ARGS)
+pg_convert(PG_FUNCTION_ARGS)
 {
-	text	   *string = PG_GETARG_TEXT_P(0);
+	bytea	   *string = PG_GETARG_TEXT_P(0);
 	char	   *src_encoding_name = NameStr(*PG_GETARG_NAME(1));
 	int			src_encoding = pg_char_to_encoding(src_encoding_name);
 	char	   *dest_encoding_name = NameStr(*PG_GETARG_NAME(2));
 	int			dest_encoding = pg_char_to_encoding(dest_encoding_name);
 	unsigned char *result;
-	text	   *retval;
+	bytea	   *retval;
 	unsigned char *str;
 	int			len;
 
@@ -343,8 +366,9 @@ pg_convert2(PG_FUNCTION_ARGS)
 				 errmsg("invalid destination encoding name \"%s\"",
 						dest_encoding_name)));
 
-	/* make sure that source string is null terminated */
+	/* make sure that source string is valid and null terminated */
 	len = VARSIZE(string) - VARHDRSZ;
+	pg_verify_mbstr(src_encoding,VARDATA(string),len,false);
 	str = palloc(len + 1);
 	memcpy(str, VARDATA(string), len);
 	*(str + len) = '\0';
@@ -354,8 +378,7 @@ pg_convert2(PG_FUNCTION_ARGS)
 		elog(ERROR, "encoding conversion failed");
 
 	/*
-	 * build text data type structure. we cannot use textin() here, since
-	 * textin assumes that input string encoding is same as database encoding.
+	 * build bytea data type structure.
 	 */
 	len = strlen((char *) result) + VARHDRSZ;
 	retval = palloc(len);
@@ -369,7 +392,28 @@ pg_convert2(PG_FUNCTION_ARGS)
 	/* free memory if allocated by the toaster */
 	PG_FREE_IF_COPY(string, 0);
 
-	PG_RETURN_TEXT_P(retval);
+	PG_RETURN_BYTEA_P(retval);
+}
+
+/*
+ * get the length of the string considered as text in the specified
+ * encoding. Raises an error if the data is not valid in that
+ * encoding.
+ *
+ * INT4 length (BYTEA string, NAME src_encoding_name)
+ */
+Datum
+length_in_encoding(PG_FUNCTION_ARGS)
+{
+	bytea      *string = PG_GETARG_BYTEA_P(0);
+	char	   *src_encoding_name = NameStr(*PG_GETARG_NAME(1));
+	int			src_encoding = pg_char_to_encoding(src_encoding_name);
+	int         len = VARSIZE(string) - VARHDRSZ;
+	int         retval;
+
+	retval = pg_verify_mbstr_len(src_encoding, VARDATA(string), len, false);
+	PG_RETURN_INT32(retval);
+	
 }
 
 /*
diff --git a/src/backend/utils/mb/wchar.c b/src/backend/utils/mb/wchar.c
index cc8d4b58624..2c98f4b476e 100644
--- a/src/backend/utils/mb/wchar.c
+++ b/src/backend/utils/mb/wchar.c
@@ -1,9 +1,7 @@
 /*
  * conversion functions between pg_wchar and multibyte streams.
  * Tatsuo Ishii
- * $PostgreSQL: pgsql/src/backend/utils/mb/wchar.c,v 1.63 2007/07/12 21:17:09 tgl Exp $
- *
- * WIN1250 client encoding updated by Pavel Behal
+ * $PostgreSQL: pgsql/src/backend/utils/mb/wchar.c,v 1.64 2007/09/18 17:41:17 adunstan Exp $
  *
  */
 /* can be used in either frontend or backend */
@@ -1435,23 +1433,37 @@ pg_database_encoding_max_length(void)
 bool
 pg_verifymbstr(const char *mbstr, int len, bool noError)
 {
-	return pg_verify_mbstr(GetDatabaseEncoding(), mbstr, len, noError);
+	return 
+		pg_verify_mbstr_len(GetDatabaseEncoding(), mbstr, len, noError) >= 0;
 }
 
 /*
  * Verify mbstr to make sure that it is validly encoded in the specified
  * encoding.
  *
+ */
+bool
+pg_verify_mbstr(int encoding, const char *mbstr, int len, bool noError)
+{
+	return pg_verify_mbstr_len(encoding, mbstr, len, noError) >= 0;
+}
+
+/* 
+ * Verify mbstr to make sure that it is validly encoded in the specified
+ * encoding.
+ *
  * mbstr is not necessarily zero terminated; length of mbstr is
  * specified by len.
  *
- * If OK, return TRUE.	If a problem is found, return FALSE when noError is
+ * If OK, return length of string in the encoding.	
+ * If a problem is found, return -1 when noError is
  * true; when noError is false, ereport() a descriptive message.
- */
-bool
-pg_verify_mbstr(int encoding, const char *mbstr, int len, bool noError)
+ */ 
+int
+pg_verify_mbstr_len(int encoding, const char *mbstr, int len, bool noError)
 {
 	mbverifier	mbverify;
+	int mb_len;
 
 	Assert(PG_VALID_ENCODING(encoding));
 
@@ -1463,14 +1475,16 @@ pg_verify_mbstr(int encoding, const char *mbstr, int len, bool noError)
 		const char *nullpos = memchr(mbstr, 0, len);
 
 		if (nullpos == NULL)
-			return true;
+			return len;
 		if (noError)
-			return false;
+			return -1;
 		report_invalid_encoding(encoding, nullpos, 1);
 	}
 
 	/* fetch function pointer just once */
 	mbverify = pg_wchar_table[encoding].mbverify;
+	
+	mb_len = 0;
 
 	while (len > 0)
 	{
@@ -1481,12 +1495,13 @@ pg_verify_mbstr(int encoding, const char *mbstr, int len, bool noError)
 		{
 			if (*mbstr != '\0')
 			{
+				mb_len++;
 				mbstr++;
 				len--;
 				continue;
 			}
 			if (noError)
-				return false;
+				return -1;
 			report_invalid_encoding(encoding, mbstr, len);
 		}
 
@@ -1495,14 +1510,15 @@ pg_verify_mbstr(int encoding, const char *mbstr, int len, bool noError)
 		if (l < 0)
 		{
 			if (noError)
-				return false;
+				return -1;
 			report_invalid_encoding(encoding, mbstr, len);
 		}
 
 		mbstr += l;
 		len -= l;
+		mb_len++;
 	}
-	return true;
+	return mb_len;
 }
 
 /*
author	Andrew Dunstan <andrew@dunslane.net>	2007-09-18 17:41:17 +0000
committer	Andrew Dunstan <andrew@dunslane.net>	2007-09-18 17:41:17 +0000
commit	55613bf9cd7d6071e43e68ac14bc0243a1027507 (patch)
tree	9b151f94d94e7dc3aa5988c03867d3f6f6b562ba /src/backend
parent	8544110042ddf8be29e177e37f53516686a06da2 (diff)
download	postgresql-55613bf9cd7d6071e43e68ac14bc0243a1027507.tar.gz postgresql-55613bf9cd7d6071e43e68ac14bc0243a1027507.zip