1 files changed, 244 insertions, 209 deletions
diff --git a/src/backend/utils/adt/formatting.c b/src/backend/utils/adt/formatting.c
index c16bfbca933..0566abd314d 100644
--- a/src/backend/utils/adt/formatting.c
+++ b/src/backend/utils/adt/formatting.c
@@ -82,6 +82,10 @@
 #include <wctype.h>
 #endif
 
+#ifdef USE_ICU
+#include <unicode/ustring.h>
+#endif
+
 #include "catalog/pg_collation.h"
 #include "mb/pg_wchar.h"
 #include "utils/builtins.h"
@@ -1443,6 +1447,42 @@ str_numth(char *dest, char *num, int type)
  *			upper/lower/initcap functions
  *****************************************************************************/
 
+#ifdef USE_ICU
+static int32_t
+icu_convert_case(int32_t (*func)(UChar *, int32_t, const UChar *, int32_t, const char *, UErrorCode *),
+				 pg_locale_t mylocale, UChar **buff_dest, UChar *buff_source, int32_t len_source)
+{
+	UErrorCode	status;
+	int32_t		len_dest;
+
+	len_dest = len_source;  /* try first with same length */
+	*buff_dest = palloc(len_dest * sizeof(**buff_dest));
+	status = U_ZERO_ERROR;
+	len_dest = func(*buff_dest, len_dest, buff_source, len_source, mylocale->info.icu.locale, &status);
+	if (status == U_BUFFER_OVERFLOW_ERROR)
+	{
+		/* try again with adjusted length */
+		pfree(buff_dest);
+		buff_dest = palloc(len_dest * sizeof(**buff_dest));
+		status = U_ZERO_ERROR;
+		len_dest = func(*buff_dest, len_dest, buff_source, len_source, mylocale->info.icu.locale, &status);
+	}
+	if (U_FAILURE(status))
+		ereport(ERROR,
+				(errmsg("case conversion failed: %s", u_errorName(status))));
+	return len_dest;
+}
+
+static int32_t
+u_strToTitle_default_BI(UChar *dest, int32_t destCapacity,
+						const UChar *src, int32_t srcLength,
+						const char *locale,
+						UErrorCode *pErrorCode)
+{
+	return u_strToTitle(dest, destCapacity, src, srcLength, NULL, locale, pErrorCode);
+}
+#endif
+
 /*
  * If the system provides the needed functions for wide-character manipulation
  * (which are all standardized by C99), then we implement upper/lower/initcap
@@ -1479,12 +1519,9 @@ str_tolower(const char *buff, size_t nbytes, Oid collid)
 		result = asc_tolower(buff, nbytes);
 	}
 #ifdef USE_WIDE_UPPER_LOWER
-	else if (pg_database_encoding_max_length() > 1)
+	else
 	{
 		pg_locale_t mylocale = 0;
-		wchar_t    *workspace;
-		size_t		curr_char;
-		size_t		result_size;
 
 		if (collid != DEFAULT_COLLATION_OID)
 		{
@@ -1502,77 +1539,79 @@ str_tolower(const char *buff, size_t nbytes, Oid collid)
 			mylocale = pg_newlocale_from_collation(collid);
 		}
 
-		/* Overflow paranoia */
-		if ((nbytes + 1) > (INT_MAX / sizeof(wchar_t)))
-			ereport(ERROR,
-					(errcode(ERRCODE_OUT_OF_MEMORY),
-					 errmsg("out of memory")));
+#ifdef USE_ICU
+		if (mylocale && mylocale->provider == COLLPROVIDER_ICU)
+		{
+			int32_t		len_uchar;
+			int32_t		len_conv;
+			UChar	   *buff_uchar;
+			UChar	   *buff_conv;
+
+			len_uchar = icu_to_uchar(&buff_uchar, buff, nbytes);
+			len_conv = icu_convert_case(u_strToLower, mylocale, &buff_conv, buff_uchar, len_uchar);
+			icu_from_uchar(&result, buff_conv, len_conv);
+		}
+		else
+#endif
+		{
+			if (pg_database_encoding_max_length() > 1)
+			{
+				wchar_t    *workspace;
+				size_t		curr_char;
+				size_t		result_size;
 
-		/* Output workspace cannot have more codes than input bytes */
-		workspace = (wchar_t *) palloc((nbytes + 1) * sizeof(wchar_t));
+				/* Overflow paranoia */
+				if ((nbytes + 1) > (INT_MAX / sizeof(wchar_t)))
+					ereport(ERROR,
+							(errcode(ERRCODE_OUT_OF_MEMORY),
+							 errmsg("out of memory")));
 
-		char2wchar(workspace, nbytes + 1, buff, nbytes, mylocale);
+				/* Output workspace cannot have more codes than input bytes */
+				workspace = (wchar_t *) palloc((nbytes + 1) * sizeof(wchar_t));
 
-		for (curr_char = 0; workspace[curr_char] != 0; curr_char++)
-		{
+				char2wchar(workspace, nbytes + 1, buff, nbytes, mylocale);
+
+				for (curr_char = 0; workspace[curr_char] != 0; curr_char++)
+				{
 #ifdef HAVE_LOCALE_T
-			if (mylocale)
-				workspace[curr_char] = towlower_l(workspace[curr_char], mylocale);
-			else
+					if (mylocale)
+						workspace[curr_char] = towlower_l(workspace[curr_char], mylocale->info.lt);
+					else
 #endif
-				workspace[curr_char] = towlower(workspace[curr_char]);
-		}
+						workspace[curr_char] = towlower(workspace[curr_char]);
+				}
 
-		/* Make result large enough; case change might change number of bytes */
-		result_size = curr_char * pg_database_encoding_max_length() + 1;
-		result = palloc(result_size);
+				/* Make result large enough; case change might change number of bytes */
+				result_size = curr_char * pg_database_encoding_max_length() + 1;
+				result = palloc(result_size);
 
-		wchar2char(result, workspace, result_size, mylocale);
-		pfree(workspace);
-	}
+				wchar2char(result, workspace, result_size, mylocale);
+				pfree(workspace);
+			}
 #endif   /* USE_WIDE_UPPER_LOWER */
-	else
-	{
-#ifdef HAVE_LOCALE_T
-		pg_locale_t mylocale = 0;
-#endif
-		char	   *p;
-
-		if (collid != DEFAULT_COLLATION_OID)
-		{
-			if (!OidIsValid(collid))
+			else
 			{
-				/*
-				 * This typically means that the parser could not resolve a
-				 * conflict of implicit collations, so report it that way.
-				 */
-				ereport(ERROR,
-						(errcode(ERRCODE_INDETERMINATE_COLLATION),
-						 errmsg("could not determine which collation to use for lower() function"),
-						 errhint("Use the COLLATE clause to set the collation explicitly.")));
-			}
-#ifdef HAVE_LOCALE_T
-			mylocale = pg_newlocale_from_collation(collid);
-#endif
-		}
+				char	   *p;
 
-		result = pnstrdup(buff, nbytes);
+				result = pnstrdup(buff, nbytes);
 
-		/*
-		 * Note: we assume that tolower_l() will not be so broken as to need
-		 * an isupper_l() guard test.  When using the default collation, we
-		 * apply the traditional Postgres behavior that forces ASCII-style
-		 * treatment of I/i, but in non-default collations you get exactly
-		 * what the collation says.
-		 */
-		for (p = result; *p; p++)
-		{
+				/*
+				 * Note: we assume that tolower_l() will not be so broken as to need
+				 * an isupper_l() guard test.  When using the default collation, we
+				 * apply the traditional Postgres behavior that forces ASCII-style
+				 * treatment of I/i, but in non-default collations you get exactly
+				 * what the collation says.
+				 */
+				for (p = result; *p; p++)
+				{
 #ifdef HAVE_LOCALE_T
-			if (mylocale)
-				*p = tolower_l((unsigned char) *p, mylocale);
-			else
+					if (mylocale)
+						*p = tolower_l((unsigned char) *p, mylocale->info.lt);
+					else
 #endif
-				*p = pg_tolower((unsigned char) *p);
+						*p = pg_tolower((unsigned char) *p);
+				}
+			}
 		}
 	}
 
@@ -1599,12 +1638,9 @@ str_toupper(const char *buff, size_t nbytes, Oid collid)
 		result = asc_toupper(buff, nbytes);
 	}
 #ifdef USE_WIDE_UPPER_LOWER
-	else if (pg_database_encoding_max_length() > 1)
+	else
 	{
 		pg_locale_t mylocale = 0;
-		wchar_t    *workspace;
-		size_t		curr_char;
-		size_t		result_size;
 
 		if (collid != DEFAULT_COLLATION_OID)
 		{
@@ -1622,77 +1658,78 @@ str_toupper(const char *buff, size_t nbytes, Oid collid)
 			mylocale = pg_newlocale_from_collation(collid);
 		}
 
-		/* Overflow paranoia */
-		if ((nbytes + 1) > (INT_MAX / sizeof(wchar_t)))
-			ereport(ERROR,
-					(errcode(ERRCODE_OUT_OF_MEMORY),
-					 errmsg("out of memory")));
+#ifdef USE_ICU
+		if (mylocale && mylocale->provider == COLLPROVIDER_ICU)
+		{
+			int32_t		len_uchar, len_conv;
+			UChar	   *buff_uchar;
+			UChar	   *buff_conv;
 
-		/* Output workspace cannot have more codes than input bytes */
-		workspace = (wchar_t *) palloc((nbytes + 1) * sizeof(wchar_t));
+			len_uchar = icu_to_uchar(&buff_uchar, buff, nbytes);
+			len_conv = icu_convert_case(u_strToUpper, mylocale, &buff_conv, buff_uchar, len_uchar);
+			icu_from_uchar(&result, buff_conv, len_conv);
+		}
+		else
+#endif
+		{
+			if (pg_database_encoding_max_length() > 1)
+			{
+				wchar_t    *workspace;
+				size_t		curr_char;
+				size_t		result_size;
 
-		char2wchar(workspace, nbytes + 1, buff, nbytes, mylocale);
+				/* Overflow paranoia */
+				if ((nbytes + 1) > (INT_MAX / sizeof(wchar_t)))
+					ereport(ERROR,
+							(errcode(ERRCODE_OUT_OF_MEMORY),
+							 errmsg("out of memory")));
 
-		for (curr_char = 0; workspace[curr_char] != 0; curr_char++)
-		{
-#ifdef HAVE_LOCALE_T
-			if (mylocale)
-				workspace[curr_char] = towupper_l(workspace[curr_char], mylocale);
-			else
-#endif
-				workspace[curr_char] = towupper(workspace[curr_char]);
-		}
+				/* Output workspace cannot have more codes than input bytes */
+				workspace = (wchar_t *) palloc((nbytes + 1) * sizeof(wchar_t));
 
-		/* Make result large enough; case change might change number of bytes */
-		result_size = curr_char * pg_database_encoding_max_length() + 1;
-		result = palloc(result_size);
+				char2wchar(workspace, nbytes + 1, buff, nbytes, mylocale);
 
-		wchar2char(result, workspace, result_size, mylocale);
-		pfree(workspace);
-	}
-#endif   /* USE_WIDE_UPPER_LOWER */
-	else
-	{
+				for (curr_char = 0; workspace[curr_char] != 0; curr_char++)
+				{
 #ifdef HAVE_LOCALE_T
-		pg_locale_t mylocale = 0;
+					if (mylocale)
+						workspace[curr_char] = towupper_l(workspace[curr_char], mylocale->info.lt);
+					else
 #endif
-		char	   *p;
+						workspace[curr_char] = towupper(workspace[curr_char]);
+				}
 
-		if (collid != DEFAULT_COLLATION_OID)
-		{
-			if (!OidIsValid(collid))
-			{
-				/*
-				 * This typically means that the parser could not resolve a
-				 * conflict of implicit collations, so report it that way.
-				 */
-				ereport(ERROR,
-						(errcode(ERRCODE_INDETERMINATE_COLLATION),
-						 errmsg("could not determine which collation to use for upper() function"),
-						 errhint("Use the COLLATE clause to set the collation explicitly.")));
+				/* Make result large enough; case change might change number of bytes */
+				result_size = curr_char * pg_database_encoding_max_length() + 1;
+				result = palloc(result_size);
+
+				wchar2char(result, workspace, result_size, mylocale);
+				pfree(workspace);
 			}
-#ifdef HAVE_LOCALE_T
-			mylocale = pg_newlocale_from_collation(collid);
-#endif
-		}
+#endif   /* USE_WIDE_UPPER_LOWER */
+			else
+			{
+				char	   *p;
 
-		result = pnstrdup(buff, nbytes);
+				result = pnstrdup(buff, nbytes);
 
-		/*
-		 * Note: we assume that toupper_l() will not be so broken as to need
-		 * an islower_l() guard test.  When using the default collation, we
-		 * apply the traditional Postgres behavior that forces ASCII-style
-		 * treatment of I/i, but in non-default collations you get exactly
-		 * what the collation says.
-		 */
-		for (p = result; *p; p++)
-		{
+				/*
+				 * Note: we assume that toupper_l() will not be so broken as to need
+				 * an islower_l() guard test.  When using the default collation, we
+				 * apply the traditional Postgres behavior that forces ASCII-style
+				 * treatment of I/i, but in non-default collations you get exactly
+				 * what the collation says.
+				 */
+				for (p = result; *p; p++)
+				{
 #ifdef HAVE_LOCALE_T
-			if (mylocale)
-				*p = toupper_l((unsigned char) *p, mylocale);
-			else
+					if (mylocale)
+						*p = toupper_l((unsigned char) *p, mylocale->info.lt);
+					else
 #endif
-				*p = pg_toupper((unsigned char) *p);
+						*p = pg_toupper((unsigned char) *p);
+				}
+			}
 		}
 	}
 
@@ -1720,12 +1757,9 @@ str_initcap(const char *buff, size_t nbytes, Oid collid)
 		result = asc_initcap(buff, nbytes);
 	}
 #ifdef USE_WIDE_UPPER_LOWER
-	else if (pg_database_encoding_max_length() > 1)
+	else
 	{
 		pg_locale_t mylocale = 0;
-		wchar_t    *workspace;
-		size_t		curr_char;
-		size_t		result_size;
 
 		if (collid != DEFAULT_COLLATION_OID)
 		{
@@ -1743,100 +1777,101 @@ str_initcap(const char *buff, size_t nbytes, Oid collid)
 			mylocale = pg_newlocale_from_collation(collid);
 		}
 
-		/* Overflow paranoia */
-		if ((nbytes + 1) > (INT_MAX / sizeof(wchar_t)))
-			ereport(ERROR,
-					(errcode(ERRCODE_OUT_OF_MEMORY),
-					 errmsg("out of memory")));
-
-		/* Output workspace cannot have more codes than input bytes */
-		workspace = (wchar_t *) palloc((nbytes + 1) * sizeof(wchar_t));
-
-		char2wchar(workspace, nbytes + 1, buff, nbytes, mylocale);
-
-		for (curr_char = 0; workspace[curr_char] != 0; curr_char++)
+#ifdef USE_ICU
+		if (mylocale && mylocale->provider == COLLPROVIDER_ICU)
 		{
-#ifdef HAVE_LOCALE_T
-			if (mylocale)
-			{
-				if (wasalnum)
-					workspace[curr_char] = towlower_l(workspace[curr_char], mylocale);
-				else
-					workspace[curr_char] = towupper_l(workspace[curr_char], mylocale);
-				wasalnum = iswalnum_l(workspace[curr_char], mylocale);
-			}
-			else
+			int32_t		len_uchar, len_conv;
+			UChar	   *buff_uchar;
+			UChar	   *buff_conv;
+
+			len_uchar = icu_to_uchar(&buff_uchar, buff, nbytes);
+			len_conv = icu_convert_case(u_strToTitle_default_BI, mylocale, &buff_conv, buff_uchar, len_uchar);
+			icu_from_uchar(&result, buff_conv, len_conv);
+		}
+		else
 #endif
+		{
+			if (pg_database_encoding_max_length() > 1)
 			{
-				if (wasalnum)
-					workspace[curr_char] = towlower(workspace[curr_char]);
-				else
-					workspace[curr_char] = towupper(workspace[curr_char]);
-				wasalnum = iswalnum(workspace[curr_char]);
-			}
-		}
+				wchar_t    *workspace;
+				size_t		curr_char;
+				size_t		result_size;
 
-		/* Make result large enough; case change might change number of bytes */
-		result_size = curr_char * pg_database_encoding_max_length() + 1;
-		result = palloc(result_size);
+				/* Overflow paranoia */
+				if ((nbytes + 1) > (INT_MAX / sizeof(wchar_t)))
+					ereport(ERROR,
+							(errcode(ERRCODE_OUT_OF_MEMORY),
+							 errmsg("out of memory")));
 
-		wchar2char(result, workspace, result_size, mylocale);
-		pfree(workspace);
-	}
-#endif   /* USE_WIDE_UPPER_LOWER */
-	else
-	{
-#ifdef HAVE_LOCALE_T
-		pg_locale_t mylocale = 0;
-#endif
-		char	   *p;
+				/* Output workspace cannot have more codes than input bytes */
+				workspace = (wchar_t *) palloc((nbytes + 1) * sizeof(wchar_t));
 
-		if (collid != DEFAULT_COLLATION_OID)
-		{
-			if (!OidIsValid(collid))
-			{
-				/*
-				 * This typically means that the parser could not resolve a
-				 * conflict of implicit collations, so report it that way.
-				 */
-				ereport(ERROR,
-						(errcode(ERRCODE_INDETERMINATE_COLLATION),
-						 errmsg("could not determine which collation to use for initcap() function"),
-						 errhint("Use the COLLATE clause to set the collation explicitly.")));
-			}
+				char2wchar(workspace, nbytes + 1, buff, nbytes, mylocale);
+
+				for (curr_char = 0; workspace[curr_char] != 0; curr_char++)
+				{
 #ifdef HAVE_LOCALE_T
-			mylocale = pg_newlocale_from_collation(collid);
+					if (mylocale)
+					{
+						if (wasalnum)
+							workspace[curr_char] = towlower_l(workspace[curr_char], mylocale->info.lt);
+						else
+							workspace[curr_char] = towupper_l(workspace[curr_char], mylocale->info.lt);
+						wasalnum = iswalnum_l(workspace[curr_char], mylocale->info.lt);
+					}
+					else
 #endif
-		}
+					{
+						if (wasalnum)
+							workspace[curr_char] = towlower(workspace[curr_char]);
+						else
+							workspace[curr_char] = towupper(workspace[curr_char]);
+						wasalnum = iswalnum(workspace[curr_char]);
+					}
+				}
 
-		result = pnstrdup(buff, nbytes);
+				/* Make result large enough; case change might change number of bytes */
+				result_size = curr_char * pg_database_encoding_max_length() + 1;
+				result = palloc(result_size);
 
-		/*
-		 * Note: we assume that toupper_l()/tolower_l() will not be so broken
-		 * as to need guard tests.  When using the default collation, we apply
-		 * the traditional Postgres behavior that forces ASCII-style treatment
-		 * of I/i, but in non-default collations you get exactly what the
-		 * collation says.
-		 */
-		for (p = result; *p; p++)
-		{
-#ifdef HAVE_LOCALE_T
-			if (mylocale)
-			{
-				if (wasalnum)
-					*p = tolower_l((unsigned char) *p, mylocale);
-				else
-					*p = toupper_l((unsigned char) *p, mylocale);
-				wasalnum = isalnum_l((unsigned char) *p, mylocale);
+				wchar2char(result, workspace, result_size, mylocale);
+				pfree(workspace);
 			}
+#endif   /* USE_WIDE_UPPER_LOWER */
 			else
-#endif
 			{
-				if (wasalnum)
-					*p = pg_tolower((unsigned char) *p);
-				else
-					*p = pg_toupper((unsigned char) *p);
-				wasalnum = isalnum((unsigned char) *p);
+				char	   *p;
+
+				result = pnstrdup(buff, nbytes);
+
+				/*
+				 * Note: we assume that toupper_l()/tolower_l() will not be so broken
+				 * as to need guard tests.  When using the default collation, we apply
+				 * the traditional Postgres behavior that forces ASCII-style treatment
+				 * of I/i, but in non-default collations you get exactly what the
+				 * collation says.
+				 */
+				for (p = result; *p; p++)
+				{
+#ifdef HAVE_LOCALE_T
+					if (mylocale)
+					{
+						if (wasalnum)
+							*p = tolower_l((unsigned char) *p, mylocale->info.lt);
+						else
+							*p = toupper_l((unsigned char) *p, mylocale->info.lt);
+						wasalnum = isalnum_l((unsigned char) *p, mylocale->info.lt);
+					}
+					else
+#endif
+					{
+						if (wasalnum)
+							*p = pg_tolower((unsigned char) *p);
+						else
+							*p = pg_toupper((unsigned char) *p);
+						wasalnum = isalnum((unsigned char) *p);
+					}
+				}
 			}
 		}
 	}