ICU support

Add a column collprovider to pg_collation that determines which library provides the collation data. The existing choices are default and libc, and this adds an icu choice, which uses the ICU4C library. The pg_locale_t type is changed to a union that contains the provider-specific locale handles. Users of locale information are changed to look into that struct for the appropriate handle to use. Also add a collversion column that records the version of the collation when it is created, and check at run time whether it is still the same. This detects potentially incompatible library upgrades that can corrupt indexes and other structures. This is currently only supported by ICU-provided collations. initdb initializes the default collation set as before from the `locale -a` output but also adds all available ICU locales with a "-x-icu" appended. Currently, ICU-provided collations can only be explicitly named collations. The global database locales are still always libc-provided. ICU support is enabled by configure --with-icu. Reviewed-by: Thomas Munro <thomas.munro@enterprisedb.com> Reviewed-by: Andreas Karlsson <andreas@proxel.se>
author: Peter Eisentraut <peter_e@gmx.net> 2017-03-23 15:25:34 -0400
committer: Peter Eisentraut <peter_e@gmx.net> 2017-03-23 15:28:48 -0400
commit: eccfef81e1f73ee41f1d8bfe4fa4e80576945048 (patch)
tree: 52bd1b2468bcf9682b356cf5b5f6199ae9d80ee4 /src/backend/utils/adt/formatting.c
parent: ea42cc18c35381f639d45628d792e790ff39e271 (diff)
download: postgresql-eccfef81e1f73ee41f1d8bfe4fa4e80576945048.tar.gz
postgresql-eccfef81e1f73ee41f1d8bfe4fa4e80576945048.zip
1 files changed, 244 insertions, 209 deletions
diff --git a/src/backend/utils/adt/formatting.c b/src/backend/utils/adt/formatting.c
index c16bfbca933..0566abd314d 100644
--- a/src/backend/utils/adt/formatting.c
+++ b/src/backend/utils/adt/formatting.c
@@ -82,6 +82,10 @@
 #include <wctype.h>
 #endif
 
+#ifdef USE_ICU
+#include <unicode/ustring.h>
+#endif
+
 #include "catalog/pg_collation.h"
 #include "mb/pg_wchar.h"
 #include "utils/builtins.h"
@@ -1443,6 +1447,42 @@ str_numth(char *dest, char *num, int type)
  *			upper/lower/initcap functions
  *****************************************************************************/
 
+#ifdef USE_ICU
+static int32_t
+icu_convert_case(int32_t (*func)(UChar *, int32_t, const UChar *, int32_t, const char *, UErrorCode *),
+				 pg_locale_t mylocale, UChar **buff_dest, UChar *buff_source, int32_t len_source)
+{
+	UErrorCode	status;
+	int32_t		len_dest;
+
+	len_dest = len_source;  /* try first with same length */
+	*buff_dest = palloc(len_dest * sizeof(**buff_dest));
+	status = U_ZERO_ERROR;
+	len_dest = func(*buff_dest, len_dest, buff_source, len_source, mylocale->info.icu.locale, &status);
+	if (status == U_BUFFER_OVERFLOW_ERROR)
+	{
+		/* try again with adjusted length */
+		pfree(buff_dest);
+		buff_dest = palloc(len_dest * sizeof(**buff_dest));
+		status = U_ZERO_ERROR;
+		len_dest = func(*buff_dest, len_dest, buff_source, len_source, mylocale->info.icu.locale, &status);
+	}
+	if (U_FAILURE(status))
+		ereport(ERROR,
+				(errmsg("case conversion failed: %s", u_errorName(status))));
+	return len_dest;
+}
+
+static int32_t
+u_strToTitle_default_BI(UChar *dest, int32_t destCapacity,
+						const UChar *src, int32_t srcLength,
+						const char *locale,
+						UErrorCode *pErrorCode)
+{
+	return u_strToTitle(dest, destCapacity, src, srcLength, NULL, locale, pErrorCode);
+}
+#endif
+
 /*
  * If the system provides the needed functions for wide-character manipulation
  * (which are all standardized by C99), then we implement upper/lower/initcap
@@ -1479,12 +1519,9 @@ str_tolower(const char *buff, size_t nbytes, Oid collid)
 		result = asc_tolower(buff, nbytes);
 	}
 #ifdef USE_WIDE_UPPER_LOWER
-	else if (pg_database_encoding_max_length() > 1)
+	else
 	{
 		pg_locale_t mylocale = 0;
-		wchar_t    *workspace;
-		size_t		curr_char;
-		size_t		result_size;
 
 		if (collid != DEFAULT_COLLATION_OID)
 		{
@@ -1502,77 +1539,79 @@ str_tolower(const char *buff, size_t nbytes, Oid collid)
 			mylocale = pg_newlocale_from_collation(collid);
 		}
 
-		/* Overflow paranoia */
-		if ((nbytes + 1) > (INT_MAX / sizeof(wchar_t)))
-			ereport(ERROR,
-					(errcode(ERRCODE_OUT_OF_MEMORY),
-					 errmsg("out of memory")));
+#ifdef USE_ICU
+		if (mylocale && mylocale->provider == COLLPROVIDER_ICU)
+		{
+			int32_t		len_uchar;
+			int32_t		len_conv;
+			UChar	   *buff_uchar;
+			UChar	   *buff_conv;
+
+			len_uchar = icu_to_uchar(&buff_uchar, buff, nbytes);
+			len_conv = icu_convert_case(u_strToLower, mylocale, &buff_conv, buff_uchar, len_uchar);
+			icu_from_uchar(&result, buff_conv, len_conv);
+		}
+		else
+#endif
+		{
+			if (pg_database_encoding_max_length() > 1)
+			{
+				wchar_t    *workspace;
+				size_t		curr_char;
+				size_t		result_size;
 
-		/* Output workspace cannot have more codes than input bytes */
-		workspace = (wchar_t *) palloc((nbytes + 1) * sizeof(wchar_t));
+				/* Overflow paranoia */
+				if ((nbytes + 1) > (INT_MAX / sizeof(wchar_t)))
+					ereport(ERROR,
+							(errcode(ERRCODE_OUT_OF_MEMORY),
+							 errmsg("out of memory")));
 
-		char2wchar(workspace, nbytes + 1, buff, nbytes, mylocale);
+				/* Output workspace cannot have more codes than input bytes */
+				workspace = (wchar_t *) palloc((nbytes + 1) * sizeof(wchar_t));
 
-		for (curr_char = 0; workspace[curr_char] != 0; curr_char++)
-		{
+				char2wchar(workspace, nbytes + 1, buff, nbytes, mylocale);
+
+				for (curr_char = 0; workspace[curr_char] != 0; curr_char++)
+				{
 #ifdef HAVE_LOCALE_T
-			if (mylocale)
-				workspace[curr_char] = towlower_l(workspace[curr_char], mylocale);
-			else
+					if (mylocale)
+						workspace[curr_char] = towlower_l(workspace[curr_char], mylocale->info.lt);
+					else
 #endif
-				workspace[curr_char] = towlower(workspace[curr_char]);
-		}
+						workspace[curr_char] = towlower(workspace[curr_char]);
+				}
 
-		/* Make result large enough; case change might change number of bytes */
-		result_size = curr_char * pg_database_encoding_max_length() + 1;
-		result = palloc(result_size);
+				/* Make result large enough; case change might change number of bytes */
+				result_size = curr_char * pg_database_encoding_max_length() + 1;
+				result = palloc(result_size);
 
-		wchar2char(result, workspace, result_size, mylocale);
-		pfree(workspace);
-	}
+				wchar2char(result, workspace, result_size, mylocale);
+				pfree(workspace);
+			}
 #endif   /* USE_WIDE_UPPER_LOWER */
-	else
-	{
-#ifdef HAVE_LOCALE_T
-		pg_locale_t mylocale = 0;
-#endif
-		char	   *p;
-
-		if (collid != DEFAULT_COLLATION_OID)
-		{
-			if (!OidIsValid(collid))
+			else
 			{
-				/*
-				 * This typically means that the parser could not resolve a
-				 * conflict of implicit collations, so report it that way.
-				 */
-				ereport(ERROR,
-						(errcode(ERRCODE_INDETERMINATE_COLLATION),
-						 errmsg("could not determine which collation to use for lower() function"),
-						 errhint("Use the COLLATE clause to set the collation explicitly.")));
-			}
-#ifdef HAVE_LOCALE_T
-			mylocale = pg_newlocale_from_collation(collid);
-#endif
-		}
+				char	   *p;
 
-		result = pnstrdup(buff, nbytes);
+				result = pnstrdup(buff, nbytes);
 
-		/*
-		 * Note: we assume that tolower_l() will not be so broken as to need
-		 * an isupper_l() guard test.  When using the default collation, we
-		 * apply the traditional Postgres behavior that forces ASCII-style
-		 * treatment of I/i, but in non-default collations you get exactly
-		 * what the collation says.
-		 */
-		for (p = result; *p; p++)
-		{
+				/*
+				 * Note: we assume that tolower_l() will not be so broken as to need
+				 * an isupper_l() guard test.  When using the default collation, we
+				 * apply the traditional Postgres behavior that forces ASCII-style
+				 * treatment of I/i, but in non-default collations you get exactly
+				 * what the collation says.
+				 */
+				for (p = result; *p; p++)
+				{
 #ifdef HAVE_LOCALE_T
-			if (mylocale)
-				*p = tolower_l((unsigned char) *p, mylocale);
-			else
+					if (mylocale)
+						*p = tolower_l((unsigned char) *p, mylocale->info.lt);
+					else
 #endif
-				*p = pg_tolower((unsigned char) *p);
+						*p = pg_tolower((unsigned char) *p);
+				}
+			}
 		}
 	}
 
@@ -1599,12 +1638,9 @@ str_toupper(const char *buff, size_t nbytes, Oid collid)
 		result = asc_toupper(buff, nbytes);
 	}
 #ifdef USE_WIDE_UPPER_LOWER
-	else if (pg_database_encoding_max_length() > 1)
+	else
 	{
 		pg_locale_t mylocale = 0;
-		wchar_t    *workspace;
-		size_t		curr_char;
-		size_t		result_size;
 
 		if (collid != DEFAULT_COLLATION_OID)
 		{
@@ -1622,77 +1658,78 @@ str_toupper(const char *buff, size_t nbytes, Oid collid)
 			mylocale = pg_newlocale_from_collation(collid);
 		}
 
-		/* Overflow paranoia */
-		if ((nbytes + 1) > (INT_MAX / sizeof(wchar_t)))
-			ereport(ERROR,
-					(errcode(ERRCODE_OUT_OF_MEMORY),
-					 errmsg("out of memory")));
+#ifdef USE_ICU
+		if (mylocale && mylocale->provider == COLLPROVIDER_ICU)
+		{
+			int32_t		len_uchar, len_conv;
+			UChar	   *buff_uchar;
+			UChar	   *buff_conv;
 
-		/* Output workspace cannot have more codes than input bytes */
-		workspace = (wchar_t *) palloc((nbytes + 1) * sizeof(wchar_t));
+			len_uchar = icu_to_uchar(&buff_uchar, buff, nbytes);
+			len_conv = icu_convert_case(u_strToUpper, mylocale, &buff_conv, buff_uchar, len_uchar);
+			icu_from_uchar(&result, buff_conv, len_conv);
+		}
+		else
+#endif
+		{
+			if (pg_database_encoding_max_length() > 1)
+			{
+				wchar_t    *workspace;
+				size_t		curr_char;
+				size_t		result_size;
 
-		char2wchar(workspace, nbytes + 1, buff, nbytes, mylocale);
+				/* Overflow paranoia */
+				if ((nbytes + 1) > (INT_MAX / sizeof(wchar_t)))
+					ereport(ERROR,
+							(errcode(ERRCODE_OUT_OF_MEMORY),
+							 errmsg("out of memory")));
 
-		for (curr_char = 0; workspace[curr_char] != 0; curr_char++)
-		{
-#ifdef HAVE_LOCALE_T
-			if (mylocale)
-				workspace[curr_char] = towupper_l(workspace[curr_char], mylocale);
-			else
-#endif
-				workspace[curr_char] = towupper(workspace[curr_char]);
-		}
+				/* Output workspace cannot have more codes than input bytes */
+				workspace = (wchar_t *) palloc((nbytes + 1) * sizeof(wchar_t));
 
-		/* Make result large enough; case change might change number of bytes */
-		result_size = curr_char * pg_database_encoding_max_length() + 1;
-		result = palloc(result_size);
+				char2wchar(workspace, nbytes + 1, buff, nbytes, mylocale);
 
-		wchar2char(result, workspace, result_size, mylocale);
-		pfree(workspace);
-	}
-#endif   /* USE_WIDE_UPPER_LOWER */
-	else
-	{
+				for (curr_char = 0; workspace[curr_char] != 0; curr_char++)
+				{
 #ifdef HAVE_LOCALE_T
-		pg_locale_t mylocale = 0;
+					if (mylocale)
+						workspace[curr_char] = towupper_l(workspace[curr_char], mylocale->info.lt);
+					else
 #endif
-		char	   *p;
+						workspace[curr_char] = towupper(workspace[curr_char]);
+				}
 
-		if (collid != DEFAULT_COLLATION_OID)
-		{
-			if (!OidIsValid(collid))
-			{
-				/*
-				 * This typically means that the parser could not resolve a
-				 * conflict of implicit collations, so report it that way.
-				 */
-				ereport(ERROR,
-						(errcode(ERRCODE_INDETERMINATE_COLLATION),
-						 errmsg("could not determine which collation to use for upper() function"),
-						 errhint("Use the COLLATE clause to set the collation explicitly.")));
+				/* Make result large enough; case change might change number of bytes */
+				result_size = curr_char * pg_database_encoding_max_length() + 1;
+				result = palloc(result_size);
+
+				wchar2char(result, workspace, result_size, mylocale);
+				pfree(workspace);
 			}
-#ifdef HAVE_LOCALE_T
-			mylocale = pg_newlocale_from_collation(collid);
-#endif
-		}
+#endif   /* USE_WIDE_UPPER_LOWER */
+			else
+			{
+				char	   *p;
 
-		result = pnstrdup(buff, nbytes);
+				result = pnstrdup(buff, nbytes);
 
-		/*
-		 * Note: we assume that toupper_l() will not be so broken as to need
-		 * an islower_l() guard test.  When using the default collation, we
-		 * apply the traditional Postgres behavior that forces ASCII-style
-		 * treatment of I/i, but in non-default collations you get exactly
-		 * what the collation says.
-		 */
-		for (p = result; *p; p++)
-		{
+				/*
+				 * Note: we assume that toupper_l() will not be so broken as to need
+				 * an islower_l() guard test.  When using the default collation, we
+				 * apply the traditional Postgres behavior that forces ASCII-style
+				 * treatment of I/i, but in non-default collations you get exactly
+				 * what the collation says.
+				 */
+				for (p = result; *p; p++)
+				{
 #ifdef HAVE_LOCALE_T
-			if (mylocale)
-				*p = toupper_l((unsigned char) *p, mylocale);
-			else
+					if (mylocale)
+						*p = toupper_l((unsigned char) *p, mylocale->info.lt);
+					else
 #endif
-				*p = pg_toupper((unsigned char) *p);
+						*p = pg_toupper((unsigned char) *p);
+				}
+			}
 		}
 	}
 
@@ -1720,12 +1757,9 @@ str_initcap(const char *buff, size_t nbytes, Oid collid)
 		result = asc_initcap(buff, nbytes);
 	}
 #ifdef USE_WIDE_UPPER_LOWER
-	else if (pg_database_encoding_max_length() > 1)
+	else
 	{
 		pg_locale_t mylocale = 0;
-		wchar_t    *workspace;
-		size_t		curr_char;
-		size_t		result_size;
 
 		if (collid != DEFAULT_COLLATION_OID)
 		{
@@ -1743,100 +1777,101 @@ str_initcap(const char *buff, size_t nbytes, Oid collid)
 			mylocale = pg_newlocale_from_collation(collid);
 		}
 
-		/* Overflow paranoia */
-		if ((nbytes + 1) > (INT_MAX / sizeof(wchar_t)))
-			ereport(ERROR,
-					(errcode(ERRCODE_OUT_OF_MEMORY),
-					 errmsg("out of memory")));
-
-		/* Output workspace cannot have more codes than input bytes */
-		workspace = (wchar_t *) palloc((nbytes + 1) * sizeof(wchar_t));
-
-		char2wchar(workspace, nbytes + 1, buff, nbytes, mylocale);
-
-		for (curr_char = 0; workspace[curr_char] != 0; curr_char++)
+#ifdef USE_ICU
+		if (mylocale && mylocale->provider == COLLPROVIDER_ICU)
 		{
-#ifdef HAVE_LOCALE_T
-			if (mylocale)
-			{
-				if (wasalnum)
-					workspace[curr_char] = towlower_l(workspace[curr_char], mylocale);
-				else
-					workspace[curr_char] = towupper_l(workspace[curr_char], mylocale);
-				wasalnum = iswalnum_l(workspace[curr_char], mylocale);
-			}
-			else
+			int32_t		len_uchar, len_conv;
+			UChar	   *buff_uchar;
+			UChar	   *buff_conv;
+
+			len_uchar = icu_to_uchar(&buff_uchar, buff, nbytes);
+			len_conv = icu_convert_case(u_strToTitle_default_BI, mylocale, &buff_conv, buff_uchar, len_uchar);
+			icu_from_uchar(&result, buff_conv, len_conv);
+		}
+		else
 #endif
+		{
+			if (pg_database_encoding_max_length() > 1)
 			{
-				if (wasalnum)
-					workspace[curr_char] = towlower(workspace[curr_char]);
-				else
-					workspace[curr_char] = towupper(workspace[curr_char]);
-				wasalnum = iswalnum(workspace[curr_char]);
-			}
-		}
+				wchar_t    *workspace;
+				size_t		curr_char;
+				size_t		result_size;
 
-		/* Make result large enough; case change might change number of bytes */
-		result_size = curr_char * pg_database_encoding_max_length() + 1;
-		result = palloc(result_size);
+				/* Overflow paranoia */
+				if ((nbytes + 1) > (INT_MAX / sizeof(wchar_t)))
+					ereport(ERROR,
+							(errcode(ERRCODE_OUT_OF_MEMORY),
+							 errmsg("out of memory")));
 
-		wchar2char(result, workspace, result_size, mylocale);
-		pfree(workspace);
-	}
-#endif   /* USE_WIDE_UPPER_LOWER */
-	else
-	{
-#ifdef HAVE_LOCALE_T
-		pg_locale_t mylocale = 0;
-#endif
-		char	   *p;
+				/* Output workspace cannot have more codes than input bytes */
+				workspace = (wchar_t *) palloc((nbytes + 1) * sizeof(wchar_t));
 
-		if (collid != DEFAULT_COLLATION_OID)
-		{
-			if (!OidIsValid(collid))
-			{
-				/*
-				 * This typically means that the parser could not resolve a
-				 * conflict of implicit collations, so report it that way.
-				 */
-				ereport(ERROR,
-						(errcode(ERRCODE_INDETERMINATE_COLLATION),
-						 errmsg("could not determine which collation to use for initcap() function"),
-						 errhint("Use the COLLATE clause to set the collation explicitly.")));
-			}
+				char2wchar(workspace, nbytes + 1, buff, nbytes, mylocale);
+
+				for (curr_char = 0; workspace[curr_char] != 0; curr_char++)
+				{
 #ifdef HAVE_LOCALE_T
-			mylocale = pg_newlocale_from_collation(collid);
+					if (mylocale)
+					{
+						if (wasalnum)
+							workspace[curr_char] = towlower_l(workspace[curr_char], mylocale->info.lt);
+						else
+							workspace[curr_char] = towupper_l(workspace[curr_char], mylocale->info.lt);
+						wasalnum = iswalnum_l(workspace[curr_char], mylocale->info.lt);
+					}
+					else
 #endif
-		}
+					{
+						if (wasalnum)
+							workspace[curr_char] = towlower(workspace[curr_char]);
+						else
+							workspace[curr_char] = towupper(workspace[curr_char]);
+						wasalnum = iswalnum(workspace[curr_char]);
+					}
+				}
 
-		result = pnstrdup(buff, nbytes);
+				/* Make result large enough; case change might change number of bytes */
+				result_size = curr_char * pg_database_encoding_max_length() + 1;
+				result = palloc(result_size);
 
-		/*
-		 * Note: we assume that toupper_l()/tolower_l() will not be so broken
-		 * as to need guard tests.  When using the default collation, we apply
-		 * the traditional Postgres behavior that forces ASCII-style treatment
-		 * of I/i, but in non-default collations you get exactly what the
-		 * collation says.
-		 */
-		for (p = result; *p; p++)
-		{
-#ifdef HAVE_LOCALE_T
-			if (mylocale)
-			{
-				if (wasalnum)
-					*p = tolower_l((unsigned char) *p, mylocale);
-				else
-					*p = toupper_l((unsigned char) *p, mylocale);
-				wasalnum = isalnum_l((unsigned char) *p, mylocale);
+				wchar2char(result, workspace, result_size, mylocale);
+				pfree(workspace);
 			}
+#endif   /* USE_WIDE_UPPER_LOWER */
 			else
-#endif
 			{
-				if (wasalnum)
-					*p = pg_tolower((unsigned char) *p);
-				else
-					*p = pg_toupper((unsigned char) *p);
-				wasalnum = isalnum((unsigned char) *p);
+				char	   *p;
+
+				result = pnstrdup(buff, nbytes);
+
+				/*
+				 * Note: we assume that toupper_l()/tolower_l() will not be so broken
+				 * as to need guard tests.  When using the default collation, we apply
+				 * the traditional Postgres behavior that forces ASCII-style treatment
+				 * of I/i, but in non-default collations you get exactly what the
+				 * collation says.
+				 */
+				for (p = result; *p; p++)
+				{
+#ifdef HAVE_LOCALE_T
+					if (mylocale)
+					{
+						if (wasalnum)
+							*p = tolower_l((unsigned char) *p, mylocale->info.lt);
+						else
+							*p = toupper_l((unsigned char) *p, mylocale->info.lt);
+						wasalnum = isalnum_l((unsigned char) *p, mylocale->info.lt);
+					}
+					else
+#endif
+					{
+						if (wasalnum)
+							*p = pg_tolower((unsigned char) *p);
+						else
+							*p = pg_toupper((unsigned char) *p);
+						wasalnum = isalnum((unsigned char) *p);
+					}
+				}
 			}
 		}
 	}
author	Peter Eisentraut <peter_e@gmx.net>	2017-03-23 15:25:34 -0400
committer	Peter Eisentraut <peter_e@gmx.net>	2017-03-23 15:28:48 -0400
commit	eccfef81e1f73ee41f1d8bfe4fa4e80576945048 (patch)
tree	52bd1b2468bcf9682b356cf5b5f6199ae9d80ee4 /src/backend/utils/adt/formatting.c
parent	ea42cc18c35381f639d45628d792e790ff39e271 (diff)
download	postgresql-eccfef81e1f73ee41f1d8bfe4fa4e80576945048.tar.gz postgresql-eccfef81e1f73ee41f1d8bfe4fa4e80576945048.zip