Add SQL function CASEFOLD().

Useful for caseless matching. Similar to LOWER(), but avoids edge-case problems with using LOWER() for caseless matching. For collations that support it, CASEFOLD() handles characters with more than two case variations or multi-character case variations. Some characters may fold to uppercase. The results of case folding are also more stable across Unicode versions than LOWER() or UPPER(). Discussion: https://postgr.es/m/a1886ddfcd8f60cb3e905c93009b646b4cfb74c5.camel%40j-davis.com Reviewed-by: Ian Lawrence Barwick
author: Jeff Davis <jdavis@postgresql.org> 2025-01-24 14:56:22 -0800
committer: Jeff Davis <jdavis@postgresql.org> 2025-01-24 14:56:22 -0800
commit: bfc5992069cf00b189af83d96a83ae5ebb65e938 (patch)
tree: 94332f38e12deb4a6dcfdc011c42848069190ec5 /src/backend/utils/adt/formatting.c
parent: f15538cd27d4eeb7d665263a3d7b5700362d7eb0 (diff)
download: postgresql-bfc5992069cf00b189af83d96a83ae5ebb65e938.tar.gz
postgresql-bfc5992069cf00b189af83d96a83ae5ebb65e938.zip
1 files changed, 69 insertions, 0 deletions
diff --git a/src/backend/utils/adt/formatting.c b/src/backend/utils/adt/formatting.c
index 7c4c4aa07d5..2720d3902ab 100644
--- a/src/backend/utils/adt/formatting.c
+++ b/src/backend/utils/adt/formatting.c
@@ -1820,6 +1820,75 @@ str_initcap(const char *buff, size_t nbytes, Oid collid)
 }
 
 /*
+ * collation-aware, wide-character-aware case folding
+ *
+ * We pass the number of bytes so we can pass varlena and char*
+ * to this function.  The result is a palloc'd, null-terminated string.
+ */
+char *
+str_casefold(const char *buff, size_t nbytes, Oid collid)
+{
+	char	   *result;
+	pg_locale_t mylocale;
+
+	if (!buff)
+		return NULL;
+
+	if (!OidIsValid(collid))
+	{
+		/*
+		 * This typically means that the parser could not resolve a conflict
+		 * of implicit collations, so report it that way.
+		 */
+		ereport(ERROR,
+				(errcode(ERRCODE_INDETERMINATE_COLLATION),
+				 errmsg("could not determine which collation to use for %s function",
+						"lower()"),
+				 errhint("Use the COLLATE clause to set the collation explicitly.")));
+	}
+
+	if (GetDatabaseEncoding() != PG_UTF8)
+		ereport(ERROR,
+				(errcode(ERRCODE_SYNTAX_ERROR),
+				 errmsg("Unicode case folding can only be performed if server encoding is UTF8")));
+
+	mylocale = pg_newlocale_from_collation(collid);
+
+	/* C/POSIX collations use this path regardless of database encoding */
+	if (mylocale->ctype_is_c)
+	{
+		result = asc_tolower(buff, nbytes);
+	}
+	else
+	{
+		const char *src = buff;
+		size_t		srclen = nbytes;
+		size_t		dstsize;
+		char	   *dst;
+		size_t		needed;
+
+		/* first try buffer of equal size plus terminating NUL */
+		dstsize = srclen + 1;
+		dst = palloc(dstsize);
+
+		needed = pg_strfold(dst, dstsize, src, srclen, mylocale);
+		if (needed + 1 > dstsize)
+		{
+			/* grow buffer if needed and retry */
+			dstsize = needed + 1;
+			dst = repalloc(dst, dstsize);
+			needed = pg_strfold(dst, dstsize, src, srclen, mylocale);
+			Assert(needed + 1 <= dstsize);
+		}
+
+		Assert(dst[needed] == '\0');
+		result = dst;
+	}
+
+	return result;
+}
+
+/*
  * ASCII-only lower function
  *
  * We pass the number of bytes so we can pass varlena and char*
author	Jeff Davis <jdavis@postgresql.org>	2025-01-24 14:56:22 -0800
committer	Jeff Davis <jdavis@postgresql.org>	2025-01-24 14:56:22 -0800
commit	bfc5992069cf00b189af83d96a83ae5ebb65e938 (patch)
tree	94332f38e12deb4a6dcfdc011c42848069190ec5 /src/backend/utils/adt/formatting.c
parent	f15538cd27d4eeb7d665263a3d7b5700362d7eb0 (diff)
download	postgresql-bfc5992069cf00b189af83d96a83ae5ebb65e938.tar.gz postgresql-bfc5992069cf00b189af83d96a83ae5ebb65e938.zip