Solve the 'Turkish problem' with undesirable locale behavior for case

conversion of basic ASCII letters. Remove all uses of strcasecmp and strncasecmp in favor of new functions pg_strcasecmp and pg_strncasecmp; remove most but not all direct uses of toupper and tolower in favor of pg_toupper and pg_tolower. These functions use the same notions of case folding already developed for identifier case conversion. I left the straight locale-based folding in place for situations where we are just manipulating user data and not trying to match it to built-in strings --- for example, the SQL upper() function is still locale dependent. Perhaps this will prove not to be what's wanted, but at the moment we can initdb and pass regression tests in Turkish locale.
author: Tom Lane <tgl@sss.pgh.pa.us> 2004-05-07 00:24:59 +0000
committer: Tom Lane <tgl@sss.pgh.pa.us> 2004-05-07 00:24:59 +0000
commit: 0bd61548ab8d1ac5fee63f48ee9b384502a51ad6 (patch)
tree: b0c63b75585d0c396e67a3acd204e226b13eae4b /src/port/pgstrcasecmp.c
parent: 4d46274b33db52618ccf49550213b4d5ce4a7981 (diff)
download: postgresql-0bd61548ab8d1ac5fee63f48ee9b384502a51ad6.tar.gz
postgresql-0bd61548ab8d1ac5fee63f48ee9b384502a51ad6.zip
1 files changed, 125 insertions, 0 deletions
diff --git a/src/port/pgstrcasecmp.c b/src/port/pgstrcasecmp.c
new file mode 100644
index 00000000000..6ac07804af4
--- /dev/null
+++ b/src/port/pgstrcasecmp.c
@@ -0,0 +1,125 @@
+/*-------------------------------------------------------------------------
+ *
+ * pgstrcasecmp.c
+ *	   Portable SQL-like case-independent comparisons and conversions.
+ *
+ * SQL99 specifies Unicode-aware case normalization, which we don't yet
+ * have the infrastructure for.  Instead we use tolower() to provide a
+ * locale-aware translation.  However, there are some locales where this
+ * is not right either (eg, Turkish may do strange things with 'i' and
+ * 'I').  Our current compromise is to use tolower() for characters with
+ * the high bit set, and use an ASCII-only downcasing for 7-bit
+ * characters.
+ *
+ * NB: this code should match downcase_truncate_identifier() in scansup.c.
+ *
+ *
+ * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
+ *
+ * $PostgreSQL: pgsql/src/port/pgstrcasecmp.c,v 1.1 2004/05/07 00:24:59 tgl Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <ctype.h>
+
+
+/*
+ * Case-independent comparison of two null-terminated strings.
+ */
+int
+pg_strcasecmp(const char *s1, const char *s2)
+{
+	for (;;)
+	{
+		unsigned char	ch1 = (unsigned char) *s1++;
+		unsigned char	ch2 = (unsigned char) *s2++;
+
+		if (ch1 != ch2)
+		{
+			if (ch1 >= 'A' && ch1 <= 'Z')
+				ch1 += 'a' - 'A';
+			else if (ch1 >= 0x80 && isupper(ch1))
+				ch1 = tolower(ch1);
+
+			if (ch2 >= 'A' && ch2 <= 'Z')
+				ch2 += 'a' - 'A';
+			else if (ch2 >= 0x80 && isupper(ch2))
+				ch2 = tolower(ch2);
+
+			if (ch1 != ch2)
+				return (int) ch1 - (int) ch2;
+		}
+		if (ch1 == 0)
+			break;
+	}
+	return 0;
+}
+
+/*
+ * Case-independent comparison of two not-necessarily-null-terminated strings.
+ * At most n bytes will be examined from each string.
+ */
+int
+pg_strncasecmp(const char *s1, const char *s2, size_t n)
+{
+	while (n-- > 0)
+	{
+		unsigned char	ch1 = (unsigned char) *s1++;
+		unsigned char	ch2 = (unsigned char) *s2++;
+
+		if (ch1 != ch2)
+		{
+			if (ch1 >= 'A' && ch1 <= 'Z')
+				ch1 += 'a' - 'A';
+			else if (ch1 >= 0x80 && isupper(ch1))
+				ch1 = tolower(ch1);
+
+			if (ch2 >= 'A' && ch2 <= 'Z')
+				ch2 += 'a' - 'A';
+			else if (ch2 >= 0x80 && isupper(ch2))
+				ch2 = tolower(ch2);
+
+			if (ch1 != ch2)
+				return (int) ch1 - (int) ch2;
+		}
+		if (ch1 == 0)
+			break;
+	}
+	return 0;
+}
+
+/*
+ * Fold a character to upper case.
+ *
+ * Unlike some versions of toupper(), this is safe to apply to characters
+ * that aren't upper case letters.  Note however that the whole thing is
+ * a bit bogus for multibyte character sets.
+ */
+unsigned char
+pg_toupper(unsigned char ch)
+{
+	if (ch >= 'a' && ch <= 'z')
+		ch += 'A' - 'a';
+	else if (ch >= 0x80 && islower(ch))
+		ch = toupper(ch);
+	return ch;
+}
+
+/*
+ * Fold a character to lower case.
+ *
+ * Unlike some versions of tolower(), this is safe to apply to characters
+ * that aren't lower case letters.  Note however that the whole thing is
+ * a bit bogus for multibyte character sets.
+ */
+unsigned char
+pg_tolower(unsigned char ch)
+{
+	if (ch >= 'A' && ch <= 'Z')
+		ch += 'a' - 'A';
+	else if (ch >= 0x80 && isupper(ch))
+		ch = tolower(ch);
+	return ch;
+}
author	Tom Lane <tgl@sss.pgh.pa.us>	2004-05-07 00:24:59 +0000
committer	Tom Lane <tgl@sss.pgh.pa.us>	2004-05-07 00:24:59 +0000
commit	0bd61548ab8d1ac5fee63f48ee9b384502a51ad6 (patch)
tree	b0c63b75585d0c396e67a3acd204e226b13eae4b /src/port/pgstrcasecmp.c
parent	4d46274b33db52618ccf49550213b4d5ce4a7981 (diff)
download	postgresql-0bd61548ab8d1ac5fee63f48ee9b384502a51ad6.tar.gz postgresql-0bd61548ab8d1ac5fee63f48ee9b384502a51ad6.zip