aboutsummaryrefslogtreecommitdiff
path: root/src/port/pgstrcasecmp.c
diff options
context:
space:
mode:
authorTom Lane <tgl@sss.pgh.pa.us>2004-05-07 00:24:59 +0000
committerTom Lane <tgl@sss.pgh.pa.us>2004-05-07 00:24:59 +0000
commit0bd61548ab8d1ac5fee63f48ee9b384502a51ad6 (patch)
treeb0c63b75585d0c396e67a3acd204e226b13eae4b /src/port/pgstrcasecmp.c
parent4d46274b33db52618ccf49550213b4d5ce4a7981 (diff)
downloadpostgresql-0bd61548ab8d1ac5fee63f48ee9b384502a51ad6.tar.gz
postgresql-0bd61548ab8d1ac5fee63f48ee9b384502a51ad6.zip
Solve the 'Turkish problem' with undesirable locale behavior for case
conversion of basic ASCII letters. Remove all uses of strcasecmp and strncasecmp in favor of new functions pg_strcasecmp and pg_strncasecmp; remove most but not all direct uses of toupper and tolower in favor of pg_toupper and pg_tolower. These functions use the same notions of case folding already developed for identifier case conversion. I left the straight locale-based folding in place for situations where we are just manipulating user data and not trying to match it to built-in strings --- for example, the SQL upper() function is still locale dependent. Perhaps this will prove not to be what's wanted, but at the moment we can initdb and pass regression tests in Turkish locale.
Diffstat (limited to 'src/port/pgstrcasecmp.c')
-rw-r--r--src/port/pgstrcasecmp.c125
1 files changed, 125 insertions, 0 deletions
diff --git a/src/port/pgstrcasecmp.c b/src/port/pgstrcasecmp.c
new file mode 100644
index 00000000000..6ac07804af4
--- /dev/null
+++ b/src/port/pgstrcasecmp.c
@@ -0,0 +1,125 @@
+/*-------------------------------------------------------------------------
+ *
+ * pgstrcasecmp.c
+ * Portable SQL-like case-independent comparisons and conversions.
+ *
+ * SQL99 specifies Unicode-aware case normalization, which we don't yet
+ * have the infrastructure for. Instead we use tolower() to provide a
+ * locale-aware translation. However, there are some locales where this
+ * is not right either (eg, Turkish may do strange things with 'i' and
+ * 'I'). Our current compromise is to use tolower() for characters with
+ * the high bit set, and use an ASCII-only downcasing for 7-bit
+ * characters.
+ *
+ * NB: this code should match downcase_truncate_identifier() in scansup.c.
+ *
+ *
+ * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
+ *
+ * $PostgreSQL: pgsql/src/port/pgstrcasecmp.c,v 1.1 2004/05/07 00:24:59 tgl Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <ctype.h>
+
+
+/*
+ * Case-independent comparison of two null-terminated strings.
+ */
+int
+pg_strcasecmp(const char *s1, const char *s2)
+{
+ for (;;)
+ {
+ unsigned char ch1 = (unsigned char) *s1++;
+ unsigned char ch2 = (unsigned char) *s2++;
+
+ if (ch1 != ch2)
+ {
+ if (ch1 >= 'A' && ch1 <= 'Z')
+ ch1 += 'a' - 'A';
+ else if (ch1 >= 0x80 && isupper(ch1))
+ ch1 = tolower(ch1);
+
+ if (ch2 >= 'A' && ch2 <= 'Z')
+ ch2 += 'a' - 'A';
+ else if (ch2 >= 0x80 && isupper(ch2))
+ ch2 = tolower(ch2);
+
+ if (ch1 != ch2)
+ return (int) ch1 - (int) ch2;
+ }
+ if (ch1 == 0)
+ break;
+ }
+ return 0;
+}
+
+/*
+ * Case-independent comparison of two not-necessarily-null-terminated strings.
+ * At most n bytes will be examined from each string.
+ */
+int
+pg_strncasecmp(const char *s1, const char *s2, size_t n)
+{
+ while (n-- > 0)
+ {
+ unsigned char ch1 = (unsigned char) *s1++;
+ unsigned char ch2 = (unsigned char) *s2++;
+
+ if (ch1 != ch2)
+ {
+ if (ch1 >= 'A' && ch1 <= 'Z')
+ ch1 += 'a' - 'A';
+ else if (ch1 >= 0x80 && isupper(ch1))
+ ch1 = tolower(ch1);
+
+ if (ch2 >= 'A' && ch2 <= 'Z')
+ ch2 += 'a' - 'A';
+ else if (ch2 >= 0x80 && isupper(ch2))
+ ch2 = tolower(ch2);
+
+ if (ch1 != ch2)
+ return (int) ch1 - (int) ch2;
+ }
+ if (ch1 == 0)
+ break;
+ }
+ return 0;
+}
+
+/*
+ * Fold a character to upper case.
+ *
+ * Unlike some versions of toupper(), this is safe to apply to characters
+ * that aren't upper case letters. Note however that the whole thing is
+ * a bit bogus for multibyte character sets.
+ */
+unsigned char
+pg_toupper(unsigned char ch)
+{
+ if (ch >= 'a' && ch <= 'z')
+ ch += 'A' - 'a';
+ else if (ch >= 0x80 && islower(ch))
+ ch = toupper(ch);
+ return ch;
+}
+
+/*
+ * Fold a character to lower case.
+ *
+ * Unlike some versions of tolower(), this is safe to apply to characters
+ * that aren't lower case letters. Note however that the whole thing is
+ * a bit bogus for multibyte character sets.
+ */
+unsigned char
+pg_tolower(unsigned char ch)
+{
+ if (ch >= 'A' && ch <= 'Z')
+ ch += 'a' - 'A';
+ else if (ch >= 0x80 && isupper(ch))
+ ch = tolower(ch);
+ return ch;
+}