Enable routine running of citext's UTF8-specific test cases.

These test cases have been commented out since citext was invented, because at the time we had no nice way to deal with tests that have restrictions such as requiring UTF8 encoding. But now we do have a convention for that, ie put them into a separate test file with an early-exit path. So let's enable these tests to run when their prerequisites are satisfied. (We may have to tighten the prerequisites beyond the "encoding = UTF8 and locale != C" checks made here. But let's put it on the buildfarm and see what blows up.) Dag Lem Discussion: https://postgr.es/m/ygezgoacs4e.fsf_-_@sid.nimrod.no
author: Tom Lane <tgl@sss.pgh.pa.us> 2022-01-05 13:30:07 -0500
committer: Tom Lane <tgl@sss.pgh.pa.us> 2022-01-05 13:30:07 -0500
commit: c2e8bd27519f47ff56987b30eb34a01969b9a9e8 (patch)
tree: 91d86f54dc3b539d133b386d12a40ef03192e7c7
parent: 6ce16088bfed97f982f66a9dc17b8364df289e4d (diff)
download: postgresql-c2e8bd27519f47ff56987b30eb34a01969b9a9e8.tar.gz
postgresql-c2e8bd27519f47ff56987b30eb34a01969b9a9e8.zip
7 files changed, 207 insertions, 94 deletions
diff --git a/contrib/citext/Makefile b/contrib/citext/Makefile
index a7de52928d7..789932fe366 100644
--- a/contrib/citext/Makefile
+++ b/contrib/citext/Makefile
@@ -11,7 +11,7 @@ DATA = citext--1.4.sql \
 	citext--1.0--1.1.sql
 PGFILEDESC = "citext - case-insensitive character string data type"
 
-REGRESS = citext
+REGRESS = citext citext_utf8
 
 ifdef USE_PGXS
 PG_CONFIG = pg_config
diff --git a/contrib/citext/expected/citext.out b/contrib/citext/expected/citext.out
index 3bac0534fb8..5afcc50920e 100644
--- a/contrib/citext/expected/citext.out
+++ b/contrib/citext/expected/citext.out
@@ -48,29 +48,6 @@ SELECT 'a'::citext <> 'ab'::citext AS t;
  t
 (1 row)
 
--- Multibyte sanity tests. Uncomment to run.
--- SELECT 'À'::citext =  'À'::citext AS t;
--- SELECT 'À'::citext =  'à'::citext AS t;
--- SELECT 'À'::text   =  'à'::text   AS f; -- text wins.
--- SELECT 'À'::citext <> 'B'::citext AS t;
--- Test combining characters making up canonically equivalent strings.
--- SELECT 'Ä'::text   <> 'Ä'::text   AS t;
--- SELECT 'Ä'::citext <> 'Ä'::citext AS t;
--- Test the Turkish dotted I. The lowercase is a single byte while the
--- uppercase is multibyte. This is why the comparison code can't be optimized
--- to compare string lengths.
--- SELECT 'i'::citext = 'İ'::citext AS t;
--- Regression.
--- SELECT 'láska'::citext <> 'laská'::citext AS t;
--- SELECT 'Ask Bjørn Hansen'::citext = 'Ask Bjørn Hansen'::citext AS t;
--- SELECT 'Ask Bjørn Hansen'::citext = 'ASK BJØRN HANSEN'::citext AS t;
--- SELECT 'Ask Bjørn Hansen'::citext <> 'Ask Bjorn Hansen'::citext AS t;
--- SELECT 'Ask Bjørn Hansen'::citext <> 'ASK BJORN HANSEN'::citext AS t;
--- SELECT citext_cmp('Ask Bjørn Hansen'::citext, 'Ask Bjørn Hansen'::citext) AS zero;
--- SELECT citext_cmp('Ask Bjørn Hansen'::citext, 'ask bjørn hansen'::citext) AS zero;
--- SELECT citext_cmp('Ask Bjørn Hansen'::citext, 'ASK BJØRN HANSEN'::citext) AS zero;
--- SELECT citext_cmp('Ask Bjørn Hansen'::citext, 'Ask Bjorn Hansen'::citext) AS positive;
--- SELECT citext_cmp('Ask Bjorn Hansen'::citext, 'Ask Bjørn Hansen'::citext) AS negative;
 -- Test > and >=
 SELECT 'B'::citext > 'a'::citext AS t;
  t 
@@ -2614,8 +2591,6 @@ SELECT citext_pattern_ge('b'::citext, 'A'::citext) AS true;
  t
 (1 row)
 
--- Multi-byte tests below are disabled like the sanity tests above.
--- Uncomment to run them.
 -- Test ~<~ and ~<=~
 SELECT 'a'::citext ~<~  'B'::citext AS t;
  t 
@@ -2629,7 +2604,6 @@ SELECT 'b'::citext ~<~  'A'::citext AS f;
  f
 (1 row)
 
--- SELECT 'à'::citext ~<~  'À'::citext AS f;
 SELECT 'a'::citext ~<=~ 'B'::citext AS t;
  t 
 ---
@@ -2642,7 +2616,6 @@ SELECT 'a'::citext ~<=~ 'A'::citext AS t;
  t
 (1 row)
 
--- SELECT 'à'::citext ~<=~ 'À'::citext AS t;
 -- Test ~>~ and ~>=~
 SELECT 'B'::citext ~>~  'a'::citext AS t;
  t 
@@ -2656,7 +2629,6 @@ SELECT 'b'::citext ~>~  'A'::citext AS t;
  t
 (1 row)
 
--- SELECT 'à'::citext ~>~  'À'::citext AS f;
 SELECT 'B'::citext ~>~  'b'::citext AS f;
  f 
 ---
@@ -2669,7 +2641,6 @@ SELECT 'B'::citext ~>=~ 'b'::citext AS t;
  t
 (1 row)
 
--- SELECT 'à'::citext ~>=~ 'À'::citext AS t;
 -- Test implicit casting. citext casts to text, but not vice-versa.
 SELECT 'B'::citext ~<~  'a'::text AS t;  -- text wins.
  t 
diff --git a/contrib/citext/expected/citext_1.out b/contrib/citext/expected/citext_1.out
index 57fc863f7a5..8aa2b9e1dbc 100644
--- a/contrib/citext/expected/citext_1.out
+++ b/contrib/citext/expected/citext_1.out
@@ -48,29 +48,6 @@ SELECT 'a'::citext <> 'ab'::citext AS t;
  t
 (1 row)
 
--- Multibyte sanity tests. Uncomment to run.
--- SELECT 'À'::citext =  'À'::citext AS t;
--- SELECT 'À'::citext =  'à'::citext AS t;
--- SELECT 'À'::text   =  'à'::text   AS f; -- text wins.
--- SELECT 'À'::citext <> 'B'::citext AS t;
--- Test combining characters making up canonically equivalent strings.
--- SELECT 'Ä'::text   <> 'Ä'::text   AS t;
--- SELECT 'Ä'::citext <> 'Ä'::citext AS t;
--- Test the Turkish dotted I. The lowercase is a single byte while the
--- uppercase is multibyte. This is why the comparison code can't be optimized
--- to compare string lengths.
--- SELECT 'i'::citext = 'İ'::citext AS t;
--- Regression.
--- SELECT 'láska'::citext <> 'laská'::citext AS t;
--- SELECT 'Ask Bjørn Hansen'::citext = 'Ask Bjørn Hansen'::citext AS t;
--- SELECT 'Ask Bjørn Hansen'::citext = 'ASK BJØRN HANSEN'::citext AS t;
--- SELECT 'Ask Bjørn Hansen'::citext <> 'Ask Bjorn Hansen'::citext AS t;
--- SELECT 'Ask Bjørn Hansen'::citext <> 'ASK BJORN HANSEN'::citext AS t;
--- SELECT citext_cmp('Ask Bjørn Hansen'::citext, 'Ask Bjørn Hansen'::citext) AS zero;
--- SELECT citext_cmp('Ask Bjørn Hansen'::citext, 'ask bjørn hansen'::citext) AS zero;
--- SELECT citext_cmp('Ask Bjørn Hansen'::citext, 'ASK BJØRN HANSEN'::citext) AS zero;
--- SELECT citext_cmp('Ask Bjørn Hansen'::citext, 'Ask Bjorn Hansen'::citext) AS positive;
--- SELECT citext_cmp('Ask Bjorn Hansen'::citext, 'Ask Bjørn Hansen'::citext) AS negative;
 -- Test > and >=
 SELECT 'B'::citext > 'a'::citext AS t;
  t 
@@ -2614,8 +2591,6 @@ SELECT citext_pattern_ge('b'::citext, 'A'::citext) AS true;
  t
 (1 row)
 
--- Multi-byte tests below are disabled like the sanity tests above.
--- Uncomment to run them.
 -- Test ~<~ and ~<=~
 SELECT 'a'::citext ~<~  'B'::citext AS t;
  t 
@@ -2629,7 +2604,6 @@ SELECT 'b'::citext ~<~  'A'::citext AS f;
  f
 (1 row)
 
--- SELECT 'à'::citext ~<~  'À'::citext AS f;
 SELECT 'a'::citext ~<=~ 'B'::citext AS t;
  t 
 ---
@@ -2642,7 +2616,6 @@ SELECT 'a'::citext ~<=~ 'A'::citext AS t;
  t
 (1 row)
 
--- SELECT 'à'::citext ~<=~ 'À'::citext AS t;
 -- Test ~>~ and ~>=~
 SELECT 'B'::citext ~>~  'a'::citext AS t;
  t 
@@ -2656,7 +2629,6 @@ SELECT 'b'::citext ~>~  'A'::citext AS t;
  t
 (1 row)
 
--- SELECT 'à'::citext ~>~  'À'::citext AS f;
 SELECT 'B'::citext ~>~  'b'::citext AS f;
  f 
 ---
@@ -2669,7 +2641,6 @@ SELECT 'B'::citext ~>=~ 'b'::citext AS t;
  t
 (1 row)
 
--- SELECT 'à'::citext ~>=~ 'À'::citext AS t;
 -- Test implicit casting. citext casts to text, but not vice-versa.
 SELECT 'B'::citext ~<~  'a'::text AS t;  -- text wins.
  t 
diff --git a/contrib/citext/expected/citext_utf8.out b/contrib/citext/expected/citext_utf8.out
new file mode 100644
index 00000000000..666b07ccec4
--- /dev/null
+++ b/contrib/citext/expected/citext_utf8.out
@@ -0,0 +1,146 @@
+/*
+ * This test must be run in a database with UTF-8 encoding
+ * and a Unicode-aware locale.
+ */
+SELECT getdatabaseencoding() <> 'UTF8' OR
+       current_setting('lc_ctype') = 'C'
+       AS skip_test \gset
+\if :skip_test
+\quit
+\endif
+set client_encoding = utf8;
+-- CREATE EXTENSION IF NOT EXISTS citext;
+-- Multibyte sanity tests.
+SELECT 'À'::citext =  'À'::citext AS t;
+ t 
+---
+ t
+(1 row)
+
+SELECT 'À'::citext =  'à'::citext AS t;
+ t 
+---
+ t
+(1 row)
+
+SELECT 'À'::text   =  'à'::text   AS f; -- text wins.
+ f 
+---
+ f
+(1 row)
+
+SELECT 'À'::citext <> 'B'::citext AS t;
+ t 
+---
+ t
+(1 row)
+
+-- Test combining characters making up canonically equivalent strings.
+SELECT 'Ä'::text   <> 'Ä'::text   AS t;
+ t 
+---
+ t
+(1 row)
+
+SELECT 'Ä'::citext <> 'Ä'::citext AS t;
+ t 
+---
+ t
+(1 row)
+
+-- Test the Turkish dotted I. The lowercase is a single byte while the
+-- uppercase is multibyte. This is why the comparison code can't be optimized
+-- to compare string lengths.
+SELECT 'i'::citext = 'İ'::citext AS t;
+ t 
+---
+ t
+(1 row)
+
+-- Regression.
+SELECT 'láska'::citext <> 'laská'::citext AS t;
+ t 
+---
+ t
+(1 row)
+
+SELECT 'Ask Bjørn Hansen'::citext = 'Ask Bjørn Hansen'::citext AS t;
+ t 
+---
+ t
+(1 row)
+
+SELECT 'Ask Bjørn Hansen'::citext = 'ASK BJØRN HANSEN'::citext AS t;
+ t 
+---
+ t
+(1 row)
+
+SELECT 'Ask Bjørn Hansen'::citext <> 'Ask Bjorn Hansen'::citext AS t;
+ t 
+---
+ t
+(1 row)
+
+SELECT 'Ask Bjørn Hansen'::citext <> 'ASK BJORN HANSEN'::citext AS t;
+ t 
+---
+ t
+(1 row)
+
+SELECT citext_cmp('Ask Bjørn Hansen'::citext, 'Ask Bjørn Hansen'::citext) = 0 AS t;
+ t 
+---
+ t
+(1 row)
+
+SELECT citext_cmp('Ask Bjørn Hansen'::citext, 'ask bjørn hansen'::citext) = 0 AS t;
+ t 
+---
+ t
+(1 row)
+
+SELECT citext_cmp('Ask Bjørn Hansen'::citext, 'ASK BJØRN HANSEN'::citext) = 0 AS t;
+ t 
+---
+ t
+(1 row)
+
+SELECT citext_cmp('Ask Bjørn Hansen'::citext, 'Ask Bjorn Hansen'::citext) > 0 AS t;
+ t 
+---
+ t
+(1 row)
+
+SELECT citext_cmp('Ask Bjorn Hansen'::citext, 'Ask Bjørn Hansen'::citext) < 0 AS t;
+ t 
+---
+ t
+(1 row)
+
+-- Test ~<~ and ~<=~
+SELECT 'à'::citext ~<~  'À'::citext AS f;
+ f 
+---
+ f
+(1 row)
+
+SELECT 'à'::citext ~<=~ 'À'::citext AS t;
+ t 
+---
+ t
+(1 row)
+
+-- Test ~>~ and ~>=~
+SELECT 'à'::citext ~>~  'À'::citext AS f;
+ f 
+---
+ f
+(1 row)
+
+SELECT 'à'::citext ~>=~ 'À'::citext AS t;
+ t 
+---
+ t
+(1 row)
+
diff --git a/contrib/citext/expected/citext_utf8_1.out b/contrib/citext/expected/citext_utf8_1.out
new file mode 100644
index 00000000000..433e9853497
--- /dev/null
+++ b/contrib/citext/expected/citext_utf8_1.out
@@ -0,0 +1,9 @@
+/*
+ * This test must be run in a database with UTF-8 encoding
+ * and a Unicode-aware locale.
+ */
+SELECT getdatabaseencoding() <> 'UTF8' OR
+       current_setting('lc_ctype') = 'C'
+       AS skip_test \gset
+\if :skip_test
+\quit
diff --git a/contrib/citext/sql/citext.sql b/contrib/citext/sql/citext.sql
index 55fb1d11a6f..8c87be6b1d2 100644
--- a/contrib/citext/sql/citext.sql
+++ b/contrib/citext/sql/citext.sql
@@ -19,34 +19,6 @@ SELECT 'a'::citext = 'b'::citext AS f;
 SELECT 'a'::citext = 'ab'::citext AS f;
 SELECT 'a'::citext <> 'ab'::citext AS t;
 
--- Multibyte sanity tests. Uncomment to run.
--- SELECT 'À'::citext =  'À'::citext AS t;
--- SELECT 'À'::citext =  'à'::citext AS t;
--- SELECT 'À'::text   =  'à'::text   AS f; -- text wins.
--- SELECT 'À'::citext <> 'B'::citext AS t;
-
--- Test combining characters making up canonically equivalent strings.
--- SELECT 'Ä'::text   <> 'Ä'::text   AS t;
--- SELECT 'Ä'::citext <> 'Ä'::citext AS t;
-
--- Test the Turkish dotted I. The lowercase is a single byte while the
--- uppercase is multibyte. This is why the comparison code can't be optimized
--- to compare string lengths.
--- SELECT 'i'::citext = 'İ'::citext AS t;
-
--- Regression.
--- SELECT 'láska'::citext <> 'laská'::citext AS t;
-
--- SELECT 'Ask Bjørn Hansen'::citext = 'Ask Bjørn Hansen'::citext AS t;
--- SELECT 'Ask Bjørn Hansen'::citext = 'ASK BJØRN HANSEN'::citext AS t;
--- SELECT 'Ask Bjørn Hansen'::citext <> 'Ask Bjorn Hansen'::citext AS t;
--- SELECT 'Ask Bjørn Hansen'::citext <> 'ASK BJORN HANSEN'::citext AS t;
--- SELECT citext_cmp('Ask Bjørn Hansen'::citext, 'Ask Bjørn Hansen'::citext) AS zero;
--- SELECT citext_cmp('Ask Bjørn Hansen'::citext, 'ask bjørn hansen'::citext) AS zero;
--- SELECT citext_cmp('Ask Bjørn Hansen'::citext, 'ASK BJØRN HANSEN'::citext) AS zero;
--- SELECT citext_cmp('Ask Bjørn Hansen'::citext, 'Ask Bjorn Hansen'::citext) AS positive;
--- SELECT citext_cmp('Ask Bjorn Hansen'::citext, 'Ask Bjørn Hansen'::citext) AS negative;
-
 -- Test > and >=
 SELECT 'B'::citext > 'a'::citext AS t;
 SELECT 'b'::citext >  'A'::citext AS t;
@@ -811,24 +783,17 @@ SELECT citext_pattern_ge('b'::citext, 'a'::citext) AS true;
 SELECT citext_pattern_ge('B'::citext, 'a'::citext) AS true;
 SELECT citext_pattern_ge('b'::citext, 'A'::citext) AS true;
 
--- Multi-byte tests below are disabled like the sanity tests above.
--- Uncomment to run them.
-
 -- Test ~<~ and ~<=~
 SELECT 'a'::citext ~<~  'B'::citext AS t;
 SELECT 'b'::citext ~<~  'A'::citext AS f;
--- SELECT 'à'::citext ~<~  'À'::citext AS f;
 SELECT 'a'::citext ~<=~ 'B'::citext AS t;
 SELECT 'a'::citext ~<=~ 'A'::citext AS t;
--- SELECT 'à'::citext ~<=~ 'À'::citext AS t;
 
 -- Test ~>~ and ~>=~
 SELECT 'B'::citext ~>~  'a'::citext AS t;
 SELECT 'b'::citext ~>~  'A'::citext AS t;
--- SELECT 'à'::citext ~>~  'À'::citext AS f;
 SELECT 'B'::citext ~>~  'b'::citext AS f;
 SELECT 'B'::citext ~>=~ 'b'::citext AS t;
--- SELECT 'à'::citext ~>=~ 'À'::citext AS t;
 
 -- Test implicit casting. citext casts to text, but not vice-versa.
 SELECT 'B'::citext ~<~  'a'::text AS t;  -- text wins.
diff --git a/contrib/citext/sql/citext_utf8.sql b/contrib/citext/sql/citext_utf8.sql
new file mode 100644
index 00000000000..d068000b423
--- /dev/null
+++ b/contrib/citext/sql/citext_utf8.sql
@@ -0,0 +1,51 @@
+/*
+ * This test must be run in a database with UTF-8 encoding
+ * and a Unicode-aware locale.
+ */
+
+SELECT getdatabaseencoding() <> 'UTF8' OR
+       current_setting('lc_ctype') = 'C'
+       AS skip_test \gset
+\if :skip_test
+\quit
+\endif
+
+set client_encoding = utf8;
+
+-- CREATE EXTENSION IF NOT EXISTS citext;
+
+-- Multibyte sanity tests.
+SELECT 'À'::citext =  'À'::citext AS t;
+SELECT 'À'::citext =  'à'::citext AS t;
+SELECT 'À'::text   =  'à'::text   AS f; -- text wins.
+SELECT 'À'::citext <> 'B'::citext AS t;
+
+-- Test combining characters making up canonically equivalent strings.
+SELECT 'Ä'::text   <> 'Ä'::text   AS t;
+SELECT 'Ä'::citext <> 'Ä'::citext AS t;
+
+-- Test the Turkish dotted I. The lowercase is a single byte while the
+-- uppercase is multibyte. This is why the comparison code can't be optimized
+-- to compare string lengths.
+SELECT 'i'::citext = 'İ'::citext AS t;
+
+-- Regression.
+SELECT 'láska'::citext <> 'laská'::citext AS t;
+
+SELECT 'Ask Bjørn Hansen'::citext = 'Ask Bjørn Hansen'::citext AS t;
+SELECT 'Ask Bjørn Hansen'::citext = 'ASK BJØRN HANSEN'::citext AS t;
+SELECT 'Ask Bjørn Hansen'::citext <> 'Ask Bjorn Hansen'::citext AS t;
+SELECT 'Ask Bjørn Hansen'::citext <> 'ASK BJORN HANSEN'::citext AS t;
+SELECT citext_cmp('Ask Bjørn Hansen'::citext, 'Ask Bjørn Hansen'::citext) = 0 AS t;
+SELECT citext_cmp('Ask Bjørn Hansen'::citext, 'ask bjørn hansen'::citext) = 0 AS t;
+SELECT citext_cmp('Ask Bjørn Hansen'::citext, 'ASK BJØRN HANSEN'::citext) = 0 AS t;
+SELECT citext_cmp('Ask Bjørn Hansen'::citext, 'Ask Bjorn Hansen'::citext) > 0 AS t;
+SELECT citext_cmp('Ask Bjorn Hansen'::citext, 'Ask Bjørn Hansen'::citext) < 0 AS t;
+
+-- Test ~<~ and ~<=~
+SELECT 'à'::citext ~<~  'À'::citext AS f;
+SELECT 'à'::citext ~<=~ 'À'::citext AS t;
+
+-- Test ~>~ and ~>=~
+SELECT 'à'::citext ~>~  'À'::citext AS f;
+SELECT 'à'::citext ~>=~ 'À'::citext AS t;
author	Tom Lane <tgl@sss.pgh.pa.us>	2022-01-05 13:30:07 -0500
committer	Tom Lane <tgl@sss.pgh.pa.us>	2022-01-05 13:30:07 -0500
commit	c2e8bd27519f47ff56987b30eb34a01969b9a9e8 (patch)
tree	91d86f54dc3b539d133b386d12a40ef03192e7c7
parent	6ce16088bfed97f982f66a9dc17b8364df289e4d (diff)
download	postgresql-c2e8bd27519f47ff56987b30eb34a01969b9a9e8.tar.gz postgresql-c2e8bd27519f47ff56987b30eb34a01969b9a9e8.zip