aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--contrib/unaccent/expected/unaccent.out44
-rw-r--r--contrib/unaccent/generate_unaccent_rules.py5
-rw-r--r--contrib/unaccent/sql/unaccent.sql10
3 files changed, 56 insertions, 3 deletions
diff --git a/contrib/unaccent/expected/unaccent.out b/contrib/unaccent/expected/unaccent.out
index c1bd7cd897d..ee0ac71a1cc 100644
--- a/contrib/unaccent/expected/unaccent.out
+++ b/contrib/unaccent/expected/unaccent.out
@@ -37,6 +37,18 @@ SELECT unaccent('À'); -- Remove combining diacritical 0x0300
A
(1 row)
+SELECT unaccent('℃℉'); -- degree signs
+ unaccent
+----------
+ °C°F
+(1 row)
+
+SELECT unaccent('℗'); -- sound recording copyright
+ unaccent
+----------
+ (P)
+(1 row)
+
SELECT unaccent('unaccent', 'foobar');
unaccent
----------
@@ -67,6 +79,18 @@ SELECT unaccent('unaccent', 'À');
A
(1 row)
+SELECT unaccent('unaccent', '℃℉');
+ unaccent
+----------
+ °C°F
+(1 row)
+
+SELECT unaccent('unaccent', '℗');
+ unaccent
+----------
+ (P)
+(1 row)
+
SELECT ts_lexize('unaccent', 'foobar');
ts_lexize
-----------
@@ -97,3 +121,23 @@ SELECT ts_lexize('unaccent', 'À');
{A}
(1 row)
+SELECT ts_lexize('unaccent', '℃℉');
+ ts_lexize
+-----------
+ {°C°F}
+(1 row)
+
+SELECT ts_lexize('unaccent', '℗');
+ ts_lexize
+-----------
+ {(P)}
+(1 row)
+
+-- Controversial case. Black-Letter Capital H (U+210C) is translated by
+-- Latin-ASCII.xml as 'x', but it should be 'H'.
+SELECT unaccent('ℌ');
+ unaccent
+----------
+ x
+(1 row)
+
diff --git a/contrib/unaccent/generate_unaccent_rules.py b/contrib/unaccent/generate_unaccent_rules.py
index c405e231b39..b4b4c38bebe 100644
--- a/contrib/unaccent/generate_unaccent_rules.py
+++ b/contrib/unaccent/generate_unaccent_rules.py
@@ -134,12 +134,12 @@ def get_plain_letter(codepoint, table):
return table[codepoint.combining_ids[0]]
# Should not come here
- assert(False)
+ assert False, 'Codepoint U+%0.2X' % codepoint.id
elif is_plain_letter(codepoint):
return codepoint
# Should not come here
- assert(False)
+ assert False, 'Codepoint U+%0.2X' % codepoint.id
def is_ligature(codepoint, table):
@@ -212,7 +212,6 @@ def special_cases():
# Symbols of "Letterlike Symbols" Unicode Block (U+2100 to U+214F)
charactersSet.add((0x2103, "\xb0C")) # DEGREE CELSIUS
charactersSet.add((0x2109, "\xb0F")) # DEGREE FAHRENHEIT
- charactersSet.add((0x2117, "(P)")) # SOUND RECORDING COPYRIGHT
return charactersSet
diff --git a/contrib/unaccent/sql/unaccent.sql b/contrib/unaccent/sql/unaccent.sql
index 2ae097ff2b8..3fc0c706be3 100644
--- a/contrib/unaccent/sql/unaccent.sql
+++ b/contrib/unaccent/sql/unaccent.sql
@@ -10,15 +10,25 @@ SELECT unaccent('ёлка');
SELECT unaccent('ЁЖИК');
SELECT unaccent('˃˖˗˜');
SELECT unaccent('À'); -- Remove combining diacritical 0x0300
+SELECT unaccent('℃℉'); -- degree signs
+SELECT unaccent('℗'); -- sound recording copyright
SELECT unaccent('unaccent', 'foobar');
SELECT unaccent('unaccent', 'ёлка');
SELECT unaccent('unaccent', 'ЁЖИК');
SELECT unaccent('unaccent', '˃˖˗˜');
SELECT unaccent('unaccent', 'À');
+SELECT unaccent('unaccent', '℃℉');
+SELECT unaccent('unaccent', '℗');
SELECT ts_lexize('unaccent', 'foobar');
SELECT ts_lexize('unaccent', 'ёлка');
SELECT ts_lexize('unaccent', 'ЁЖИК');
SELECT ts_lexize('unaccent', '˃˖˗˜');
SELECT ts_lexize('unaccent', 'À');
+SELECT ts_lexize('unaccent', '℃℉');
+SELECT ts_lexize('unaccent', '℗');
+
+-- Controversial case. Black-Letter Capital H (U+210C) is translated by
+-- Latin-ASCII.xml as 'x', but it should be 'H'.
+SELECT unaccent('ℌ');