aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--contrib/unaccent/expected/unaccent.out18
-rw-r--r--contrib/unaccent/generate_unaccent_rules.py21
-rw-r--r--contrib/unaccent/sql/unaccent.sql3
-rw-r--r--contrib/unaccent/unaccent.rules15
4 files changed, 54 insertions, 3 deletions
diff --git a/contrib/unaccent/expected/unaccent.out b/contrib/unaccent/expected/unaccent.out
index 0835e141afb..69c2cf9bd7a 100644
--- a/contrib/unaccent/expected/unaccent.out
+++ b/contrib/unaccent/expected/unaccent.out
@@ -25,6 +25,12 @@ SELECT unaccent('ЁЖИК');
ЕЖИК
(1 row)
+SELECT unaccent('˃˖˗˜');
+ unaccent
+----------
+ >+-~
+(1 row)
+
SELECT unaccent('unaccent', 'foobar');
unaccent
----------
@@ -43,6 +49,12 @@ SELECT unaccent('unaccent', 'ЁЖИК');
ЕЖИК
(1 row)
+SELECT unaccent('unaccent', '˃˖˗˜');
+ unaccent
+----------
+ >+-~
+(1 row)
+
SELECT ts_lexize('unaccent', 'foobar');
ts_lexize
-----------
@@ -61,3 +73,9 @@ SELECT ts_lexize('unaccent', 'ЁЖИК');
{ЕЖИК}
(1 row)
+SELECT ts_lexize('unaccent', '˃˖˗˜');
+ ts_lexize
+-----------
+ {>+-~}
+(1 row)
+
diff --git a/contrib/unaccent/generate_unaccent_rules.py b/contrib/unaccent/generate_unaccent_rules.py
index c9aef490aef..4419a771edf 100644
--- a/contrib/unaccent/generate_unaccent_rules.py
+++ b/contrib/unaccent/generate_unaccent_rules.py
@@ -20,8 +20,13 @@
# option is enabled, the XML file of this transliterator [2] -- given as a
# command line argument -- will be parsed and used.
#
+# Ideally you should use the latest release for each data set. For
+# Latin-ASCII.xml, the latest data sets released can be browsed directly
+# via [3]. Note that this script is compatible with at least release 29.
+#
# [1] http://unicode.org/Public/8.0.0/ucd/UnicodeData.txt
-# [2] http://unicode.org/cldr/trac/export/12304/tags/release-28/common/transforms/Latin-ASCII.xml
+# [2] http://unicode.org/cldr/trac/export/14746/tags/release-34/common/transforms/Latin-ASCII.xml
+# [3] https://unicode.org/cldr/trac/browser/tags
# BEGIN: Python 2/3 compatibility - remove when Python 2 compatibility dropped
# The approach is to be Python3 compatible with Python2 "backports".
@@ -140,8 +145,18 @@ def parse_cldr_latin_ascii_transliterator(latinAsciiFilePath):
transliterationTree = ET.parse(latinAsciiFilePath)
transliterationTreeRoot = transliterationTree.getroot()
- for rule in transliterationTreeRoot.findall("./transforms/transform/tRule"):
- matches = rulePattern.search(rule.text)
+ # Fetch all the transliteration rules. Since release 29 of Latin-ASCII.xml
+ # all the transliteration rules are located in a single tRule block with
+ # all rules separated into separate lines.
+ blockRules = transliterationTreeRoot.findall("./transforms/transform/tRule")
+ assert(len(blockRules) == 1)
+
+ # Split the block of rules into one element per line.
+ rules = blockRules[0].text.splitlines()
+
+ # And finish the processing of each individual rule.
+ for rule in rules:
+ matches = rulePattern.search(rule)
# The regular expression capture four groups corresponding
# to the characters.
diff --git a/contrib/unaccent/sql/unaccent.sql b/contrib/unaccent/sql/unaccent.sql
index ba72ab6261c..c671827caa5 100644
--- a/contrib/unaccent/sql/unaccent.sql
+++ b/contrib/unaccent/sql/unaccent.sql
@@ -8,11 +8,14 @@ SET client_encoding TO 'UTF8';
SELECT unaccent('foobar');
SELECT unaccent('ёлка');
SELECT unaccent('ЁЖИК');
+SELECT unaccent('˃˖˗˜');
SELECT unaccent('unaccent', 'foobar');
SELECT unaccent('unaccent', 'ёлка');
SELECT unaccent('unaccent', 'ЁЖИК');
+SELECT unaccent('unaccent', '˃˖˗˜');
SELECT ts_lexize('unaccent', 'foobar');
SELECT ts_lexize('unaccent', 'ёлка');
SELECT ts_lexize('unaccent', 'ЁЖИК');
+SELECT ts_lexize('unaccent', '˃˖˗˜');
diff --git a/contrib/unaccent/unaccent.rules b/contrib/unaccent/unaccent.rules
index 76e4e69bebb..7ce25eef03d 100644
--- a/contrib/unaccent/unaccent.rules
+++ b/contrib/unaccent/unaccent.rules
@@ -399,6 +399,21 @@
ʦ ts
ʪ ls
ʫ lz
+ʹ '
+ʺ "
+ʻ '
+ʼ '
+ʽ '
+˂ <
+˃ >
+˄ ^
+ˆ ^
+ˈ '
+ˋ `
+ː :
+˖ +
+˗ -
+˜ ~
Ά Α
Έ Ε
Ή Η