aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--contrib/unaccent/generate_unaccent_rules.py144
-rw-r--r--contrib/unaccent/unaccent.rules674
2 files changed, 762 insertions, 56 deletions
diff --git a/contrib/unaccent/generate_unaccent_rules.py b/contrib/unaccent/generate_unaccent_rules.py
index b838d8f630d..2f5520c8198 100644
--- a/contrib/unaccent/generate_unaccent_rules.py
+++ b/contrib/unaccent/generate_unaccent_rules.py
@@ -1,20 +1,33 @@
-#!/usr/bin/python
+#!/usr/bin/python2
+# -*- coding: utf-8 -*-
#
# This script builds unaccent.rules on standard output when given the
-# contents of UnicodeData.txt[1] on standard input. Optionally includes
-# ligature expansion, if --expand-ligatures is given on the command line.
+# contents of UnicodeData.txt [1] and Latin-ASCII.xml [2] given as
+# arguments. Optionally includes ligature expansion and Unicode CLDR
+# Latin-ASCII transliterator, enabled by default, this can be disabled
+# with "--no-ligatures-expansion" command line option.
#
# The approach is to use the Unicode decomposition data to identify
# precomposed codepoints that are equivalent to a ligature of several
# letters, or a base letter with any number of diacritical marks.
-# There is also a small set of special cases for codepoints that we
-# traditionally support even though Unicode doesn't consider them to
-# be ligatures or letters with marks.
#
-# [1] http://unicode.org/Public/7.0.0/ucd/UnicodeData.txt
+# This approach handles most letters with diacritical marks and some
+# ligatures. However, several characters (notably a majority of
+# ligatures) don't have decomposition. To handle all these cases, one can
+# use a standard Unicode transliterator available in Common Locale Data
+# Repository (CLDR): Latin-ASCII. This transliterator associates Unicode
+# characters to ASCII-range equivalent. Unless "--no-ligatures-expansion"
+# option is enabled, the XML file of this transliterator [2] -- given as a
+# command line argument -- will be parsed and used.
+#
+# [1] http://unicode.org/Public/8.0.0/ucd/UnicodeData.txt
+# [2] http://unicode.org/cldr/trac/export/12304/tags/release-28/common/transforms/Latin-ASCII.xml
+
import re
+import argparse
import sys
+import xml.etree.ElementTree as ET
def print_record(codepoint, letter):
print (unichr(codepoint) + "\t" + letter).encode("UTF-8")
@@ -63,15 +76,73 @@ def get_plain_letters(codepoint, table):
assert(is_ligature(codepoint, table))
return [get_plain_letter(table[id], table) for id in codepoint.combining_ids]
-def main(expand_ligatures):
+def parse_cldr_latin_ascii_transliterator(latinAsciiFilePath):
+ """Parse the XML file and return a set of tuples (src, trg), where "src"
+ is the original character and "trg" the substitute."""
+ charactersSet = set()
+
+ # RegEx to parse rules
+ rulePattern = re.compile(ur'^(?:(.)|(\\u[0-9a-fA-F]{4})) \u2192 (?:\'(.+)\'|(.+)) ;')
+
+ # construct tree from XML
+ transliterationTree = ET.parse(latinAsciiFilePath)
+ transliterationTreeRoot = transliterationTree.getroot()
+
+ for rule in transliterationTreeRoot.findall("./transforms/transform/tRule"):
+ matches = rulePattern.search(rule.text)
+
+ # The regular expression capture four groups corresponding
+ # to the characters.
+ #
+ # Group 1: plain "src" char. Empty if group 2 is not.
+ # Group 2: unicode-espaced "src" char (e.g. "\u0110"). Empty if group 1 is not.
+ #
+ # Group 3: plain "trg" char. Empty if group 4 is not.
+ # Group 4: plain "trg" char between quotes. Empty if group 3 is not.
+ if matches is not None:
+ src = matches.group(1) if matches.group(1) is not None else matches.group(2).decode('unicode-escape')
+ trg = matches.group(3) if matches.group(3) is not None else matches.group(4)
+
+ # "'" and """ are escaped
+ trg = trg.replace("\\'", "'").replace('\\"', '"')
+
+ # the parser of unaccent only accepts non-whitespace characters
+ # for "src" and "trg" (see unaccent.c)
+ if not src.isspace() and not trg.isspace():
+ charactersSet.add((ord(src), trg))
+
+ return charactersSet
+
+def special_cases():
+ """Returns the special cases which are not handled by other methods"""
+ charactersSet = set()
+
+ # Cyrillic
+ charactersSet.add((0x0401, u"\u0415")) # CYRILLIC CAPITAL LETTER IO
+ charactersSet.add((0x0451, u"\u0435")) # CYRILLIC SMALL LETTER IO
+
+ # Symbols of "Letterlike Symbols" Unicode Block (U+2100 to U+214F)
+ charactersSet.add((0x2103, u"\xb0C")) # DEGREE CELSIUS
+ charactersSet.add((0x2109, u"\xb0F")) # DEGREE FAHRENHEIT
+ charactersSet.add((0x2117, "(P)")) # SOUND RECORDING COPYRIGHT
+
+ return charactersSet
+
+def main(args):
# http://www.unicode.org/reports/tr44/tr44-14.html#Character_Decomposition_Mappings
decomposition_type_pattern = re.compile(" *<[^>]*> *")
table = {}
all = []
+ # unordered set for ensure uniqueness
+ charactersSet = set()
+
+ # read file UnicodeData.txt
+ unicodeDataFile = open(args.unicodeDataFilePath, 'r')
+
# read everything we need into memory
- for line in sys.stdin.readlines():
+ for line in unicodeDataFile:
fields = line.split(";")
if len(fields) > 5:
# http://www.unicode.org/reports/tr44/tr44-14.html#UnicodeData.txt
@@ -89,35 +160,34 @@ def main(expand_ligatures):
if codepoint.general_category.startswith('L') and \
len(codepoint.combining_ids) > 1:
if is_letter_with_marks(codepoint, table):
- print_record(codepoint.id,
- chr(get_plain_letter(codepoint, table).id))
- elif expand_ligatures and is_ligature(codepoint, table):
- print_record(codepoint.id,
+ charactersSet.add((codepoint.id,
+ chr(get_plain_letter(codepoint, table).id)))
+ elif args.noLigaturesExpansion is False and is_ligature(codepoint, table):
+ charactersSet.add((codepoint.id,
"".join(unichr(combining_codepoint.id)
for combining_codepoint \
- in get_plain_letters(codepoint, table)))
-
- # some special cases
- print_record(0x00d8, "O") # LATIN CAPITAL LETTER O WITH STROKE
- print_record(0x00f8, "o") # LATIN SMALL LETTER O WITH STROKE
- print_record(0x0110, "D") # LATIN CAPITAL LETTER D WITH STROKE
- print_record(0x0111, "d") # LATIN SMALL LETTER D WITH STROKE
- print_record(0x0131, "i") # LATIN SMALL LETTER DOTLESS I
- print_record(0x0126, "H") # LATIN CAPITAL LETTER H WITH STROKE
- print_record(0x0127, "h") # LATIN SMALL LETTER H WITH STROKE
- print_record(0x0141, "L") # LATIN CAPITAL LETTER L WITH STROKE
- print_record(0x0142, "l") # LATIN SMALL LETTER L WITH STROKE
- print_record(0x0149, "'n") # LATIN SMALL LETTER N PRECEDED BY APOSTROPHE
- print_record(0x0166, "T") # LATIN CAPITAL LETTER T WITH STROKE
- print_record(0x0167, "t") # LATIN SMALL LETTER t WITH STROKE
- print_record(0x0401, u"\u0415") # CYRILLIC CAPITAL LETTER IO
- print_record(0x0451, u"\u0435") # CYRILLIC SMALL LETTER IO
- if expand_ligatures:
- print_record(0x00c6, "AE") # LATIN CAPITAL LETTER AE
- print_record(0x00df, "ss") # LATIN SMALL LETTER SHARP S
- print_record(0x00e6, "ae") # LATIN SMALL LETTER AE
- print_record(0x0152, "OE") # LATIN CAPITAL LIGATURE OE
- print_record(0x0153, "oe") # LATIN SMALL LIGATURE OE
+ in get_plain_letters(codepoint, table))))
+
+ # add CLDR Latin-ASCII characters
+ if not args.noLigaturesExpansion:
+ charactersSet |= parse_cldr_latin_ascii_transliterator(args.latinAsciiFilePath)
+ charactersSet |= special_cases()
+
+ # sort for more convenient display
+ charactersList = sorted(charactersSet, key=lambda characterPair: characterPair[0])
+
+ for characterPair in charactersList:
+ print_record(characterPair[0], characterPair[1])
if __name__ == "__main__":
- main(len(sys.argv) == 2 and sys.argv[1] == "--expand-ligatures")
+ parser = argparse.ArgumentParser(description='This script builds unaccent.rules on standard output when given the contents of UnicodeData.txt and Latin-ASCII.xml given as arguments.')
+ parser.add_argument("--unicode-data-file", help="Path to formatted text file corresponding to UnicodeData.txt. See <http://unicode.org/Public/8.0.0/ucd/UnicodeData.txt>.", type=str, required=True, dest='unicodeDataFilePath')
+ parser.add_argument("--latin-ascii-file", help="Path to XML file from Unicode Common Locale Data Repository (CLDR) corresponding to Latin-ASCII transliterator (Latin-ASCII.xml). See <http://unicode.org/cldr/trac/export/12304/tags/release-28/common/transforms/Latin-ASCII.xml>.", type=str, dest='latinAsciiFilePath')
+ parser.add_argument("--no-ligatures-expansion", help="Do not expand ligatures and do not use Unicode CLDR Latin-ASCII transliterator. By default, this option is not enabled and \"--latin-ascii-file\" argument is required. If this option is enabled, \"--latin-ascii-file\" argument is optional and ignored.", action="store_true", dest='noLigaturesExpansion')
+ args = parser.parse_args()
+
+ if args.noLigaturesExpansion is False and args.latinAsciiFilePath is None:
+ sys.stderr.write('You must specify the path to Latin-ASCII transliterator file with \"--latin-ascii-file\" option or use \"--no-ligatures-expansion\" option. Use \"-h\" option for help.')
+ sys.exit(1)
+
+ main(args)
diff --git a/contrib/unaccent/unaccent.rules b/contrib/unaccent/unaccent.rules
index 73c24a188ba..84886da587a 100644
--- a/contrib/unaccent/unaccent.rules
+++ b/contrib/unaccent/unaccent.rules
@@ -1,9 +1,18 @@
+© (C)
+« <<
+­ -
+® (R)
+» >>
+¼ 1/4
+½ 1/2
+¾ 3/4
À A
Á A
 A
à A
Ä A
Å A
+Æ AE
Ç C
È E
É E
@@ -13,23 +22,29 @@
Í I
Î I
Ï I
+Ð D
Ñ N
Ò O
Ó O
Ô O
Õ O
Ö O
+× *
+Ø O
Ù U
Ú U
Û U
Ü U
Ý Y
+Þ TH
+ß ss
à a
á a
â a
ã a
ä a
å a
+æ ae
ç c
è e
é e
@@ -39,17 +54,21 @@
í i
î i
ï i
+ð d
ñ n
ò o
ó o
ô o
õ o
ö o
+÷ /
+ø o
ù u
ú u
û u
ü u
ý y
+þ th
ÿ y
Ā A
ā a
@@ -67,6 +86,8 @@
č c
Ď D
ď d
+Đ D
+đ d
Ē E
ē e
Ĕ E
@@ -87,6 +108,8 @@
ģ g
Ĥ H
ĥ h
+Ħ H
+ħ h
Ĩ I
ĩ i
Ī I
@@ -96,30 +119,41 @@
Į I
į i
İ I
+ı i
IJ IJ
ij ij
Ĵ J
ĵ j
Ķ K
ķ k
+ĸ q
Ĺ L
ĺ l
Ļ L
ļ l
Ľ L
ľ l
+Ŀ L
+ŀ l
+Ł L
+ł l
Ń N
ń n
Ņ N
ņ n
Ň N
ň n
+ʼn 'n
+Ŋ N
+ŋ n
Ō O
ō o
Ŏ O
ŏ o
Ő O
ő o
+Œ OE
+œ oe
Ŕ R
ŕ r
Ŗ R
@@ -138,6 +172,8 @@
ţ t
Ť T
ť t
+Ŧ T
+ŧ t
Ũ U
ũ u
Ū U
@@ -161,10 +197,46 @@
ż z
Ž Z
ž z
+ſ s
+ƀ b
+Ɓ B
+Ƃ B
+ƃ b
+Ƈ C
+ƈ c
+Ɖ D
+Ɗ D
+Ƌ D
+ƌ d
+Ɛ E
+Ƒ F
+ƒ f
+Ɠ G
+ƕ hv
+Ɩ I
+Ɨ I
+Ƙ K
+ƙ k
+ƚ l
+Ɲ N
+ƞ n
Ơ O
ơ o
+Ƣ OI
+ƣ oi
+Ƥ P
+ƥ p
+ƫ t
+Ƭ T
+ƭ t
+Ʈ T
Ư U
ư u
+Ʋ V
+Ƴ Y
+ƴ y
+Ƶ Z
+ƶ z
DŽ DZ
Dž Dz
dž dz
@@ -182,6 +254,8 @@
ǒ o
Ǔ U
ǔ u
+Ǥ G
+ǥ g
Ǧ G
ǧ g
Ǩ K
@@ -226,6 +300,9 @@
ț t
Ȟ H
ȟ h
+ȡ d
+Ȥ Z
+ȥ z
Ȧ A
ȧ a
Ȩ E
@@ -234,6 +311,128 @@
ȯ o
Ȳ Y
ȳ y
+ȴ l
+ȵ n
+ȶ t
+ȷ j
+ȸ db
+ȹ qp
+Ⱥ A
+Ȼ C
+ȼ c
+Ƚ L
+Ⱦ T
+ȿ s
+ɀ z
+Ƀ B
+Ʉ U
+Ɇ E
+ɇ e
+Ɉ J
+ɉ j
+Ɍ R
+ɍ r
+Ɏ Y
+ɏ y
+ɓ b
+ɕ c
+ɖ d
+ɗ d
+ɛ e
+ɟ j
+ɠ g
+ɡ g
+ɢ G
+ɦ h
+ɧ h
+ɨ i
+ɪ I
+ɫ l
+ɬ l
+ɭ l
+ɱ m
+ɲ n
+ɳ n
+ɴ N
+ɶ OE
+ɼ r
+ɽ r
+ɾ r
+ʀ R
+ʂ s
+ʈ t
+ʉ u
+ʋ v
+ʏ Y
+ʐ z
+ʑ z
+ʙ B
+ʛ G
+ʜ H
+ʝ j
+ʟ L
+ʠ q
+ʣ dz
+ʥ dz
+ʦ ts
+ʪ ls
+ʫ lz
+Ё Е
+ё е
+ᴀ A
+ᴁ AE
+ᴃ B
+ᴄ C
+ᴅ D
+ᴆ D
+ᴇ E
+ᴊ J
+ᴋ K
+ᴌ L
+ᴍ M
+ᴏ O
+ᴘ P
+ᴛ T
+ᴜ U
+ᴠ V
+ᴡ W
+ᴢ Z
+ᵫ ue
+ᵬ b
+ᵭ d
+ᵮ f
+ᵯ m
+ᵰ n
+ᵱ p
+ᵲ r
+ᵳ r
+ᵴ s
+ᵵ t
+ᵶ z
+ᵺ th
+ᵻ I
+ᵽ p
+ᵾ U
+ᶀ b
+ᶁ d
+ᶂ f
+ᶃ g
+ᶄ k
+ᶅ l
+ᶆ m
+ᶇ n
+ᶈ p
+ᶉ r
+ᶊ s
+ᶌ v
+ᶍ x
+ᶎ z
+ᶏ a
+ᶑ d
+ᶒ e
+ᶓ e
+ᶖ i
+ᶙ u
Ḁ A
ḁ a
Ḃ B
@@ -356,6 +555,10 @@
ẗ t
ẘ w
ẙ y
+ẚ a
+ẜ s
+ẝ s
+ẞ SS
Ạ A
ạ a
Ả A
@@ -386,28 +589,461 @@
ỷ y
Ỹ Y
ỹ y
+Ỻ LL
+ỻ ll
+Ỽ V
+ỽ v
+Ỿ Y
+ỿ y
+‐ -
+‑ -
+‒ -
+– -
+— -
+― -
+‖ ||
+‘ '
+’ '
+‚ ,
+‛ '
+“ "
+” "
+„ ,,
+‟ "
+․ .
+‥ ..
+… ...
+′ '
+″ "
+‹ <
+› >
+‼ !!
+⁄ /
+⁅ [
+⁆ ]
+⁇ ??
+⁈ ?!
+⁉ !?
+⁎ *
+₠ CE
+₢ Cr
+₣ Fr.
+₤ L.
+₧ Pts
+₹ Rs
+₺ TL
+℀ a/c
+℁ a/s
+ℂ C
+℃ °C
+℅ c/o
+℆ c/u
+℉ °F
+ℊ g
+ℋ H
+ℌ x
+ℍ H
+ℎ h
+ℐ I
+ℑ I
+ℒ L
+ℓ l
+ℕ N
+№ No
+℗ (P)
+ℙ P
+ℚ Q
+ℛ R
+ℜ R
+ℝ R
+℞ Rx
+℡ TEL
+ℤ Z
+ℨ Z
+ℬ B
+ℭ C
+ℯ e
+ℰ E
+ℱ F
+ℳ M
+ℴ o
+ℹ i
+℻ FAX
+ⅅ D
+ⅆ d
+ⅇ e
+ⅈ i
+ⅉ j
+⅓ 1/3
+⅔ 2/3
+⅕ 1/5
+⅖ 2/5
+⅗ 3/5
+⅘ 4/5
+⅙ 1/6
+⅚ 5/6
+⅛ 1/8
+⅜ 3/8
+⅝ 5/8
+⅞ 7/8
+⅟ 1/
+Ⅰ I
+Ⅱ II
+Ⅲ III
+Ⅳ IV
+Ⅴ V
+Ⅵ VI
+Ⅶ VII
+Ⅷ VIII
+Ⅸ IX
+Ⅹ X
+Ⅺ XI
+Ⅻ XII
+Ⅼ L
+Ⅽ C
+Ⅾ D
+Ⅿ M
+ⅰ i
+ⅱ ii
+ⅲ iii
+ⅳ iv
+ⅴ v
+ⅵ vi
+ⅶ vii
+ⅷ viii
+ⅸ ix
+ⅹ x
+ⅺ xi
+ⅻ xii
+ⅼ l
+ⅽ c
+ⅾ d
+ⅿ m
+− -
+∕ /
+∖ \
+∣ |
+∥ ||
+≪ <<
+≫ >>
+⑴ (1)
+⑵ (2)
+⑶ (3)
+⑷ (4)
+⑸ (5)
+⑹ (6)
+⑺ (7)
+⑻ (8)
+⑼ (9)
+⑽ (10)
+⑾ (11)
+⑿ (12)
+⒀ (13)
+⒁ (14)
+⒂ (15)
+⒃ (16)
+⒄ (17)
+⒅ (18)
+⒆ (19)
+⒇ (20)
+⒈ 1.
+⒉ 2.
+⒊ 3.
+⒋ 4.
+⒌ 5.
+⒍ 6.
+⒎ 7.
+⒏ 8.
+⒐ 9.
+⒑ 10.
+⒒ 11.
+⒓ 12.
+⒔ 13.
+⒕ 14.
+⒖ 15.
+⒗ 16.
+⒘ 17.
+⒙ 18.
+⒚ 19.
+⒛ 20.
+⒜ (a)
+⒝ (b)
+⒞ (c)
+⒟ (d)
+⒠ (e)
+⒡ (f)
+⒢ (g)
+⒣ (h)
+⒤ (i)
+⒥ (j)
+⒦ (k)
+⒧ (l)
+⒨ (m)
+⒩ (n)
+⒪ (o)
+⒫ (p)
+⒬ (q)
+⒭ (r)
+⒮ (s)
+⒯ (t)
+⒰ (u)
+⒱ (v)
+⒲ (w)
+⒳ (x)
+⒴ (y)
+⒵ (z)
+⦅ ((
+⦆ ))
+⩴ ::=
+⩵ ==
+⩶ ===
+、 ,
+。 .
+〇 0
+〈 <
+〉 >
+《 <<
+》 >>
+〔 [
+〕 ]
+〘 [
+〙 ]
+〚 [
+〛 ]
+〝 "
+〞 "
+㍱ hPa
+㍲ da
+㍳ AU
+㍴ bar
+㍵ oV
+㍶ pc
+㍷ dm
+㍺ IU
+㎀ pA
+㎁ nA
+㎃ mA
+㎄ kA
+㎅ KB
+㎆ MB
+㎇ GB
+㎈ cal
+㎉ kcal
+㎊ pF
+㎋ nF
+㎎ mg
+㎏ kg
+㎐ Hz
+㎑ kHz
+㎒ MHz
+㎓ GHz
+㎔ THz
+㎙ fm
+㎚ nm
+㎜ mm
+㎝ cm
+㎞ km
+㎧ m/s
+㎩ Pa
+㎪ kPa
+㎫ MPa
+㎬ GPa
+㎭ rad
+㎮ rad/s
+㎰ ps
+㎱ ns
+㎳ ms
+㎴ pV
+㎵ nV
+㎷ mV
+㎸ kV
+㎹ MV
+㎺ pW
+㎻ nW
+㎽ mW
+㎾ kW
+㎿ MW
+㏂ a.m.
+㏃ Bq
+㏄ cc
+㏅ cd
+㏆ C/kg
+㏇ Co.
+㏈ dB
+㏉ Gy
+㏊ ha
+㏋ HP
+㏌ in
+㏍ KK
+㏎ KM
+㏏ kt
+㏐ lm
+㏑ ln
+㏒ log
+㏓ lx
+㏔ mb
+㏕ mil
+㏖ mol
+㏗ pH
+㏘ p.m.
+㏙ PPM
+㏚ PR
+㏛ sr
+㏜ Sv
+㏝ Wb
+㏞ V/m
+㏟ A/m
ff ff
fi fi
fl fl
ffi ffi
ffl ffl
+ſt st
st st
-Ø O
-ø o
-Đ D
-đ d
-ı i
-Ħ H
-ħ h
-Ł L
-ł l
-ʼn 'n
-Ŧ T
-ŧ t
-Ё Е
-ё е
-Æ AE
-ß ss
-æ ae
-Œ OE
-œ oe
+︐ ,
+︑ ,
+︒ .
+︓ :
+︔ ;
+︕ !
+︖ ?
+︙ ...
+︰ ..
+︱ -
+︲ -
+︵ (
+︶ )
+︷ {
+︸ }
+︹ [
+︺ ]
+︽ <<
+︾ >>
+︿ <
+﹀ >
+﹇ [
+﹈ ]
+﹐ ,
+﹑ ,
+﹒ .
+﹔ ;
+﹕ :
+﹖ ?
+﹗ !
+﹘ -
+﹙ (
+﹚ )
+﹛ {
+﹜ }
+﹝ [
+﹞ ]
+﹟ #
+﹠ &
+﹡ *
+﹢ +
+﹣ -
+﹤ <
+﹥ >
+﹦ =
+﹨ \
+﹩ $
+﹪ %
+﹫ @
+! !
+" "
+# #
+$ $
+% %
+& &
+' '
+( (
+) )
+* *
++ +
+, ,
+- -
+. .
+/ /
+0 0
+1 1
+2 2
+3 3
+4 4
+5 5
+6 6
+7 7
+8 8
+9 9
+: :
+; ;
+< <
+= =
+> >
+? ?
+@ @
+A A
+B B
+C C
+D D
+E E
+F F
+G G
+H H
+I I
+J J
+K K
+L L
+M M
+N N
+O O
+P P
+Q Q
+R R
+S S
+T T
+U U
+V V
+W W
+X X
+Y Y
+Z Z
+[ [
+\ \
+] ]
+^ ^
+_ _
+` `
+a a
+b b
+c c
+d d
+e e
+f f
+g g
+h h
+i i
+j j
+k k
+l l
+m m
+n n
+o o
+p p
+q q
+r r
+s s
+t t
+u u
+v v
+w w
+x x
+y y
+z z
+{ {
+| |
+} }
+~ ~
+⦅ ((
+⦆ ))
+。 .
+、 ,