aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTeodor Sigaev <teodor@sigaev.ru>2015-09-04 12:51:53 +0300
committerTeodor Sigaev <teodor@sigaev.ru>2015-09-04 12:51:53 +0300
commit1bbd52cb9a4aa61a7dd751f5d1f7b44650d6122a (patch)
treecb52d878702e901529ce383c60ded775c7c76435
parent4aec49899e5782247e134f94ce1c6ee926f88e1c (diff)
downloadpostgresql-1bbd52cb9a4aa61a7dd751f5d1f7b44650d6122a.tar.gz
postgresql-1bbd52cb9a4aa61a7dd751f5d1f7b44650d6122a.zip
Make unaccent handle all diacritics known to Unicode, and expand ligatures correctly
Add Python script for buiding unaccent.rules from Unicode data. Don't backpatch because unaccent changes may require tsvector/index rebuild. Thomas Munro <thomas.munro@enterprisedb.com>
-rw-r--r--contrib/unaccent/generate_unaccent_rules.py123
-rw-r--r--contrib/unaccent/unaccent.rules358
2 files changed, 415 insertions, 66 deletions
diff --git a/contrib/unaccent/generate_unaccent_rules.py b/contrib/unaccent/generate_unaccent_rules.py
new file mode 100644
index 00000000000..b838d8f630d
--- /dev/null
+++ b/contrib/unaccent/generate_unaccent_rules.py
@@ -0,0 +1,123 @@
+#!/usr/bin/python
+#
+# This script builds unaccent.rules on standard output when given the
+# contents of UnicodeData.txt[1] on standard input. Optionally includes
+# ligature expansion, if --expand-ligatures is given on the command line.
+#
+# The approach is to use the Unicode decomposition data to identify
+# precomposed codepoints that are equivalent to a ligature of several
+# letters, or a base letter with any number of diacritical marks.
+# There is also a small set of special cases for codepoints that we
+# traditionally support even though Unicode doesn't consider them to
+# be ligatures or letters with marks.
+#
+# [1] http://unicode.org/Public/7.0.0/ucd/UnicodeData.txt
+
+import re
+import sys
+
+def print_record(codepoint, letter):
+ print (unichr(codepoint) + "\t" + letter).encode("UTF-8")
+
+class Codepoint:
+ def __init__(self, id, general_category, combining_ids):
+ self.id = id
+ self.general_category = general_category
+ self.combining_ids = combining_ids
+
+def is_plain_letter(codepoint):
+ """Return true if codepoint represents a plain ASCII letter."""
+ return (codepoint.id >= ord('a') and codepoint.id <= ord('z')) or \
+ (codepoint.id >= ord('A') and codepoint.id <= ord('Z'))
+
+def is_mark(codepoint):
+ """Returns true for diacritical marks (combining codepoints)."""
+ return codepoint.general_category in ("Mn", "Me", "Mc")
+
+def is_letter_with_marks(codepoint, table):
+ """Returns true for plain letters combined with one or more marks."""
+ # See http://www.unicode.org/reports/tr44/tr44-14.html#General_Category_Values
+ return len(codepoint.combining_ids) > 1 and \
+ is_plain_letter(table[codepoint.combining_ids[0]]) and \
+ all(is_mark(table[i]) for i in codepoint.combining_ids[1:])
+
+def is_letter(codepoint, table):
+ """Return true for letter with or without diacritical marks."""
+ return is_plain_letter(codepoint) or is_letter_with_marks(codepoint, table)
+
+def get_plain_letter(codepoint, table):
+ """Return the base codepoint without marks."""
+ if is_letter_with_marks(codepoint, table):
+ return table[codepoint.combining_ids[0]]
+ elif is_plain_letter(codepoint):
+ return codepoint
+ else:
+ raise "mu"
+
+def is_ligature(codepoint, table):
+ """Return true for letters combined with letters."""
+ return all(is_letter(table[i], table) for i in codepoint.combining_ids)
+
+def get_plain_letters(codepoint, table):
+ """Return a list of plain letters from a ligature."""
+ assert(is_ligature(codepoint, table))
+ return [get_plain_letter(table[id], table) for id in codepoint.combining_ids]
+
+def main(expand_ligatures):
+ # http://www.unicode.org/reports/tr44/tr44-14.html#Character_Decomposition_Mappings
+ decomposition_type_pattern = re.compile(" *<[^>]*> *")
+
+ table = {}
+ all = []
+
+ # read everything we need into memory
+ for line in sys.stdin.readlines():
+ fields = line.split(";")
+ if len(fields) > 5:
+ # http://www.unicode.org/reports/tr44/tr44-14.html#UnicodeData.txt
+ general_category = fields[2]
+ decomposition = fields[5]
+ decomposition = re.sub(decomposition_type_pattern, ' ', decomposition)
+ id = int(fields[0], 16)
+ combining_ids = [int(s, 16) for s in decomposition.split(" ") if s != ""]
+ codepoint = Codepoint(id, general_category, combining_ids)
+ table[id] = codepoint
+ all.append(codepoint)
+
+ # walk through all the codepoints looking for interesting mappings
+ for codepoint in all:
+ if codepoint.general_category.startswith('L') and \
+ len(codepoint.combining_ids) > 1:
+ if is_letter_with_marks(codepoint, table):
+ print_record(codepoint.id,
+ chr(get_plain_letter(codepoint, table).id))
+ elif expand_ligatures and is_ligature(codepoint, table):
+ print_record(codepoint.id,
+ "".join(unichr(combining_codepoint.id)
+ for combining_codepoint \
+ in get_plain_letters(codepoint, table)))
+
+ # some special cases
+ print_record(0x00d8, "O") # LATIN CAPITAL LETTER O WITH STROKE
+ print_record(0x00f8, "o") # LATIN SMALL LETTER O WITH STROKE
+ print_record(0x0110, "D") # LATIN CAPITAL LETTER D WITH STROKE
+ print_record(0x0111, "d") # LATIN SMALL LETTER D WITH STROKE
+ print_record(0x0131, "i") # LATIN SMALL LETTER DOTLESS I
+ print_record(0x0126, "H") # LATIN CAPITAL LETTER H WITH STROKE
+ print_record(0x0127, "h") # LATIN SMALL LETTER H WITH STROKE
+ print_record(0x0141, "L") # LATIN CAPITAL LETTER L WITH STROKE
+ print_record(0x0142, "l") # LATIN SMALL LETTER L WITH STROKE
+ print_record(0x0149, "'n") # LATIN SMALL LETTER N PRECEDED BY APOSTROPHE
+ print_record(0x0166, "T") # LATIN CAPITAL LETTER T WITH STROKE
+ print_record(0x0167, "t") # LATIN SMALL LETTER t WITH STROKE
+ print_record(0x0401, u"\u0415") # CYRILLIC CAPITAL LETTER IO
+ print_record(0x0451, u"\u0435") # CYRILLIC SMALL LETTER IO
+ if expand_ligatures:
+ print_record(0x00c6, "AE") # LATIN CAPITAL LETTER AE
+ print_record(0x00df, "ss") # LATIN SMALL LETTER SHARP S
+ print_record(0x00e6, "ae") # LATIN SMALL LETTER AE
+ print_record(0x0152, "OE") # LATIN CAPITAL LIGATURE OE
+ print_record(0x0153, "oe") # LATIN SMALL LIGATURE OE
+
+if __name__ == "__main__":
+ main(len(sys.argv) == 2 and sys.argv[1] == "--expand-ligatures")
diff --git a/contrib/unaccent/unaccent.rules b/contrib/unaccent/unaccent.rules
index cc2f7a65858..73c24a188ba 100644
--- a/contrib/unaccent/unaccent.rules
+++ b/contrib/unaccent/unaccent.rules
@@ -4,22 +4,59 @@
à A
Ä A
Å A
-Æ A
+Ç C
+È E
+É E
+Ê E
+Ë E
+Ì I
+Í I
+Î I
+Ï I
+Ñ N
+Ò O
+Ó O
+Ô O
+Õ O
+Ö O
+Ù U
+Ú U
+Û U
+Ü U
+Ý Y
à a
á a
â a
ã a
ä a
å a
-æ a
+ç c
+è e
+é e
+ê e
+ë e
+ì i
+í i
+î i
+ï i
+ñ n
+ò o
+ó o
+ô o
+õ o
+ö o
+ù u
+ú u
+û u
+ü u
+ý y
+ÿ y
Ā A
ā a
Ă A
ă a
Ą A
ą a
-Ç C
-ç c
Ć C
ć c
Ĉ C
@@ -30,16 +67,6 @@
č c
Ď D
ď d
-Đ D
-đ d
-È E
-É E
-Ê E
-Ë E
-è e
-é e
-ê e
-ë e
Ē E
ē e
Ĕ E
@@ -60,17 +87,7 @@
ģ g
Ĥ H
ĥ h
-Ħ H
-ħ h
Ĩ I
-Ì I
-Í I
-Î I
-Ï I
-ì i
-í i
-î i
-ï i
ĩ i
Ī I
ī i
@@ -79,62 +96,36 @@
Į I
į i
İ I
-ı i
-IJ I
-ij i
+IJ IJ
+ij ij
Ĵ J
ĵ j
Ķ K
ķ k
-ĸ k
Ĺ L
ĺ l
Ļ L
ļ l
Ľ L
ľ l
-Ŀ L
-ŀ l
-Ł L
-ł l
-Ñ N
-ñ n
Ń N
ń n
Ņ N
ņ n
Ň N
ň n
-ʼn n
-Ŋ N
-ŋ n
-Ò O
-Ó O
-Ô O
-Õ O
-Ö O
-ò o
-ó o
-ô o
-õ o
-ö o
Ō O
ō o
Ŏ O
ŏ o
Ő O
ő o
-Œ E
-œ e
-Ø O
-ø o
Ŕ R
ŕ r
Ŗ R
ŗ r
Ř R
ř r
-ß S
Ś S
ś s
Ŝ S
@@ -147,16 +138,6 @@
ţ t
Ť T
ť t
-Ŧ T
-ŧ t
-Ù U
-Ú U
-Û U
-Ü U
-ù u
-ú u
-û u
-ü u
Ũ U
ũ u
Ū U
@@ -171,9 +152,6 @@
ų u
Ŵ W
ŵ w
-Ý Y
-ý y
-ÿ y
Ŷ Y
ŷ y
Ÿ Y
@@ -183,5 +161,253 @@
ż z
Ž Z
ž z
-ё е
+Ơ O
+ơ o
+Ư U
+ư u
+DŽ DZ
+Dž Dz
+dž dz
+LJ LJ
+Lj Lj
+lj lj
+NJ NJ
+Nj Nj
+nj nj
+Ǎ A
+ǎ a
+Ǐ I
+ǐ i
+Ǒ O
+ǒ o
+Ǔ U
+ǔ u
+Ǧ G
+ǧ g
+Ǩ K
+ǩ k
+Ǫ O
+ǫ o
+ǰ j
+DZ DZ
+Dz Dz
+dz dz
+Ǵ G
+ǵ g
+Ǹ N
+ǹ n
+Ȁ A
+ȁ a
+Ȃ A
+ȃ a
+Ȅ E
+ȅ e
+Ȇ E
+ȇ e
+Ȉ I
+ȉ i
+Ȋ I
+ȋ i
+Ȍ O
+ȍ o
+Ȏ O
+ȏ o
+Ȑ R
+ȑ r
+Ȓ R
+ȓ r
+Ȕ U
+ȕ u
+Ȗ U
+ȗ u
+Ș S
+ș s
+Ț T
+ț t
+Ȟ H
+ȟ h
+Ȧ A
+ȧ a
+Ȩ E
+ȩ e
+Ȯ O
+ȯ o
+Ȳ Y
+ȳ y
+Ḁ A
+ḁ a
+Ḃ B
+ḃ b
+Ḅ B
+ḅ b
+Ḇ B
+ḇ b
+Ḋ D
+ḋ d
+Ḍ D
+ḍ d
+Ḏ D
+ḏ d
+Ḑ D
+ḑ d
+Ḓ D
+ḓ d
+Ḙ E
+ḙ e
+Ḛ E
+ḛ e
+Ḟ F
+ḟ f
+Ḡ G
+ḡ g
+Ḣ H
+ḣ h
+Ḥ H
+ḥ h
+Ḧ H
+ḧ h
+Ḩ H
+ḩ h
+Ḫ H
+ḫ h
+Ḭ I
+ḭ i
+Ḱ K
+ḱ k
+Ḳ K
+ḳ k
+Ḵ K
+ḵ k
+Ḷ L
+ḷ l
+Ḻ L
+ḻ l
+Ḽ L
+ḽ l
+Ḿ M
+ḿ m
+Ṁ M
+ṁ m
+Ṃ M
+ṃ m
+Ṅ N
+ṅ n
+Ṇ N
+ṇ n
+Ṉ N
+ṉ n
+Ṋ N
+ṋ n
+Ṕ P
+ṕ p
+Ṗ P
+ṗ p
+Ṙ R
+ṙ r
+Ṛ R
+ṛ r
+Ṟ R
+ṟ r
+Ṡ S
+ṡ s
+Ṣ S
+ṣ s
+Ṫ T
+ṫ t
+Ṭ T
+ṭ t
+Ṯ T
+ṯ t
+Ṱ T
+ṱ t
+Ṳ U
+ṳ u
+Ṵ U
+ṵ u
+Ṷ U
+ṷ u
+Ṽ V
+ṽ v
+Ṿ V
+ṿ v
+Ẁ W
+ẁ w
+Ẃ W
+ẃ w
+Ẅ W
+ẅ w
+Ẇ W
+ẇ w
+Ẉ W
+ẉ w
+Ẋ X
+ẋ x
+Ẍ X
+ẍ x
+Ẏ Y
+ẏ y
+Ẑ Z
+ẑ z
+Ẓ Z
+ẓ z
+Ẕ Z
+ẕ z
+ẖ h
+ẗ t
+ẘ w
+ẙ y
+Ạ A
+ạ a
+Ả A
+ả a
+Ẹ E
+ẹ e
+Ẻ E
+ẻ e
+Ẽ E
+ẽ e
+Ỉ I
+ỉ i
+Ị I
+ị i
+Ọ O
+ọ o
+Ỏ O
+ỏ o
+Ụ U
+ụ u
+Ủ U
+ủ u
+Ỳ Y
+ỳ y
+Ỵ Y
+ỵ y
+Ỷ Y
+ỷ y
+Ỹ Y
+ỹ y
+ff ff
+fi fi
+fl fl
+ffi ffi
+ffl ffl
+st st
+Ø O
+ø o
+Đ D
+đ d
+ı i
+Ħ H
+ħ h
+Ł L
+ł l
+ʼn 'n
+Ŧ T
+ŧ t
Ё Е
+ё е
+Æ AE
+ß ss
+æ ae
+Œ OE
+œ oe