Add support for automatically updating Unicode derived files

We currently have several sets of files generated from data provided by Unicode. These all have ad hoc rules and instructions for updating when new Unicode versions appear, and it's not done consistently. This patch centralizes and automates the process and makes it part of the release checklist. The Unicode and CLDR versions are specified in Makefile.global.in. There is a new make target "update-unicode" that downloads all the relevant files and runs the generation script. There is also a new script for generating the table of combining characters for ucs_wcwidth(). That table is now in a separate include file rather than hardcoded into the middle of other code. This is based on the script that was used for generating d8594d123c155aeecd47fc2450f62f5100b2fbf0, but the script itself wasn't committed at that time. Reviewed-by: John Naylor <john.naylor@2ndquadrant.com> Discussion: https://www.postgresql.org/message-id/flat/c8d05f42-443e-6c23-819b-05b31759a37c@2ndquadrant.com
author: Peter Eisentraut <peter@eisentraut.org> 2020-01-09 09:54:47 +0100
committer: Peter Eisentraut <peter@eisentraut.org> 2020-01-09 10:08:14 +0100
commit: f85a485f89e2eb38499558c7489f108994410952 (patch)
tree: 6b88bcc3aca9f1da2b76fa911ee86a7b926470ca /contrib
parent: f5fd995a1a24e6571d26b1e29c4dc179112b1003 (diff)
download: postgresql-f85a485f89e2eb38499558c7489f108994410952.tar.gz
postgresql-f85a485f89e2eb38499558c7489f108994410952.zip
3 files changed, 26 insertions, 6 deletions
diff --git a/contrib/unaccent/.gitignore b/contrib/unaccent/.gitignore
index 5dcb3ff9723..bccda7317dc 100644
--- a/contrib/unaccent/.gitignore
+++ b/contrib/unaccent/.gitignore
@@ -2,3 +2,6 @@
 /log/
 /results/
 /tmp_check/
+
+# Downloaded files
+/Latin-ASCII.xml
diff --git a/contrib/unaccent/Makefile b/contrib/unaccent/Makefile
index 92b7f9d78e7..9753bc6ad29 100644
--- a/contrib/unaccent/Makefile
+++ b/contrib/unaccent/Makefile
@@ -26,3 +26,22 @@ top_builddir = ../..
 include $(top_builddir)/src/Makefile.global
 include $(top_srcdir)/contrib/contrib-global.mk
 endif
+
+update-unicode: unaccent.rules
+
+# Allow running this even without --with-python
+PYTHON ?= python
+
+unaccent.rules: generate_unaccent_rules.py ../../src/common/unicode/UnicodeData.txt Latin-ASCII.xml
+	$(PYTHON) $< --unicode-data-file $(word 2,$^) --latin-ascii-file $(word 3,$^) >$@
+
+# Only download it once; dependencies must match src/common/unicode/
+../../src/common/unicode/UnicodeData.txt: $(top_builddir)/src/Makefile.global
+	$(MAKE) -C $(@D) $(@F)
+
+# Dependency on Makefile.global is for CLDR_VERSION
+Latin-ASCII.xml: $(top_builddir)/src/Makefile.global
+	$(DOWNLOAD) https://raw.githubusercontent.com/unicode-org/cldr/release-$(subst .,-,$(CLDR_VERSION))/common/transforms/Latin-ASCII.xml
+
+distclean:
+	rm -f Latin-ASCII.xml
diff --git a/contrib/unaccent/generate_unaccent_rules.py b/contrib/unaccent/generate_unaccent_rules.py
index acfb4f0b686..a952de510c6 100644
--- a/contrib/unaccent/generate_unaccent_rules.py
+++ b/contrib/unaccent/generate_unaccent_rules.py
@@ -20,13 +20,11 @@
 # option is enabled, the XML file of this transliterator [2] -- given as a
 # command line argument -- will be parsed and used.
 #
-# Ideally you should use the latest release for each data set.  For
-# Latin-ASCII.xml, the latest data sets released can be browsed directly
-# via [3].  Note that this script is compatible with at least release 29.
+# Ideally you should use the latest release for each data set.  This
+# script is compatible with at least CLDR release 29.
 #
-# [1] https://www.unicode.org/Public/8.0.0/ucd/UnicodeData.txt
-# [2] https://raw.githubusercontent.com/unicode-org/cldr/release-34/common/transforms/Latin-ASCII.xml
-# [3] https://github.com/unicode-org/cldr/tags
+# [1] https://www.unicode.org/Public/${UNICODE_VERSION}/ucd/UnicodeData.txt
+# [2] https://raw.githubusercontent.com/unicode-org/cldr/${TAG}/common/transforms/Latin-ASCII.xml
 
 # BEGIN: Python 2/3 compatibility - remove when Python 2 compatibility dropped
 # The approach is to be Python3 compatible with Python2 "backports".
author	Peter Eisentraut <peter@eisentraut.org>	2020-01-09 09:54:47 +0100
committer	Peter Eisentraut <peter@eisentraut.org>	2020-01-09 10:08:14 +0100
commit	f85a485f89e2eb38499558c7489f108994410952 (patch)
tree	6b88bcc3aca9f1da2b76fa911ee86a7b926470ca /contrib
parent	f5fd995a1a24e6571d26b1e29c4dc179112b1003 (diff)
download	postgresql-f85a485f89e2eb38499558c7489f108994410952.tar.gz postgresql-f85a485f89e2eb38499558c7489f108994410952.zip