aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--contrib/Makefile3
-rw-r--r--contrib/README4
-rw-r--r--contrib/unaccent/Makefile24
-rw-r--r--contrib/unaccent/expected/unaccent.out58
-rw-r--r--contrib/unaccent/sql/unaccent.sql19
-rw-r--r--contrib/unaccent/unaccent.c318
-rw-r--r--contrib/unaccent/unaccent.rules187
-rw-r--r--contrib/unaccent/unaccent.sql.in33
-rw-r--r--contrib/unaccent/uninstall_unaccent.sql9
-rw-r--r--doc/src/sgml/contrib.sgml3
-rw-r--r--doc/src/sgml/filelist.sgml3
-rw-r--r--doc/src/sgml/unaccent.sgml150
12 files changed, 808 insertions, 3 deletions
diff --git a/contrib/Makefile b/contrib/Makefile
index 85cabd8618a..8543b5287fe 100644
--- a/contrib/Makefile
+++ b/contrib/Makefile
@@ -1,4 +1,4 @@
-# $PostgreSQL: pgsql/contrib/Makefile,v 1.88 2009/08/07 20:50:21 petere Exp $
+# $PostgreSQL: pgsql/contrib/Makefile,v 1.89 2009/08/18 10:34:39 teodor Exp $
subdir = contrib
top_builddir = ..
@@ -39,6 +39,7 @@ SUBDIRS = \
tablefunc \
test_parser \
tsearch2 \
+ unaccent \
vacuumlo
ifeq ($(with_openssl),yes)
diff --git a/contrib/README b/contrib/README
index 1ae49adc704..a8396a5bfad 100644
--- a/contrib/README
+++ b/contrib/README
@@ -169,6 +169,10 @@ tsearch2 -
Pavel Stehule <pavel.stehule@gmail.com>, based on code originally by
Teodor Sigaev <teodor@sigaev.ru> and Oleg Bartunov <oleg@sai.msu.su>.
+unaccent -
+ Unaccent dictionary for text search
+ Teodor Sigaev <teodor@sigaev.ru> and Oleg Bartunov <oleg@sai.msu.su>.
+
uuid-ossp -
UUID generation functions
by Peter Eisentraut <peter_e@gmx.net>
diff --git a/contrib/unaccent/Makefile b/contrib/unaccent/Makefile
new file mode 100644
index 00000000000..91b04fc2753
--- /dev/null
+++ b/contrib/unaccent/Makefile
@@ -0,0 +1,24 @@
+# $PostgreSQL: pgsql/contrib/unaccent/Makefile,v 1.1 2009/08/18 10:34:39 teodor Exp $
+
+MODULE_big = unaccent
+OBJS = unaccent.o
+
+DATA_built = unaccent.sql
+DATA = uninstall_unaccent.sql
+DATA_TSEARCH = unaccent.rules
+REGRESS = unaccent
+
+
+ifdef USE_PGXS
+PG_CONFIG = pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
+else
+subdir = contrib/pg_trgm
+top_builddir = ../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
+
+#redefine REGRESS_OPTS because of needings of UTF8 database
+REGRESS_OPTS = --dbname=$(CONTRIB_TESTDB) --multibyte=UTF8 --no-locale
diff --git a/contrib/unaccent/expected/unaccent.out b/contrib/unaccent/expected/unaccent.out
new file mode 100644
index 00000000000..8d197c50be7
--- /dev/null
+++ b/contrib/unaccent/expected/unaccent.out
@@ -0,0 +1,58 @@
+SET client_min_messages = warning;
+\set ECHO none
+RESET client_min_messages;
+SET client_encoding TO 'KOI8';
+SELECT unaccent('foobar');
+ unaccent
+----------
+ foobar
+(1 row)
+
+SELECT unaccent('L肆');
+ unaccent
+----------
+ 盘肆
+(1 row)
+
+SELECT unaccent('出殡');
+ unaccent
+----------
+ 弼殡
+(1 row)
+
+SELECT unaccent('unaccent', 'foobar');
+ unaccent
+----------
+ foobar
+(1 row)
+
+SELECT unaccent('unaccent', 'L肆');
+ unaccent
+----------
+ 盘肆
+(1 row)
+
+SELECT unaccent('unaccent', '出殡');
+ unaccent
+----------
+ 弼殡
+(1 row)
+
+SELECT ts_lexize('unaccent', 'foobar');
+ ts_lexize
+-----------
+
+(1 row)
+
+SELECT ts_lexize('unaccent', 'L肆');
+ ts_lexize
+-----------
+ {盘肆}
+(1 row)
+
+SELECT ts_lexize('unaccent', '出殡');
+ ts_lexize
+-----------
+ {弼殡}
+(1 row)
+
diff --git a/contrib/unaccent/sql/unaccent.sql b/contrib/unaccent/sql/unaccent.sql
new file mode 100644
index 00000000000..71ab5bb4358
--- /dev/null
+++ b/contrib/unaccent/sql/unaccent.sql
@@ -0,0 +1,19 @@
+SET client_min_messages = warning;
+\set ECHO none
+\i unaccent.sql
+\set ECHO all
+RESET client_min_messages;
+
+SET client_encoding TO 'KOI8';
+
+SELECT unaccent('foobar');
+SELECT unaccent('L肆');
+SELECT unaccent('出殡');
+
+SELECT unaccent('unaccent', 'foobar');
+SELECT unaccent('unaccent', 'L肆');
+SELECT unaccent('unaccent', '出殡');
+
+SELECT ts_lexize('unaccent', 'foobar');
+SELECT ts_lexize('unaccent', 'L肆');
+SELECT ts_lexize('unaccent', '出殡');
diff --git a/contrib/unaccent/unaccent.c b/contrib/unaccent/unaccent.c
new file mode 100644
index 00000000000..7b5086b9587
--- /dev/null
+++ b/contrib/unaccent/unaccent.c
@@ -0,0 +1,318 @@
+/*-------------------------------------------------------------------------
+ *
+ * unaccent.c
+ * Text search unaccent dictionary
+ *
+ * Copyright (c) 2009, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ * $PostgreSQL: pgsql/contrib/unaccent/unaccent.c,v 1.1 2009/08/18 10:34:39 teodor Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "fmgr.h"
+#include "catalog/namespace.h"
+#include "commands/defrem.h"
+#include "mb/pg_wchar.h"
+#include "tsearch/ts_cache.h"
+#include "tsearch/ts_locale.h"
+#include "tsearch/ts_public.h"
+#include "utils/builtins.h"
+
+PG_MODULE_MAGIC;
+
+/*
+ * Unaccent dictionary uses uncompressed suffix tree to find a
+ * character to replace. Each node of tree is an array of
+ * SuffixChar struct with length = 256 (n-th element of array
+ * corresponds to byte)
+ */
+typedef struct SuffixChar {
+ struct SuffixChar *nextChar;
+ char *replaceTo;
+ int replacelen;
+} SuffixChar;
+
+/*
+ * placeChar - put str into tree's structure, byte by byte.
+ */
+static SuffixChar*
+placeChar(SuffixChar *node, unsigned char *str, int lenstr, char *replaceTo, int replacelen)
+{
+ SuffixChar *curnode;
+
+ if ( !node )
+ {
+ node = palloc(sizeof(SuffixChar) * 256);
+ memset(node, 0, sizeof(SuffixChar) * 256);
+ }
+
+ curnode = node + *str;
+
+ if ( lenstr == 1 )
+ {
+ if ( curnode->replaceTo )
+ elog(WARNING, "duplicate TO argument, use first one");
+ else
+ {
+ curnode->replacelen = replacelen;
+ curnode->replaceTo = palloc( replacelen );
+ memcpy(curnode->replaceTo, replaceTo, replacelen);
+ }
+ }
+ else
+ {
+ curnode->nextChar = placeChar( curnode->nextChar, str+1, lenstr-1, replaceTo, replacelen);
+ }
+
+ return node;
+}
+
+/*
+ * initSuffixTree - create suffix tree from file. Function converts
+ * UTF8-encoded file into current encoding.
+ */
+static SuffixChar*
+initSuffixTree(char *filename)
+{
+ SuffixChar *rootSuffixTree = NULL;
+ MemoryContext ccxt = CurrentMemoryContext;
+ tsearch_readline_state trst;
+ bool skip;
+
+ filename = get_tsearch_config_filename(filename, "rules");
+ if (!tsearch_readline_begin(&trst, filename))
+ ereport(ERROR,
+ (errcode(ERRCODE_CONFIG_FILE_ERROR),
+ errmsg("could not open unaccent file \"%s\": %m",
+ filename)));
+
+ do
+ {
+ char src[4096];
+ char trg[4096];
+ int srclen;
+ int trglen;
+ char *line = NULL;
+
+ skip = true;
+
+ PG_TRY();
+ {
+ /*
+ * pg_do_encoding_conversion() (called by tsearch_readline())
+ * will emit exception if it finds untranslatable characters in current locale.
+ * We just skip such characters.
+ */
+ while ((line = tsearch_readline(&trst)) != NULL)
+ {
+ if ( sscanf(line, "%s\t%s\n", src, trg)!=2 )
+ continue;
+
+ srclen = strlen(src);
+ trglen = strlen(trg);
+
+ rootSuffixTree = placeChar(rootSuffixTree,
+ (unsigned char*)src, srclen,
+ trg, trglen);
+ skip = false;
+ pfree(line);
+ }
+ }
+ PG_CATCH();
+ {
+ ErrorData *errdata;
+ MemoryContext ecxt;
+
+ ecxt = MemoryContextSwitchTo(ccxt);
+ errdata = CopyErrorData();
+ if (errdata->sqlerrcode == ERRCODE_UNTRANSLATABLE_CHARACTER)
+ {
+ FlushErrorState();
+ }
+ else
+ {
+ MemoryContextSwitchTo(ecxt);
+ PG_RE_THROW();
+ }
+ }
+ PG_END_TRY();
+ }
+ while(skip);
+
+ tsearch_readline_end(&trst);
+
+ return rootSuffixTree;
+}
+
+/*
+ * findReplaceTo - find multibyte character in tree
+ */
+static SuffixChar *
+findReplaceTo( SuffixChar *node, unsigned char *src, int srclen )
+{
+ while( node )
+ {
+ node = node + *src;
+ if ( srclen == 1 )
+ return node;
+
+ src++;
+ srclen--;
+ node = node->nextChar;
+ }
+
+ return NULL;
+}
+
+PG_FUNCTION_INFO_V1(unaccent_init);
+Datum unaccent_init(PG_FUNCTION_ARGS);
+Datum
+unaccent_init(PG_FUNCTION_ARGS)
+{
+ List *dictoptions = (List *) PG_GETARG_POINTER(0);
+ SuffixChar *rootSuffixTree;
+ bool fileloaded = false;
+ ListCell *l;
+
+ foreach(l, dictoptions)
+ {
+ DefElem *defel = (DefElem *) lfirst(l);
+
+ if (pg_strcasecmp("Rules", defel->defname) == 0)
+ {
+ if (fileloaded)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("multiple Rules parameters")));
+ rootSuffixTree = initSuffixTree(defGetString(defel));
+ fileloaded = true;
+ }
+ else
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("unrecognized Unaccent parameter: \"%s\"",
+ defel->defname)));
+ }
+ }
+
+ if (!fileloaded)
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("missing Rules parameter")));
+ }
+
+ PG_RETURN_POINTER(rootSuffixTree);
+}
+
+PG_FUNCTION_INFO_V1(unaccent_lexize);
+Datum unaccent_lexize(PG_FUNCTION_ARGS);
+Datum
+unaccent_lexize(PG_FUNCTION_ARGS)
+{
+ SuffixChar *rootSuffixTree = (SuffixChar*)PG_GETARG_POINTER(0);
+ char *srcchar = (char *) PG_GETARG_POINTER(1);
+ int32 len = PG_GETARG_INT32(2);
+ char *srcstart, *trgchar;
+ int charlen;
+ TSLexeme *res = NULL;
+ SuffixChar *node;
+
+ srcstart = srcchar;
+ while( srcchar - srcstart < len )
+ {
+ charlen = pg_mblen(srcchar);
+
+ node = findReplaceTo( rootSuffixTree, (unsigned char *) srcchar, charlen );
+ if ( node && node->replaceTo )
+ {
+ if ( !res )
+ {
+ /* allocate res only it it's needed */
+ res = palloc0(sizeof(TSLexeme) * 2);
+ res->lexeme = trgchar = palloc( len * pg_database_encoding_max_length() + 1 /* \0 */ );
+ res->flags = TSL_FILTER;
+ if ( srcchar != srcstart )
+ {
+ memcpy(trgchar, srcstart, srcchar - srcstart);
+ trgchar += (srcchar - srcstart);
+ }
+ }
+ memcpy( trgchar, node->replaceTo, node->replacelen );
+ trgchar += node->replacelen;
+ }
+ else if ( res )
+ {
+ memcpy( trgchar, srcchar, charlen );
+ trgchar += charlen;
+ }
+
+ srcchar += charlen;
+ }
+
+ if ( res )
+ *trgchar = '\0';
+
+ PG_RETURN_POINTER(res);
+}
+
+/*
+ * Function-like wrapper for dictionary
+ */
+PG_FUNCTION_INFO_V1(unaccent_dict);
+Datum unaccent_dict(PG_FUNCTION_ARGS);
+Datum
+unaccent_dict(PG_FUNCTION_ARGS)
+{
+ text *str;
+ int strArg;
+ Oid dictOid;
+ TSDictionaryCacheEntry *dict;
+ TSLexeme *res;
+
+ if (PG_NARGS() == 1)
+ {
+ dictOid = TSDictionaryGetDictid(stringToQualifiedNameList("unaccent"), false);
+ strArg = 0;
+ }
+ else
+ {
+ dictOid = PG_GETARG_OID(0);
+ strArg = 1;
+ }
+ str = PG_GETARG_TEXT_P(strArg);
+
+ dict = lookup_ts_dictionary_cache(dictOid);
+
+ res = (TSLexeme *) DatumGetPointer(FunctionCall4(&(dict->lexize),
+ PointerGetDatum(dict->dictData),
+ PointerGetDatum(VARDATA(str)),
+ Int32GetDatum(VARSIZE(str) - VARHDRSZ),
+ PointerGetDatum(NULL)));
+
+ PG_FREE_IF_COPY(str, strArg);
+
+ if ( res == NULL )
+ {
+ PG_RETURN_TEXT_P(PG_GETARG_TEXT_P_COPY(strArg));
+ }
+ else if ( res->lexeme == NULL )
+ {
+ pfree(res);
+ PG_RETURN_TEXT_P(PG_GETARG_TEXT_P_COPY(strArg));
+ }
+ else
+ {
+ text *txt = cstring_to_text(res->lexeme);
+
+ pfree(res->lexeme);
+ pfree(res);
+
+ PG_RETURN_TEXT_P(txt);
+ }
+}
diff --git a/contrib/unaccent/unaccent.rules b/contrib/unaccent/unaccent.rules
new file mode 100644
index 00000000000..cc2f7a65858
--- /dev/null
+++ b/contrib/unaccent/unaccent.rules
@@ -0,0 +1,187 @@
+脌 A
+脕 A
+脗 A
+脙 A
+脛 A
+脜 A
+脝 A
+脿 a
+谩 a
+芒 a
+茫 a
+盲 a
+氓 a
+忙 a
+膧 A
+膩 a
+膫 A
+膬 a
+膭 A
+膮 a
+脟 C
+莽 c
+膯 C
+膰 c
+膱 C
+膲 c
+膴 C
+膵 c
+膶 C
+膷 c
+膸 D
+膹 d
+膼 D
+膽 d
+脠 E
+脡 E
+脢 E
+脣 E
+猫 e
+茅 e
+锚 e
+毛 e
+膾 E
+膿 e
+臄 E
+臅 e
+臇 E
+臈 e
+臉 E
+臋 e
+臍 E
+臎 e
+臏 G
+臐 g
+臑 G
+臒 g
+臓 G
+摹 g
+蘑 G
+模 g
+膜 H
+磨 h
+摩 H
+魔 h
+抹 I
+脤 I
+脥 I
+脦 I
+脧 I
+矛 i
+铆 i
+卯 i
+茂 i
+末 i
+莫 I
+墨 i
+默 I
+沫 i
+漠 I
+寞 i
+陌 I
+谋 i
+牟 I
+某 i
+拇 J
+牡 j
+亩 K
+姆 k
+母 k
+墓 L
+暮 l
+幕 L
+募 l
+慕 L
+木 l
+目 L
+艀 l
+艁 L
+艂 l
+脩 N
+帽 n
+艃 N
+艅 n
+艆 N
+艈 n
+艊 N
+艌 n
+艍 n
+艎 N
+艐 n
+脪 O
+脫 O
+脭 O
+脮 O
+脰 O
+貌 o
+贸 o
+么 o
+玫 o
+枚 o
+艑 O
+艒 o
+艓 O
+艔 o
+艕 O
+艖 o
+艗 E
+艙 e
+脴 O
+酶 o
+艛 R
+艜 r
+艝 R
+艞 r
+艠 R
+艡 r
+脽 S
+艢 S
+艣 s
+艤 S
+艥 s
+艦 S
+艧 s
+艩 S
+拧 s
+泞 T
+牛 t
+扭 T
+钮 t
+纽 T
+脓 t
+脵 U
+脷 U
+脹 U
+脺 U
+霉 u
+煤 u
+没 u
+眉 u
+浓 U
+农 u
+弄 U
+奴 u
+努 U
+怒 u
+女 U
+暖 u
+虐 U
+疟 u
+挪 U
+懦 u
+糯 W
+诺 w
+脻 Y
+媒 y
+每 y
+哦 Y
+欧 y
+鸥 Y
+殴 Z
+藕 z
+呕 Z
+偶 z
+沤 Z
+啪 z
+褢 械
+衼 袝
diff --git a/contrib/unaccent/unaccent.sql.in b/contrib/unaccent/unaccent.sql.in
new file mode 100644
index 00000000000..ba981398faf
--- /dev/null
+++ b/contrib/unaccent/unaccent.sql.in
@@ -0,0 +1,33 @@
+/* $PostgreSQL: pgsql/contrib/unaccent/unaccent.sql.in,v 1.1 2009/08/18 10:34:39 teodor Exp $ */
+
+CREATE OR REPLACE FUNCTION unaccent(regdictionary, text)
+ RETURNS text
+ AS 'MODULE_PATHNAME', 'unaccent_dict'
+ LANGUAGE C RETURNS NULL ON NULL INPUT IMMUTABLE;
+
+CREATE OR REPLACE FUNCTION unaccent(text)
+ RETURNS text
+ AS 'MODULE_PATHNAME', 'unaccent_dict'
+ LANGUAGE C RETURNS NULL ON NULL INPUT IMMUTABLE;
+
+CREATE OR REPLACE FUNCTION unaccent_init(internal)
+ RETURNS internal
+ AS 'MODULE_PATHNAME', 'unaccent_init'
+ LANGUAGE C;
+
+CREATE OR REPLACE FUNCTION unaccent_lexize(internal,internal,internal,internal)
+ RETURNS internal
+ AS 'MODULE_PATHNAME', 'unaccent_lexize'
+ LANGUAGE C;
+
+CREATE TEXT SEARCH TEMPLATE unaccent (
+ INIT = unaccent_init,
+ LEXIZE = unaccent_lexize
+);
+
+
+CREATE TEXT SEARCH DICTIONARY unaccent (
+ TEMPLATE = unaccent,
+ RULES = 'unaccent'
+);
+
diff --git a/contrib/unaccent/uninstall_unaccent.sql b/contrib/unaccent/uninstall_unaccent.sql
new file mode 100644
index 00000000000..89e3627fc8c
--- /dev/null
+++ b/contrib/unaccent/uninstall_unaccent.sql
@@ -0,0 +1,9 @@
+/* $PostgreSQL: pgsql/contrib/unaccent/uninstall_unaccent.sql,v 1.1 2009/08/18 10:34:39 teodor Exp $ */
+
+DROP FUNCTION IF EXISTS unaccent(regdictionary, text) CASCADE;
+DROP FUNCTION IF EXISTS unaccent(text) CASCADE;
+DROP TEXT SEARCH DICTIONARY IF EXISTS unaccent CASCADE;
+DROP TEXT SEARCH TEMPLATE IF EXISTS unaccent CASCADE;
+DROP FUNCTION IF EXISTS unaccent_init(internal) CASCADE;
+DROP FUNCTION IF EXISTS unaccent_lexize(internal,internal,internal,internal) CASCADE;
+
diff --git a/doc/src/sgml/contrib.sgml b/doc/src/sgml/contrib.sgml
index 0ef92b48968..cffbc55249c 100644
--- a/doc/src/sgml/contrib.sgml
+++ b/doc/src/sgml/contrib.sgml
@@ -1,4 +1,4 @@
-<!-- $PostgreSQL: pgsql/doc/src/sgml/contrib.sgml,v 1.13 2009/04/27 16:27:35 momjian Exp $ -->
+<!-- $PostgreSQL: pgsql/doc/src/sgml/contrib.sgml,v 1.14 2009/08/18 10:34:39 teodor Exp $ -->
<appendix id="contrib">
<title>Additional Supplied Modules</title>
@@ -113,6 +113,7 @@ psql -d dbname -f <replaceable>SHAREDIR</>/contrib/<replaceable>module</>.sql
&tablefunc;
&test-parser;
&tsearch2;
+ &unaccent;
&uuid-ossp;
&vacuumlo;
&xml2;
diff --git a/doc/src/sgml/filelist.sgml b/doc/src/sgml/filelist.sgml
index 7e194f7bccb..bee66008b66 100644
--- a/doc/src/sgml/filelist.sgml
+++ b/doc/src/sgml/filelist.sgml
@@ -1,4 +1,4 @@
-<!-- $PostgreSQL: pgsql/doc/src/sgml/filelist.sgml,v 1.63 2009/08/17 22:14:44 petere Exp $ -->
+<!-- $PostgreSQL: pgsql/doc/src/sgml/filelist.sgml,v 1.64 2009/08/18 10:34:39 teodor Exp $ -->
<!entity history SYSTEM "history.sgml">
<!entity info SYSTEM "info.sgml">
@@ -126,6 +126,7 @@
<!entity tablefunc SYSTEM "tablefunc.sgml">
<!entity test-parser SYSTEM "test-parser.sgml">
<!entity tsearch2 SYSTEM "tsearch2.sgml">
+<!entity unaccent SYSTEM "unaccent.sgml">
<!entity uuid-ossp SYSTEM "uuid-ossp.sgml">
<!entity vacuumlo SYSTEM "vacuumlo.sgml">
<!entity xml2 SYSTEM "xml2.sgml">
diff --git a/doc/src/sgml/unaccent.sgml b/doc/src/sgml/unaccent.sgml
new file mode 100644
index 00000000000..b3c7bbee489
--- /dev/null
+++ b/doc/src/sgml/unaccent.sgml
@@ -0,0 +1,150 @@
+<sect1 id="unaccent">
+ <title>unaccent</title>
+
+ <indexterm zone="unaccent">
+ <primary>unaccent</primary>
+ </indexterm>
+
+ <para>
+ <filename>unaccent</> removes accents (diacritic signs) from a lexeme.
+ It's a filtering dictionary, that means its output is
+ always passed to the next dictionary (if any), contrary to the standard
+ behaviour. Currently, it supports most important accents from european
+ languages.
+ </para>
+
+ <para>
+ Limitation: Current implementation of <filename>unaccent</>
+ dictionary cannot be used as a normalizing dictionary for
+ <filename>thesaurus</filename> dictionary.
+ </para>
+
+ <sect2>
+ <title>Configuration</title>
+
+ <para>
+ A <literal>unaccent</> dictionary accepts the following options:
+ </para>
+ <itemizedlist>
+ <listitem>
+ <para>
+ <literal>RULES</> is the base name of the file containing the list of
+ translation rules. This file must be stored in
+ <filename>$SHAREDIR/tsearch_data/</> (where <literal>$SHAREDIR</> means
+ the <productname>PostgreSQL</> installation's shared-data directory).
+ Its name must end in <literal>.rules</> (which is not to be included in
+ the <literal>RULES</> parameter).
+ </para>
+ </listitem>
+ </itemizedlist>
+ <para>
+ The rules file has the following format:
+ </para>
+ <itemizedlist>
+ <listitem>
+ <para>
+ Each line represents pair: character_with_accent character_without_accent
+ <programlisting>
+&Agrave; A
+&Aacute; A
+&Acirc; A
+&Atilde; A
+&Auml; A
+&Aring; A
+&AElig; A
+ </programlisting>
+ </para>
+ </listitem>
+ </itemizedlist>
+
+ <para>
+ Look at <filename>unaccent.rules</>, which is installed in
+ <filename>$SHAREDIR/tsearch_data/</>, for an example.
+ </para>
+ </sect2>
+
+ <sect2>
+ <title>Usage</title>
+
+ <para>
+ Running the installation script creates a text search template
+ <literal>unaccent</> and a dictionary <literal>unaccent</>
+ based on it, with default parameters. You can alter the
+ parameters, for example
+
+<programlisting>
+=# ALTER TEXT SEARCH DICTIONARY unaccent (RULES='my_rules');
+</programlisting>
+
+ or create new dictionaries based on the template.
+ </para>
+
+ <para>
+ To test the dictionary, you can try
+
+<programlisting>
+=# select ts_lexize('unaccent','H么tel');
+ ts_lexize
+-----------
+ {Hotel}
+(1 row)
+</programlisting>
+ </para>
+
+ <para>
+ Filtering dictionary are useful for correct work of
+ <function>ts_headline</function> function.
+<programlisting>
+=# CREATE TEXT SEARCH CONFIGURATION fr ( COPY = french );
+=# ALTER TEXT SEARCH CONFIGURATION fr
+ ALTER MAPPING FOR hword, hword_part, word
+ WITH unaccent, french_stem;
+=# select to_tsvector('fr','H么tels de la Mer');
+ to_tsvector
+-------------------
+ 'hotel':1 'mer':4
+(1 row)
+
+=# select to_tsvector('fr','H么tel de la Mer') @@ to_tsquery('fr','Hotels');
+ ?column?
+----------
+ t
+(1 row)
+=# select ts_headline('fr','H么tel de la Mer',to_tsquery('fr','Hotels'));
+ ts_headline
+------------------------
+ &lt;b&gt;H么tel&lt;/b&gt;de la Mer
+(1 row)
+
+</programlisting>
+ </para>
+ </sect2>
+
+ <sect2>
+ <title>Function</title>
+
+ <para>
+ <function>unaccent</> function removes accents (diacritic signs) from
+ argument string. Basically, it's a wrapper around
+ <filename>unaccent</> dictionary.
+ </para>
+
+ <indexterm>
+ <primary>unaccent</primary>
+ </indexterm>
+
+ <synopsis>
+ unaccent(<optional><replaceable class="PARAMETER">dictionary</replaceable>,
+ </optional> <replaceable class="PARAMETER">string</replaceable>)
+ returns <type>text</type>
+ </synopsis>
+
+ <para>
+<programlisting>
+SELECT unaccent('unaccent','H么tel');
+SELECT unaccent('H么tel');
+</programlisting>
+ </para>
+ </sect2>
+
+</sect1>