aboutsummaryrefslogtreecommitdiff
path: root/contrib/unaccent/unaccent.c
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/unaccent/unaccent.c')
-rw-r--r--contrib/unaccent/unaccent.c318
1 files changed, 318 insertions, 0 deletions
diff --git a/contrib/unaccent/unaccent.c b/contrib/unaccent/unaccent.c
new file mode 100644
index 00000000000..7b5086b9587
--- /dev/null
+++ b/contrib/unaccent/unaccent.c
@@ -0,0 +1,318 @@
+/*-------------------------------------------------------------------------
+ *
+ * unaccent.c
+ * Text search unaccent dictionary
+ *
+ * Copyright (c) 2009, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ * $PostgreSQL: pgsql/contrib/unaccent/unaccent.c,v 1.1 2009/08/18 10:34:39 teodor Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "fmgr.h"
+#include "catalog/namespace.h"
+#include "commands/defrem.h"
+#include "mb/pg_wchar.h"
+#include "tsearch/ts_cache.h"
+#include "tsearch/ts_locale.h"
+#include "tsearch/ts_public.h"
+#include "utils/builtins.h"
+
+PG_MODULE_MAGIC;
+
+/*
+ * Unaccent dictionary uses uncompressed suffix tree to find a
+ * character to replace. Each node of tree is an array of
+ * SuffixChar struct with length = 256 (n-th element of array
+ * corresponds to byte)
+ */
+typedef struct SuffixChar {
+ struct SuffixChar *nextChar;
+ char *replaceTo;
+ int replacelen;
+} SuffixChar;
+
+/*
+ * placeChar - put str into tree's structure, byte by byte.
+ */
+static SuffixChar*
+placeChar(SuffixChar *node, unsigned char *str, int lenstr, char *replaceTo, int replacelen)
+{
+ SuffixChar *curnode;
+
+ if ( !node )
+ {
+ node = palloc(sizeof(SuffixChar) * 256);
+ memset(node, 0, sizeof(SuffixChar) * 256);
+ }
+
+ curnode = node + *str;
+
+ if ( lenstr == 1 )
+ {
+ if ( curnode->replaceTo )
+ elog(WARNING, "duplicate TO argument, use first one");
+ else
+ {
+ curnode->replacelen = replacelen;
+ curnode->replaceTo = palloc( replacelen );
+ memcpy(curnode->replaceTo, replaceTo, replacelen);
+ }
+ }
+ else
+ {
+ curnode->nextChar = placeChar( curnode->nextChar, str+1, lenstr-1, replaceTo, replacelen);
+ }
+
+ return node;
+}
+
+/*
+ * initSuffixTree - create suffix tree from file. Function converts
+ * UTF8-encoded file into current encoding.
+ */
+static SuffixChar*
+initSuffixTree(char *filename)
+{
+ SuffixChar *rootSuffixTree = NULL;
+ MemoryContext ccxt = CurrentMemoryContext;
+ tsearch_readline_state trst;
+ bool skip;
+
+ filename = get_tsearch_config_filename(filename, "rules");
+ if (!tsearch_readline_begin(&trst, filename))
+ ereport(ERROR,
+ (errcode(ERRCODE_CONFIG_FILE_ERROR),
+ errmsg("could not open unaccent file \"%s\": %m",
+ filename)));
+
+ do
+ {
+ char src[4096];
+ char trg[4096];
+ int srclen;
+ int trglen;
+ char *line = NULL;
+
+ skip = true;
+
+ PG_TRY();
+ {
+ /*
+ * pg_do_encoding_conversion() (called by tsearch_readline())
+ * will emit exception if it finds untranslatable characters in current locale.
+ * We just skip such characters.
+ */
+ while ((line = tsearch_readline(&trst)) != NULL)
+ {
+ if ( sscanf(line, "%s\t%s\n", src, trg)!=2 )
+ continue;
+
+ srclen = strlen(src);
+ trglen = strlen(trg);
+
+ rootSuffixTree = placeChar(rootSuffixTree,
+ (unsigned char*)src, srclen,
+ trg, trglen);
+ skip = false;
+ pfree(line);
+ }
+ }
+ PG_CATCH();
+ {
+ ErrorData *errdata;
+ MemoryContext ecxt;
+
+ ecxt = MemoryContextSwitchTo(ccxt);
+ errdata = CopyErrorData();
+ if (errdata->sqlerrcode == ERRCODE_UNTRANSLATABLE_CHARACTER)
+ {
+ FlushErrorState();
+ }
+ else
+ {
+ MemoryContextSwitchTo(ecxt);
+ PG_RE_THROW();
+ }
+ }
+ PG_END_TRY();
+ }
+ while(skip);
+
+ tsearch_readline_end(&trst);
+
+ return rootSuffixTree;
+}
+
+/*
+ * findReplaceTo - find multibyte character in tree
+ */
+static SuffixChar *
+findReplaceTo( SuffixChar *node, unsigned char *src, int srclen )
+{
+ while( node )
+ {
+ node = node + *src;
+ if ( srclen == 1 )
+ return node;
+
+ src++;
+ srclen--;
+ node = node->nextChar;
+ }
+
+ return NULL;
+}
+
+PG_FUNCTION_INFO_V1(unaccent_init);
+Datum unaccent_init(PG_FUNCTION_ARGS);
+Datum
+unaccent_init(PG_FUNCTION_ARGS)
+{
+ List *dictoptions = (List *) PG_GETARG_POINTER(0);
+ SuffixChar *rootSuffixTree;
+ bool fileloaded = false;
+ ListCell *l;
+
+ foreach(l, dictoptions)
+ {
+ DefElem *defel = (DefElem *) lfirst(l);
+
+ if (pg_strcasecmp("Rules", defel->defname) == 0)
+ {
+ if (fileloaded)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("multiple Rules parameters")));
+ rootSuffixTree = initSuffixTree(defGetString(defel));
+ fileloaded = true;
+ }
+ else
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("unrecognized Unaccent parameter: \"%s\"",
+ defel->defname)));
+ }
+ }
+
+ if (!fileloaded)
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("missing Rules parameter")));
+ }
+
+ PG_RETURN_POINTER(rootSuffixTree);
+}
+
+PG_FUNCTION_INFO_V1(unaccent_lexize);
+Datum unaccent_lexize(PG_FUNCTION_ARGS);
+Datum
+unaccent_lexize(PG_FUNCTION_ARGS)
+{
+ SuffixChar *rootSuffixTree = (SuffixChar*)PG_GETARG_POINTER(0);
+ char *srcchar = (char *) PG_GETARG_POINTER(1);
+ int32 len = PG_GETARG_INT32(2);
+ char *srcstart, *trgchar;
+ int charlen;
+ TSLexeme *res = NULL;
+ SuffixChar *node;
+
+ srcstart = srcchar;
+ while( srcchar - srcstart < len )
+ {
+ charlen = pg_mblen(srcchar);
+
+ node = findReplaceTo( rootSuffixTree, (unsigned char *) srcchar, charlen );
+ if ( node && node->replaceTo )
+ {
+ if ( !res )
+ {
+ /* allocate res only it it's needed */
+ res = palloc0(sizeof(TSLexeme) * 2);
+ res->lexeme = trgchar = palloc( len * pg_database_encoding_max_length() + 1 /* \0 */ );
+ res->flags = TSL_FILTER;
+ if ( srcchar != srcstart )
+ {
+ memcpy(trgchar, srcstart, srcchar - srcstart);
+ trgchar += (srcchar - srcstart);
+ }
+ }
+ memcpy( trgchar, node->replaceTo, node->replacelen );
+ trgchar += node->replacelen;
+ }
+ else if ( res )
+ {
+ memcpy( trgchar, srcchar, charlen );
+ trgchar += charlen;
+ }
+
+ srcchar += charlen;
+ }
+
+ if ( res )
+ *trgchar = '\0';
+
+ PG_RETURN_POINTER(res);
+}
+
+/*
+ * Function-like wrapper for dictionary
+ */
+PG_FUNCTION_INFO_V1(unaccent_dict);
+Datum unaccent_dict(PG_FUNCTION_ARGS);
+Datum
+unaccent_dict(PG_FUNCTION_ARGS)
+{
+ text *str;
+ int strArg;
+ Oid dictOid;
+ TSDictionaryCacheEntry *dict;
+ TSLexeme *res;
+
+ if (PG_NARGS() == 1)
+ {
+ dictOid = TSDictionaryGetDictid(stringToQualifiedNameList("unaccent"), false);
+ strArg = 0;
+ }
+ else
+ {
+ dictOid = PG_GETARG_OID(0);
+ strArg = 1;
+ }
+ str = PG_GETARG_TEXT_P(strArg);
+
+ dict = lookup_ts_dictionary_cache(dictOid);
+
+ res = (TSLexeme *) DatumGetPointer(FunctionCall4(&(dict->lexize),
+ PointerGetDatum(dict->dictData),
+ PointerGetDatum(VARDATA(str)),
+ Int32GetDatum(VARSIZE(str) - VARHDRSZ),
+ PointerGetDatum(NULL)));
+
+ PG_FREE_IF_COPY(str, strArg);
+
+ if ( res == NULL )
+ {
+ PG_RETURN_TEXT_P(PG_GETARG_TEXT_P_COPY(strArg));
+ }
+ else if ( res->lexeme == NULL )
+ {
+ pfree(res);
+ PG_RETURN_TEXT_P(PG_GETARG_TEXT_P_COPY(strArg));
+ }
+ else
+ {
+ text *txt = cstring_to_text(res->lexeme);
+
+ pfree(res->lexeme);
+ pfree(res);
+
+ PG_RETURN_TEXT_P(txt);
+ }
+}