diff options
Diffstat (limited to 'contrib/unaccent/unaccent.c')
-rw-r--r-- | contrib/unaccent/unaccent.c | 318 |
1 files changed, 318 insertions, 0 deletions
diff --git a/contrib/unaccent/unaccent.c b/contrib/unaccent/unaccent.c new file mode 100644 index 00000000000..7b5086b9587 --- /dev/null +++ b/contrib/unaccent/unaccent.c @@ -0,0 +1,318 @@ +/*------------------------------------------------------------------------- + * + * unaccent.c + * Text search unaccent dictionary + * + * Copyright (c) 2009, PostgreSQL Global Development Group + * + * IDENTIFICATION + * $PostgreSQL: pgsql/contrib/unaccent/unaccent.c,v 1.1 2009/08/18 10:34:39 teodor Exp $ + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "fmgr.h" +#include "catalog/namespace.h" +#include "commands/defrem.h" +#include "mb/pg_wchar.h" +#include "tsearch/ts_cache.h" +#include "tsearch/ts_locale.h" +#include "tsearch/ts_public.h" +#include "utils/builtins.h" + +PG_MODULE_MAGIC; + +/* + * Unaccent dictionary uses uncompressed suffix tree to find a + * character to replace. Each node of tree is an array of + * SuffixChar struct with length = 256 (n-th element of array + * corresponds to byte) + */ +typedef struct SuffixChar { + struct SuffixChar *nextChar; + char *replaceTo; + int replacelen; +} SuffixChar; + +/* + * placeChar - put str into tree's structure, byte by byte. + */ +static SuffixChar* +placeChar(SuffixChar *node, unsigned char *str, int lenstr, char *replaceTo, int replacelen) +{ + SuffixChar *curnode; + + if ( !node ) + { + node = palloc(sizeof(SuffixChar) * 256); + memset(node, 0, sizeof(SuffixChar) * 256); + } + + curnode = node + *str; + + if ( lenstr == 1 ) + { + if ( curnode->replaceTo ) + elog(WARNING, "duplicate TO argument, use first one"); + else + { + curnode->replacelen = replacelen; + curnode->replaceTo = palloc( replacelen ); + memcpy(curnode->replaceTo, replaceTo, replacelen); + } + } + else + { + curnode->nextChar = placeChar( curnode->nextChar, str+1, lenstr-1, replaceTo, replacelen); + } + + return node; +} + +/* + * initSuffixTree - create suffix tree from file. Function converts + * UTF8-encoded file into current encoding. + */ +static SuffixChar* +initSuffixTree(char *filename) +{ + SuffixChar *rootSuffixTree = NULL; + MemoryContext ccxt = CurrentMemoryContext; + tsearch_readline_state trst; + bool skip; + + filename = get_tsearch_config_filename(filename, "rules"); + if (!tsearch_readline_begin(&trst, filename)) + ereport(ERROR, + (errcode(ERRCODE_CONFIG_FILE_ERROR), + errmsg("could not open unaccent file \"%s\": %m", + filename))); + + do + { + char src[4096]; + char trg[4096]; + int srclen; + int trglen; + char *line = NULL; + + skip = true; + + PG_TRY(); + { + /* + * pg_do_encoding_conversion() (called by tsearch_readline()) + * will emit exception if it finds untranslatable characters in current locale. + * We just skip such characters. + */ + while ((line = tsearch_readline(&trst)) != NULL) + { + if ( sscanf(line, "%s\t%s\n", src, trg)!=2 ) + continue; + + srclen = strlen(src); + trglen = strlen(trg); + + rootSuffixTree = placeChar(rootSuffixTree, + (unsigned char*)src, srclen, + trg, trglen); + skip = false; + pfree(line); + } + } + PG_CATCH(); + { + ErrorData *errdata; + MemoryContext ecxt; + + ecxt = MemoryContextSwitchTo(ccxt); + errdata = CopyErrorData(); + if (errdata->sqlerrcode == ERRCODE_UNTRANSLATABLE_CHARACTER) + { + FlushErrorState(); + } + else + { + MemoryContextSwitchTo(ecxt); + PG_RE_THROW(); + } + } + PG_END_TRY(); + } + while(skip); + + tsearch_readline_end(&trst); + + return rootSuffixTree; +} + +/* + * findReplaceTo - find multibyte character in tree + */ +static SuffixChar * +findReplaceTo( SuffixChar *node, unsigned char *src, int srclen ) +{ + while( node ) + { + node = node + *src; + if ( srclen == 1 ) + return node; + + src++; + srclen--; + node = node->nextChar; + } + + return NULL; +} + +PG_FUNCTION_INFO_V1(unaccent_init); +Datum unaccent_init(PG_FUNCTION_ARGS); +Datum +unaccent_init(PG_FUNCTION_ARGS) +{ + List *dictoptions = (List *) PG_GETARG_POINTER(0); + SuffixChar *rootSuffixTree; + bool fileloaded = false; + ListCell *l; + + foreach(l, dictoptions) + { + DefElem *defel = (DefElem *) lfirst(l); + + if (pg_strcasecmp("Rules", defel->defname) == 0) + { + if (fileloaded) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("multiple Rules parameters"))); + rootSuffixTree = initSuffixTree(defGetString(defel)); + fileloaded = true; + } + else + { + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("unrecognized Unaccent parameter: \"%s\"", + defel->defname))); + } + } + + if (!fileloaded) + { + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("missing Rules parameter"))); + } + + PG_RETURN_POINTER(rootSuffixTree); +} + +PG_FUNCTION_INFO_V1(unaccent_lexize); +Datum unaccent_lexize(PG_FUNCTION_ARGS); +Datum +unaccent_lexize(PG_FUNCTION_ARGS) +{ + SuffixChar *rootSuffixTree = (SuffixChar*)PG_GETARG_POINTER(0); + char *srcchar = (char *) PG_GETARG_POINTER(1); + int32 len = PG_GETARG_INT32(2); + char *srcstart, *trgchar; + int charlen; + TSLexeme *res = NULL; + SuffixChar *node; + + srcstart = srcchar; + while( srcchar - srcstart < len ) + { + charlen = pg_mblen(srcchar); + + node = findReplaceTo( rootSuffixTree, (unsigned char *) srcchar, charlen ); + if ( node && node->replaceTo ) + { + if ( !res ) + { + /* allocate res only it it's needed */ + res = palloc0(sizeof(TSLexeme) * 2); + res->lexeme = trgchar = palloc( len * pg_database_encoding_max_length() + 1 /* \0 */ ); + res->flags = TSL_FILTER; + if ( srcchar != srcstart ) + { + memcpy(trgchar, srcstart, srcchar - srcstart); + trgchar += (srcchar - srcstart); + } + } + memcpy( trgchar, node->replaceTo, node->replacelen ); + trgchar += node->replacelen; + } + else if ( res ) + { + memcpy( trgchar, srcchar, charlen ); + trgchar += charlen; + } + + srcchar += charlen; + } + + if ( res ) + *trgchar = '\0'; + + PG_RETURN_POINTER(res); +} + +/* + * Function-like wrapper for dictionary + */ +PG_FUNCTION_INFO_V1(unaccent_dict); +Datum unaccent_dict(PG_FUNCTION_ARGS); +Datum +unaccent_dict(PG_FUNCTION_ARGS) +{ + text *str; + int strArg; + Oid dictOid; + TSDictionaryCacheEntry *dict; + TSLexeme *res; + + if (PG_NARGS() == 1) + { + dictOid = TSDictionaryGetDictid(stringToQualifiedNameList("unaccent"), false); + strArg = 0; + } + else + { + dictOid = PG_GETARG_OID(0); + strArg = 1; + } + str = PG_GETARG_TEXT_P(strArg); + + dict = lookup_ts_dictionary_cache(dictOid); + + res = (TSLexeme *) DatumGetPointer(FunctionCall4(&(dict->lexize), + PointerGetDatum(dict->dictData), + PointerGetDatum(VARDATA(str)), + Int32GetDatum(VARSIZE(str) - VARHDRSZ), + PointerGetDatum(NULL))); + + PG_FREE_IF_COPY(str, strArg); + + if ( res == NULL ) + { + PG_RETURN_TEXT_P(PG_GETARG_TEXT_P_COPY(strArg)); + } + else if ( res->lexeme == NULL ) + { + pfree(res); + PG_RETURN_TEXT_P(PG_GETARG_TEXT_P_COPY(strArg)); + } + else + { + text *txt = cstring_to_text(res->lexeme); + + pfree(res->lexeme); + pfree(res); + + PG_RETURN_TEXT_P(txt); + } +} |