diff options
Diffstat (limited to 'contrib/tsearch2/wordparser/parser.c')
-rw-r--r-- | contrib/tsearch2/wordparser/parser.c | 134 |
1 files changed, 104 insertions, 30 deletions
diff --git a/contrib/tsearch2/wordparser/parser.c b/contrib/tsearch2/wordparser/parser.c index fced41ec5e8..3706a0efb72 100644 --- a/contrib/tsearch2/wordparser/parser.c +++ b/contrib/tsearch2/wordparser/parser.c @@ -1,4 +1,4 @@ -/* $PostgreSQL: pgsql/contrib/tsearch2/wordparser/parser.c,v 1.11 2006/10/04 00:29:47 momjian Exp $ */ +/* $PostgreSQL: pgsql/contrib/tsearch2/wordparser/parser.c,v 1.12 2007/01/15 15:16:28 teodor Exp $ */ #include "postgres.h" @@ -40,16 +40,13 @@ TParserInit(char *str, int len) #ifdef TS_USE_WIDE /* - * Use wide char code only when max encoding length > 1 and ctype != C. - * Some operating systems fail with multi-byte encodings and a C locale. - * Also, for a C locale there is no need to process as multibyte. From - * backend/utils/adt/oracle_compat.c Teodor + * Use wide char code only when max encoding length > 1. */ - if (prs->charmaxlen > 1 && !lc_ctype_is_c()) + if (prs->charmaxlen > 1) { prs->usewide = true; - prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * prs->lenstr); + prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * (prs->lenstr+1)); prs->lenwstr = char2wchar(prs->wstr, prs->str, prs->lenstr); } else @@ -83,25 +80,99 @@ TParserClose(TParser * prs) /* * defining support function, equvalent is* macroses, but - * working with any possible encodings and locales + * working with any possible encodings and locales. Note, + * that with multibyte encoding and C-locale isw* function may fail + * or give wrong result. Note 2: multibyte encoding and C-locale + * often are used for Asian languages. */ #ifdef TS_USE_WIDE -#define p_iswhat(type) \ -static int \ -p_is##type(TParser *prs) { \ - Assert( prs->state ); \ - return ( ( prs->usewide ) ? isw##type( (wint_t)*( prs->wstr + prs->state->poschar ) ) : \ - is##type( (unsigned char)*( prs->str + prs->state->posbyte ) ) ); \ -} \ - \ -static int \ -p_isnot##type(TParser *prs) { \ - return !p_is##type(prs); \ +#define p_iswhat(type) \ +static int \ +p_is##type(TParser *prs) { \ + Assert( prs->state ); \ + if ( prs->usewide ) \ + { \ + if ( lc_ctype_is_c() ) \ + return is##type( 0xff & *( prs->wstr + prs->state->poschar) ); \ + \ + return isw##type( *(wint_t*)( prs->wstr + prs->state->poschar ) ); \ + } \ + \ + return is##type( *(unsigned char*)( prs->str + prs->state->posbyte ) ); \ +} \ + \ +static int \ +p_isnot##type(TParser *prs) { \ + return !p_is##type(prs); \ } +static int +p_isalnum(TParser *prs) +{ + Assert( prs->state ); + + if (prs->usewide) + { + if (lc_ctype_is_c()) + { + unsigned int c = *(unsigned int*)(prs->wstr + prs->state->poschar); + + /* + * any non-ascii symbol with multibyte encoding + * with C-locale is an alpha character + */ + if ( c > 0x7f ) + return 1; + + return isalnum(0xff & c); + } + + return iswalnum( (wint_t)*( prs->wstr + prs->state->poschar)); + } + return isalnum( *(unsigned char*)( prs->str + prs->state->posbyte )); +} + +static int +p_isnotalnum(TParser *prs) +{ + return !p_isalnum(prs); +} + +static int +p_isalpha(TParser *prs) +{ + Assert( prs->state ); + + if (prs->usewide) + { + if (lc_ctype_is_c()) + { + unsigned int c = *(prs->wstr + prs->state->poschar); + + /* + * any non-ascii symbol with multibyte encoding + * with C-locale is an alpha character + */ + if ( c > 0x7f ) + return 1; + + return isalpha(0xff & c); + } + + return iswalpha( (wint_t)*( prs->wstr + prs->state->poschar)); + } + + return isalpha( *(unsigned char*)( prs->str + prs->state->posbyte )); +} + +static int +p_isnotalpha(TParser *prs) +{ + return !p_isalpha(prs); +} /* p_iseq should be used only for ascii symbols */ @@ -111,18 +182,19 @@ p_iseq(TParser * prs, char c) Assert(prs->state); return ((prs->state->charlen == 1 && *(prs->str + prs->state->posbyte) == c)) ? 1 : 0; } + #else /* TS_USE_WIDE */ -#define p_iswhat(type) \ -static int \ -p_is##type(TParser *prs) { \ - Assert( prs->state ); \ - return is##type( (unsigned char)*( prs->str + prs->state->posbyte ) ); \ -} \ - \ -static int \ -p_isnot##type(TParser *prs) { \ - return !p_is##type(prs); \ +#define p_iswhat(type) \ +static int \ +p_is##type(TParser *prs) { \ + Assert( prs->state ); \ + return is##type( (unsigned char)*( prs->str + prs->state->posbyte ) ); \ +} \ + \ +static int \ +p_isnot##type(TParser *prs) { \ + return !p_is##type(prs); \ } @@ -132,10 +204,12 @@ p_iseq(TParser * prs, char c) Assert(prs->state); return (*(prs->str + prs->state->posbyte) == c) ? 1 : 0; } -#endif /* TS_USE_WIDE */ p_iswhat(alnum) p_iswhat(alpha) + +#endif /* TS_USE_WIDE */ + p_iswhat(digit) p_iswhat(lower) p_iswhat(print) |