aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/backend/tsearch/ts_locale.c185
-rw-r--r--src/backend/tsearch/ts_utils.c4
-rw-r--r--src/backend/tsearch/wparser_def.c6
-rw-r--r--src/include/tsearch/ts_locale.h63
-rw-r--r--src/include/tsearch/ts_public.h4
5 files changed, 150 insertions, 112 deletions
diff --git a/src/backend/tsearch/ts_locale.c b/src/backend/tsearch/ts_locale.c
index 361152e6bec..784cc17edd2 100644
--- a/src/backend/tsearch/ts_locale.c
+++ b/src/backend/tsearch/ts_locale.c
@@ -1,13 +1,13 @@
/*-------------------------------------------------------------------------
*
* ts_locale.c
- * locale compatiblility layer for tsearch
+ * locale compatibility layer for tsearch
*
* Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/tsearch/ts_locale.c,v 1.2 2007/08/25 00:03:59 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/tsearch/ts_locale.c,v 1.3 2007/11/09 22:37:35 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@@ -16,41 +16,56 @@
#include "tsearch/ts_locale.h"
#include "tsearch/ts_public.h"
-#ifdef TS_USE_WIDE
-#ifdef WIN32
+#ifdef TS_USE_WIDE
+/*
+ * wchar2char --- convert wide characters to multibyte format
+ *
+ * This has the same API as the standard wcstombs() function; in particular,
+ * tolen is the maximum number of bytes to store at *to, and *from should be
+ * zero-terminated. The output will be zero-terminated iff there is room.
+ */
size_t
-wchar2char(char *to, const wchar_t *from, size_t len)
+wchar2char(char *to, const wchar_t *from, size_t tolen)
{
- if (len == 0)
+ if (tolen == 0)
return 0;
+#ifdef WIN32
if (GetDatabaseEncoding() == PG_UTF8)
{
int r;
- r = WideCharToMultiByte(CP_UTF8, 0, from, -1, to, len,
+ r = WideCharToMultiByte(CP_UTF8, 0, from, -1, to, tolen,
NULL, NULL);
- if (r == 0)
- ereport(ERROR,
- (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
- errmsg("UTF-16 to UTF-8 translation failed: %lu",
- GetLastError())));
- Assert(r <= len);
+ if (r <= 0)
+ return (size_t) -1;
+
+ Assert(r <= tolen);
- return r;
+ /* Microsoft counts the zero terminator in the result */
+ return r-1;
}
+#endif /* WIN32 */
- return wcstombs(to, from, len);
+ return wcstombs(to, from, tolen);
}
-#endif /* WIN32 */
+/*
+ * char2wchar --- convert multibyte characters to wide characters
+ *
+ * This has almost the API of mbstowcs(), except that *from need not be
+ * null-terminated; instead, the number of input bytes is specified as
+ * fromlen. Also, we ereport() rather than returning -1 for invalid
+ * input encoding. tolen is the maximum number of wchar_t's to store at *to.
+ * The output will be zero-terminated iff there is room.
+ */
size_t
-char2wchar(wchar_t *to, const char *from, size_t len)
+char2wchar(wchar_t *to, size_t tolen, const char *from, size_t fromlen)
{
- if (len == 0)
+ if (tolen == 0)
return 0;
#ifdef WIN32
@@ -58,71 +73,117 @@ char2wchar(wchar_t *to, const char *from, size_t len)
{
int r;
- r = MultiByteToWideChar(CP_UTF8, 0, from, len, to, len);
+ r = MultiByteToWideChar(CP_UTF8, 0, from, fromlen, to, tolen);
- if (!r)
+ if (r <= 0)
{
- pg_verifymbstr(from, len, false);
+ pg_verifymbstr(from, fromlen, false);
ereport(ERROR,
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
errmsg("invalid multibyte character for locale"),
errhint("The server's LC_CTYPE locale is probably incompatible with the database encoding.")));
}
- Assert(r <= len);
+ Assert(r <= tolen);
- return r;
+ /* Microsoft counts the zero terminator in the result */
+ return r-1;
}
- else
#endif /* WIN32 */
+
if (lc_ctype_is_c())
{
/*
* pg_mb2wchar_with_len always adds trailing '\0', so 'to' should be
* allocated with sufficient space
*/
- return pg_mb2wchar_with_len(from, (pg_wchar *) to, len);
+ return pg_mb2wchar_with_len(from, (pg_wchar *) to, fromlen);
}
else
{
/*
- * mbstowcs require ending '\0'
+ * mbstowcs requires ending '\0'
*/
- char *str = pnstrdup(from, len);
- size_t tolen;
+ char *str = pnstrdup(from, fromlen);
+ size_t result;
+
+ result = mbstowcs(to, str, tolen);
- tolen = mbstowcs(to, str, len);
pfree(str);
- return tolen;
+ if (result == (size_t) -1)
+ {
+ pg_verifymbstr(from, fromlen, false);
+ ereport(ERROR,
+ (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
+ errmsg("invalid multibyte character for locale"),
+ errhint("The server's LC_CTYPE locale is probably incompatible with the database encoding.")));
+ }
+
+ if (result < tolen)
+ to[result] = 0;
+
+ return result;
}
}
+
int
-_t_isalpha(const char *ptr)
+t_isdigit(const char *ptr)
{
+ int clen = pg_mblen(ptr);
wchar_t character[2];
- if (lc_ctype_is_c())
+ if (clen == 1 || lc_ctype_is_c())
+ return isdigit(TOUCHAR(ptr));
+
+ char2wchar(character, 2, ptr, clen);
+
+ return iswdigit((wint_t) character[0]);
+}
+
+int
+t_isspace(const char *ptr)
+{
+ int clen = pg_mblen(ptr);
+ wchar_t character[2];
+
+ if (clen == 1 || lc_ctype_is_c())
+ return isspace(TOUCHAR(ptr));
+
+ char2wchar(character, 2, ptr, clen);
+
+ return iswspace((wint_t) character[0]);
+}
+
+int
+t_isalpha(const char *ptr)
+{
+ int clen = pg_mblen(ptr);
+ wchar_t character[2];
+
+ if (clen == 1 || lc_ctype_is_c())
return isalpha(TOUCHAR(ptr));
- char2wchar(character, ptr, 1);
+ char2wchar(character, 2, ptr, clen);
- return iswalpha((wint_t) *character);
+ return iswalpha((wint_t) character[0]);
}
int
-_t_isprint(const char *ptr)
+t_isprint(const char *ptr)
{
+ int clen = pg_mblen(ptr);
wchar_t character[2];
- if (lc_ctype_is_c())
+ if (clen == 1 || lc_ctype_is_c())
return isprint(TOUCHAR(ptr));
- char2wchar(character, ptr, 1);
+ char2wchar(character, 2, ptr, clen);
- return iswprint((wint_t) *character);
+ return iswprint((wint_t) character[0]);
}
+
#endif /* TS_USE_WIDE */
@@ -168,19 +229,27 @@ t_readline(FILE *fp)
return recoded;
}
+/*
+ * lowerstr --- fold null-terminated string to lower case
+ *
+ * Returned string is palloc'd
+ */
char *
-lowerstr(char *str)
+lowerstr(const char *str)
{
return lowerstr_with_len(str, strlen(str));
}
/*
+ * lowerstr_with_len --- fold string to lower case
+ *
+ * Input string need not be null-terminated.
+ *
* Returned string is palloc'd
*/
char *
-lowerstr_with_len(char *str, int len)
+lowerstr_with_len(const char *str, int len)
{
- char *ptr = str;
char *out;
if (len == 0)
@@ -202,23 +271,13 @@ lowerstr_with_len(char *str, int len)
/*
* alloc number of wchar_t for worst case, len contains number of
- * bytes <= number of characters and alloc 1 wchar_t for 0, because
- * wchar2char(wcstombs in really) wants zero-terminated string
+ * bytes >= number of characters and alloc 1 wchar_t for 0, because
+ * wchar2char wants zero-terminated string
*/
wptr = wstr = (wchar_t *) palloc(sizeof(wchar_t) * (len + 1));
- /*
- * str SHOULD be cstring, so wlen contains number of converted
- * character
- */
- wlen = char2wchar(wstr, str, len);
- if (wlen < 0)
- ereport(ERROR,
- (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
- errmsg("translation failed from server encoding to wchar_t")));
-
+ wlen = char2wchar(wstr, len+1, str, len);
Assert(wlen <= len);
- wstr[wlen] = 0;
while (*wptr)
{
@@ -229,31 +288,29 @@ lowerstr_with_len(char *str, int len)
/*
* Alloc result string for worst case + '\0'
*/
- len = sizeof(char) * pg_database_encoding_max_length() *(wlen + 1);
+ len = pg_database_encoding_max_length() * wlen + 1;
out = (char *) palloc(len);
- /*
- * wlen now is number of bytes which is always >= number of characters
- */
wlen = wchar2char(out, wstr, len);
+
pfree(wstr);
if (wlen < 0)
ereport(ERROR,
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
- errmsg("translation failed from wchar_t to server encoding %d", errno)));
- Assert(wlen <= len);
- out[wlen] = '\0';
+ errmsg("translation from wchar_t to server encoding failed: %m")));
+ Assert(wlen < len);
}
else
-#endif
+#endif /* TS_USE_WIDE */
{
+ const char *ptr = str;
char *outptr;
outptr = out = (char *) palloc(sizeof(char) * (len + 1));
- while (*ptr && ptr - str < len)
+ while ((ptr - str) < len && *ptr)
{
- *outptr++ = tolower(*(unsigned char *) ptr);
+ *outptr++ = tolower(TOUCHAR(ptr));
ptr++;
}
*outptr = '\0';
diff --git a/src/backend/tsearch/ts_utils.c b/src/backend/tsearch/ts_utils.c
index 781146886a3..6c989474202 100644
--- a/src/backend/tsearch/ts_utils.c
+++ b/src/backend/tsearch/ts_utils.c
@@ -7,7 +7,7 @@
*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/tsearch/ts_utils.c,v 1.4 2007/09/04 02:16:56 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/tsearch/ts_utils.c,v 1.5 2007/11/09 22:37:35 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@@ -75,7 +75,7 @@ comparestr(const void *a, const void *b)
* or palloc a new version.
*/
void
-readstoplist(const char *fname, StopList *s, char *(*wordop) (char *))
+readstoplist(const char *fname, StopList *s, char *(*wordop) (const char *))
{
char **stop = NULL;
diff --git a/src/backend/tsearch/wparser_def.c b/src/backend/tsearch/wparser_def.c
index 086ac951558..b79056ca688 100644
--- a/src/backend/tsearch/wparser_def.c
+++ b/src/backend/tsearch/wparser_def.c
@@ -7,7 +7,7 @@
*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/tsearch/wparser_def.c,v 1.7 2007/10/27 19:03:45 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/tsearch/wparser_def.c,v 1.8 2007/11/09 22:37:35 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@@ -294,12 +294,12 @@ TParserInit(char *str, int len)
/*
* Use wide char code only when max encoding length > 1.
*/
-
if (prs->charmaxlen > 1)
{
prs->usewide = true;
prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * (prs->lenstr + 1));
- prs->lenwstr = char2wchar(prs->wstr, prs->str, prs->lenstr);
+ prs->lenwstr = char2wchar(prs->wstr, prs->lenstr + 1,
+ prs->str, prs->lenstr);
}
else
#endif
diff --git a/src/include/tsearch/ts_locale.h b/src/include/tsearch/ts_locale.h
index dcae2af93a4..cea3830a0f1 100644
--- a/src/include/tsearch/ts_locale.h
+++ b/src/include/tsearch/ts_locale.h
@@ -1,15 +1,14 @@
/*-------------------------------------------------------------------------
*
* ts_locale.h
- * helper utilities for tsearch
+ * locale compatibility layer for tsearch
*
* Copyright (c) 1998-2007, PostgreSQL Global Development Group
*
- * $PostgreSQL: pgsql/src/include/tsearch/ts_locale.h,v 1.2 2007/08/25 00:03:59 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/tsearch/ts_locale.h,v 1.3 2007/11/09 22:37:35 tgl Exp $
*
*-------------------------------------------------------------------------
*/
-
#ifndef __TSLOCALE_H__
#define __TSLOCALE_H__
@@ -34,55 +33,37 @@
#define TS_USE_WIDE
#endif
-#define TOUCHAR(x) (*((unsigned char*)(x)))
+#define TOUCHAR(x) (*((const unsigned char *) (x)))
#ifdef TS_USE_WIDE
-extern size_t char2wchar(wchar_t *to, const char *from, size_t len);
-
-#ifdef WIN32
-
-extern size_t wchar2char(char *to, const wchar_t *from, size_t len);
-#else /* WIN32 */
-
-/* correct wcstombs */
-#define wchar2char wcstombs
+extern size_t wchar2char(char *to, const wchar_t *from, size_t tolen);
+extern size_t char2wchar(wchar_t *to, size_t tolen, const char *from, size_t fromlen);
-#endif /* WIN32 */
+extern int t_isdigit(const char *ptr);
+extern int t_isspace(const char *ptr);
+extern int t_isalpha(const char *ptr);
+extern int t_isprint(const char *ptr);
-#define t_isdigit(x) ( pg_mblen(x)==1 && isdigit( TOUCHAR(x) ) )
-#define t_isspace(x) ( pg_mblen(x)==1 && isspace( TOUCHAR(x) ) )
-extern int _t_isalpha(const char *ptr);
+/* The second argument of t_iseq() must be a plain ASCII character */
+#define t_iseq(x,c) (TOUCHAR(x) == (unsigned char) (c))
-#define t_isalpha(x) ( (pg_mblen(x)==1) ? isalpha( TOUCHAR(x) ) : _t_isalpha(x) )
-extern int _t_isprint(const char *ptr);
+#define COPYCHAR(d,s) memcpy(d, s, pg_mblen(s))
-#define t_isprint(x) ( (pg_mblen(x)==1) ? isprint( TOUCHAR(x) ) : _t_isprint(x) )
-/*
- * t_iseq() should be called only for ASCII symbols
- */
-#define t_iseq(x,c) ( (pg_mblen(x)==1) ? ( TOUCHAR(x) == ((unsigned char)(c)) ) : false )
+#else /* not TS_USE_WIDE */
-#define COPYCHAR(d,s) do { \
- int lll = pg_mblen( s ); \
- \
- while( lll-- ) \
- TOUCHAR((d)+lll) = TOUCHAR((s)+lll); \
-} while(0)
+#define t_isdigit(x) isdigit(TOUCHAR(x))
+#define t_isspace(x) isspace(TOUCHAR(x))
+#define t_isalpha(x) isalpha(TOUCHAR(x))
+#define t_isprint(x) isprint(TOUCHAR(x))
+#define t_iseq(x,c) (TOUCHAR(x) == (unsigned char) (c))
-#else /* not def TS_USE_WIDE */
+#define COPYCHAR(d,s) (*((unsigned char *) (d)) = TOUCHAR(s))
-#define t_isdigit(x) isdigit( TOUCHAR(x) )
-#define t_isspace(x) isspace( TOUCHAR(x) )
-#define t_isalpha(x) isalpha( TOUCHAR(x) )
-#define t_isprint(x) isprint( TOUCHAR(x) )
-#define t_iseq(x,c) ( TOUCHAR(x) == ((unsigned char)(c)) )
-
-#define COPYCHAR(d,s) TOUCHAR(d) = TOUCHAR(s)
-#endif
+#endif /* TS_USE_WIDE */
-extern char *lowerstr(char *str);
-extern char *lowerstr_with_len(char *str, int len);
+extern char *lowerstr(const char *str);
+extern char *lowerstr_with_len(const char *str, int len);
extern char *t_readline(FILE *fp);
#endif /* __TSLOCALE_H__ */
diff --git a/src/include/tsearch/ts_public.h b/src/include/tsearch/ts_public.h
index ab19de7924f..92736c4e1bc 100644
--- a/src/include/tsearch/ts_public.h
+++ b/src/include/tsearch/ts_public.h
@@ -6,7 +6,7 @@
*
* Copyright (c) 1998-2007, PostgreSQL Global Development Group
*
- * $PostgreSQL: pgsql/src/include/tsearch/ts_public.h,v 1.4 2007/09/07 15:09:56 teodor Exp $
+ * $PostgreSQL: pgsql/src/include/tsearch/ts_public.h,v 1.5 2007/11/09 22:37:35 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@@ -74,7 +74,7 @@ typedef struct
} StopList;
extern void readstoplist(const char *fname, StopList *s,
- char *(*wordop) (char *));
+ char *(*wordop) (const char *));
extern bool searchstoplist(StopList *s, char *key);
/*