diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/backend/snowball/Makefile | 15 | ||||
-rw-r--r-- | src/backend/snowball/snowball.sql.in | 13 | ||||
-rw-r--r-- | src/backend/tsearch/wparser_def.c | 431 | ||||
-rw-r--r-- | src/include/catalog/catversion.h | 4 | ||||
-rw-r--r-- | src/test/regress/expected/tsdicts.out | 10 | ||||
-rw-r--r-- | src/test/regress/expected/tsearch.out | 50 | ||||
-rw-r--r-- | src/test/regress/sql/tsdicts.sql | 11 | ||||
-rw-r--r-- | src/tools/msvc/Install.pm | 8 |
8 files changed, 273 insertions, 269 deletions
diff --git a/src/backend/snowball/Makefile b/src/backend/snowball/Makefile index e09d332e787..ba0c60db28a 100644 --- a/src/backend/snowball/Makefile +++ b/src/backend/snowball/Makefile @@ -2,7 +2,7 @@ # # Makefile for src/backend/snowball # -# $PostgreSQL: pgsql/src/backend/snowball/Makefile,v 1.3 2007/08/27 10:29:49 mha Exp $ +# $PostgreSQL: pgsql/src/backend/snowball/Makefile,v 1.4 2007/10/23 20:46:12 tgl Exp $ # #------------------------------------------------------------------------- @@ -46,8 +46,9 @@ OBJS= dict_snowball.o api.o utilities.o \ stem_UTF_8_swedish.o \ stem_UTF_8_turkish.o -# second column is name of latin dictionary, if different -# Note order dependency: use of some other language as latin dictionary +# first column is language name and also name of dictionary for not-all-ASCII +# words, second is name of dictionary for all-ASCII words +# Note order dependency: use of some other language as ASCII dictionary # must come after creation of that language LANGUAGES= \ danish danish \ @@ -95,8 +96,8 @@ ifeq ($(enable_shared), yes) while [ "$$#" -gt 0 ] ; \ do \ lang=$$1; shift; \ - nonlatdictname=$$lang; \ - latdictname=$$1; shift; \ + nonascdictname=$$lang; \ + ascdictname=$$1; shift; \ if [ -s $(srcdir)/stopwords/$${lang}.stop ] ; then \ stop=", StopWords=$${lang}" ; \ else \ @@ -106,8 +107,8 @@ ifeq ($(enable_shared), yes) sed -e "s#_LANGNAME_#$$lang#g" | \ sed -e "s#_DICTNAME_#$${lang}_stem#g" | \ sed -e "s#_CFGNAME_#$$lang#g" | \ - sed -e "s#_LATDICTNAME_#$${latdictname}_stem#g" | \ - sed -e "s#_NONLATDICTNAME_#$${nonlatdictname}_stem#g" | \ + sed -e "s#_ASCDICTNAME_#$${ascdictname}_stem#g" | \ + sed -e "s#_NONASCDICTNAME_#$${nonascdictname}_stem#g" | \ sed -e "s#_STOPWORDS_#$$stop#g" ; \ done >> $@ else diff --git a/src/backend/snowball/snowball.sql.in b/src/backend/snowball/snowball.sql.in index 8b6328a0835..7a32c85edb2 100644 --- a/src/backend/snowball/snowball.sql.in +++ b/src/backend/snowball/snowball.sql.in @@ -1,4 +1,4 @@ --- $PostgreSQL: pgsql/src/backend/snowball/snowball.sql.in,v 1.4 2007/09/03 02:30:43 tgl Exp $$ +-- $PostgreSQL: pgsql/src/backend/snowball/snowball.sql.in,v 1.5 2007/10/23 20:46:12 tgl Exp $$ -- text search configuration for _LANGNAME_ language CREATE TEXT SEARCH DICTIONARY _DICTNAME_ @@ -12,14 +12,15 @@ CREATE TEXT SEARCH CONFIGURATION _CFGNAME_ COMMENT ON TEXT SEARCH CONFIGURATION _CFGNAME_ IS 'configuration for _LANGNAME_ language'; ALTER TEXT SEARCH CONFIGURATION _CFGNAME_ ADD MAPPING - FOR email, url, host, sfloat, version, uri, file, float, int, uint + FOR email, url, host, sfloat, version, uri, file, float, int, uint, + numword, hword_numpart, numhword WITH simple; ALTER TEXT SEARCH CONFIGURATION _CFGNAME_ ADD MAPPING - FOR lhword, lpart_hword, lword - WITH _LATDICTNAME_; + FOR asciiword, hword_asciipart, asciihword + WITH _ASCDICTNAME_; ALTER TEXT SEARCH CONFIGURATION _CFGNAME_ ADD MAPPING - FOR hword, nlhword, nlpart_hword, nlword, word, part_hword - WITH _NONLATDICTNAME_; + FOR word, hword_part, hword + WITH _NONASCDICTNAME_; diff --git a/src/backend/tsearch/wparser_def.c b/src/backend/tsearch/wparser_def.c index 5f65cbc9fb2..e6df88d9c76 100644 --- a/src/backend/tsearch/wparser_def.c +++ b/src/backend/tsearch/wparser_def.c @@ -1,13 +1,13 @@ /*------------------------------------------------------------------------- * * wparser_def.c - * Standard word parser + * Default text search parser * * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/tsearch/wparser_def.c,v 1.3 2007/09/07 15:09:55 teodor Exp $ + * $PostgreSQL: pgsql/src/backend/tsearch/wparser_def.c,v 1.4 2007/10/23 20:46:12 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -22,79 +22,53 @@ #include "utils/builtins.h" -/* rememder !!!! */ -#define LASTNUM 23 +/* Output token categories */ -#define LATWORD 1 -#define CYRWORD 2 -#define UWORD 3 -#define EMAIL 4 -#define FURL 5 -#define HOST 6 -#define SCIENTIFIC 7 +#define ASCIIWORD 1 +#define WORD_T 2 +#define NUMWORD 3 +#define EMAIL 4 +#define URL_T 5 +#define HOST 6 +#define SCIENTIFIC 7 #define VERSIONNUMBER 8 -#define PARTHYPHENWORD 9 -#define CYRPARTHYPHENWORD 10 -#define LATPARTHYPHENWORD 11 -#define SPACE 12 -#define TAG 13 +#define NUMPARTHWORD 9 +#define PARTHWORD 10 +#define ASCIIPARTHWORD 11 +#define SPACE 12 +#define TAG_T 13 #define PROTOCOL 14 -#define HYPHENWORD 15 -#define LATHYPHENWORD 16 -#define CYRHYPHENWORD 17 -#define URI 18 -#define FILEPATH 19 -#define DECIMAL 20 -#define SIGNEDINT 21 -#define UNSIGNEDINT 22 -#define HTMLENTITY 23 - -static const char *lex_descr[] = { +#define NUMHWORD 15 +#define ASCIIHWORD 16 +#define HWORD 17 +#define URI 18 +#define FILEPATH 19 +#define DECIMAL 20 +#define SIGNEDINT 21 +#define UNSIGNEDINT 22 +#define HTMLENTITY 23 + +#define LASTNUM 23 + +static const char * const tok_alias[] = { "", - "Latin word", - "Non-latin word", - "Word", - "Email", - "URL", - "Host", - "Scientific notation", - "VERSION", - "Part of hyphenated word", - "Non-latin part of hyphenated word", - "Latin part of hyphenated word", - "Space symbols", - "HTML Tag", - "Protocol head", - "Hyphenated word", - "Latin hyphenated word", - "Non-latin hyphenated word", - "URI", - "File or path name", - "Decimal notation", - "Signed integer", - "Unsigned integer", - "HTML Entity" -}; - -static const char *tok_alias[] = { - "", - "lword", - "nlword", + "asciiword", "word", + "numword", "email", "url", "host", "sfloat", "version", - "part_hword", - "nlpart_hword", - "lpart_hword", + "hword_numpart", + "hword_part", + "hword_asciipart", "blank", "tag", "protocol", + "numhword", + "asciihword", "hword", - "lhword", - "nlhword", "uri", "file", "float", @@ -103,12 +77,42 @@ static const char *tok_alias[] = { "entity" }; +static const char * const lex_descr[] = { + "", + "Word, all ASCII", + "Word, all letters", + "Word, letters and digits", + "Email address", + "URL", + "Host", + "Scientific notation", + "Version number", + "Hyphenated word part, letters and digits", + "Hyphenated word part, all letters", + "Hyphenated word part, all ASCII", + "Space symbols", + "HTML tag", + "Protocol head", + "Hyphenated word, letters and digits", + "Hyphenated word, all ASCII", + "Hyphenated word, all letters", + "URI", + "File or path name", + "Decimal notation", + "Signed integer", + "Unsigned integer", + "HTML entity" +}; + + +/* Parser states */ + typedef enum { TPS_Base = 0, - TPS_InUWord, - TPS_InLatWord, - TPS_InCyrWord, + TPS_InNumWord, + TPS_InAsciiWord, + TPS_InWord, TPS_InUnsignedInt, TPS_InSignedIntFirst, TPS_InSignedInt, @@ -167,20 +171,20 @@ typedef enum TPS_InProtocolFirst, TPS_InProtocolSecond, TPS_InProtocolEnd, - TPS_InHyphenLatWordFirst, - TPS_InHyphenLatWord, - TPS_InHyphenCyrWordFirst, - TPS_InHyphenCyrWord, - TPS_InHyphenUWordFirst, - TPS_InHyphenUWord, + TPS_InHyphenAsciiWordFirst, + TPS_InHyphenAsciiWord, + TPS_InHyphenWordFirst, + TPS_InHyphenWord, + TPS_InHyphenNumWordFirst, + TPS_InHyphenNumWord, TPS_InHyphenValueFirst, TPS_InHyphenValue, TPS_InHyphenValueExact, TPS_InParseHyphen, TPS_InParseHyphenHyphen, - TPS_InHyphenCyrWordPart, - TPS_InHyphenLatWordPart, - TPS_InHyphenUWordPart, + TPS_InHyphenWordPart, + TPS_InHyphenAsciiWordPart, + TPS_InHyphenNumWordPart, TPS_InHyphenUnsignedInt, TPS_InHDecimalPartFirst, TPS_InHDecimalPart, @@ -192,7 +196,6 @@ typedef enum /* forward declaration */ struct TParser; - typedef int (*TParserCharTest) (struct TParser *); /* any p_is* functions * except p_iseq */ typedef void (*TParserSpecial) (struct TParser *); /* special handler for @@ -208,6 +211,16 @@ typedef struct TParserSpecial special; } TParserStateActionItem; +/* Flag bits in TParserStateActionItem.flags */ +#define A_NEXT 0x0000 +#define A_BINGO 0x0001 +#define A_POP 0x0002 +#define A_PUSH 0x0004 +#define A_RERUN 0x0008 +#define A_CLEAR 0x0010 +#define A_MERGE 0x0020 +#define A_CLRALL 0x0040 + typedef struct { TParserState state; @@ -255,6 +268,11 @@ typedef struct TParser } TParser; + +/* forward decls here */ +static bool TParserGet(TParser * prs); + + static TParserPosition * newTParserPosition(TParserPosition * prev) { @@ -303,8 +321,6 @@ TParserInit(char *str, int len) return prs; } -static bool TParserGet(TParser * prs); - static void TParserClose(TParser * prs) { @@ -325,10 +341,10 @@ TParserClose(TParser * prs) } /* - * defining support function, equvalent is* macroses, but + * Character-type support functions, equivalent to is* macros, but * working with any possible encodings and locales. Note, * that with multibyte encoding and C-locale isw* function may fail - * or give wrong result. Note 2: multibyte encoding and C-local + * or give wrong result. Note 2: multibyte encoding and C-locale * often are used for Asian languages */ @@ -487,17 +503,13 @@ p_isascii(TParser * prs) } static int -p_islatin(TParser * prs) +p_isasclet(TParser * prs) { - return (p_isalpha(prs) && p_isascii(prs)) ? 1 : 0; + return (p_isascii(prs) && p_isalpha(prs)) ? 1 : 0; } -static int -p_isnonlatin(TParser * prs) -{ - return (p_isalpha(prs) && !p_isascii(prs)) ? 1 : 0; -} +/* deliberately suppress unused-function complaints for the above */ void _make_compiler_happy(void); void _make_compiler_happy(void) @@ -638,21 +650,12 @@ p_isURI(TParser * prs) * Table of state/action of parser */ -#define A_NEXT 0x0000 -#define A_BINGO 0x0001 -#define A_POP 0x0002 -#define A_PUSH 0x0004 -#define A_RERUN 0x0008 -#define A_CLEAR 0x0010 -#define A_MERGE 0x0020 -#define A_CLRALL 0x0040 - static TParserStateActionItem actionTPS_Base[] = { {p_isEOF, 0, A_NEXT, TPS_Null, 0, NULL}, {p_iseqC, '<', A_PUSH, TPS_InTagFirst, 0, NULL}, {p_isignore, 0, A_NEXT, TPS_InSpace, 0, NULL}, - {p_islatin, 0, A_NEXT, TPS_InLatWord, 0, NULL}, - {p_isnonlatin, 0, A_NEXT, TPS_InCyrWord, 0, NULL}, + {p_isasclet, 0, A_NEXT, TPS_InAsciiWord, 0, NULL}, + {p_isalpha, 0, A_NEXT, TPS_InWord, 0, NULL}, {p_isdigit, 0, A_NEXT, TPS_InUnsignedInt, 0, NULL}, {p_iseqC, '-', A_PUSH, TPS_InSignedIntFirst, 0, NULL}, {p_iseqC, '+', A_PUSH, TPS_InSignedIntFirst, 0, NULL}, @@ -664,37 +667,38 @@ static TParserStateActionItem actionTPS_Base[] = { }; -static TParserStateActionItem actionTPS_InUWord[] = { - {p_isEOF, 0, A_BINGO, TPS_Base, UWORD, NULL}, - {p_isalnum, 0, A_NEXT, TPS_InUWord, 0, NULL}, +static TParserStateActionItem actionTPS_InNumWord[] = { + {p_isEOF, 0, A_BINGO, TPS_Base, NUMWORD, NULL}, + {p_isalnum, 0, A_NEXT, TPS_InNumWord, 0, NULL}, {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL}, {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL}, {p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL}, - {p_iseqC, '-', A_PUSH, TPS_InHyphenUWordFirst, 0, NULL}, - {NULL, 0, A_BINGO, TPS_Base, UWORD, NULL} + {p_iseqC, '-', A_PUSH, TPS_InHyphenNumWordFirst, 0, NULL}, + {NULL, 0, A_BINGO, TPS_Base, NUMWORD, NULL} }; -static TParserStateActionItem actionTPS_InLatWord[] = { - {p_isEOF, 0, A_BINGO, TPS_Base, LATWORD, NULL}, - {p_islatin, 0, A_NEXT, TPS_Null, 0, NULL}, +static TParserStateActionItem actionTPS_InAsciiWord[] = { + {p_isEOF, 0, A_BINGO, TPS_Base, ASCIIWORD, NULL}, + {p_isasclet, 0, A_NEXT, TPS_Null, 0, NULL}, {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL}, {p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL}, {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL}, - {p_iseqC, '-', A_PUSH, TPS_InHyphenLatWordFirst, 0, NULL}, + {p_iseqC, '-', A_PUSH, TPS_InHyphenAsciiWordFirst, 0, NULL}, {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL}, {p_iseqC, ':', A_PUSH, TPS_InProtocolFirst, 0, NULL}, {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL}, {p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL}, - {p_isalnum, 0, A_NEXT, TPS_InUWord, 0, NULL}, - {NULL, 0, A_BINGO, TPS_Base, LATWORD, NULL} + {p_isdigit, 0, A_NEXT, TPS_InNumWord, 0, NULL}, + {p_isalpha, 0, A_NEXT, TPS_InWord, 0, NULL}, + {NULL, 0, A_BINGO, TPS_Base, ASCIIWORD, NULL} }; -static TParserStateActionItem actionTPS_InCyrWord[] = { - {p_isEOF, 0, A_BINGO, TPS_Base, CYRWORD, NULL}, - {p_isnonlatin, 0, A_NEXT, TPS_Null, 0, NULL}, - {p_isalnum, 0, A_NEXT, TPS_InUWord, 0, NULL}, - {p_iseqC, '-', A_PUSH, TPS_InHyphenCyrWordFirst, 0, NULL}, - {NULL, 0, A_BINGO, TPS_Base, CYRWORD, NULL} +static TParserStateActionItem actionTPS_InWord[] = { + {p_isEOF, 0, A_BINGO, TPS_Base, WORD_T, NULL}, + {p_isalpha, 0, A_NEXT, TPS_Null, 0, NULL}, + {p_isdigit, 0, A_NEXT, TPS_InNumWord, 0, NULL}, + {p_iseqC, '-', A_PUSH, TPS_InHyphenWordFirst, 0, NULL}, + {NULL, 0, A_BINGO, TPS_Base, WORD_T, NULL} }; static TParserStateActionItem actionTPS_InUnsignedInt[] = { @@ -704,8 +708,8 @@ static TParserStateActionItem actionTPS_InUnsignedInt[] = { {p_iseqC, '.', A_PUSH, TPS_InUDecimalFirst, 0, NULL}, {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL}, {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL}, - {p_islatin, 0, A_PUSH, TPS_InHost, 0, NULL}, - {p_isalpha, 0, A_NEXT, TPS_InUWord, 0, NULL}, + {p_isasclet, 0, A_PUSH, TPS_InHost, 0, NULL}, + {p_isalpha, 0, A_NEXT, TPS_InNumWord, 0, NULL}, {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL}, {NULL, 0, A_BINGO, TPS_Base, UNSIGNEDINT, NULL} }; @@ -816,13 +820,13 @@ static TParserStateActionItem actionTPS_InMantissa[] = { static TParserStateActionItem actionTPS_InHTMLEntityFirst[] = { {p_isEOF, 0, A_POP, TPS_Null, 0, NULL}, {p_iseqC, '#', A_NEXT, TPS_InHTMLEntityNumFirst, 0, NULL}, - {p_islatin, 0, A_NEXT, TPS_InHTMLEntity, 0, NULL}, + {p_isasclet, 0, A_NEXT, TPS_InHTMLEntity, 0, NULL}, {NULL, 0, A_POP, TPS_Null, 0, NULL} }; static TParserStateActionItem actionTPS_InHTMLEntity[] = { {p_isEOF, 0, A_POP, TPS_Null, 0, NULL}, - {p_islatin, 0, A_NEXT, TPS_InHTMLEntity, 0, NULL}, + {p_isasclet, 0, A_NEXT, TPS_InHTMLEntity, 0, NULL}, {p_iseqC, ';', A_NEXT, TPS_InHTMLEntityEnd, 0, NULL}, {NULL, 0, A_POP, TPS_Null, 0, NULL} }; @@ -849,7 +853,7 @@ static TParserStateActionItem actionTPS_InTagFirst[] = { {p_iseqC, '/', A_PUSH, TPS_InTagCloseFirst, 0, NULL}, {p_iseqC, '!', A_PUSH, TPS_InCommentFirst, 0, NULL}, {p_iseqC, '?', A_PUSH, TPS_InXMLBegin, 0, NULL}, - {p_islatin, 0, A_PUSH, TPS_InTagName, 0, NULL}, + {p_isasclet, 0, A_PUSH, TPS_InTagName, 0, NULL}, {NULL, 0, A_POP, TPS_Null, 0, NULL} }; @@ -863,7 +867,7 @@ static TParserStateActionItem actionTPS_InXMLBegin[] = { static TParserStateActionItem actionTPS_InTagCloseFirst[] = { {p_isEOF, 0, A_POP, TPS_Null, 0, NULL}, - {p_islatin, 0, A_NEXT, TPS_InTagName, 0, NULL}, + {p_isasclet, 0, A_NEXT, TPS_InTagName, 0, NULL}, {NULL, 0, A_POP, TPS_Null, 0, NULL} }; @@ -873,7 +877,7 @@ static TParserStateActionItem actionTPS_InTagName[] = { {p_iseqC, '/', A_NEXT, TPS_InTagBeginEnd, 0, NULL}, {p_iseqC, '>', A_NEXT, TPS_InTagEnd, 0, SpecialTags}, {p_isspace, 0, A_NEXT, TPS_InTag, 0, SpecialTags}, - {p_islatin, 0, A_NEXT, TPS_Null, 0, NULL}, + {p_isasclet, 0, A_NEXT, TPS_Null, 0, NULL}, {NULL, 0, A_POP, TPS_Null, 0, NULL} }; @@ -888,7 +892,7 @@ static TParserStateActionItem actionTPS_InTag[] = { {p_iseqC, '>', A_NEXT, TPS_InTagEnd, 0, SpecialTags}, {p_iseqC, '\'', A_NEXT, TPS_InTagEscapeK, 0, NULL}, {p_iseqC, '"', A_NEXT, TPS_InTagEscapeKK, 0, NULL}, - {p_islatin, 0, A_NEXT, TPS_Null, 0, NULL}, + {p_isasclet, 0, A_NEXT, TPS_Null, 0, NULL}, {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL}, {p_iseqC, '=', A_NEXT, TPS_Null, 0, NULL}, {p_iseqC, '-', A_NEXT, TPS_Null, 0, NULL}, @@ -924,7 +928,7 @@ static TParserStateActionItem actionTPS_InTagBackSleshed[] = { }; static TParserStateActionItem actionTPS_InTagEnd[] = { - {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, TAG, NULL} + {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, TAG_T, NULL} }; static TParserStateActionItem actionTPS_InCommentFirst[] = { @@ -962,19 +966,19 @@ static TParserStateActionItem actionTPS_InCloseCommentLast[] = { }; static TParserStateActionItem actionTPS_InCommentEnd[] = { - {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, TAG, NULL} + {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, TAG_T, NULL} }; static TParserStateActionItem actionTPS_InHostFirstDomain[] = { {p_isEOF, 0, A_POP, TPS_Null, 0, NULL}, - {p_islatin, 0, A_NEXT, TPS_InHostDomainSecond, 0, NULL}, + {p_isasclet, 0, A_NEXT, TPS_InHostDomainSecond, 0, NULL}, {p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL}, {NULL, 0, A_POP, TPS_Null, 0, NULL} }; static TParserStateActionItem actionTPS_InHostDomainSecond[] = { {p_isEOF, 0, A_POP, TPS_Null, 0, NULL}, - {p_islatin, 0, A_NEXT, TPS_InHostDomain, 0, NULL}, + {p_isasclet, 0, A_NEXT, TPS_InHostDomain, 0, NULL}, {p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL}, {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL}, {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL}, @@ -984,7 +988,7 @@ static TParserStateActionItem actionTPS_InHostDomainSecond[] = { static TParserStateActionItem actionTPS_InHostDomain[] = { {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL}, - {p_islatin, 0, A_NEXT, TPS_InHostDomain, 0, NULL}, + {p_isasclet, 0, A_NEXT, TPS_InHostDomain, 0, NULL}, {p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL}, {p_iseqC, ':', A_PUSH, TPS_InPortFirst, 0, NULL}, {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL}, @@ -1013,14 +1017,14 @@ static TParserStateActionItem actionTPS_InPort[] = { static TParserStateActionItem actionTPS_InHostFirstAN[] = { {p_isEOF, 0, A_POP, TPS_Null, 0, NULL}, {p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL}, - {p_islatin, 0, A_NEXT, TPS_InHost, 0, NULL}, + {p_isasclet, 0, A_NEXT, TPS_InHost, 0, NULL}, {NULL, 0, A_POP, TPS_Null, 0, NULL} }; static TParserStateActionItem actionTPS_InHost[] = { {p_isEOF, 0, A_POP, TPS_Null, 0, NULL}, {p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL}, - {p_islatin, 0, A_NEXT, TPS_InHost, 0, NULL}, + {p_isasclet, 0, A_NEXT, TPS_InHost, 0, NULL}, {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL}, {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL}, {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL}, @@ -1034,7 +1038,7 @@ static TParserStateActionItem actionTPS_InEmail[] = { static TParserStateActionItem actionTPS_InFileFirst[] = { {p_isEOF, 0, A_POP, TPS_Null, 0, NULL}, - {p_islatin, 0, A_NEXT, TPS_InFile, 0, NULL}, + {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL}, {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL}, {p_iseqC, '.', A_NEXT, TPS_InPathFirst, 0, NULL}, {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL}, @@ -1045,7 +1049,7 @@ static TParserStateActionItem actionTPS_InFileFirst[] = { static TParserStateActionItem actionTPS_InFileTwiddle[] = { {p_isEOF, 0, A_POP, TPS_Null, 0, NULL}, - {p_islatin, 0, A_NEXT, TPS_InFile, 0, NULL}, + {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL}, {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL}, {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL}, {p_iseqC, '/', A_NEXT, TPS_InFileFirst, 0, NULL}, @@ -1054,7 +1058,7 @@ static TParserStateActionItem actionTPS_InFileTwiddle[] = { static TParserStateActionItem actionTPS_InPathFirst[] = { {p_isEOF, 0, A_POP, TPS_Null, 0, NULL}, - {p_islatin, 0, A_NEXT, TPS_InFile, 0, NULL}, + {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL}, {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL}, {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL}, {p_iseqC, '.', A_NEXT, TPS_InPathSecond, 0, NULL}, @@ -1079,7 +1083,7 @@ static TParserStateActionItem actionTPS_InPathSecond[] = { static TParserStateActionItem actionTPS_InFile[] = { {p_isEOF, 0, A_BINGO, TPS_Base, FILEPATH, NULL}, - {p_islatin, 0, A_NEXT, TPS_InFile, 0, NULL}, + {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL}, {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL}, {p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL}, {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL}, @@ -1091,7 +1095,7 @@ static TParserStateActionItem actionTPS_InFile[] = { static TParserStateActionItem actionTPS_InFileNext[] = { {p_isEOF, 0, A_POP, TPS_Null, 0, NULL}, - {p_islatin, 0, A_CLEAR, TPS_InFile, 0, NULL}, + {p_isasclet, 0, A_CLEAR, TPS_InFile, 0, NULL}, {p_isdigit, 0, A_CLEAR, TPS_InFile, 0, NULL}, {p_iseqC, '_', A_CLEAR, TPS_InFile, 0, NULL}, {NULL, 0, A_POP, TPS_Null, 0, NULL} @@ -1119,7 +1123,7 @@ static TParserStateActionItem actionTPS_InURI[] = { static TParserStateActionItem actionTPS_InFURL[] = { {p_isEOF, 0, A_POP, TPS_Null, 0, NULL}, - {p_isURI, 0, A_BINGO | A_CLRALL, TPS_Base, FURL, SpecialFURL}, + {p_isURI, 0, A_BINGO | A_CLRALL, TPS_Base, URL_T, SpecialFURL}, {NULL, 0, A_POP, TPS_Null, 0, NULL} }; @@ -1139,54 +1143,52 @@ static TParserStateActionItem actionTPS_InProtocolEnd[] = { {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, PROTOCOL, NULL} }; -static TParserStateActionItem actionTPS_InHyphenLatWordFirst[] = { +static TParserStateActionItem actionTPS_InHyphenAsciiWordFirst[] = { {p_isEOF, 0, A_POP, TPS_Null, 0, NULL}, - {p_islatin, 0, A_NEXT, TPS_InHyphenLatWord, 0, NULL}, - {p_isnonlatin, 0, A_NEXT, TPS_InHyphenUWord, 0, NULL}, + {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWord, 0, NULL}, + {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL}, {p_isdigit, 0, A_NEXT, TPS_InHyphenValue, 0, NULL}, - {p_isdigit, 0, A_NEXT, TPS_InHyphenUWord, 0, NULL}, + {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL}, {NULL, 0, A_POP, TPS_Null, 0, NULL} }; -static TParserStateActionItem actionTPS_InHyphenLatWord[] = { - {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, LATHYPHENWORD, SpecialHyphen}, - {p_islatin, 0, A_NEXT, TPS_InHyphenLatWord, 0, NULL}, - {p_isnonlatin, 0, A_NEXT, TPS_InHyphenUWord, 0, NULL}, - {p_isdigit, 0, A_NEXT, TPS_InHyphenUWord, 0, NULL}, - {p_iseqC, '-', A_PUSH, TPS_InHyphenLatWordFirst, 0, NULL}, - {NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, LATHYPHENWORD, SpecialHyphen} +static TParserStateActionItem actionTPS_InHyphenAsciiWord[] = { + {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, ASCIIHWORD, SpecialHyphen}, + {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWord, 0, NULL}, + {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL}, + {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL}, + {p_iseqC, '-', A_PUSH, TPS_InHyphenAsciiWordFirst, 0, NULL}, + {NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, ASCIIHWORD, SpecialHyphen} }; -static TParserStateActionItem actionTPS_InHyphenCyrWordFirst[] = { +static TParserStateActionItem actionTPS_InHyphenWordFirst[] = { {p_isEOF, 0, A_POP, TPS_Null, 0, NULL}, - {p_isnonlatin, 0, A_NEXT, TPS_InHyphenCyrWord, 0, NULL}, - {p_islatin, 0, A_NEXT, TPS_InHyphenUWord, 0, NULL}, + {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL}, {p_isdigit, 0, A_NEXT, TPS_InHyphenValue, 0, NULL}, - {p_isdigit, 0, A_NEXT, TPS_InHyphenUWord, 0, NULL}, + {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL}, {NULL, 0, A_POP, TPS_Null, 0, NULL} }; -static TParserStateActionItem actionTPS_InHyphenCyrWord[] = { - {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, CYRHYPHENWORD, SpecialHyphen}, - {p_isnonlatin, 0, A_NEXT, TPS_InHyphenCyrWord, 0, NULL}, - {p_islatin, 0, A_NEXT, TPS_InHyphenUWord, 0, NULL}, - {p_isdigit, 0, A_NEXT, TPS_InHyphenUWord, 0, NULL}, - {p_iseqC, '-', A_PUSH, TPS_InHyphenCyrWordFirst, 0, NULL}, - {NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, CYRHYPHENWORD, SpecialHyphen} +static TParserStateActionItem actionTPS_InHyphenWord[] = { + {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, HWORD, SpecialHyphen}, + {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL}, + {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL}, + {p_iseqC, '-', A_PUSH, TPS_InHyphenWordFirst, 0, NULL}, + {NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, HWORD, SpecialHyphen} }; -static TParserStateActionItem actionTPS_InHyphenUWordFirst[] = { +static TParserStateActionItem actionTPS_InHyphenNumWordFirst[] = { {p_isEOF, 0, A_POP, TPS_Null, 0, NULL}, {p_isdigit, 0, A_NEXT, TPS_InHyphenValue, 0, NULL}, - {p_isalnum, 0, A_NEXT, TPS_InHyphenUWord, 0, NULL}, + {p_isalpha, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL}, {NULL, 0, A_POP, TPS_Null, 0, NULL} }; -static TParserStateActionItem actionTPS_InHyphenUWord[] = { - {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, HYPHENWORD, SpecialHyphen}, - {p_isalnum, 0, A_NEXT, TPS_InHyphenUWord, 0, NULL}, - {p_iseqC, '-', A_PUSH, TPS_InHyphenUWordFirst, 0, NULL}, - {NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, HYPHENWORD, SpecialHyphen} +static TParserStateActionItem actionTPS_InHyphenNumWord[] = { + {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, NUMHWORD, SpecialHyphen}, + {p_isalnum, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL}, + {p_iseqC, '-', A_PUSH, TPS_InHyphenNumWordFirst, 0, NULL}, + {NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, NUMHWORD, SpecialHyphen} }; static TParserStateActionItem actionTPS_InHyphenValueFirst[] = { @@ -1196,26 +1198,26 @@ static TParserStateActionItem actionTPS_InHyphenValueFirst[] = { }; static TParserStateActionItem actionTPS_InHyphenValue[] = { - {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, HYPHENWORD, SpecialHyphen}, + {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, NUMHWORD, SpecialHyphen}, {p_isdigit, 0, A_NEXT, TPS_InHyphenValue, 0, NULL}, {p_iseqC, '.', A_PUSH, TPS_InHyphenValueFirst, 0, NULL}, - {p_iseqC, '-', A_PUSH, TPS_InHyphenUWordFirst, 0, NULL}, - {p_isalpha, 0, A_NEXT, TPS_InHyphenUWord, 0, NULL}, - {NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, HYPHENWORD, SpecialHyphen} + {p_iseqC, '-', A_PUSH, TPS_InHyphenNumWordFirst, 0, NULL}, + {p_isalpha, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL}, + {NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, NUMHWORD, SpecialHyphen} }; static TParserStateActionItem actionTPS_InHyphenValueExact[] = { - {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, HYPHENWORD, SpecialHyphen}, + {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, NUMHWORD, SpecialHyphen}, {p_isdigit, 0, A_NEXT, TPS_InHyphenValueExact, 0, NULL}, {p_iseqC, '.', A_PUSH, TPS_InHyphenValueFirst, 0, NULL}, - {p_iseqC, '-', A_PUSH, TPS_InHyphenUWordFirst, 0, NULL}, - {NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, HYPHENWORD, SpecialHyphen} + {p_iseqC, '-', A_PUSH, TPS_InHyphenNumWordFirst, 0, NULL}, + {NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, NUMHWORD, SpecialHyphen} }; static TParserStateActionItem actionTPS_InParseHyphen[] = { {p_isEOF, 0, A_RERUN, TPS_Base, 0, NULL}, - {p_islatin, 0, A_NEXT, TPS_InHyphenLatWordPart, 0, NULL}, - {p_isnonlatin, 0, A_NEXT, TPS_InHyphenCyrWordPart, 0, NULL}, + {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWordPart, 0, NULL}, + {p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL}, {p_isdigit, 0, A_NEXT, TPS_InHyphenUnsignedInt, 0, NULL}, {p_iseqC, '-', A_PUSH, TPS_InParseHyphenHyphen, 0, NULL}, {NULL, 0, A_RERUN, TPS_Base, 0, NULL} @@ -1227,32 +1229,31 @@ static TParserStateActionItem actionTPS_InParseHyphenHyphen[] = { {NULL, 0, A_POP, TPS_Null, 0, NULL} }; -static TParserStateActionItem actionTPS_InHyphenCyrWordPart[] = { - {p_isEOF, 0, A_BINGO, TPS_Base, CYRPARTHYPHENWORD, NULL}, - {p_isnonlatin, 0, A_NEXT, TPS_InHyphenCyrWordPart, 0, NULL}, - {p_islatin, 0, A_NEXT, TPS_InHyphenUWordPart, 0, NULL}, - {p_isdigit, 0, A_NEXT, TPS_InHyphenUWordPart, 0, NULL}, - {NULL, 0, A_BINGO, TPS_InParseHyphen, CYRPARTHYPHENWORD, NULL} +static TParserStateActionItem actionTPS_InHyphenWordPart[] = { + {p_isEOF, 0, A_BINGO, TPS_Base, PARTHWORD, NULL}, + {p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL}, + {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL}, + {NULL, 0, A_BINGO, TPS_InParseHyphen, PARTHWORD, NULL} }; -static TParserStateActionItem actionTPS_InHyphenLatWordPart[] = { - {p_isEOF, 0, A_BINGO, TPS_Base, LATPARTHYPHENWORD, NULL}, - {p_islatin, 0, A_NEXT, TPS_InHyphenLatWordPart, 0, NULL}, - {p_isnonlatin, 0, A_NEXT, TPS_InHyphenUWordPart, 0, NULL}, - {p_isdigit, 0, A_NEXT, TPS_InHyphenUWordPart, 0, NULL}, - {NULL, 0, A_BINGO, TPS_InParseHyphen, LATPARTHYPHENWORD, NULL} +static TParserStateActionItem actionTPS_InHyphenAsciiWordPart[] = { + {p_isEOF, 0, A_BINGO, TPS_Base, ASCIIPARTHWORD, NULL}, + {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWordPart, 0, NULL}, + {p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL}, + {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL}, + {NULL, 0, A_BINGO, TPS_InParseHyphen, ASCIIPARTHWORD, NULL} }; -static TParserStateActionItem actionTPS_InHyphenUWordPart[] = { - {p_isEOF, 0, A_BINGO, TPS_Base, PARTHYPHENWORD, NULL}, - {p_isalnum, 0, A_NEXT, TPS_InHyphenUWordPart, 0, NULL}, - {NULL, 0, A_BINGO, TPS_InParseHyphen, PARTHYPHENWORD, NULL} +static TParserStateActionItem actionTPS_InHyphenNumWordPart[] = { + {p_isEOF, 0, A_BINGO, TPS_Base, NUMPARTHWORD, NULL}, + {p_isalnum, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL}, + {NULL, 0, A_BINGO, TPS_InParseHyphen, NUMPARTHWORD, NULL} }; static TParserStateActionItem actionTPS_InHyphenUnsignedInt[] = { {p_isEOF, 0, A_BINGO, TPS_Base, UNSIGNEDINT, NULL}, {p_isdigit, 0, A_NEXT, TPS_InHyphenUnsignedInt, 0, NULL}, - {p_isalpha, 0, A_NEXT, TPS_InHyphenUWordPart, 0, NULL}, + {p_isalpha, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL}, {p_iseqC, '.', A_PUSH, TPS_InHDecimalPartFirst, 0, NULL}, {NULL, 0, A_BINGO, TPS_InParseHyphen, UNSIGNEDINT, NULL} }; @@ -1284,14 +1285,14 @@ static TParserStateActionItem actionTPS_InHVersionPart[] = { }; /* - * order should be the same as in typedef enum {} TParserState!! + * order must be the same as in typedef enum {} TParserState!! */ static const TParserStateAction Actions[] = { {TPS_Base, actionTPS_Base}, - {TPS_InUWord, actionTPS_InUWord}, - {TPS_InLatWord, actionTPS_InLatWord}, - {TPS_InCyrWord, actionTPS_InCyrWord}, + {TPS_InNumWord, actionTPS_InNumWord}, + {TPS_InAsciiWord, actionTPS_InAsciiWord}, + {TPS_InWord, actionTPS_InWord}, {TPS_InUnsignedInt, actionTPS_InUnsignedInt}, {TPS_InSignedIntFirst, actionTPS_InSignedIntFirst}, {TPS_InSignedInt, actionTPS_InSignedInt}, @@ -1350,20 +1351,20 @@ static const TParserStateAction Actions[] = { {TPS_InProtocolFirst, actionTPS_InProtocolFirst}, {TPS_InProtocolSecond, actionTPS_InProtocolSecond}, {TPS_InProtocolEnd, actionTPS_InProtocolEnd}, - {TPS_InHyphenLatWordFirst, actionTPS_InHyphenLatWordFirst}, - {TPS_InHyphenLatWord, actionTPS_InHyphenLatWord}, - {TPS_InHyphenCyrWordFirst, actionTPS_InHyphenCyrWordFirst}, - {TPS_InHyphenCyrWord, actionTPS_InHyphenCyrWord}, - {TPS_InHyphenUWordFirst, actionTPS_InHyphenUWordFirst}, - {TPS_InHyphenUWord, actionTPS_InHyphenUWord}, + {TPS_InHyphenAsciiWordFirst, actionTPS_InHyphenAsciiWordFirst}, + {TPS_InHyphenAsciiWord, actionTPS_InHyphenAsciiWord}, + {TPS_InHyphenWordFirst, actionTPS_InHyphenWordFirst}, + {TPS_InHyphenWord, actionTPS_InHyphenWord}, + {TPS_InHyphenNumWordFirst, actionTPS_InHyphenNumWordFirst}, + {TPS_InHyphenNumWord, actionTPS_InHyphenNumWord}, {TPS_InHyphenValueFirst, actionTPS_InHyphenValueFirst}, {TPS_InHyphenValue, actionTPS_InHyphenValue}, {TPS_InHyphenValueExact, actionTPS_InHyphenValueExact}, {TPS_InParseHyphen, actionTPS_InParseHyphen}, {TPS_InParseHyphenHyphen, actionTPS_InParseHyphenHyphen}, - {TPS_InHyphenCyrWordPart, actionTPS_InHyphenCyrWordPart}, - {TPS_InHyphenLatWordPart, actionTPS_InHyphenLatWordPart}, - {TPS_InHyphenUWordPart, actionTPS_InHyphenUWordPart}, + {TPS_InHyphenWordPart, actionTPS_InHyphenWordPart}, + {TPS_InHyphenAsciiWordPart, actionTPS_InHyphenAsciiWordPart}, + {TPS_InHyphenNumWordPart, actionTPS_InHyphenNumWordPart}, {TPS_InHyphenUnsignedInt, actionTPS_InHyphenUnsignedInt}, {TPS_InHDecimalPartFirst, actionTPS_InHDecimalPartFirst}, {TPS_InHDecimalPart, actionTPS_InHDecimalPart}, @@ -1378,10 +1379,11 @@ TParserGet(TParser * prs) { TParserStateActionItem *item = NULL; + Assert(prs->state); + if (prs->state->posbyte >= prs->lenstr) return false; - Assert(prs->state); prs->lexeme = prs->str + prs->state->posbyte; prs->state->pushedAtAction = NULL; @@ -1488,10 +1490,12 @@ TParserGet(TParser * prs) prs->state->state = item->tostate; /* check for go away */ - if ((item->flags & A_BINGO) || (prs->state->posbyte >= prs->lenstr && (item->flags & A_RERUN) == 0)) + if ((item->flags & A_BINGO) || + (prs->state->posbyte >= prs->lenstr && + (item->flags & A_RERUN) == 0)) break; - /* go to begining of loop if we should rerun or we just restore state */ + /* go to beginning of loop if we should rerun or we just restore state */ if (item->flags & (A_RERUN | A_POP)) continue; @@ -1557,16 +1561,15 @@ prsd_end(PG_FUNCTION_ARGS) PG_RETURN_VOID(); } -#define LEAVETOKEN(x) ( (x)==12 ) -#define COMPLEXTOKEN(x) ( (x)==5 || (x)==15 || (x)==16 || (x)==17 ) -#define ENDPUNCTOKEN(x) ( (x)==12 ) - +#define LEAVETOKEN(x) ( (x)==SPACE ) +#define COMPLEXTOKEN(x) ( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD ) +#define ENDPUNCTOKEN(x) ( (x)==SPACE ) -#define TS_IDIGNORE(x) ( (x)==13 || (x)==14 || (x)==12 || (x)==23 ) -#define HLIDIGNORE(x) ( (x)==5 || (x)==13 || (x)==15 || (x)==16 || (x)==17 ) -#define HTMLHLIDIGNORE(x) ( (x)==5 || (x)==15 || (x)==16 || (x)==17 ) -#define NONWORDTOKEN(x) ( (x)==12 || HLIDIGNORE(x) ) -#define NOENDTOKEN(x) ( NONWORDTOKEN(x) || (x)==7 || (x)==8 || (x)==20 || (x)==21 || (x)==22 || TS_IDIGNORE(x) ) +#define TS_IDIGNORE(x) ( (x)==TAG_T || (x)==PROTOCOL || (x)==SPACE || (x)==HTMLENTITY ) +#define HLIDIGNORE(x) ( (x)==URL_T || (x)==TAG_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD ) +#define HTMLHLIDIGNORE(x) ( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD ) +#define NONWORDTOKEN(x) ( (x)==SPACE || HLIDIGNORE(x) ) +#define NOENDTOKEN(x) ( NONWORDTOKEN(x) || (x)==SCIENTIFIC || (x)==VERSIONNUMBER || (x)==DECIMAL || (x)==SIGNEDINT || (x)==UNSIGNEDINT || TS_IDIGNORE(x) ) typedef struct { diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h index 1fa5428a967..cbbd6737992 100644 --- a/src/include/catalog/catversion.h +++ b/src/include/catalog/catversion.h @@ -37,7 +37,7 @@ * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/catalog/catversion.h,v 1.435 2007/10/22 20:13:37 tgl Exp $ + * $PostgreSQL: pgsql/src/include/catalog/catversion.h,v 1.436 2007/10/23 20:46:12 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -53,6 +53,6 @@ */ /* yyyymmddN */ -#define CATALOG_VERSION_NO 200710221 +#define CATALOG_VERSION_NO 200710231 #endif diff --git a/src/test/regress/expected/tsdicts.out b/src/test/regress/expected/tsdicts.out index a1c13e70870..3520baceac7 100644 --- a/src/test/regress/expected/tsdicts.out +++ b/src/test/regress/expected/tsdicts.out @@ -209,8 +209,8 @@ SELECT ts_lexize('synonym', 'Gogle'); (1 row) -- Create and simple test thesaurus dictionary --- More test in configuration checks because of ts_lexize --- can not give more tat one word as it may wish thesaurus. +-- More tests in configuration checks because ts_lexize() +-- cannot pass more than one word to thesaurus. CREATE TEXT SEARCH DICTIONARY thesaurus ( Template=thesaurus, DictFile=thesaurus_sample, @@ -227,7 +227,7 @@ CREATE TEXT SEARCH CONFIGURATION ispell_tst ( COPY=english ); ALTER TEXT SEARCH CONFIGURATION ispell_tst ALTER MAPPING FOR - hword, lhword, lpart_hword, lword, nlhword, nlpart_hword, nlword, part_hword, word + word, numword, asciiword, hword, numhword, asciihword, hword_part, hword_numpart, hword_asciipart WITH ispell, english_stem; SELECT to_tsvector('ispell_tst', 'Booking the skies after rebookings for footballklubber from a foot'); to_tsvector @@ -276,7 +276,7 @@ CREATE TEXT SEARCH CONFIGURATION synonym_tst ( COPY=english ); ALTER TEXT SEARCH CONFIGURATION synonym_tst ALTER MAPPING FOR - lword, lpart_hword, lhword + asciiword, hword_asciipart, asciihword WITH synonym, english_stem; SELECT to_tsvector('synonym_tst', 'Postgresql is often called as postgres or pgsql and pronounced as postgre'); to_tsvector @@ -296,7 +296,7 @@ CREATE TEXT SEARCH CONFIGURATION thesaurus_tst ( COPY=synonym_tst ); ALTER TEXT SEARCH CONFIGURATION thesaurus_tst ALTER MAPPING FOR - lword, lpart_hword, lhword + asciiword, hword_asciipart, asciihword WITH synonym, thesaurus, english_stem; SELECT to_tsvector('thesaurus_tst', 'one postgres one two one two three one'); to_tsvector diff --git a/src/test/regress/expected/tsearch.out b/src/test/regress/expected/tsearch.out index 6eb453194da..3d55715be38 100644 --- a/src/test/regress/expected/tsearch.out +++ b/src/test/regress/expected/tsearch.out @@ -208,31 +208,31 @@ SELECT ts_lexize('english_stem', 'identity'); (1 row) SELECT * FROM ts_token_type('default'); - tokid | alias | description --------+--------------+----------------------------------- - 1 | lword | Latin word - 2 | nlword | Non-latin word - 3 | word | Word - 4 | email | Email - 5 | url | URL - 6 | host | Host - 7 | sfloat | Scientific notation - 8 | version | VERSION - 9 | part_hword | Part of hyphenated word - 10 | nlpart_hword | Non-latin part of hyphenated word - 11 | lpart_hword | Latin part of hyphenated word - 12 | blank | Space symbols - 13 | tag | HTML Tag - 14 | protocol | Protocol head - 15 | hword | Hyphenated word - 16 | lhword | Latin hyphenated word - 17 | nlhword | Non-latin hyphenated word - 18 | uri | URI - 19 | file | File or path name - 20 | float | Decimal notation - 21 | int | Signed integer - 22 | uint | Unsigned integer - 23 | entity | HTML Entity + tokid | alias | description +-------+-----------------+------------------------------------------ + 1 | asciiword | Word, all ASCII + 2 | word | Word, all letters + 3 | numword | Word, letters and digits + 4 | email | Email address + 5 | url | URL + 6 | host | Host + 7 | sfloat | Scientific notation + 8 | version | Version number + 9 | hword_numpart | Hyphenated word part, letters and digits + 10 | hword_part | Hyphenated word part, all letters + 11 | hword_asciipart | Hyphenated word part, all ASCII + 12 | blank | Space symbols + 13 | tag | HTML tag + 14 | protocol | Protocol head + 15 | numhword | Hyphenated word, letters and digits + 16 | asciihword | Hyphenated word, all ASCII + 17 | hword | Hyphenated word, all letters + 18 | uri | URI + 19 | file | File or path name + 20 | float | Decimal notation + 21 | int | Signed integer + 22 | uint | Unsigned integer + 23 | entity | HTML entity (23 rows) SELECT * FROM ts_parse('default', '345 qwe@efd.r '' http://www.com/ http://aew.werc.ewr/?ad=qwe&dw 1aew.werc.ewr/?ad=qwe&dw 2aew.werc.ewr http://3aew.werc.ewr/?ad=qwe&dw http://4aew.werc.ewr http://5aew.werc.ewr:8100/? ad=qwe&dw 6aew.werc.ewr:8100/?ad=qwe&dw 7aew.werc.ewr:8100/?ad=qwe&dw=%20%32 +4.0e-10 qwe qwe qwqwe 234.435 455 5.005 teodor@stack.net qwe-wer asdf <fr>qwer jf sdjk<we hjwer <werrwe> ewr1> ewri2 <a href="qwe<qwe>"> diff --git a/src/test/regress/sql/tsdicts.sql b/src/test/regress/sql/tsdicts.sql index 2e6cf791d87..f36e63a3110 100644 --- a/src/test/regress/sql/tsdicts.sql +++ b/src/test/regress/sql/tsdicts.sql @@ -58,8 +58,8 @@ SELECT ts_lexize('synonym', 'PoStGrEs'); SELECT ts_lexize('synonym', 'Gogle'); -- Create and simple test thesaurus dictionary --- More test in configuration checks because of ts_lexize --- can not give more tat one word as it may wish thesaurus. +-- More tests in configuration checks because ts_lexize() +-- cannot pass more than one word to thesaurus. CREATE TEXT SEARCH DICTIONARY thesaurus ( Template=thesaurus, DictFile=thesaurus_sample, @@ -74,7 +74,7 @@ CREATE TEXT SEARCH CONFIGURATION ispell_tst ( ); ALTER TEXT SEARCH CONFIGURATION ispell_tst ALTER MAPPING FOR - hword, lhword, lpart_hword, lword, nlhword, nlpart_hword, nlword, part_hword, word + word, numword, asciiword, hword, numhword, asciihword, hword_part, hword_numpart, hword_asciipart WITH ispell, english_stem; SELECT to_tsvector('ispell_tst', 'Booking the skies after rebookings for footballklubber from a foot'); @@ -99,7 +99,7 @@ CREATE TEXT SEARCH CONFIGURATION synonym_tst ( ); ALTER TEXT SEARCH CONFIGURATION synonym_tst ALTER MAPPING FOR - lword, lpart_hword, lhword + asciiword, hword_asciipart, asciihword WITH synonym, english_stem; SELECT to_tsvector('synonym_tst', 'Postgresql is often called as postgres or pgsql and pronounced as postgre'); @@ -112,10 +112,9 @@ CREATE TEXT SEARCH CONFIGURATION thesaurus_tst ( ); ALTER TEXT SEARCH CONFIGURATION thesaurus_tst ALTER MAPPING FOR - lword, lpart_hword, lhword + asciiword, hword_asciipart, asciihword WITH synonym, thesaurus, english_stem; SELECT to_tsvector('thesaurus_tst', 'one postgres one two one two three one'); SELECT to_tsvector('thesaurus_tst', 'Supernovae star is very new star and usually called supernovae (abbrevation SN)'); SELECT to_tsvector('thesaurus_tst', 'Booking tickets is looking like a booking a tickets'); - diff --git a/src/tools/msvc/Install.pm b/src/tools/msvc/Install.pm index 57815a2dc87..798810343ef 100644 --- a/src/tools/msvc/Install.pm +++ b/src/tools/msvc/Install.pm @@ -3,7 +3,7 @@ package Install; # # Package that provides 'make install' functionality for msvc builds # -# $PostgreSQL: pgsql/src/tools/msvc/Install.pm,v 1.24 2007/10/16 16:00:00 tgl Exp $ +# $PostgreSQL: pgsql/src/tools/msvc/Install.pm,v 1.25 2007/10/23 20:46:12 tgl Exp $ # use strict; use warnings; @@ -258,7 +258,7 @@ sub GenerateTsearchFiles while ($#pieces > 0) { my $lang = shift @pieces || last; - my $latlang = shift @pieces || last; + my $asclang = shift @pieces || last; my $txt = $tmpl; my $stop = ''; @@ -269,8 +269,8 @@ sub GenerateTsearchFiles $txt =~ s#_LANGNAME_#${lang}#gs; $txt =~ s#_DICTNAME_#${lang}_stem#gs; $txt =~ s#_CFGNAME_#${lang}#gs; - $txt =~ s#_LATDICTNAME_#${latlang}_stem#gs; - $txt =~ s#_NONLATDICTNAME_#${lang}_stem#gs; + $txt =~ s#_ASCDICTNAME_#${asclang}_stem#gs; + $txt =~ s#_NONASCDICTNAME_#${lang}_stem#gs; $txt =~ s#_STOPWORDS_#$stop#gs; print $F $txt; print "."; |