diff options
Diffstat (limited to 'contrib/tsearch2/wordparser/parser.l')
-rw-r--r-- | contrib/tsearch2/wordparser/parser.l | 346 |
1 files changed, 0 insertions, 346 deletions
diff --git a/contrib/tsearch2/wordparser/parser.l b/contrib/tsearch2/wordparser/parser.l deleted file mode 100644 index a7cb4684c32..00000000000 --- a/contrib/tsearch2/wordparser/parser.l +++ /dev/null @@ -1,346 +0,0 @@ -%{ -#include "postgres.h" - -#include "deflex.h" -#include "parser.h" -#include "common.h" - -/* Avoid exit() on fatal scanner errors */ -#undef fprintf -#define fprintf(file, fmt, msg) ts_error(ERROR, fmt, msg) - -char *token = NULL; /* pointer to token */ -int tokenlen; -static char *s = NULL; /* to return WHOLE hyphenated-word */ - -YY_BUFFER_STATE buf = NULL; /* buffer to parse; it need for parse from string */ - -typedef struct { - int tlen; - int clen; - char *str; -} TagStorage; - -static TagStorage ts={0,0,NULL}; - -static void -addTag(void) -{ - while( ts.clen+tsearch2_yyleng+1 > ts.tlen ) { - ts.tlen*=2; - ts.str=realloc(ts.str,ts.tlen); - if (!ts.str) - ereport(ERROR, - (errcode(ERRCODE_OUT_OF_MEMORY), - errmsg("out of memory"))); - } - memcpy(ts.str+ts.clen,tsearch2_yytext,tsearch2_yyleng); - ts.clen+=tsearch2_yyleng; - ts.str[ts.clen]='\0'; -} - -static void -startTag(void) -{ - if ( ts.str==NULL ) { - ts.tlen=tsearch2_yyleng+1; - ts.str=malloc(ts.tlen); - if (!ts.str) - ereport(ERROR, - (errcode(ERRCODE_OUT_OF_MEMORY), - errmsg("out of memory"))); - } - ts.clen=0; - ts.str[0]='\0'; - addTag(); -} - -%} - -%option 8bit -%option never-interactive -%option nodefault -%option nounput -%option noyywrap - -/* parser's state for parsing hyphenated-word */ -%x DELIM -/* parser's state for parsing URL*/ -%x URL -%x SERVER - -/* parser's state for parsing TAGS */ -%x INTAG -%x QINTAG -%x INCOMMENT -%x INSCRIPT - -/* cyrillic koi8 char */ -CYRALNUM [0-9\200-\377] -CYRALPHA [\200-\377] -ALPHA [a-zA-Z\200-\377] -ALNUM [0-9a-zA-Z\200-\377] - - -HOSTNAME ([-_[:alnum:]]+\.)+[[:alpha:]]+ -URI [-_[:alnum:]/%,\.;=&?#]+ - -%% - -"<"[Ss][Cc][Rr][Ii][Pp][Tt] { BEGIN INSCRIPT; startTag(); } - -<INSCRIPT>"</"[Ss][Cc][Rr][Ii][Pp][Tt]">" { - BEGIN INITIAL; - addTag(); - token = ts.str; - tokenlen = ts.clen; - return TAG; -} - -"<!--" { BEGIN INCOMMENT; startTag(); } - -<INCOMMENT>"-->" { - BEGIN INITIAL; - addTag(); - token = ts.str; - tokenlen = ts.clen; - return TAG; -} - - -"<"[\![:alpha:]] { BEGIN INTAG; startTag(); } - -"</"[[:alpha:]] { BEGIN INTAG; startTag(); } - -<INTAG>"\"" { BEGIN QINTAG; addTag(); } - -<QINTAG>"\\\"" { addTag(); } - -<QINTAG>"\"" { BEGIN INTAG; addTag(); } - -<INTAG>">" { - BEGIN INITIAL; - addTag(); - token = ts.str; - tokenlen = ts.clen; - return TAG; -} - -<QINTAG,INTAG,INCOMMENT,INSCRIPT>.|\n { addTag(); } - -\&(quot|amp|nbsp|lt|gt)\; { - token = tsearch2_yytext; - tokenlen = tsearch2_yyleng; - return HTMLENTITY; -} - -\&\#[0-9][0-9]?[0-9]?\; { - token = tsearch2_yytext; - tokenlen = tsearch2_yyleng; - return HTMLENTITY; -} - -[-_\.[:alnum:]]+@{HOSTNAME} /* Emails */ { - token = tsearch2_yytext; - tokenlen = tsearch2_yyleng; - return EMAIL; -} - -[+-]?[0-9]+(\.[0-9]+)?[eEdD][+-]?[0-9]+ /* float */ { - token = tsearch2_yytext; - tokenlen = tsearch2_yyleng; - return SCIENTIFIC; -} - -[0-9]+\.[0-9]+\.[0-9\.]*[0-9] { - token = tsearch2_yytext; - tokenlen = tsearch2_yyleng; - return VERSIONNUMBER; -} - -[+-]?[0-9]+\.[0-9]+ { - token = tsearch2_yytext; - tokenlen = tsearch2_yyleng; - return DECIMAL; -} - -[+-][0-9]+ { - token = tsearch2_yytext; - tokenlen = tsearch2_yyleng; - return SIGNEDINT; -} - -<DELIM,INITIAL>[0-9]+ { - token = tsearch2_yytext; - tokenlen = tsearch2_yyleng; - return UNSIGNEDINT; -} - -http"://" { - BEGIN URL; - token = tsearch2_yytext; - tokenlen = tsearch2_yyleng; - return HTTP; -} - -ftp"://" { - BEGIN URL; - token = tsearch2_yytext; - tokenlen = tsearch2_yyleng; - return HTTP; -} - -<URL,INITIAL>{HOSTNAME}[/:]{URI} { - BEGIN SERVER; - if (s) { free(s); s=NULL; } - s = strdup( tsearch2_yytext ); - tokenlen = tsearch2_yyleng; - yyless( 0 ); - token = s; - return FURL; -} - -<SERVER,URL,INITIAL>{HOSTNAME} { - token = tsearch2_yytext; - tokenlen = tsearch2_yyleng; - return HOST; -} - -<SERVER>[/:]{URI} { - token = tsearch2_yytext; - tokenlen = tsearch2_yyleng; - return URI; -} - -[[:alnum:]\./_-]+"/"[[:alnum:]\./_-]+ { - token = tsearch2_yytext; - tokenlen = tsearch2_yyleng; - return FILEPATH; -} - -({CYRALPHA}+-)+{CYRALPHA}+ /* composite-word */ { - BEGIN DELIM; - if (s) { free(s); s=NULL; } - s = strdup( tsearch2_yytext ); - tokenlen = tsearch2_yyleng; - yyless( 0 ); - token = s; - return CYRHYPHENWORD; -} - -([[:alpha:]]+-)+[[:alpha:]]+ /* composite-word */ { - BEGIN DELIM; - if (s) { free(s); s=NULL; } - s = strdup( tsearch2_yytext ); - tokenlen = tsearch2_yyleng; - yyless( 0 ); - token = s; - return LATHYPHENWORD; -} - -({ALNUM}+-)+{ALNUM}+ /* composite-word */ { - BEGIN DELIM; - if (s) { free(s); s=NULL; } - s = strdup( tsearch2_yytext ); - tokenlen = tsearch2_yyleng; - yyless( 0 ); - token = s; - return HYPHENWORD; -} - -<DELIM>[0-9]+\.[0-9]+\.[0-9\.]*[0-9] { - token = tsearch2_yytext; - tokenlen = tsearch2_yyleng; - return VERSIONNUMBER; -} - -<DELIM>\+?[0-9]+\.[0-9]+ { - token = tsearch2_yytext; - tokenlen = tsearch2_yyleng; - return DECIMAL; -} - -<DELIM>{CYRALPHA}+ /* one word in composite-word */ { - token = tsearch2_yytext; - tokenlen = tsearch2_yyleng; - return CYRPARTHYPHENWORD; -} - -<DELIM>[[:alpha:]]+ /* one word in composite-word */ { - token = tsearch2_yytext; - tokenlen = tsearch2_yyleng; - return LATPARTHYPHENWORD; -} - -<DELIM>{ALNUM}+ /* one word in composite-word */ { - token = tsearch2_yytext; - tokenlen = tsearch2_yyleng; - return PARTHYPHENWORD; -} - -<DELIM>- { - token = tsearch2_yytext; - tokenlen = tsearch2_yyleng; - return SPACE; -} - -<DELIM,SERVER,URL>.|\n /* return in basic state */ { - BEGIN INITIAL; - yyless( 0 ); -} - -{CYRALPHA}+ /* normal word */ { - token = tsearch2_yytext; - tokenlen = tsearch2_yyleng; - return CYRWORD; -} - -[[:alpha:]]+ /* normal word */ { - token = tsearch2_yytext; - tokenlen = tsearch2_yyleng; - return LATWORD; -} - -{ALNUM}+ /* normal word */ { - token = tsearch2_yytext; - tokenlen = tsearch2_yyleng; - return UWORD; -} - -[ \r\n\t]+ { - token = tsearch2_yytext; - tokenlen = tsearch2_yyleng; - return SPACE; -} - -. { - token = tsearch2_yytext; - tokenlen = tsearch2_yyleng; - return SPACE; -} - -%% - -/* clearing after parsing from string */ -void -tsearch2_end_parse(void) -{ - if (s) - { - free(s); - s = NULL; - } - tsearch2_yy_delete_buffer( buf ); - buf = NULL; -} - -/* start parse from string */ -void -tsearch2_start_parse_str(char* str, int limit) -{ - if (buf) - tsearch2_end_parse(); - buf = tsearch2_yy_scan_bytes( str, limit ); - tsearch2_yy_switch_to_buffer( buf ); - BEGIN INITIAL; -} |