diff options
Diffstat (limited to 'src/backend/utils/adt/tsquery.c')
-rw-r--r-- | src/backend/utils/adt/tsquery.c | 390 |
1 files changed, 306 insertions, 84 deletions
diff --git a/src/backend/utils/adt/tsquery.c b/src/backend/utils/adt/tsquery.c index 1ccbf790306..793c0e5dd1c 100644 --- a/src/backend/utils/adt/tsquery.c +++ b/src/backend/utils/adt/tsquery.c @@ -32,14 +32,53 @@ const int tsearch_op_priority[OP_COUNT] = 3 /* OP_PHRASE */ }; +/* + * parser's states + */ +typedef enum +{ + WAITOPERAND = 1, + WAITOPERATOR = 2, + WAITFIRSTOPERAND = 3 +} ts_parserstate; + +/* + * token types for parsing + */ +typedef enum +{ + PT_END = 0, + PT_ERR = 1, + PT_VAL = 2, + PT_OPR = 3, + PT_OPEN = 4, + PT_CLOSE = 5 +} ts_tokentype; + +/* + * get token from query string + * + * *operator is filled in with OP_* when return values is PT_OPR, + * but *weight could contain a distance value in case of phrase operator. + * *strval, *lenval and *weight are filled in when return value is PT_VAL + * + */ +typedef ts_tokentype (*ts_tokenizer)(TSQueryParserState state, int8 *operator, + int *lenval, char **strval, + int16 *weight, bool *prefix); + struct TSQueryParserStateData { - /* State for gettoken_query */ + /* Tokenizer used for parsing tsquery */ + ts_tokenizer gettoken; + + /* State of tokenizer function */ char *buffer; /* entire string we are scanning */ char *buf; /* current scan point */ - int state; int count; /* nesting count, incremented by (, * decremented by ) */ + bool in_quotes; /* phrase in quotes "" */ + ts_parserstate state; /* polish (prefix) notation in list, filled in by push* functions */ List *polstr; @@ -57,12 +96,6 @@ struct TSQueryParserStateData TSVectorParseState valstate; }; -/* parser's states */ -#define WAITOPERAND 1 -#define WAITOPERATOR 2 -#define WAITFIRSTOPERAND 3 -#define WAITSINGLEOPERAND 4 - /* * subroutine to parse the modifiers (weight and prefix flag currently) * part, like ':AB*' of a query. @@ -118,18 +151,17 @@ get_modifiers(char *buf, int16 *weight, bool *prefix) * * The buffer should begin with '<' char */ -static char * -parse_phrase_operator(char *buf, int16 *distance) +static bool +parse_phrase_operator(TSQueryParserState pstate, int16 *distance) { enum { PHRASE_OPEN = 0, PHRASE_DIST, PHRASE_CLOSE, - PHRASE_ERR, PHRASE_FINISH } state = PHRASE_OPEN; - char *ptr = buf; + char *ptr = pstate->buf; char *endptr; long l = 1; /* default distance */ @@ -138,9 +170,13 @@ parse_phrase_operator(char *buf, int16 *distance) switch (state) { case PHRASE_OPEN: - Assert(t_iseq(ptr, '<')); - state = PHRASE_DIST; - ptr++; + if (t_iseq(ptr, '<')) + { + state = PHRASE_DIST; + ptr++; + } + else + return false; break; case PHRASE_DIST: @@ -148,18 +184,16 @@ parse_phrase_operator(char *buf, int16 *distance) { state = PHRASE_CLOSE; ptr++; - break; + continue; } + if (!t_isdigit(ptr)) - { - state = PHRASE_ERR; - break; - } + return false; errno = 0; l = strtol(ptr, &endptr, 10); if (ptr == endptr) - state = PHRASE_ERR; + return false; else if (errno == ERANGE || l < 0 || l > MAXENTRYPOS) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), @@ -179,54 +213,77 @@ parse_phrase_operator(char *buf, int16 *distance) ptr++; } else - state = PHRASE_ERR; + return false; break; case PHRASE_FINISH: *distance = (int16) l; - return ptr; - - case PHRASE_ERR: - default: - goto err; + pstate->buf = ptr; + return true; } } -err: - *distance = -1; - return buf; + return false; } /* - * token types for parsing + * Parse OR operator used in websearch_to_tsquery(), returns true if we + * believe that "OR" literal could be an operator OR */ -typedef enum +static bool +parse_or_operator(TSQueryParserState pstate) { - PT_END = 0, - PT_ERR = 1, - PT_VAL = 2, - PT_OPR = 3, - PT_OPEN = 4, - PT_CLOSE = 5 -} ts_tokentype; + char *ptr = pstate->buf; + + if (pstate->in_quotes) + return false; + + /* it should begin with "OR" literal */ + if (pg_strncasecmp(ptr, "or", 2) != 0) + return false; + + ptr += 2; + + /* + * it shouldn't be a part of any word but somewhere later it should be some + * operand + */ + if (*ptr == '\0') /* no operand */ + return false; + + /* it shouldn't be a part of any word */ + if (t_iseq(ptr, '-') || t_iseq(ptr, '_') || t_isalpha(ptr) || t_isdigit(ptr)) + return false; + + for(;;) + { + ptr += pg_mblen(ptr); + + if (*ptr == '\0') /* got end of string without operand */ + return false; + + /* + * Suppose, we found an operand, but could be a not correct operand. So + * we still treat OR literal as operation with possibly incorrect + * operand and will not search it as lexeme + */ + if (!t_isspace(ptr)) + break; + } + + pstate->buf += 2; + return true; +} -/* - * get token from query string - * - * *operator is filled in with OP_* when return values is PT_OPR, - * but *weight could contain a distance value in case of phrase operator. - * *strval, *lenval and *weight are filled in when return value is PT_VAL - * - */ static ts_tokentype -gettoken_query(TSQueryParserState state, - int8 *operator, - int *lenval, char **strval, int16 *weight, bool *prefix) +gettoken_query_standard(TSQueryParserState state, int8 *operator, + int *lenval, char **strval, + int16 *weight, bool *prefix) { *weight = 0; *prefix = false; - while (1) + while (true) { switch (state->state) { @@ -234,17 +291,16 @@ gettoken_query(TSQueryParserState state, case WAITOPERAND: if (t_iseq(state->buf, '!')) { - (state->buf)++; /* can safely ++, t_iseq guarantee that - * pg_mblen()==1 */ - *operator = OP_NOT; + state->buf++; state->state = WAITOPERAND; + *operator = OP_NOT; return PT_OPR; } else if (t_iseq(state->buf, '(')) { - state->count++; - (state->buf)++; + state->buf++; state->state = WAITOPERAND; + state->count++; return PT_OPEN; } else if (t_iseq(state->buf, ':')) @@ -256,19 +312,19 @@ gettoken_query(TSQueryParserState state, } else if (!t_isspace(state->buf)) { - /* - * We rely on the tsvector parser to parse the value for - * us - */ + /* We rely on the tsvector parser to parse the value for us */ reset_tsvector_parser(state->valstate, state->buf); - if (gettoken_tsvector(state->valstate, strval, lenval, NULL, NULL, &state->buf)) + if (gettoken_tsvector(state->valstate, strval, lenval, + NULL, NULL, &state->buf)) { state->buf = get_modifiers(state->buf, weight, prefix); state->state = WAITOPERATOR; return PT_VAL; } else if (state->state == WAITFIRSTOPERAND) + { return PT_END; + } else ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), @@ -276,58 +332,206 @@ gettoken_query(TSQueryParserState state, state->buffer))); } break; + case WAITOPERATOR: if (t_iseq(state->buf, '&')) { + state->buf++; state->state = WAITOPERAND; *operator = OP_AND; - (state->buf)++; return PT_OPR; } else if (t_iseq(state->buf, '|')) { + state->buf++; state->state = WAITOPERAND; *operator = OP_OR; - (state->buf)++; return PT_OPR; } - else if (t_iseq(state->buf, '<')) + else if (parse_phrase_operator(state, weight)) { + /* weight var is used as storage for distance */ state->state = WAITOPERAND; *operator = OP_PHRASE; - /* weight var is used as storage for distance */ - state->buf = parse_phrase_operator(state->buf, weight); - if (*weight < 0) - return PT_ERR; return PT_OPR; } else if (t_iseq(state->buf, ')')) { - (state->buf)++; + state->buf++; state->count--; return (state->count < 0) ? PT_ERR : PT_CLOSE; } - else if (*(state->buf) == '\0') + else if (*state->buf == '\0') + { return (state->count) ? PT_ERR : PT_END; + } else if (!t_isspace(state->buf)) + { return PT_ERR; + } + break; + } + + state->buf += pg_mblen(state->buf); + } +} + +static ts_tokentype +gettoken_query_websearch(TSQueryParserState state, int8 *operator, + int *lenval, char **strval, + int16 *weight, bool *prefix) +{ + *weight = 0; + *prefix = false; + + while (true) + { + switch (state->state) + { + case WAITFIRSTOPERAND: + case WAITOPERAND: + if (t_iseq(state->buf, '-')) + { + state->buf++; + state->state = WAITOPERAND; + + if (state->in_quotes) + continue; + + *operator = OP_NOT; + return PT_OPR; + } + else if (t_iseq(state->buf, '"')) + { + state->buf++; + + if (!state->in_quotes) + { + state->state = WAITOPERAND; + + if (strchr(state->buf, '"')) + { + /* quoted text should be ordered <-> */ + state->in_quotes = true; + return PT_OPEN; + } + + /* web search tolerates missing quotes */ + continue; + } + else + { + /* we have to provide an operand */ + state->in_quotes = false; + state->state = WAITOPERATOR; + pushStop(state); + return PT_CLOSE; + } + } + else if (ISOPERATOR(state->buf)) + { + /* or else gettoken_tsvector() will raise an error */ + state->buf++; + state->state = WAITOPERAND; + continue; + } + else if (!t_isspace(state->buf)) + { + /* We rely on the tsvector parser to parse the value for us */ + reset_tsvector_parser(state->valstate, state->buf); + if (gettoken_tsvector(state->valstate, strval, lenval, + NULL, NULL, &state->buf)) + { + state->state = WAITOPERATOR; + return PT_VAL; + } + else if (state->state == WAITFIRSTOPERAND) + { + return PT_END; + } + else + { + /* finally, we have to provide an operand */ + pushStop(state); + return PT_END; + } + } break; - case WAITSINGLEOPERAND: - if (*(state->buf) == '\0') + + case WAITOPERATOR: + if (t_iseq(state->buf, '"')) + { + if (!state->in_quotes) + { + /* + * put implicit AND after an operand + * and handle this quote in WAITOPERAND + */ + state->state = WAITOPERAND; + *operator = OP_AND; + return PT_OPR; + } + else + { + state->buf++; + + /* just close quotes */ + state->in_quotes = false; + return PT_CLOSE; + } + } + else if (parse_or_operator(state)) + { + state->state = WAITOPERAND; + *operator = OP_OR; + return PT_OPR; + } + else if (*state->buf == '\0') + { return PT_END; - *strval = state->buf; - *lenval = strlen(state->buf); - state->buf += strlen(state->buf); - state->count++; - return PT_VAL; - default: - return PT_ERR; + } + else if (!t_isspace(state->buf)) + { + if (state->in_quotes) + { + /* put implicit <-> after an operand */ + *operator = OP_PHRASE; + *weight = 1; + } + else + { + /* put implicit AND after an operand */ + *operator = OP_AND; + } + + state->state = WAITOPERAND; + return PT_OPR; + } break; } + state->buf += pg_mblen(state->buf); } } +static ts_tokentype +gettoken_query_plain(TSQueryParserState state, int8 *operator, + int *lenval, char **strval, + int16 *weight, bool *prefix) +{ + *weight = 0; + *prefix = false; + + if (*state->buf == '\0') + return PT_END; + + *strval = state->buf; + *lenval = strlen(state->buf); + state->buf += *lenval; + state->count++; + return PT_VAL; +} + /* * Push an operator to state->polstr */ @@ -489,7 +693,9 @@ makepol(TSQueryParserState state, /* since this function recurses, it could be driven to stack overflow */ check_stack_depth(); - while ((type = gettoken_query(state, &operator, &lenval, &strval, &weight, &prefix)) != PT_END) + while ((type = state->gettoken(state, &operator, + &lenval, &strval, + &weight, &prefix)) != PT_END) { switch (type) { @@ -605,7 +811,7 @@ TSQuery parse_tsquery(char *buf, PushFunction pushval, Datum opaque, - bool isplain) + int flags) { struct TSQueryParserStateData state; int i; @@ -614,16 +820,32 @@ parse_tsquery(char *buf, QueryItem *ptr; ListCell *cell; bool needcleanup; + int tsv_flags = P_TSV_OPR_IS_DELIM | P_TSV_IS_TSQUERY; + + /* plain should not be used with web */ + Assert((flags & (P_TSQ_PLAIN | P_TSQ_WEB)) != (P_TSQ_PLAIN | P_TSQ_WEB)); + + /* select suitable tokenizer */ + if (flags & P_TSQ_PLAIN) + state.gettoken = gettoken_query_plain; + else if (flags & P_TSQ_WEB) + { + state.gettoken = gettoken_query_websearch; + tsv_flags |= P_TSV_IS_WEB; + } + else + state.gettoken = gettoken_query_standard; /* init state */ state.buffer = buf; state.buf = buf; - state.state = (isplain) ? WAITSINGLEOPERAND : WAITFIRSTOPERAND; state.count = 0; + state.in_quotes = false; + state.state = WAITFIRSTOPERAND; state.polstr = NIL; /* init value parser's state */ - state.valstate = init_tsvector_parser(state.buffer, true, true); + state.valstate = init_tsvector_parser(state.buffer, tsv_flags); /* init list of operand */ state.sumlen = 0; @@ -716,7 +938,7 @@ tsqueryin(PG_FUNCTION_ARGS) { char *in = PG_GETARG_CSTRING(0); - PG_RETURN_TSQUERY(parse_tsquery(in, pushval_asis, PointerGetDatum(NULL), false)); + PG_RETURN_TSQUERY(parse_tsquery(in, pushval_asis, PointerGetDatum(NULL), 0)); } /* |