diff options
author | Tom Lane <tgl@sss.pgh.pa.us> | 2023-01-19 16:21:44 -0500 |
---|---|---|
committer | Tom Lane <tgl@sss.pgh.pa.us> | 2023-01-19 16:21:44 -0500 |
commit | 5a617d75d3b31414f378dd764a11db1a08fa79bb (patch) | |
tree | 0c366969873581f99ca2c99b50c6d21eab11c6bd /src/backend/utils/adt/tsvector_op.c | |
parent | 44e9e34266efd42901bf7b12552f2033972d70b7 (diff) | |
download | postgresql-5a617d75d3b31414f378dd764a11db1a08fa79bb.tar.gz postgresql-5a617d75d3b31414f378dd764a11db1a08fa79bb.zip |
Fix ts_headline() to handle ORs and phrase queries more honestly.
This patch largely reverts what I did in commits c9b0c678d and
78e73e875. The maximum cover length limit that I added in 78e73e875
(to band-aid over c9b0c678d's performance issues) creates too many
user-visible behavior discrepancies, as complained of for example in
bug #17691. The real problem with hlCover() is not what I thought
at the time, but more that it seems to have been designed with only
AND tsquery semantics in mind. It doesn't work quite right for OR,
and even less so for NOT or phrase queries. However, we can improve
that situation by building a variant of TS_execute() that returns a
list of match locations. We already get an ExecPhraseData struct
representing match locations for the primitive case of a simple match,
as well as one for a phrase match; we just need to add some logic to
combine these for AND and OR operators. The result is a list of
ExecPhraseDatas, which hlCover can regard as having simple AND
semantics, so that its old algorithm works correctly.
There's still a lot not to like about ts_headline's behavior, but
I think the remaining issues have to do with the heuristics used
in mark_hl_words and mark_hl_fragments (which, likewise, were not
revisited when phrase search was added). Improving those is a task
for another day.
Patch by me; thanks to Alvaro Herrera for review.
Discussion: https://postgr.es/m/840.1669405935@sss.pgh.pa.us
Diffstat (limited to 'src/backend/utils/adt/tsvector_op.c')
-rw-r--r-- | src/backend/utils/adt/tsvector_op.c | 181 |
1 files changed, 178 insertions, 3 deletions
diff --git a/src/backend/utils/adt/tsvector_op.c b/src/backend/utils/adt/tsvector_op.c index c594c18a3ae..a38db4697d3 100644 --- a/src/backend/utils/adt/tsvector_op.c +++ b/src/backend/utils/adt/tsvector_op.c @@ -71,6 +71,10 @@ typedef struct static TSTernaryValue TS_execute_recurse(QueryItem *curitem, void *arg, uint32 flags, TSExecuteCallback chkcond); +static bool TS_execute_locations_recurse(QueryItem *curitem, + void *arg, + TSExecuteCallback chkcond, + List **locations); static int tsvector_bsearch(const TSVector tsv, char *lexeme, int lexeme_len); static Datum tsvector_update_trigger(PG_FUNCTION_ARGS, bool config_column); @@ -1571,10 +1575,10 @@ TS_phrase_output(ExecPhraseData *data, * In addition to the same arguments used for TS_execute, the caller may pass * a preinitialized-to-zeroes ExecPhraseData struct, to be filled with lexeme * match position info on success. data == NULL if no position data need be - * returned. (In practice, outside callers pass NULL, and only the internal - * recursion cases pass a data pointer.) + * returned. * Note: the function assumes data != NULL for operators other than OP_PHRASE. - * This is OK because an outside call always starts from an OP_PHRASE node. + * This is OK because an outside call always starts from an OP_PHRASE node, + * and all internal recursion cases pass data != NULL. * * The detailed semantics of the match data, given that the function returned * TS_YES (successful match), are: @@ -1972,6 +1976,177 @@ TS_execute_recurse(QueryItem *curitem, void *arg, uint32 flags, } /* + * Evaluate tsquery and report locations of matching terms. + * + * This is like TS_execute except that it returns match locations not just + * success/failure status. The callback function is required to provide + * position data (we report failure if it doesn't). + * + * On successful match, the result is a List of ExecPhraseData structs, one + * for each AND'ed term or phrase operator in the query. Each struct includes + * a sorted array of lexeme positions matching that term. (Recall that for + * phrase operators, the match includes width+1 lexemes, and the recorded + * position is that of the rightmost lexeme.) + * + * OR subexpressions are handled by union'ing their match locations into a + * single List element, which is valid since any of those locations contains + * a match. However, when some of the OR'ed terms are phrase operators, we + * report the maximum width of any of the OR'ed terms, making such cases + * slightly imprecise in the conservative direction. (For example, if the + * tsquery is "(A <-> B) | C", an occurrence of C in the data would be + * reported as though it includes the lexeme to the left of C.) + * + * Locations of NOT subexpressions are not reported. (Obviously, there can + * be no successful NOT matches at top level, or the match would have failed. + * So this amounts to ignoring NOTs underneath ORs.) + * + * The result is NIL if no match, or if position data was not returned. + * + * Arguments are the same as for TS_execute, although flags is currently + * vestigial since none of the defined bits are sensible here. + */ +List * +TS_execute_locations(QueryItem *curitem, void *arg, + uint32 flags, + TSExecuteCallback chkcond) +{ + List *result; + + /* No flags supported, as yet */ + Assert(flags == TS_EXEC_EMPTY); + if (TS_execute_locations_recurse(curitem, arg, chkcond, &result)) + return result; + return NIL; +} + +/* + * TS_execute_locations recursion for operators above any phrase operator. + * OP_PHRASE subexpressions can be passed off to TS_phrase_execute. + */ +static bool +TS_execute_locations_recurse(QueryItem *curitem, void *arg, + TSExecuteCallback chkcond, + List **locations) +{ + bool lmatch, + rmatch; + List *llocations, + *rlocations; + ExecPhraseData *data; + + /* since this function recurses, it could be driven to stack overflow */ + check_stack_depth(); + + /* ... and let's check for query cancel while we're at it */ + CHECK_FOR_INTERRUPTS(); + + /* Default locations result is empty */ + *locations = NIL; + + if (curitem->type == QI_VAL) + { + data = palloc0_object(ExecPhraseData); + if (chkcond(arg, (QueryOperand *) curitem, data) == TS_YES) + { + *locations = list_make1(data); + return true; + } + pfree(data); + return false; + } + + switch (curitem->qoperator.oper) + { + case OP_NOT: + if (!TS_execute_locations_recurse(curitem + 1, arg, chkcond, + &llocations)) + return true; /* we don't pass back any locations */ + return false; + + case OP_AND: + if (!TS_execute_locations_recurse(curitem + curitem->qoperator.left, + arg, chkcond, + &llocations)) + return false; + if (!TS_execute_locations_recurse(curitem + 1, + arg, chkcond, + &rlocations)) + return false; + *locations = list_concat(llocations, rlocations); + return true; + + case OP_OR: + lmatch = TS_execute_locations_recurse(curitem + curitem->qoperator.left, + arg, chkcond, + &llocations); + rmatch = TS_execute_locations_recurse(curitem + 1, + arg, chkcond, + &rlocations); + if (lmatch || rmatch) + { + /* + * We generate an AND'able location struct from each + * combination of sub-matches, following the disjunctive law + * (A & B) | (C & D) = (A | C) & (A | D) & (B | C) & (B | D). + * + * However, if either input didn't produce locations (i.e., it + * failed or was a NOT), we must just return the other list. + */ + if (llocations == NIL) + *locations = rlocations; + else if (rlocations == NIL) + *locations = llocations; + else + { + ListCell *ll; + + foreach(ll, llocations) + { + ExecPhraseData *ldata = (ExecPhraseData *) lfirst(ll); + ListCell *lr; + + foreach(lr, rlocations) + { + ExecPhraseData *rdata = (ExecPhraseData *) lfirst(lr); + + data = palloc0_object(ExecPhraseData); + (void) TS_phrase_output(data, ldata, rdata, + TSPO_BOTH | TSPO_L_ONLY | TSPO_R_ONLY, + 0, 0, + ldata->npos + rdata->npos); + /* Report the larger width, as explained above. */ + data->width = Max(ldata->width, rdata->width); + *locations = lappend(*locations, data); + } + } + } + + return true; + } + return false; + + case OP_PHRASE: + /* We can hand this off to TS_phrase_execute */ + data = palloc0_object(ExecPhraseData); + if (TS_phrase_execute(curitem, arg, TS_EXEC_EMPTY, chkcond, + data) == TS_YES) + { + if (!data->negate) + *locations = list_make1(data); + return true; + } + pfree(data); + return false; + + default: + elog(ERROR, "unrecognized operator: %d", curitem->qoperator.oper); + } + + /* not reachable, but keep compiler quiet */ + return false; +} + +/* * Detect whether a tsquery boolean expression requires any positive matches * to values shown in the tsquery. * |