aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTom Lane <tgl@sss.pgh.pa.us>2022-11-21 17:07:07 -0500
committerTom Lane <tgl@sss.pgh.pa.us>2022-11-21 17:07:07 -0500
commit0353db996e37c6b923863ee648f4d2f153bf619d (patch)
tree9c8bc8103407781a12dd2680b0b8668f5b176512
parentb8988cf1d0a74f2e394278e0d88c2c133ee252fb (diff)
downloadpostgresql-0353db996e37c6b923863ee648f4d2f153bf619d.tar.gz
postgresql-0353db996e37c6b923863ee648f4d2f153bf619d.zip
Add comments and a missing CHECK_FOR_INTERRUPTS in ts_headline.
I just spent an annoying amount of time reverse-engineering the 100%-undocumented API between ts_headline and the text search parser's prsheadline function. Add some commentary about that while it's fresh in mind. Also remove some unused macros in wparser_def.c. While at it, I noticed that when commit 78e73e875 added a CHECK_FOR_INTERRUPTS call in TS_execute_recurse, it missed doing so in the parallel function TS_phrase_execute, which surely needs one just as much. Back-patch because of the missing CHECK_FOR_INTERRUPTS. Might as well back-patch the rest of this too.
-rw-r--r--src/backend/tsearch/ts_parse.c13
-rw-r--r--src/backend/tsearch/wparser_def.c4
-rw-r--r--src/backend/utils/adt/tsvector_op.c3
-rw-r--r--src/include/tsearch/ts_public.h61
4 files changed, 60 insertions, 21 deletions
diff --git a/src/backend/tsearch/ts_parse.c b/src/backend/tsearch/ts_parse.c
index 27b2cca2dfb..a87b442046a 100644
--- a/src/backend/tsearch/ts_parse.c
+++ b/src/backend/tsearch/ts_parse.c
@@ -433,6 +433,8 @@ parsetext(Oid cfgId, ParsedText *prs, char *buf, int buflen)
/*
* Headline framework
*/
+
+/* Add a word to prs->words[] */
static void
hladdword(HeadlineParsedText *prs, char *buf, int buflen, int type)
{
@@ -449,6 +451,14 @@ hladdword(HeadlineParsedText *prs, char *buf, int buflen, int type)
prs->curwords++;
}
+/*
+ * Add pos and matching-query-item data to the just-added word.
+ * Here, buf/buflen represent a processed lexeme, not raw token text.
+ *
+ * If the query contains more than one matching item, we replicate
+ * the last-added word so that each item can be pointed to. The
+ * duplicate entries are marked with repeated = 1.
+ */
static void
hlfinditem(HeadlineParsedText *prs, TSQuery query, int32 pos, char *buf, int buflen)
{
@@ -589,6 +599,9 @@ hlparsetext(Oid cfgId, HeadlineParsedText *prs, TSQuery query, char *buf, int bu
FunctionCall1(&(prsobj->prsend), PointerGetDatum(prsdata));
}
+/*
+ * Generate the headline, as a text object, from HeadlineParsedText.
+ */
text *
generateHeadline(HeadlineParsedText *prs)
{
diff --git a/src/backend/tsearch/wparser_def.c b/src/backend/tsearch/wparser_def.c
index 826027844e7..2323a3b9086 100644
--- a/src/backend/tsearch/wparser_def.c
+++ b/src/backend/tsearch/wparser_def.c
@@ -1914,10 +1914,6 @@ prsd_end(PG_FUNCTION_ARGS)
*/
/* token type classification macros */
-#define LEAVETOKEN(x) ( (x)==SPACE )
-#define COMPLEXTOKEN(x) ( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
-#define ENDPUNCTOKEN(x) ( (x)==SPACE )
-
#define TS_IDIGNORE(x) ( (x)==TAG_T || (x)==PROTOCOL || (x)==SPACE || (x)==XMLENTITY )
#define HLIDREPLACE(x) ( (x)==TAG_T )
#define HLIDSKIP(x) ( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
diff --git a/src/backend/utils/adt/tsvector_op.c b/src/backend/utils/adt/tsvector_op.c
index addc3491518..2ccd3bdbb0e 100644
--- a/src/backend/utils/adt/tsvector_op.c
+++ b/src/backend/utils/adt/tsvector_op.c
@@ -1622,6 +1622,9 @@ TS_phrase_execute(QueryItem *curitem, void *arg, uint32 flags,
/* since this function recurses, it could be driven to stack overflow */
check_stack_depth();
+ /* ... and let's check for query cancel while we're at it */
+ CHECK_FOR_INTERRUPTS();
+
if (curitem->type == QI_VAL)
return chkcond(arg, (QueryOperand *) curitem, data);
diff --git a/src/include/tsearch/ts_public.h b/src/include/tsearch/ts_public.h
index bde1e8abc13..fe2a16783de 100644
--- a/src/include/tsearch/ts_public.h
+++ b/src/include/tsearch/ts_public.h
@@ -30,33 +30,60 @@ typedef struct
} LexDescr;
/*
- * Interface to headline generator
+ * Interface to headline generator (tsparser's prsheadline function)
+ *
+ * HeadlineParsedText describes the text that is to be highlighted.
+ * Some fields are passed from the core code to the prsheadline function,
+ * while others are output from the prsheadline function.
+ *
+ * The principal data is words[], an array of HeadlineWordEntry,
+ * one entry per token, of length curwords.
+ * The fields of HeadlineWordEntry are:
+ *
+ * in, selected, replace, skip: these flags are initially zero
+ * and may be set by the prsheadline function. A consecutive group
+ * of tokens marked "in" form a "fragment" to be output.
+ * Such tokens may additionally be marked selected, replace, or skip
+ * to modify how they are shown. (If you set more than one of those
+ * bits, you get an unspecified one of those behaviors.)
+ *
+ * type, len, pos, word: filled by core code to describe the token.
+ *
+ * item: if the token matches any operand of the tsquery of interest,
+ * a pointer to such an operand. (If there are multiple matching
+ * operands, we generate extra copies of the HeadlineWordEntry to hold
+ * all the pointers. The extras are marked with repeated = 1 and should
+ * be ignored except for checking the item pointer.)
*/
typedef struct
{
- uint32 selected:1,
- in:1,
- replace:1,
- repeated:1,
- skip:1,
- unused:3,
- type:8,
- len:16;
- WordEntryPos pos;
- char *word;
- QueryOperand *item;
+ uint32 selected:1, /* token is to be highlighted */
+ in:1, /* token is part of headline */
+ replace:1, /* token is to be replaced with a space */
+ repeated:1, /* duplicate entry to hold item pointer */
+ skip:1, /* token is to be skipped (not output) */
+ unused:3, /* available bits */
+ type:8, /* parser's token category */
+ len:16; /* length of token */
+ WordEntryPos pos; /* position of token */
+ char *word; /* text of token (not null-terminated) */
+ QueryOperand *item; /* a matching query operand, or NULL if none */
} HeadlineWordEntry;
typedef struct
{
+ /* Fields filled by core code before calling prsheadline function: */
HeadlineWordEntry *words;
- int32 lenwords;
- int32 curwords;
- int32 vectorpos; /* positions a-la tsvector */
- char *startsel;
+ int32 lenwords; /* allocated length of words[] */
+ int32 curwords; /* current number of valid entries */
+ int32 vectorpos; /* used by ts_parse.c in filling pos fields */
+
+ /* The prsheadline function must fill these fields: */
+ /* Strings for marking selected tokens and separating fragments: */
+ char *startsel; /* palloc'd strings */
char *stopsel;
char *fragdelim;
- int16 startsellen;
+ int16 startsellen; /* lengths of strings */
int16 stopsellen;
int16 fragdelimlen;
} HeadlineParsedText;