aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAlexander Korotkov <akorotkov@postgresql.org>2021-05-03 03:58:03 +0300
committerAlexander Korotkov <akorotkov@postgresql.org>2021-05-03 04:18:19 +0300
commiteb086056fec44516efdd5db71244a079fed65c7f (patch)
tree18b086f3c361e471380fd6f66c9bf6d7de81f5ac
parent651d005e76bc0b9542615f609b4d0d946035dc58 (diff)
downloadpostgresql-eb086056fec44516efdd5db71244a079fed65c7f.tar.gz
postgresql-eb086056fec44516efdd5db71244a079fed65c7f.zip
Make websearch_to_tsquery() parse text in quotes as a single token
websearch_to_tsquery() splits text in quotes into tokens and connects them with phrase operator on its own. However, that leads to surprising results when the token contains no words. For instance, websearch_to_tsquery('"aaa: bbb"') is 'aaa <2> bbb', because it is equivalent of to_tsquery(E'aaa <-> \':\' <-> bbb'). But websearch_to_tsquery('"aaa: bbb"') has to be 'aaa <-> bbb' in order to match to_tsvector('aaa: bbb'). Since 0c4f355c6a, we anyway connect lexemes of complex tokens with phrase operators. Thus, let's just websearch_to_tsquery() parse text in quotes as a single token. Therefore, websearch_to_tsquery() should process the quoted text in the same way phraseto_tsquery() does. This solution is what we exactly need and also simplifies the code. This commit is an incompatible change, so we don't backpatch it. Reported-by: Valentin Gatien-Baron Discussion: https://postgr.es/m/CA%2B0DEqiZs7gdOd4ikmg%3D0UWG%2BSwWOLxPsk_JW-sx9WNOyrb0KQ%40mail.gmail.com Author: Alexander Korotkov Reviewed-by: Tom Lane, Zhihong Yu
-rw-r--r--src/backend/utils/adt/tsquery.c81
-rw-r--r--src/test/regress/expected/tsearch.out24
-rw-r--r--src/test/regress/sql/tsearch.sql1
3 files changed, 39 insertions, 67 deletions
diff --git a/src/backend/utils/adt/tsquery.c b/src/backend/utils/adt/tsquery.c
index fe4470174f5..b2ca0d2f8a2 100644
--- a/src/backend/utils/adt/tsquery.c
+++ b/src/backend/utils/adt/tsquery.c
@@ -77,7 +77,6 @@ struct TSQueryParserStateData
char *buf; /* current scan point */
int count; /* nesting count, incremented by (,
* decremented by ) */
- bool in_quotes; /* phrase in quotes "" */
ts_parserstate state;
/* polish (prefix) notation in list, filled in by push* functions */
@@ -235,9 +234,6 @@ parse_or_operator(TSQueryParserState pstate)
{
char *ptr = pstate->buf;
- if (pstate->in_quotes)
- return false;
-
/* it should begin with "OR" literal */
if (pg_strncasecmp(ptr, "or", 2) != 0)
return false;
@@ -398,38 +394,29 @@ gettoken_query_websearch(TSQueryParserState state, int8 *operator,
state->buf++;
state->state = WAITOPERAND;
- if (state->in_quotes)
- continue;
-
*operator = OP_NOT;
return PT_OPR;
}
else if (t_iseq(state->buf, '"'))
{
+ /* Everything in quotes is processed as a single token */
+
+ /* skip opening quote */
state->buf++;
+ *strval = state->buf;
- if (!state->in_quotes)
- {
- state->state = WAITOPERAND;
+ /* iterate to the closing quote or end of the string */
+ while (*state->buf != '\0' && !t_iseq(state->buf, '"'))
+ state->buf++;
+ *lenval = state->buf - *strval;
- if (strchr(state->buf, '"'))
- {
- /* quoted text should be ordered <-> */
- state->in_quotes = true;
- return PT_OPEN;
- }
+ /* skip closing quote if not end of the string */
+ if (*state->buf != '\0')
+ state->buf++;
- /* web search tolerates missing quotes */
- continue;
- }
- else
- {
- /* we have to provide an operand */
- state->in_quotes = false;
- state->state = WAITOPERATOR;
- pushStop(state);
- return PT_CLOSE;
- }
+ state->state = WAITOPERATOR;
+ state->count++;
+ return PT_VAL;
}
else if (ISOPERATOR(state->buf))
{
@@ -467,24 +454,13 @@ gettoken_query_websearch(TSQueryParserState state, int8 *operator,
case WAITOPERATOR:
if (t_iseq(state->buf, '"'))
{
- if (!state->in_quotes)
- {
- /*
- * put implicit AND after an operand and handle this
- * quote in WAITOPERAND
- */
- state->state = WAITOPERAND;
- *operator = OP_AND;
- return PT_OPR;
- }
- else
- {
- state->buf++;
-
- /* just close quotes */
- state->in_quotes = false;
- return PT_CLOSE;
- }
+ /*
+ * put implicit AND after an operand and handle this quote
+ * in WAITOPERAND
+ */
+ state->state = WAITOPERAND;
+ *operator = OP_AND;
+ return PT_OPR;
}
else if (parse_or_operator(state))
{
@@ -498,18 +474,8 @@ gettoken_query_websearch(TSQueryParserState state, int8 *operator,
}
else if (!t_isspace(state->buf))
{
- if (state->in_quotes)
- {
- /* put implicit <-> after an operand */
- *operator = OP_PHRASE;
- *weight = 1;
- }
- else
- {
- /* put implicit AND after an operand */
- *operator = OP_AND;
- }
-
+ /* put implicit AND after an operand */
+ *operator = OP_AND;
state->state = WAITOPERAND;
return PT_OPR;
}
@@ -846,7 +812,6 @@ parse_tsquery(char *buf,
state.buffer = buf;
state.buf = buf;
state.count = 0;
- state.in_quotes = false;
state.state = WAITFIRSTOPERAND;
state.polstr = NIL;
diff --git a/src/test/regress/expected/tsearch.out b/src/test/regress/expected/tsearch.out
index 4ae62320c9f..45b92a63388 100644
--- a/src/test/regress/expected/tsearch.out
+++ b/src/test/regress/expected/tsearch.out
@@ -2678,9 +2678,9 @@ select websearch_to_tsquery('simple', 'abc OR_abc');
-- test quotes
select websearch_to_tsquery('english', '"pg_class pg');
- websearch_to_tsquery
--------------------------
- 'pg' <-> 'class' & 'pg'
+ websearch_to_tsquery
+---------------------------
+ 'pg' <-> 'class' <-> 'pg'
(1 row)
select websearch_to_tsquery('english', 'pg_class pg"');
@@ -2695,6 +2695,12 @@ select websearch_to_tsquery('english', '"pg_class pg"');
'pg' <-> 'class' <-> 'pg'
(1 row)
+select websearch_to_tsquery('english', '"pg_class : pg"');
+ websearch_to_tsquery
+---------------------------
+ 'pg' <-> 'class' <-> 'pg'
+(1 row)
+
select websearch_to_tsquery('english', 'abc "pg_class pg"');
websearch_to_tsquery
-----------------------------------
@@ -2708,15 +2714,15 @@ select websearch_to_tsquery('english', '"pg_class pg" def');
(1 row)
select websearch_to_tsquery('english', 'abc "pg pg_class pg" def');
- websearch_to_tsquery
---------------------------------------------------------
- 'abc' & 'pg' <-> ( 'pg' <-> 'class' ) <-> 'pg' & 'def'
+ websearch_to_tsquery
+----------------------------------------------------
+ 'abc' & 'pg' <-> 'pg' <-> 'class' <-> 'pg' & 'def'
(1 row)
select websearch_to_tsquery('english', ' or "pg pg_class pg" or ');
- websearch_to_tsquery
-----------------------------------------
- 'pg' <-> ( 'pg' <-> 'class' ) <-> 'pg'
+ websearch_to_tsquery
+------------------------------------
+ 'pg' <-> 'pg' <-> 'class' <-> 'pg'
(1 row)
select websearch_to_tsquery('english', '""pg pg_class pg""');
diff --git a/src/test/regress/sql/tsearch.sql b/src/test/regress/sql/tsearch.sql
index b02ed73f6a8..d929210998a 100644
--- a/src/test/regress/sql/tsearch.sql
+++ b/src/test/regress/sql/tsearch.sql
@@ -759,6 +759,7 @@ select websearch_to_tsquery('simple', 'abc OR_abc');
select websearch_to_tsquery('english', '"pg_class pg');
select websearch_to_tsquery('english', 'pg_class pg"');
select websearch_to_tsquery('english', '"pg_class pg"');
+select websearch_to_tsquery('english', '"pg_class : pg"');
select websearch_to_tsquery('english', 'abc "pg_class pg"');
select websearch_to_tsquery('english', '"pg_class pg" def');
select websearch_to_tsquery('english', 'abc "pg pg_class pg" def');