diff options
-rw-r--r-- | src/backend/optimizer/util/pathnode.c | 174 | ||||
-rw-r--r-- | src/test/regress/expected/subselect.out | 57 | ||||
-rw-r--r-- | src/test/regress/sql/subselect.sql | 33 |
3 files changed, 219 insertions, 45 deletions
diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c index 64c802805a7..2f10fbe1939 100644 --- a/src/backend/optimizer/util/pathnode.c +++ b/src/backend/optimizer/util/pathnode.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/optimizer/util/pathnode.c,v 1.111 2004/12/31 22:00:23 pgsql Exp $ + * $PostgreSQL: pgsql/src/backend/optimizer/util/pathnode.c,v 1.111.4.1 2005/07/15 17:09:49 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -34,7 +34,8 @@ #include "utils/syscache.h" -static bool is_distinct_query(Query *query); +static List *translate_sub_tlist(List *tlist, int relid); +static bool query_is_distinct_for(Query *query, List *colnos); static bool hash_safe_tlist(List *tlist); @@ -642,14 +643,41 @@ create_unique_path(Query *root, RelOptInfo *rel, Path *subpath) pathnode->subpath = subpath; /* - * If the input is a subquery whose output must be unique already, we - * don't need to do anything. + * Try to identify the targetlist that will actually be unique-ified. + * In current usage, this routine is only used for sub-selects of IN + * clauses, so we should be able to find the tlist in in_info_list. + */ + sub_targetlist = NIL; + foreach(l, root->in_info_list) + { + InClauseInfo *ininfo = (InClauseInfo *) lfirst(l); + + if (bms_equal(ininfo->righthand, rel->relids)) + { + sub_targetlist = ininfo->sub_targetlist; + break; + } + } + + /* + * If the input is a subquery whose output must be unique already, + * then we don't need to do anything. The test for uniqueness has + * to consider exactly which columns we are extracting; for example + * "SELECT DISTINCT x,y" doesn't guarantee that x alone is distinct. + * So we cannot check for this optimization unless we found our own + * targetlist above, and it consists only of simple Vars referencing + * subquery outputs. (Possibly we could do something with expressions + * in the subquery outputs, too, but for now keep it simple.) */ - if (rel->rtekind == RTE_SUBQUERY) + if (sub_targetlist && rel->rtekind == RTE_SUBQUERY) { RangeTblEntry *rte = rt_fetch(rel->relid, root->rtable); + List *sub_tlist_colnos; - if (is_distinct_query(rte->subquery)) + sub_tlist_colnos = translate_sub_tlist(sub_targetlist, rel->relid); + + if (sub_tlist_colnos && + query_is_distinct_for(rte->subquery, sub_tlist_colnos)) { pathnode->umethod = UNIQUE_PATH_NOOP; pathnode->rows = rel->rows; @@ -664,23 +692,6 @@ create_unique_path(Query *root, RelOptInfo *rel, Path *subpath) } /* - * Try to identify the targetlist that will actually be unique-ified. - * In current usage, this routine is only used for sub-selects of IN - * clauses, so we should be able to find the tlist in in_info_list. - */ - sub_targetlist = NIL; - foreach(l, root->in_info_list) - { - InClauseInfo *ininfo = (InClauseInfo *) lfirst(l); - - if (bms_equal(ininfo->righthand, rel->relids)) - { - sub_targetlist = ininfo->sub_targetlist; - break; - } - } - - /* * If we know the targetlist, try to estimate number of result rows; * otherwise punt. */ @@ -755,50 +766,123 @@ create_unique_path(Query *root, RelOptInfo *rel, Path *subpath) } /* - * is_distinct_query - does query never return duplicate rows? + * translate_sub_tlist - get subquery column numbers represented by tlist + * + * The given targetlist should contain only Vars referencing the given relid. + * Extract their varattnos (ie, the column numbers of the subquery) and return + * as an integer List. + * + * If any of the tlist items is not a simple Var, we cannot determine whether + * the subquery's uniqueness condition (if any) matches ours, so punt and + * return NIL. */ -static bool -is_distinct_query(Query *query) +static List * +translate_sub_tlist(List *tlist, int relid) { - /* DISTINCT (but not DISTINCT ON) guarantees uniqueness */ - if (has_distinct_clause(query)) - return true; + List *result = NIL; + ListCell *l; - /* UNION, INTERSECT, EXCEPT guarantee uniqueness, except with ALL */ - if (query->setOperations) + foreach(l, tlist) { - SetOperationStmt *topop = (SetOperationStmt *) query->setOperations; + Var *var = (Var *) lfirst(l); - Assert(IsA(topop, SetOperationStmt)); - Assert(topop->op != SETOP_NONE); + if (!var || !IsA(var, Var) || + var->varno != relid) + return NIL; /* punt */ - if (!topop->all) + result = lappend_int(result, var->varattno); + } + return result; +} + +/* + * query_is_distinct_for - does query never return duplicates of the + * specified columns? + * + * colnos is an integer list of output column numbers (resno's). We are + * interested in whether rows consisting of just these columns are certain + * to be distinct. + */ +static bool +query_is_distinct_for(Query *query, List *colnos) +{ + ListCell *l; + + /* + * DISTINCT (including DISTINCT ON) guarantees uniqueness if all the + * columns in the DISTINCT clause appear in colnos. + */ + if (query->distinctClause) + { + foreach(l, query->distinctClause) + { + SortClause *scl = (SortClause *) lfirst(l); + TargetEntry *tle = get_sortgroupclause_tle(scl, + query->targetList); + + if (!list_member_int(colnos, tle->resdom->resno)) + break; /* exit early if no match */ + } + if (l == NULL) /* had matches for all? */ return true; } /* - * GROUP BY guarantees uniqueness if all the grouped columns appear in - * the output. In our implementation this means checking they are non - * resjunk columns. + * Similarly, GROUP BY guarantees uniqueness if all the grouped columns + * appear in colnos. */ if (query->groupClause) { - ListCell *gl; - - foreach(gl, query->groupClause) + foreach(l, query->groupClause) { - GroupClause *grpcl = (GroupClause *) lfirst(gl); + GroupClause *grpcl = (GroupClause *) lfirst(l); TargetEntry *tle = get_sortgroupclause_tle(grpcl, query->targetList); - if (tle->resdom->resjunk) - break; + if (!list_member_int(colnos, tle->resdom->resno)) + break; /* exit early if no match */ } - if (!gl) /* got to the end? */ + if (l == NULL) /* had matches for all? */ + return true; + } + else + { + /* + * If we have no GROUP BY, but do have aggregates or HAVING, then + * the result is at most one row so it's surely unique. + */ + if (query->hasAggs || query->havingQual) return true; } /* + * UNION, INTERSECT, EXCEPT guarantee uniqueness of the whole output row, + * except with ALL + */ + if (query->setOperations) + { + SetOperationStmt *topop = (SetOperationStmt *) query->setOperations; + + Assert(IsA(topop, SetOperationStmt)); + Assert(topop->op != SETOP_NONE); + + if (!topop->all) + { + /* We're good if all the nonjunk output columns are in colnos */ + foreach(l, query->targetList) + { + TargetEntry *tle = (TargetEntry *) lfirst(l); + + if (!tle->resdom->resjunk && + !list_member_int(colnos, tle->resdom->resno)) + break; /* exit early if no match */ + } + if (l == NULL) /* had matches for all? */ + return true; + } + } + + /* * XXX Are there any other cases in which we can easily see the result * must be distinct? */ diff --git a/src/test/regress/expected/subselect.out b/src/test/regress/expected/subselect.out index 07e727de482..d99656eac77 100644 --- a/src/test/regress/expected/subselect.out +++ b/src/test/regress/expected/subselect.out @@ -203,6 +203,63 @@ select count(distinct ss.ten) from (1 row) -- +-- Test cases to check for overenthusiastic optimization of +-- "IN (SELECT DISTINCT ...)" and related cases. Per example from +-- Luca Pireddu and Michael Fuhr. +-- +CREATE TEMP TABLE foo (id integer); +CREATE TEMP TABLE bar (id1 integer, id2 integer); +INSERT INTO foo VALUES (1); +INSERT INTO bar VALUES (1, 1); +INSERT INTO bar VALUES (2, 2); +INSERT INTO bar VALUES (3, 1); +-- These cases require an extra level of distinct-ing above subquery s +SELECT * FROM foo WHERE id IN + (SELECT id2 FROM (SELECT DISTINCT id1, id2 FROM bar) AS s); + id +---- + 1 +(1 row) + +SELECT * FROM foo WHERE id IN + (SELECT id2 FROM (SELECT id1,id2 FROM bar GROUP BY id1,id2) AS s); + id +---- + 1 +(1 row) + +SELECT * FROM foo WHERE id IN + (SELECT id2 FROM (SELECT id1, id2 FROM bar UNION + SELECT id1, id2 FROM bar) AS s); + id +---- + 1 +(1 row) + +-- These cases do not +SELECT * FROM foo WHERE id IN + (SELECT id2 FROM (SELECT DISTINCT ON (id2) id1, id2 FROM bar) AS s); + id +---- + 1 +(1 row) + +SELECT * FROM foo WHERE id IN + (SELECT id2 FROM (SELECT id2 FROM bar GROUP BY id2) AS s); + id +---- + 1 +(1 row) + +SELECT * FROM foo WHERE id IN + (SELECT id2 FROM (SELECT id2 FROM bar UNION + SELECT id2 FROM bar) AS s); + id +---- + 1 +(1 row) + +-- -- Test case to catch problems with multiply nested sub-SELECTs not getting -- recalculated properly. Per bug report from Didier Moens. -- diff --git a/src/test/regress/sql/subselect.sql b/src/test/regress/sql/subselect.sql index 5cba9ca74d0..a07cc337596 100644 --- a/src/test/regress/sql/subselect.sql +++ b/src/test/regress/sql/subselect.sql @@ -96,6 +96,39 @@ select count(distinct ss.ten) from where unique1 IN (select distinct hundred from tenk1 b)) ss; -- +-- Test cases to check for overenthusiastic optimization of +-- "IN (SELECT DISTINCT ...)" and related cases. Per example from +-- Luca Pireddu and Michael Fuhr. +-- + +CREATE TEMP TABLE foo (id integer); +CREATE TEMP TABLE bar (id1 integer, id2 integer); + +INSERT INTO foo VALUES (1); + +INSERT INTO bar VALUES (1, 1); +INSERT INTO bar VALUES (2, 2); +INSERT INTO bar VALUES (3, 1); + +-- These cases require an extra level of distinct-ing above subquery s +SELECT * FROM foo WHERE id IN + (SELECT id2 FROM (SELECT DISTINCT id1, id2 FROM bar) AS s); +SELECT * FROM foo WHERE id IN + (SELECT id2 FROM (SELECT id1,id2 FROM bar GROUP BY id1,id2) AS s); +SELECT * FROM foo WHERE id IN + (SELECT id2 FROM (SELECT id1, id2 FROM bar UNION + SELECT id1, id2 FROM bar) AS s); + +-- These cases do not +SELECT * FROM foo WHERE id IN + (SELECT id2 FROM (SELECT DISTINCT ON (id2) id1, id2 FROM bar) AS s); +SELECT * FROM foo WHERE id IN + (SELECT id2 FROM (SELECT id2 FROM bar GROUP BY id2) AS s); +SELECT * FROM foo WHERE id IN + (SELECT id2 FROM (SELECT id2 FROM bar UNION + SELECT id2 FROM bar) AS s); + +-- -- Test case to catch problems with multiply nested sub-SELECTs not getting -- recalculated properly. Per bug report from Didier Moens. -- |