diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/backend/optimizer/plan/analyzejoins.c | 295 | ||||
-rw-r--r-- | src/backend/optimizer/util/pathnode.c | 192 | ||||
-rw-r--r-- | src/include/optimizer/planmain.h | 2 | ||||
-rw-r--r-- | src/test/regress/expected/join.out | 79 | ||||
-rw-r--r-- | src/test/regress/sql/join.sql | 35 |
5 files changed, 417 insertions, 186 deletions
diff --git a/src/backend/optimizer/plan/analyzejoins.c b/src/backend/optimizer/plan/analyzejoins.c index 129fc3dfae6..773f8a458e2 100644 --- a/src/backend/optimizer/plan/analyzejoins.c +++ b/src/backend/optimizer/plan/analyzejoins.c @@ -22,17 +22,21 @@ */ #include "postgres.h" +#include "nodes/nodeFuncs.h" +#include "optimizer/clauses.h" #include "optimizer/joininfo.h" #include "optimizer/pathnode.h" #include "optimizer/paths.h" #include "optimizer/planmain.h" -#include "optimizer/var.h" +#include "optimizer/tlist.h" +#include "utils/lsyscache.h" /* local functions */ static bool join_is_removable(PlannerInfo *root, SpecialJoinInfo *sjinfo); static void remove_rel_from_query(PlannerInfo *root, int relid, Relids joinrelids); static List *remove_rel_from_joinlist(List *joinlist, int relid, int *nremoved); +static Oid distinct_col_search(int colno, List *colnos, List *opids); /* @@ -147,18 +151,15 @@ join_is_removable(PlannerInfo *root, SpecialJoinInfo *sjinfo) { int innerrelid; RelOptInfo *innerrel; + Query *subquery = NULL; Relids joinrelids; List *clause_list = NIL; ListCell *l; int attroff; /* - * Currently, we only know how to remove left joins to a baserel with - * unique indexes. We can check most of these criteria pretty trivially - * to avoid doing useless extra work. But checking whether any of the - * indexes are unique would require iterating over the indexlist, so for - * now we just make sure there are indexes of some sort or other. If none - * of them are unique, join removal will still fail, just slightly later. + * Must be a non-delaying left join to a single baserel, else we aren't + * going to be able to do anything with it. */ if (sjinfo->jointype != JOIN_LEFT || sjinfo->delay_upper_joins || @@ -168,11 +169,39 @@ join_is_removable(PlannerInfo *root, SpecialJoinInfo *sjinfo) innerrelid = bms_singleton_member(sjinfo->min_righthand); innerrel = find_base_rel(root, innerrelid); - if (innerrel->reloptkind != RELOPT_BASEREL || - innerrel->rtekind != RTE_RELATION || - innerrel->indexlist == NIL) + if (innerrel->reloptkind != RELOPT_BASEREL) return false; + /* + * Before we go to the effort of checking whether any innerrel variables + * are needed above the join, make a quick check to eliminate cases in + * which we will surely be unable to prove uniqueness of the innerrel. + */ + if (innerrel->rtekind == RTE_RELATION) + { + /* + * For a plain-relation innerrel, we only know how to prove uniqueness + * by reference to unique indexes. If there are no indexes then + * there's certainly no unique indexes so there's no point in going + * further. + */ + if (innerrel->indexlist == NIL) + return false; + } + else if (innerrel->rtekind == RTE_SUBQUERY) + { + subquery = root->simple_rte_array[innerrelid]->subquery; + + /* + * If the subquery has no qualities that support distinctness proofs + * then there's no point in going further. + */ + if (!query_supports_distinctness(subquery)) + return false; + } + else + return false; /* unsupported rtekind */ + /* Compute the relid set for the join we are considering */ joinrelids = bms_union(sjinfo->min_lefthand, sjinfo->min_righthand); @@ -272,12 +301,64 @@ join_is_removable(PlannerInfo *root, SpecialJoinInfo *sjinfo) /* * relation_has_unique_index_for automatically adds any usable restriction - * clauses for the innerrel, so we needn't do that here. + * clauses for the innerrel, so we needn't do that here. (XXX we are not + * considering restriction clauses for subqueries; is that worth doing?) */ - /* Now examine the indexes to see if we have a matching unique index */ - if (relation_has_unique_index_for(root, innerrel, clause_list, NIL, NIL)) - return true; + if (innerrel->rtekind == RTE_RELATION) + { + /* Now examine the indexes to see if we have a matching unique index */ + if (relation_has_unique_index_for(root, innerrel, clause_list, NIL, NIL)) + return true; + } + else /* innerrel->rtekind == RTE_SUBQUERY */ + { + List *colnos = NIL; + List *opids = NIL; + + /* + * Build the argument lists for query_is_distinct_for: a list of + * output column numbers that the query needs to be distinct over, and + * a list of equality operators that the output columns need to be + * distinct according to. + */ + foreach(l, clause_list) + { + RestrictInfo *rinfo = (RestrictInfo *) lfirst(l); + Oid op; + Var *var; + + /* + * Get the equality operator we need uniqueness according to. + * (This might be a cross-type operator and thus not exactly the + * same operator the subquery would consider; that's all right + * since query_is_distinct_for can resolve such cases.) The + * mergejoinability test above should have selected only OpExprs. + */ + Assert(IsA(rinfo->clause, OpExpr)); + op = ((OpExpr *) rinfo->clause)->opno; + + /* clause_sides_match_join identified the inner side for us */ + if (rinfo->outer_is_left) + var = (Var *) get_rightop(rinfo->clause); + else + var = (Var *) get_leftop(rinfo->clause); + + /* + * If inner side isn't a Var referencing a subquery output column, + * this clause doesn't help us. + */ + if (!var || !IsA(var, Var) || + var->varno != innerrelid || var->varlevelsup != 0) + continue; + + colnos = lappend_int(colnos, var->varattno); + opids = lappend_oid(opids, op); + } + + if (query_is_distinct_for(subquery, colnos, opids)) + return true; + } /* * Some day it would be nice to check for other methods of establishing @@ -481,3 +562,189 @@ remove_rel_from_joinlist(List *joinlist, int relid, int *nremoved) return result; } + + +/* + * query_supports_distinctness - could the query possibly be proven distinct + * on some set of output columns? + * + * This is effectively a pre-checking function for query_is_distinct_for(). + * It must return TRUE if query_is_distinct_for() could possibly return TRUE + * with this query, but it should not expend a lot of cycles. The idea is + * that callers can avoid doing possibly-expensive processing to compute + * query_is_distinct_for()'s argument lists if the call could not possibly + * succeed. + */ +bool +query_supports_distinctness(Query *query) +{ + if (query->distinctClause != NIL || + query->groupClause != NIL || + query->hasAggs || + query->havingQual || + query->setOperations) + return true; + + return false; +} + +/* + * query_is_distinct_for - does query never return duplicates of the + * specified columns? + * + * query is a not-yet-planned subquery (in current usage, it's always from + * a subquery RTE, which the planner avoids scribbling on). + * + * colnos is an integer list of output column numbers (resno's). We are + * interested in whether rows consisting of just these columns are certain + * to be distinct. "Distinctness" is defined according to whether the + * corresponding upper-level equality operators listed in opids would think + * the values are distinct. (Note: the opids entries could be cross-type + * operators, and thus not exactly the equality operators that the subquery + * would use itself. We use equality_ops_are_compatible() to check + * compatibility. That looks at btree or hash opfamily membership, and so + * should give trustworthy answers for all operators that we might need + * to deal with here.) + */ +bool +query_is_distinct_for(Query *query, List *colnos, List *opids) +{ + ListCell *l; + Oid opid; + + Assert(list_length(colnos) == list_length(opids)); + + /* + * A set-returning function in the query's targetlist can result in + * returning duplicate rows, if the SRF is evaluated after the + * de-duplication step; so we play it safe and say "no" if there are any + * SRFs. (We could be certain that it's okay if SRFs appear only in the + * specified columns, since those must be evaluated before de-duplication; + * but it doesn't presently seem worth the complication to check that.) + */ + if (expression_returns_set((Node *) query->targetList)) + return false; + + /* + * DISTINCT (including DISTINCT ON) guarantees uniqueness if all the + * columns in the DISTINCT clause appear in colnos and operator semantics + * match. + */ + if (query->distinctClause) + { + foreach(l, query->distinctClause) + { + SortGroupClause *sgc = (SortGroupClause *) lfirst(l); + TargetEntry *tle = get_sortgroupclause_tle(sgc, + query->targetList); + + opid = distinct_col_search(tle->resno, colnos, opids); + if (!OidIsValid(opid) || + !equality_ops_are_compatible(opid, sgc->eqop)) + break; /* exit early if no match */ + } + if (l == NULL) /* had matches for all? */ + return true; + } + + /* + * Similarly, GROUP BY guarantees uniqueness if all the grouped columns + * appear in colnos and operator semantics match. + */ + if (query->groupClause) + { + foreach(l, query->groupClause) + { + SortGroupClause *sgc = (SortGroupClause *) lfirst(l); + TargetEntry *tle = get_sortgroupclause_tle(sgc, + query->targetList); + + opid = distinct_col_search(tle->resno, colnos, opids); + if (!OidIsValid(opid) || + !equality_ops_are_compatible(opid, sgc->eqop)) + break; /* exit early if no match */ + } + if (l == NULL) /* had matches for all? */ + return true; + } + else + { + /* + * If we have no GROUP BY, but do have aggregates or HAVING, then the + * result is at most one row so it's surely unique, for any operators. + */ + if (query->hasAggs || query->havingQual) + return true; + } + + /* + * UNION, INTERSECT, EXCEPT guarantee uniqueness of the whole output row, + * except with ALL. + */ + if (query->setOperations) + { + SetOperationStmt *topop = (SetOperationStmt *) query->setOperations; + + Assert(IsA(topop, SetOperationStmt)); + Assert(topop->op != SETOP_NONE); + + if (!topop->all) + { + ListCell *lg; + + /* We're good if all the nonjunk output columns are in colnos */ + lg = list_head(topop->groupClauses); + foreach(l, query->targetList) + { + TargetEntry *tle = (TargetEntry *) lfirst(l); + SortGroupClause *sgc; + + if (tle->resjunk) + continue; /* ignore resjunk columns */ + + /* non-resjunk columns should have grouping clauses */ + Assert(lg != NULL); + sgc = (SortGroupClause *) lfirst(lg); + lg = lnext(lg); + + opid = distinct_col_search(tle->resno, colnos, opids); + if (!OidIsValid(opid) || + !equality_ops_are_compatible(opid, sgc->eqop)) + break; /* exit early if no match */ + } + if (l == NULL) /* had matches for all? */ + return true; + } + } + + /* + * XXX Are there any other cases in which we can easily see the result + * must be distinct? + * + * If you do add more smarts to this function, be sure to update + * query_supports_distinctness() to match. + */ + + return false; +} + +/* + * distinct_col_search - subroutine for query_is_distinct_for + * + * If colno is in colnos, return the corresponding element of opids, + * else return InvalidOid. (Ordinarily colnos would not contain duplicates, + * but if it does, we arbitrarily select the first match.) + */ +static Oid +distinct_col_search(int colno, List *colnos, List *opids) +{ + ListCell *lc1, + *lc2; + + forboth(lc1, colnos, lc2, opids) + { + if (colno == lfirst_int(lc1)) + return lfirst_oid(lc2); + } + return InvalidOid; +} diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c index d129f8d65ea..319e8b2c379 100644 --- a/src/backend/optimizer/util/pathnode.c +++ b/src/backend/optimizer/util/pathnode.c @@ -22,8 +22,9 @@ #include "optimizer/cost.h" #include "optimizer/pathnode.h" #include "optimizer/paths.h" +#include "optimizer/planmain.h" #include "optimizer/restrictinfo.h" -#include "optimizer/tlist.h" +#include "optimizer/var.h" #include "parser/parsetree.h" #include "utils/lsyscache.h" #include "utils/selfuncs.h" @@ -38,8 +39,6 @@ typedef enum } PathCostComparison; static List *translate_sub_tlist(List *tlist, int relid); -static bool query_is_distinct_for(Query *query, List *colnos, List *opids); -static Oid distinct_col_search(int colno, List *colnos, List *opids); /***************************************************************************** @@ -1312,25 +1311,29 @@ create_unique_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath, if (rel->rtekind == RTE_SUBQUERY) { RangeTblEntry *rte = planner_rt_fetch(rel->relid, root); - List *sub_tlist_colnos; - sub_tlist_colnos = translate_sub_tlist(uniq_exprs, rel->relid); - - if (sub_tlist_colnos && - query_is_distinct_for(rte->subquery, - sub_tlist_colnos, in_operators)) + if (query_supports_distinctness(rte->subquery)) { - pathnode->umethod = UNIQUE_PATH_NOOP; - pathnode->path.rows = rel->rows; - pathnode->path.startup_cost = subpath->startup_cost; - pathnode->path.total_cost = subpath->total_cost; - pathnode->path.pathkeys = subpath->pathkeys; + List *sub_tlist_colnos; + + sub_tlist_colnos = translate_sub_tlist(uniq_exprs, rel->relid); + + if (sub_tlist_colnos && + query_is_distinct_for(rte->subquery, + sub_tlist_colnos, in_operators)) + { + pathnode->umethod = UNIQUE_PATH_NOOP; + pathnode->path.rows = rel->rows; + pathnode->path.startup_cost = subpath->startup_cost; + pathnode->path.total_cost = subpath->total_cost; + pathnode->path.pathkeys = subpath->pathkeys; - rel->cheapest_unique_path = (Path *) pathnode; + rel->cheapest_unique_path = (Path *) pathnode; - MemoryContextSwitchTo(oldcontext); + MemoryContextSwitchTo(oldcontext); - return pathnode; + return pathnode; + } } } @@ -1451,161 +1454,6 @@ translate_sub_tlist(List *tlist, int relid) } /* - * query_is_distinct_for - does query never return duplicates of the - * specified columns? - * - * colnos is an integer list of output column numbers (resno's). We are - * interested in whether rows consisting of just these columns are certain - * to be distinct. "Distinctness" is defined according to whether the - * corresponding upper-level equality operators listed in opids would think - * the values are distinct. (Note: the opids entries could be cross-type - * operators, and thus not exactly the equality operators that the subquery - * would use itself. We use equality_ops_are_compatible() to check - * compatibility. That looks at btree or hash opfamily membership, and so - * should give trustworthy answers for all operators that we might need - * to deal with here.) - */ -static bool -query_is_distinct_for(Query *query, List *colnos, List *opids) -{ - ListCell *l; - Oid opid; - - Assert(list_length(colnos) == list_length(opids)); - - /* - * A set-returning function in the query's targetlist can result in - * returning duplicate rows, if the SRF is evaluated after the - * de-duplication step; so we play it safe and say "no" if there are any - * SRFs. (We could be certain that it's okay if SRFs appear only in the - * specified columns, since those must be evaluated before de-duplication; - * but it doesn't presently seem worth the complication to check that.) - */ - if (expression_returns_set((Node *) query->targetList)) - return false; - - /* - * DISTINCT (including DISTINCT ON) guarantees uniqueness if all the - * columns in the DISTINCT clause appear in colnos and operator semantics - * match. - */ - if (query->distinctClause) - { - foreach(l, query->distinctClause) - { - SortGroupClause *sgc = (SortGroupClause *) lfirst(l); - TargetEntry *tle = get_sortgroupclause_tle(sgc, - query->targetList); - - opid = distinct_col_search(tle->resno, colnos, opids); - if (!OidIsValid(opid) || - !equality_ops_are_compatible(opid, sgc->eqop)) - break; /* exit early if no match */ - } - if (l == NULL) /* had matches for all? */ - return true; - } - - /* - * Similarly, GROUP BY guarantees uniqueness if all the grouped columns - * appear in colnos and operator semantics match. - */ - if (query->groupClause) - { - foreach(l, query->groupClause) - { - SortGroupClause *sgc = (SortGroupClause *) lfirst(l); - TargetEntry *tle = get_sortgroupclause_tle(sgc, - query->targetList); - - opid = distinct_col_search(tle->resno, colnos, opids); - if (!OidIsValid(opid) || - !equality_ops_are_compatible(opid, sgc->eqop)) - break; /* exit early if no match */ - } - if (l == NULL) /* had matches for all? */ - return true; - } - else - { - /* - * If we have no GROUP BY, but do have aggregates or HAVING, then the - * result is at most one row so it's surely unique, for any operators. - */ - if (query->hasAggs || query->havingQual) - return true; - } - - /* - * UNION, INTERSECT, EXCEPT guarantee uniqueness of the whole output row, - * except with ALL. - */ - if (query->setOperations) - { - SetOperationStmt *topop = (SetOperationStmt *) query->setOperations; - - Assert(IsA(topop, SetOperationStmt)); - Assert(topop->op != SETOP_NONE); - - if (!topop->all) - { - ListCell *lg; - - /* We're good if all the nonjunk output columns are in colnos */ - lg = list_head(topop->groupClauses); - foreach(l, query->targetList) - { - TargetEntry *tle = (TargetEntry *) lfirst(l); - SortGroupClause *sgc; - - if (tle->resjunk) - continue; /* ignore resjunk columns */ - - /* non-resjunk columns should have grouping clauses */ - Assert(lg != NULL); - sgc = (SortGroupClause *) lfirst(lg); - lg = lnext(lg); - - opid = distinct_col_search(tle->resno, colnos, opids); - if (!OidIsValid(opid) || - !equality_ops_are_compatible(opid, sgc->eqop)) - break; /* exit early if no match */ - } - if (l == NULL) /* had matches for all? */ - return true; - } - } - - /* - * XXX Are there any other cases in which we can easily see the result - * must be distinct? - */ - - return false; -} - -/* - * distinct_col_search - subroutine for query_is_distinct_for - * - * If colno is in colnos, return the corresponding element of opids, - * else return InvalidOid. (We expect colnos does not contain duplicates, - * so the result is well-defined.) - */ -static Oid -distinct_col_search(int colno, List *colnos, List *opids) -{ - ListCell *lc1, - *lc2; - - forboth(lc1, colnos, lc2, opids) - { - if (colno == lfirst_int(lc1)) - return lfirst_oid(lc2); - } - return InvalidOid; -} - -/* * create_subqueryscan_path * Creates a path corresponding to a sequential scan of a subquery, * returning the pathnode. diff --git a/src/include/optimizer/planmain.h b/src/include/optimizer/planmain.h index 8bdb7dbeb08..450425046a5 100644 --- a/src/include/optimizer/planmain.h +++ b/src/include/optimizer/planmain.h @@ -122,6 +122,8 @@ extern RestrictInfo *build_implied_join_equality(Oid opno, * prototypes for plan/analyzejoins.c */ extern List *remove_useless_joins(PlannerInfo *root, List *joinlist); +extern bool query_supports_distinctness(Query *query); +extern bool query_is_distinct_for(Query *query, List *colnos, List *opids); /* * prototypes for plan/setrefs.c diff --git a/src/test/regress/expected/join.out b/src/test/regress/expected/join.out index c62a63f6740..1cb1c51a6ee 100644 --- a/src/test/regress/expected/join.out +++ b/src/test/regress/expected/join.out @@ -3131,9 +3131,11 @@ begin; CREATE TEMP TABLE a (id int PRIMARY KEY, b_id int); CREATE TEMP TABLE b (id int PRIMARY KEY, c_id int); CREATE TEMP TABLE c (id int PRIMARY KEY); +CREATE TEMP TABLE d (a int, b int); INSERT INTO a VALUES (0, 0), (1, NULL); INSERT INTO b VALUES (0, 0), (1, NULL); INSERT INTO c VALUES (0), (1); +INSERT INTO d VALUES (1,3), (2,2), (3,1); -- all three cases should be optimizable into a simple seqscan explain (costs off) SELECT a.* FROM a LEFT JOIN b ON a.b_id = b.id; QUERY PLAN @@ -3169,6 +3171,83 @@ select id from a where id in ( -> Seq Scan on b (5 rows) +-- check that join removal works for a left join when joining a subquery +-- that is guaranteed to be unique by its GROUP BY clause +explain (costs off) +select d.* from d left join (select * from b group by b.id, b.c_id) s + on d.a = s.id and d.b = s.c_id; + QUERY PLAN +--------------- + Seq Scan on d +(1 row) + +-- similarly, but keying off a DISTINCT clause +explain (costs off) +select d.* from d left join (select distinct * from b) s + on d.a = s.id and d.b = s.c_id; + QUERY PLAN +--------------- + Seq Scan on d +(1 row) + +-- join removal is not possible when the GROUP BY contains a column that is +-- not in the join condition +explain (costs off) +select d.* from d left join (select * from b group by b.id, b.c_id) s + on d.a = s.id; + QUERY PLAN +--------------------------------------------- + Merge Left Join + Merge Cond: (d.a = s.id) + -> Sort + Sort Key: d.a + -> Seq Scan on d + -> Sort + Sort Key: s.id + -> Subquery Scan on s + -> HashAggregate + Group Key: b.id, b.c_id + -> Seq Scan on b +(11 rows) + +-- similarly, but keying off a DISTINCT clause +explain (costs off) +select d.* from d left join (select distinct * from b) s + on d.a = s.id; + QUERY PLAN +--------------------------------------------- + Merge Left Join + Merge Cond: (d.a = s.id) + -> Sort + Sort Key: d.a + -> Seq Scan on d + -> Sort + Sort Key: s.id + -> Subquery Scan on s + -> HashAggregate + Group Key: b.id, b.c_id + -> Seq Scan on b +(11 rows) + +-- check join removal works when uniqueness of the join condition is enforced +-- by a UNION +explain (costs off) +select d.* from d left join (select id from a union select id from b) s + on d.a = s.id; + QUERY PLAN +--------------- + Seq Scan on d +(1 row) + +-- check join removal with a cross-type comparison operator +explain (costs off) +select i8.* from int8_tbl i8 left join (select f1 from int4_tbl group by f1) i4 + on i8.q1 = i4.f1; + QUERY PLAN +------------------------- + Seq Scan on int8_tbl i8 +(1 row) + rollback; create temp table parent (k int primary key, pd int); create temp table child (k int unique, cd int); diff --git a/src/test/regress/sql/join.sql b/src/test/regress/sql/join.sql index 1031f26b314..fa3e0686261 100644 --- a/src/test/regress/sql/join.sql +++ b/src/test/regress/sql/join.sql @@ -919,9 +919,11 @@ begin; CREATE TEMP TABLE a (id int PRIMARY KEY, b_id int); CREATE TEMP TABLE b (id int PRIMARY KEY, c_id int); CREATE TEMP TABLE c (id int PRIMARY KEY); +CREATE TEMP TABLE d (a int, b int); INSERT INTO a VALUES (0, 0), (1, NULL); INSERT INTO b VALUES (0, 0), (1, NULL); INSERT INTO c VALUES (0), (1); +INSERT INTO d VALUES (1,3), (2,2), (3,1); -- all three cases should be optimizable into a simple seqscan explain (costs off) SELECT a.* FROM a LEFT JOIN b ON a.b_id = b.id; @@ -936,6 +938,39 @@ select id from a where id in ( select b.id from b left join c on b.id = c.id ); +-- check that join removal works for a left join when joining a subquery +-- that is guaranteed to be unique by its GROUP BY clause +explain (costs off) +select d.* from d left join (select * from b group by b.id, b.c_id) s + on d.a = s.id and d.b = s.c_id; + +-- similarly, but keying off a DISTINCT clause +explain (costs off) +select d.* from d left join (select distinct * from b) s + on d.a = s.id and d.b = s.c_id; + +-- join removal is not possible when the GROUP BY contains a column that is +-- not in the join condition +explain (costs off) +select d.* from d left join (select * from b group by b.id, b.c_id) s + on d.a = s.id; + +-- similarly, but keying off a DISTINCT clause +explain (costs off) +select d.* from d left join (select distinct * from b) s + on d.a = s.id; + +-- check join removal works when uniqueness of the join condition is enforced +-- by a UNION +explain (costs off) +select d.* from d left join (select id from a union select id from b) s + on d.a = s.id; + +-- check join removal with a cross-type comparison operator +explain (costs off) +select i8.* from int8_tbl i8 left join (select f1 from int4_tbl group by f1) i4 + on i8.q1 = i4.f1; + rollback; create temp table parent (k int primary key, pd int); |