1 files changed, 281 insertions, 14 deletions
diff --git a/src/backend/optimizer/plan/analyzejoins.c b/src/backend/optimizer/plan/analyzejoins.c
index 129fc3dfae6..773f8a458e2 100644
--- a/src/backend/optimizer/plan/analyzejoins.c
+++ b/src/backend/optimizer/plan/analyzejoins.c
@@ -22,17 +22,21 @@
  */
 #include "postgres.h"
 
+#include "nodes/nodeFuncs.h"
+#include "optimizer/clauses.h"
 #include "optimizer/joininfo.h"
 #include "optimizer/pathnode.h"
 #include "optimizer/paths.h"
 #include "optimizer/planmain.h"
-#include "optimizer/var.h"
+#include "optimizer/tlist.h"
+#include "utils/lsyscache.h"
 
 /* local functions */
 static bool join_is_removable(PlannerInfo *root, SpecialJoinInfo *sjinfo);
 static void remove_rel_from_query(PlannerInfo *root, int relid,
 					  Relids joinrelids);
 static List *remove_rel_from_joinlist(List *joinlist, int relid, int *nremoved);
+static Oid	distinct_col_search(int colno, List *colnos, List *opids);
 
 
 /*
@@ -147,18 +151,15 @@ join_is_removable(PlannerInfo *root, SpecialJoinInfo *sjinfo)
 {
 	int			innerrelid;
 	RelOptInfo *innerrel;
+	Query	   *subquery = NULL;
 	Relids		joinrelids;
 	List	   *clause_list = NIL;
 	ListCell   *l;
 	int			attroff;
 
 	/*
-	 * Currently, we only know how to remove left joins to a baserel with
-	 * unique indexes.  We can check most of these criteria pretty trivially
-	 * to avoid doing useless extra work.  But checking whether any of the
-	 * indexes are unique would require iterating over the indexlist, so for
-	 * now we just make sure there are indexes of some sort or other.  If none
-	 * of them are unique, join removal will still fail, just slightly later.
+	 * Must be a non-delaying left join to a single baserel, else we aren't
+	 * going to be able to do anything with it.
 	 */
 	if (sjinfo->jointype != JOIN_LEFT ||
 		sjinfo->delay_upper_joins ||
@@ -168,11 +169,39 @@ join_is_removable(PlannerInfo *root, SpecialJoinInfo *sjinfo)
 	innerrelid = bms_singleton_member(sjinfo->min_righthand);
 	innerrel = find_base_rel(root, innerrelid);
 
-	if (innerrel->reloptkind != RELOPT_BASEREL ||
-		innerrel->rtekind != RTE_RELATION ||
-		innerrel->indexlist == NIL)
+	if (innerrel->reloptkind != RELOPT_BASEREL)
 		return false;
 
+	/*
+	 * Before we go to the effort of checking whether any innerrel variables
+	 * are needed above the join, make a quick check to eliminate cases in
+	 * which we will surely be unable to prove uniqueness of the innerrel.
+	 */
+	if (innerrel->rtekind == RTE_RELATION)
+	{
+		/*
+		 * For a plain-relation innerrel, we only know how to prove uniqueness
+		 * by reference to unique indexes.  If there are no indexes then
+		 * there's certainly no unique indexes so there's no point in going
+		 * further.
+		 */
+		if (innerrel->indexlist == NIL)
+			return false;
+	}
+	else if (innerrel->rtekind == RTE_SUBQUERY)
+	{
+		subquery = root->simple_rte_array[innerrelid]->subquery;
+
+		/*
+		 * If the subquery has no qualities that support distinctness proofs
+		 * then there's no point in going further.
+		 */
+		if (!query_supports_distinctness(subquery))
+			return false;
+	}
+	else
+		return false;			/* unsupported rtekind */
+
 	/* Compute the relid set for the join we are considering */
 	joinrelids = bms_union(sjinfo->min_lefthand, sjinfo->min_righthand);
 
@@ -272,12 +301,64 @@ join_is_removable(PlannerInfo *root, SpecialJoinInfo *sjinfo)
 
 	/*
 	 * relation_has_unique_index_for automatically adds any usable restriction
-	 * clauses for the innerrel, so we needn't do that here.
+	 * clauses for the innerrel, so we needn't do that here.  (XXX we are not
+	 * considering restriction clauses for subqueries; is that worth doing?)
 	 */
 
-	/* Now examine the indexes to see if we have a matching unique index */
-	if (relation_has_unique_index_for(root, innerrel, clause_list, NIL, NIL))
-		return true;
+	if (innerrel->rtekind == RTE_RELATION)
+	{
+		/* Now examine the indexes to see if we have a matching unique index */
+		if (relation_has_unique_index_for(root, innerrel, clause_list, NIL, NIL))
+			return true;
+	}
+	else	/* innerrel->rtekind == RTE_SUBQUERY */
+	{
+		List	   *colnos = NIL;
+		List	   *opids = NIL;
+
+		/*
+		 * Build the argument lists for query_is_distinct_for: a list of
+		 * output column numbers that the query needs to be distinct over, and
+		 * a list of equality operators that the output columns need to be
+		 * distinct according to.
+		 */
+		foreach(l, clause_list)
+		{
+			RestrictInfo *rinfo = (RestrictInfo *) lfirst(l);
+			Oid			op;
+			Var		   *var;
+
+			/*
+			 * Get the equality operator we need uniqueness according to.
+			 * (This might be a cross-type operator and thus not exactly the
+			 * same operator the subquery would consider; that's all right
+			 * since query_is_distinct_for can resolve such cases.)  The
+			 * mergejoinability test above should have selected only OpExprs.
+			 */
+			Assert(IsA(rinfo->clause, OpExpr));
+			op = ((OpExpr *) rinfo->clause)->opno;
+
+			/* clause_sides_match_join identified the inner side for us */
+			if (rinfo->outer_is_left)
+				var = (Var *) get_rightop(rinfo->clause);
+			else
+				var = (Var *) get_leftop(rinfo->clause);
+
+			/*
+			 * If inner side isn't a Var referencing a subquery output column,
+			 * this clause doesn't help us.
+			 */
+			if (!var || !IsA(var, Var) ||
+				var->varno != innerrelid || var->varlevelsup != 0)
+				continue;
+
+			colnos = lappend_int(colnos, var->varattno);
+			opids = lappend_oid(opids, op);
+		}
+
+		if (query_is_distinct_for(subquery, colnos, opids))
+			return true;
+	}
 
 	/*
 	 * Some day it would be nice to check for other methods of establishing
@@ -481,3 +562,189 @@ remove_rel_from_joinlist(List *joinlist, int relid, int *nremoved)
 
 	return result;
 }
+
+
+/*
+ * query_supports_distinctness - could the query possibly be proven distinct
+ *		on some set of output columns?
+ *
+ * This is effectively a pre-checking function for query_is_distinct_for().
+ * It must return TRUE if query_is_distinct_for() could possibly return TRUE
+ * with this query, but it should not expend a lot of cycles.  The idea is
+ * that callers can avoid doing possibly-expensive processing to compute
+ * query_is_distinct_for()'s argument lists if the call could not possibly
+ * succeed.
+ */
+bool
+query_supports_distinctness(Query *query)
+{
+	if (query->distinctClause != NIL ||
+		query->groupClause != NIL ||
+		query->hasAggs ||
+		query->havingQual ||
+		query->setOperations)
+		return true;
+
+	return false;
+}
+
+/*
+ * query_is_distinct_for - does query never return duplicates of the
+ *		specified columns?
+ *
+ * query is a not-yet-planned subquery (in current usage, it's always from
+ * a subquery RTE, which the planner avoids scribbling on).
+ *
+ * colnos is an integer list of output column numbers (resno's).  We are
+ * interested in whether rows consisting of just these columns are certain
+ * to be distinct.  "Distinctness" is defined according to whether the
+ * corresponding upper-level equality operators listed in opids would think
+ * the values are distinct.  (Note: the opids entries could be cross-type
+ * operators, and thus not exactly the equality operators that the subquery
+ * would use itself.  We use equality_ops_are_compatible() to check
+ * compatibility.  That looks at btree or hash opfamily membership, and so
+ * should give trustworthy answers for all operators that we might need
+ * to deal with here.)
+ */
+bool
+query_is_distinct_for(Query *query, List *colnos, List *opids)
+{
+	ListCell   *l;
+	Oid			opid;
+
+	Assert(list_length(colnos) == list_length(opids));
+
+	/*
+	 * A set-returning function in the query's targetlist can result in
+	 * returning duplicate rows, if the SRF is evaluated after the
+	 * de-duplication step; so we play it safe and say "no" if there are any
+	 * SRFs.  (We could be certain that it's okay if SRFs appear only in the
+	 * specified columns, since those must be evaluated before de-duplication;
+	 * but it doesn't presently seem worth the complication to check that.)
+	 */
+	if (expression_returns_set((Node *) query->targetList))
+		return false;
+
+	/*
+	 * DISTINCT (including DISTINCT ON) guarantees uniqueness if all the
+	 * columns in the DISTINCT clause appear in colnos and operator semantics
+	 * match.
+	 */
+	if (query->distinctClause)
+	{
+		foreach(l, query->distinctClause)
+		{
+			SortGroupClause *sgc = (SortGroupClause *) lfirst(l);
+			TargetEntry *tle = get_sortgroupclause_tle(sgc,
+													   query->targetList);
+
+			opid = distinct_col_search(tle->resno, colnos, opids);
+			if (!OidIsValid(opid) ||
+				!equality_ops_are_compatible(opid, sgc->eqop))
+				break;			/* exit early if no match */
+		}
+		if (l == NULL)			/* had matches for all? */
+			return true;
+	}
+
+	/*
+	 * Similarly, GROUP BY guarantees uniqueness if all the grouped columns
+	 * appear in colnos and operator semantics match.
+	 */
+	if (query->groupClause)
+	{
+		foreach(l, query->groupClause)
+		{
+			SortGroupClause *sgc = (SortGroupClause *) lfirst(l);
+			TargetEntry *tle = get_sortgroupclause_tle(sgc,
+													   query->targetList);
+
+			opid = distinct_col_search(tle->resno, colnos, opids);
+			if (!OidIsValid(opid) ||
+				!equality_ops_are_compatible(opid, sgc->eqop))
+				break;			/* exit early if no match */
+		}
+		if (l == NULL)			/* had matches for all? */
+			return true;
+	}
+	else
+	{
+		/*
+		 * If we have no GROUP BY, but do have aggregates or HAVING, then the
+		 * result is at most one row so it's surely unique, for any operators.
+		 */
+		if (query->hasAggs || query->havingQual)
+			return true;
+	}
+
+	/*
+	 * UNION, INTERSECT, EXCEPT guarantee uniqueness of the whole output row,
+	 * except with ALL.
+	 */
+	if (query->setOperations)
+	{
+		SetOperationStmt *topop = (SetOperationStmt *) query->setOperations;
+
+		Assert(IsA(topop, SetOperationStmt));
+		Assert(topop->op != SETOP_NONE);
+
+		if (!topop->all)
+		{
+			ListCell   *lg;
+
+			/* We're good if all the nonjunk output columns are in colnos */
+			lg = list_head(topop->groupClauses);
+			foreach(l, query->targetList)
+			{
+				TargetEntry *tle = (TargetEntry *) lfirst(l);
+				SortGroupClause *sgc;
+
+				if (tle->resjunk)
+					continue;	/* ignore resjunk columns */
+
+				/* non-resjunk columns should have grouping clauses */
+				Assert(lg != NULL);
+				sgc = (SortGroupClause *) lfirst(lg);
+				lg = lnext(lg);
+
+				opid = distinct_col_search(tle->resno, colnos, opids);
+				if (!OidIsValid(opid) ||
+					!equality_ops_are_compatible(opid, sgc->eqop))
+					break;		/* exit early if no match */
+			}
+			if (l == NULL)		/* had matches for all? */
+				return true;
+		}
+	}
+
+	/*
+	 * XXX Are there any other cases in which we can easily see the result
+	 * must be distinct?
+	 *
+	 * If you do add more smarts to this function, be sure to update
+	 * query_supports_distinctness() to match.
+	 */
+
+	return false;
+}
+
+/*
+ * distinct_col_search - subroutine for query_is_distinct_for
+ *
+ * If colno is in colnos, return the corresponding element of opids,
+ * else return InvalidOid.  (Ordinarily colnos would not contain duplicates,
+ * but if it does, we arbitrarily select the first match.)
+ */
+static Oid
+distinct_col_search(int colno, List *colnos, List *opids)
+{
+	ListCell   *lc1,
+			   *lc2;
+
+	forboth(lc1, colnos, lc2, opids)
+	{
+		if (colno == lfirst_int(lc1))
+			return lfirst_oid(lc2);
+	}
+	return InvalidOid;
+}