Fix overenthusiastic optimization of 'x IN (SELECT DISTINCT ...)' and related

cases: we can't just consider whether the subquery's output is unique on its own terms, we have to check whether the set of output columns we are going to use will be unique. Per complaint from Luca Pireddu and test case from Michael Fuhr.
author: Tom Lane <tgl@sss.pgh.pa.us> 2005-07-15 17:09:50 +0000
committer: Tom Lane <tgl@sss.pgh.pa.us> 2005-07-15 17:09:50 +0000
commit: 1e31942a3317b1a31c90c5e6ab6ef3b8760f1063 (patch)
tree: 52ca41a1d86b07e5b8480528cc7a390cad2a0719 /src/backend/optimizer/util/pathnode.c
parent: 35a0fc32f537bf6dbbdf53a8523ead01704ec8b4 (diff)
download: postgresql-1e31942a3317b1a31c90c5e6ab6ef3b8760f1063.tar.gz
postgresql-1e31942a3317b1a31c90c5e6ab6ef3b8760f1063.zip
1 files changed, 129 insertions, 45 deletions
diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c
index 64c802805a7..2f10fbe1939 100644
--- a/src/backend/optimizer/util/pathnode.c
+++ b/src/backend/optimizer/util/pathnode.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/optimizer/util/pathnode.c,v 1.111 2004/12/31 22:00:23 pgsql Exp $
+ *	  $PostgreSQL: pgsql/src/backend/optimizer/util/pathnode.c,v 1.111.4.1 2005/07/15 17:09:49 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -34,7 +34,8 @@
 #include "utils/syscache.h"
 
 
-static bool is_distinct_query(Query *query);
+static List *translate_sub_tlist(List *tlist, int relid);
+static bool query_is_distinct_for(Query *query, List *colnos);
 static bool hash_safe_tlist(List *tlist);
 
 
@@ -642,14 +643,41 @@ create_unique_path(Query *root, RelOptInfo *rel, Path *subpath)
 	pathnode->subpath = subpath;
 
 	/*
-	 * If the input is a subquery whose output must be unique already, we
-	 * don't need to do anything.
+	 * Try to identify the targetlist that will actually be unique-ified.
+	 * In current usage, this routine is only used for sub-selects of IN
+	 * clauses, so we should be able to find the tlist in in_info_list.
+	 */
+	sub_targetlist = NIL;
+	foreach(l, root->in_info_list)
+	{
+		InClauseInfo *ininfo = (InClauseInfo *) lfirst(l);
+
+		if (bms_equal(ininfo->righthand, rel->relids))
+		{
+			sub_targetlist = ininfo->sub_targetlist;
+			break;
+		}
+	}
+
+	/*
+	 * If the input is a subquery whose output must be unique already,
+	 * then we don't need to do anything.  The test for uniqueness has
+	 * to consider exactly which columns we are extracting; for example
+	 * "SELECT DISTINCT x,y" doesn't guarantee that x alone is distinct.
+	 * So we cannot check for this optimization unless we found our own
+	 * targetlist above, and it consists only of simple Vars referencing
+	 * subquery outputs.  (Possibly we could do something with expressions
+	 * in the subquery outputs, too, but for now keep it simple.)
 	 */
-	if (rel->rtekind == RTE_SUBQUERY)
+	if (sub_targetlist && rel->rtekind == RTE_SUBQUERY)
 	{
 		RangeTblEntry *rte = rt_fetch(rel->relid, root->rtable);
+		List   *sub_tlist_colnos;
 
-		if (is_distinct_query(rte->subquery))
+		sub_tlist_colnos = translate_sub_tlist(sub_targetlist, rel->relid);
+
+		if (sub_tlist_colnos &&
+			query_is_distinct_for(rte->subquery, sub_tlist_colnos))
 		{
 			pathnode->umethod = UNIQUE_PATH_NOOP;
 			pathnode->rows = rel->rows;
@@ -664,23 +692,6 @@ create_unique_path(Query *root, RelOptInfo *rel, Path *subpath)
 	}
 
 	/*
-	 * Try to identify the targetlist that will actually be unique-ified.
-	 * In current usage, this routine is only used for sub-selects of IN
-	 * clauses, so we should be able to find the tlist in in_info_list.
-	 */
-	sub_targetlist = NIL;
-	foreach(l, root->in_info_list)
-	{
-		InClauseInfo *ininfo = (InClauseInfo *) lfirst(l);
-
-		if (bms_equal(ininfo->righthand, rel->relids))
-		{
-			sub_targetlist = ininfo->sub_targetlist;
-			break;
-		}
-	}
-
-	/*
 	 * If we know the targetlist, try to estimate number of result rows;
 	 * otherwise punt.
 	 */
@@ -755,50 +766,123 @@ create_unique_path(Query *root, RelOptInfo *rel, Path *subpath)
 }
 
 /*
- * is_distinct_query - does query never return duplicate rows?
+ * translate_sub_tlist - get subquery column numbers represented by tlist
+ *
+ * The given targetlist should contain only Vars referencing the given relid.
+ * Extract their varattnos (ie, the column numbers of the subquery) and return
+ * as an integer List.
+ *
+ * If any of the tlist items is not a simple Var, we cannot determine whether
+ * the subquery's uniqueness condition (if any) matches ours, so punt and
+ * return NIL.
  */
-static bool
-is_distinct_query(Query *query)
+static List *
+translate_sub_tlist(List *tlist, int relid)
 {
-	/* DISTINCT (but not DISTINCT ON) guarantees uniqueness */
-	if (has_distinct_clause(query))
-		return true;
+	List	   *result = NIL;
+	ListCell   *l;
 
-	/* UNION, INTERSECT, EXCEPT guarantee uniqueness, except with ALL */
-	if (query->setOperations)
+	foreach(l, tlist)
 	{
-		SetOperationStmt *topop = (SetOperationStmt *) query->setOperations;
+		Var	   *var = (Var *) lfirst(l);
 
-		Assert(IsA(topop, SetOperationStmt));
-		Assert(topop->op != SETOP_NONE);
+		if (!var || !IsA(var, Var) ||
+			var->varno != relid)
+			return NIL;			/* punt */
 
-		if (!topop->all)
+		result = lappend_int(result, var->varattno);
+	}
+	return result;
+}
+
+/*
+ * query_is_distinct_for - does query never return duplicates of the
+ *		specified columns?
+ *
+ * colnos is an integer list of output column numbers (resno's).  We are
+ * interested in whether rows consisting of just these columns are certain
+ * to be distinct.
+ */
+static bool
+query_is_distinct_for(Query *query, List *colnos)
+{
+	ListCell   *l;
+
+	/*
+	 * DISTINCT (including DISTINCT ON) guarantees uniqueness if all the
+	 * columns in the DISTINCT clause appear in colnos.
+	 */
+	if (query->distinctClause)
+	{
+		foreach(l, query->distinctClause)
+		{
+			SortClause *scl = (SortClause *) lfirst(l);
+			TargetEntry *tle = get_sortgroupclause_tle(scl,
+													   query->targetList);
+
+			if (!list_member_int(colnos, tle->resdom->resno))
+				break;			/* exit early if no match */
+		}
+		if (l == NULL)			/* had matches for all? */
 			return true;
 	}
 
 	/*
-	 * GROUP BY guarantees uniqueness if all the grouped columns appear in
-	 * the output.	In our implementation this means checking they are non
-	 * resjunk columns.
+	 * Similarly, GROUP BY guarantees uniqueness if all the grouped columns
+	 * appear in colnos.
 	 */
 	if (query->groupClause)
 	{
-		ListCell   *gl;
-
-		foreach(gl, query->groupClause)
+		foreach(l, query->groupClause)
 		{
-			GroupClause *grpcl = (GroupClause *) lfirst(gl);
+			GroupClause *grpcl = (GroupClause *) lfirst(l);
 			TargetEntry *tle = get_sortgroupclause_tle(grpcl,
 													   query->targetList);
 
-			if (tle->resdom->resjunk)
-				break;
+			if (!list_member_int(colnos, tle->resdom->resno))
+				break;			/* exit early if no match */
 		}
-		if (!gl)				/* got to the end? */
+		if (l == NULL)			/* had matches for all? */
+			return true;
+	}
+	else
+	{
+		/*
+		 * If we have no GROUP BY, but do have aggregates or HAVING, then
+		 * the result is at most one row so it's surely unique.
+		 */
+		if (query->hasAggs || query->havingQual)
 			return true;
 	}
 
 	/*
+	 * UNION, INTERSECT, EXCEPT guarantee uniqueness of the whole output row,
+	 * except with ALL
+	 */
+	if (query->setOperations)
+	{
+		SetOperationStmt *topop = (SetOperationStmt *) query->setOperations;
+
+		Assert(IsA(topop, SetOperationStmt));
+		Assert(topop->op != SETOP_NONE);
+
+		if (!topop->all)
+		{
+			/* We're good if all the nonjunk output columns are in colnos */
+			foreach(l, query->targetList)
+			{
+				TargetEntry *tle = (TargetEntry *) lfirst(l);
+
+				if (!tle->resdom->resjunk &&
+					!list_member_int(colnos, tle->resdom->resno))
+					break;		/* exit early if no match */
+			}
+			if (l == NULL)		/* had matches for all? */
+				return true;
+		}
+	}
+
+	/*
 	 * XXX Are there any other cases in which we can easily see the result
 	 * must be distinct?
 	 */
author	Tom Lane <tgl@sss.pgh.pa.us>	2005-07-15 17:09:50 +0000
committer	Tom Lane <tgl@sss.pgh.pa.us>	2005-07-15 17:09:50 +0000
commit	1e31942a3317b1a31c90c5e6ab6ef3b8760f1063 (patch)
tree	52ca41a1d86b07e5b8480528cc7a390cad2a0719 /src/backend/optimizer/util/pathnode.c
parent	35a0fc32f537bf6dbbdf53a8523ead01704ec8b4 (diff)
download	postgresql-1e31942a3317b1a31c90c5e6ab6ef3b8760f1063.tar.gz postgresql-1e31942a3317b1a31c90c5e6ab6ef3b8760f1063.zip