3 files changed, 219 insertions, 45 deletions
diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c
index 64c802805a7..2f10fbe1939 100644
--- a/src/backend/optimizer/util/pathnode.c
+++ b/src/backend/optimizer/util/pathnode.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/optimizer/util/pathnode.c,v 1.111 2004/12/31 22:00:23 pgsql Exp $
+ *	  $PostgreSQL: pgsql/src/backend/optimizer/util/pathnode.c,v 1.111.4.1 2005/07/15 17:09:49 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -34,7 +34,8 @@
 #include "utils/syscache.h"
 
 
-static bool is_distinct_query(Query *query);
+static List *translate_sub_tlist(List *tlist, int relid);
+static bool query_is_distinct_for(Query *query, List *colnos);
 static bool hash_safe_tlist(List *tlist);
 
 
@@ -642,14 +643,41 @@ create_unique_path(Query *root, RelOptInfo *rel, Path *subpath)
 	pathnode->subpath = subpath;
 
 	/*
-	 * If the input is a subquery whose output must be unique already, we
-	 * don't need to do anything.
+	 * Try to identify the targetlist that will actually be unique-ified.
+	 * In current usage, this routine is only used for sub-selects of IN
+	 * clauses, so we should be able to find the tlist in in_info_list.
+	 */
+	sub_targetlist = NIL;
+	foreach(l, root->in_info_list)
+	{
+		InClauseInfo *ininfo = (InClauseInfo *) lfirst(l);
+
+		if (bms_equal(ininfo->righthand, rel->relids))
+		{
+			sub_targetlist = ininfo->sub_targetlist;
+			break;
+		}
+	}
+
+	/*
+	 * If the input is a subquery whose output must be unique already,
+	 * then we don't need to do anything.  The test for uniqueness has
+	 * to consider exactly which columns we are extracting; for example
+	 * "SELECT DISTINCT x,y" doesn't guarantee that x alone is distinct.
+	 * So we cannot check for this optimization unless we found our own
+	 * targetlist above, and it consists only of simple Vars referencing
+	 * subquery outputs.  (Possibly we could do something with expressions
+	 * in the subquery outputs, too, but for now keep it simple.)
 	 */
-	if (rel->rtekind == RTE_SUBQUERY)
+	if (sub_targetlist && rel->rtekind == RTE_SUBQUERY)
 	{
 		RangeTblEntry *rte = rt_fetch(rel->relid, root->rtable);
+		List   *sub_tlist_colnos;
 
-		if (is_distinct_query(rte->subquery))
+		sub_tlist_colnos = translate_sub_tlist(sub_targetlist, rel->relid);
+
+		if (sub_tlist_colnos &&
+			query_is_distinct_for(rte->subquery, sub_tlist_colnos))
 		{
 			pathnode->umethod = UNIQUE_PATH_NOOP;
 			pathnode->rows = rel->rows;
@@ -664,23 +692,6 @@ create_unique_path(Query *root, RelOptInfo *rel, Path *subpath)
 	}
 
 	/*
-	 * Try to identify the targetlist that will actually be unique-ified.
-	 * In current usage, this routine is only used for sub-selects of IN
-	 * clauses, so we should be able to find the tlist in in_info_list.
-	 */
-	sub_targetlist = NIL;
-	foreach(l, root->in_info_list)
-	{
-		InClauseInfo *ininfo = (InClauseInfo *) lfirst(l);
-
-		if (bms_equal(ininfo->righthand, rel->relids))
-		{
-			sub_targetlist = ininfo->sub_targetlist;
-			break;
-		}
-	}
-
-	/*
 	 * If we know the targetlist, try to estimate number of result rows;
 	 * otherwise punt.
 	 */
@@ -755,50 +766,123 @@ create_unique_path(Query *root, RelOptInfo *rel, Path *subpath)
 }
 
 /*
- * is_distinct_query - does query never return duplicate rows?
+ * translate_sub_tlist - get subquery column numbers represented by tlist
+ *
+ * The given targetlist should contain only Vars referencing the given relid.
+ * Extract their varattnos (ie, the column numbers of the subquery) and return
+ * as an integer List.
+ *
+ * If any of the tlist items is not a simple Var, we cannot determine whether
+ * the subquery's uniqueness condition (if any) matches ours, so punt and
+ * return NIL.
  */
-static bool
-is_distinct_query(Query *query)
+static List *
+translate_sub_tlist(List *tlist, int relid)
 {
-	/* DISTINCT (but not DISTINCT ON) guarantees uniqueness */
-	if (has_distinct_clause(query))
-		return true;
+	List	   *result = NIL;
+	ListCell   *l;
 
-	/* UNION, INTERSECT, EXCEPT guarantee uniqueness, except with ALL */
-	if (query->setOperations)
+	foreach(l, tlist)
 	{
-		SetOperationStmt *topop = (SetOperationStmt *) query->setOperations;
+		Var	   *var = (Var *) lfirst(l);
 
-		Assert(IsA(topop, SetOperationStmt));
-		Assert(topop->op != SETOP_NONE);
+		if (!var || !IsA(var, Var) ||
+			var->varno != relid)
+			return NIL;			/* punt */
 
-		if (!topop->all)
+		result = lappend_int(result, var->varattno);
+	}
+	return result;
+}
+
+/*
+ * query_is_distinct_for - does query never return duplicates of the
+ *		specified columns?
+ *
+ * colnos is an integer list of output column numbers (resno's).  We are
+ * interested in whether rows consisting of just these columns are certain
+ * to be distinct.
+ */
+static bool
+query_is_distinct_for(Query *query, List *colnos)
+{
+	ListCell   *l;
+
+	/*
+	 * DISTINCT (including DISTINCT ON) guarantees uniqueness if all the
+	 * columns in the DISTINCT clause appear in colnos.
+	 */
+	if (query->distinctClause)
+	{
+		foreach(l, query->distinctClause)
+		{
+			SortClause *scl = (SortClause *) lfirst(l);
+			TargetEntry *tle = get_sortgroupclause_tle(scl,
+													   query->targetList);
+
+			if (!list_member_int(colnos, tle->resdom->resno))
+				break;			/* exit early if no match */
+		}
+		if (l == NULL)			/* had matches for all? */
 			return true;
 	}
 
 	/*
-	 * GROUP BY guarantees uniqueness if all the grouped columns appear in
-	 * the output.	In our implementation this means checking they are non
-	 * resjunk columns.
+	 * Similarly, GROUP BY guarantees uniqueness if all the grouped columns
+	 * appear in colnos.
 	 */
 	if (query->groupClause)
 	{
-		ListCell   *gl;
-
-		foreach(gl, query->groupClause)
+		foreach(l, query->groupClause)
 		{
-			GroupClause *grpcl = (GroupClause *) lfirst(gl);
+			GroupClause *grpcl = (GroupClause *) lfirst(l);
 			TargetEntry *tle = get_sortgroupclause_tle(grpcl,
 													   query->targetList);
 
-			if (tle->resdom->resjunk)
-				break;
+			if (!list_member_int(colnos, tle->resdom->resno))
+				break;			/* exit early if no match */
 		}
-		if (!gl)				/* got to the end? */
+		if (l == NULL)			/* had matches for all? */
+			return true;
+	}
+	else
+	{
+		/*
+		 * If we have no GROUP BY, but do have aggregates or HAVING, then
+		 * the result is at most one row so it's surely unique.
+		 */
+		if (query->hasAggs || query->havingQual)
 			return true;
 	}
 
 	/*
+	 * UNION, INTERSECT, EXCEPT guarantee uniqueness of the whole output row,
+	 * except with ALL
+	 */
+	if (query->setOperations)
+	{
+		SetOperationStmt *topop = (SetOperationStmt *) query->setOperations;
+
+		Assert(IsA(topop, SetOperationStmt));
+		Assert(topop->op != SETOP_NONE);
+
+		if (!topop->all)
+		{
+			/* We're good if all the nonjunk output columns are in colnos */
+			foreach(l, query->targetList)
+			{
+				TargetEntry *tle = (TargetEntry *) lfirst(l);
+
+				if (!tle->resdom->resjunk &&
+					!list_member_int(colnos, tle->resdom->resno))
+					break;		/* exit early if no match */
+			}
+			if (l == NULL)		/* had matches for all? */
+				return true;
+		}
+	}
+
+	/*
 	 * XXX Are there any other cases in which we can easily see the result
 	 * must be distinct?
 	 */
diff --git a/src/test/regress/expected/subselect.out b/src/test/regress/expected/subselect.out
index 07e727de482..d99656eac77 100644
--- a/src/test/regress/expected/subselect.out
+++ b/src/test/regress/expected/subselect.out
@@ -203,6 +203,63 @@ select count(distinct ss.ten) from
 (1 row)
 
 --
+-- Test cases to check for overenthusiastic optimization of
+-- "IN (SELECT DISTINCT ...)" and related cases.  Per example from
+-- Luca Pireddu and Michael Fuhr.
+--
+CREATE TEMP TABLE foo (id integer);
+CREATE TEMP TABLE bar (id1 integer, id2 integer);
+INSERT INTO foo VALUES (1);
+INSERT INTO bar VALUES (1, 1);
+INSERT INTO bar VALUES (2, 2);
+INSERT INTO bar VALUES (3, 1);
+-- These cases require an extra level of distinct-ing above subquery s
+SELECT * FROM foo WHERE id IN
+    (SELECT id2 FROM (SELECT DISTINCT id1, id2 FROM bar) AS s);
+ id 
+----
+  1
+(1 row)
+
+SELECT * FROM foo WHERE id IN
+    (SELECT id2 FROM (SELECT id1,id2 FROM bar GROUP BY id1,id2) AS s);
+ id 
+----
+  1
+(1 row)
+
+SELECT * FROM foo WHERE id IN
+    (SELECT id2 FROM (SELECT id1, id2 FROM bar UNION
+                      SELECT id1, id2 FROM bar) AS s);
+ id 
+----
+  1
+(1 row)
+
+-- These cases do not
+SELECT * FROM foo WHERE id IN
+    (SELECT id2 FROM (SELECT DISTINCT ON (id2) id1, id2 FROM bar) AS s);
+ id 
+----
+  1
+(1 row)
+
+SELECT * FROM foo WHERE id IN
+    (SELECT id2 FROM (SELECT id2 FROM bar GROUP BY id2) AS s);
+ id 
+----
+  1
+(1 row)
+
+SELECT * FROM foo WHERE id IN
+    (SELECT id2 FROM (SELECT id2 FROM bar UNION
+                      SELECT id2 FROM bar) AS s);
+ id 
+----
+  1
+(1 row)
+
+--
 -- Test case to catch problems with multiply nested sub-SELECTs not getting
 -- recalculated properly.  Per bug report from Didier Moens.
 --
diff --git a/src/test/regress/sql/subselect.sql b/src/test/regress/sql/subselect.sql
index 5cba9ca74d0..a07cc337596 100644
--- a/src/test/regress/sql/subselect.sql
+++ b/src/test/regress/sql/subselect.sql
@@ -96,6 +96,39 @@ select count(distinct ss.ten) from
    where unique1 IN (select distinct hundred from tenk1 b)) ss;
 
 --
+-- Test cases to check for overenthusiastic optimization of
+-- "IN (SELECT DISTINCT ...)" and related cases.  Per example from
+-- Luca Pireddu and Michael Fuhr.
+--
+
+CREATE TEMP TABLE foo (id integer);
+CREATE TEMP TABLE bar (id1 integer, id2 integer);
+
+INSERT INTO foo VALUES (1);
+
+INSERT INTO bar VALUES (1, 1);
+INSERT INTO bar VALUES (2, 2);
+INSERT INTO bar VALUES (3, 1);
+
+-- These cases require an extra level of distinct-ing above subquery s
+SELECT * FROM foo WHERE id IN
+    (SELECT id2 FROM (SELECT DISTINCT id1, id2 FROM bar) AS s);
+SELECT * FROM foo WHERE id IN
+    (SELECT id2 FROM (SELECT id1,id2 FROM bar GROUP BY id1,id2) AS s);
+SELECT * FROM foo WHERE id IN
+    (SELECT id2 FROM (SELECT id1, id2 FROM bar UNION
+                      SELECT id1, id2 FROM bar) AS s);
+
+-- These cases do not
+SELECT * FROM foo WHERE id IN
+    (SELECT id2 FROM (SELECT DISTINCT ON (id2) id1, id2 FROM bar) AS s);
+SELECT * FROM foo WHERE id IN
+    (SELECT id2 FROM (SELECT id2 FROM bar GROUP BY id2) AS s);
+SELECT * FROM foo WHERE id IN
+    (SELECT id2 FROM (SELECT id2 FROM bar UNION
+                      SELECT id2 FROM bar) AS s);
+
+--
 -- Test case to catch problems with multiply nested sub-SELECTs not getting
 -- recalculated properly.  Per bug report from Didier Moens.
 --