1 files changed, 35 insertions, 13 deletions
diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index aa3d9f8f327..50117340231 100644
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -15,7 +15,7 @@
  *
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/utils/adt/selfuncs.c,v 1.147.2.5 2007/01/03 22:39:56 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/utils/adt/selfuncs.c,v 1.147.2.6 2008/07/07 20:25:40 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -1990,21 +1990,25 @@ mergejoinscansel(Query *root, Node *clause,
  * case (all possible cross-product terms actually appear as groups) since
  * very often the grouped-by Vars are highly correlated.  Our current approach
  * is as follows:
- *	1.	Reduce the given expressions to a list of unique Vars used.  For
+ *	1.	Expressions yielding boolean are assumed to contribute two groups,
+ *		independently of their content, and are ignored in the subsequent
+ *		steps.  This is mainly because tests like "col IS NULL" break the
+ *		heuristic used in step 2 especially badly.
+ *	2.	Reduce the given expressions to a list of unique Vars used.  For
  *		example, GROUP BY a, a + b is treated the same as GROUP BY a, b.
  *		It is clearly correct not to count the same Var more than once.
  *		It is also reasonable to treat f(x) the same as x: f() cannot
  *		increase the number of distinct values (unless it is volatile,
  *		which we consider unlikely for grouping), but it probably won't
  *		reduce the number of distinct values much either.
- *	2.	If the list contains Vars of different relations that are known equal
+ *	3.	If the list contains Vars of different relations that are known equal
  *		due to equijoin clauses, then drop all but one of the Vars from each
  *		known-equal set, keeping the one with smallest estimated # of values
  *		(since the extra values of the others can't appear in joined rows).
  *		Note the reason we only consider Vars of different relations is that
  *		if we considered ones of the same rel, we'd be double-counting the
  *		restriction selectivity of the equality in the next step.
- *	3.	For Vars within a single source rel, we multiply together the numbers
+ *	4.	For Vars within a single source rel, we multiply together the numbers
  *		of values, clamp to the number of rows in the rel, and then multiply
  *		by the selectivity of the restriction clauses for that rel.  The
  *		initial product is probably too high (it's the worst case) but since
@@ -2012,10 +2016,10 @@ mergejoinscansel(Query *root, Node *clause,
  *		by the restriction selectivity is effectively assuming that the
  *		restriction clauses are independent of the grouping, which is a crummy
  *		assumption, but it's hard to do better.
- *	4.	If there are Vars from multiple rels, we repeat step 3 for each such
+ *	5.	If there are Vars from multiple rels, we repeat step 4 for each such
  *		rel, and multiply the results together.
  * Note that rels not containing grouped Vars are ignored completely, as are
- * join clauses other than the equijoin clauses used in step 2.  Such rels
+ * join clauses other than the equijoin clauses used in step 3.  Such rels
  * cannot increase the number of groups, and we assume such clauses do not
  * reduce the number either (somewhat bogus, but we don't have the info to
  * do better).
@@ -2036,12 +2040,24 @@ estimate_num_groups(Query *root, List *groupExprs, double input_rows)
 	/* We should not be called unless query has GROUP BY (or DISTINCT) */
 	Assert(groupExprs != NIL);
 
-	/* Step 1: get the unique Vars used */
+	/*
+	 * Count groups derived from boolean grouping expressions.  For other
+	 * expressions, find the unique Vars used.
+	 */
+	numdistinct = 1.0;
+
 	foreach(l, groupExprs)
 	{
 		Node	   *groupexpr = (Node *) lfirst(l);
 		List	   *varshere;
 
+		/* Short-circuit for expressions returning boolean */
+		if (exprType(groupexpr) == BOOLOID)
+		{
+			numdistinct *= 2.0;
+			continue;
+		}
+
 		varshere = pull_var_clause(groupexpr, false);
 
 		/*
@@ -2059,15 +2075,23 @@ estimate_num_groups(Query *root, List *groupExprs, double input_rows)
 		allvars = nconc(allvars, varshere);
 	}
 
-	/* If now no Vars, we must have an all-constant GROUP BY list. */
+	/*
+	 * If now no Vars, we must have an all-constant or all-boolean GROUP BY
+	 * list.
+	 */
 	if (allvars == NIL)
-		return 1.0;
+	{
+		/* Guard against out-of-range answers */
+		if (numdistinct > input_rows)
+			numdistinct = input_rows;
+		return numdistinct;
+	}
 
 	/* Use set_union() to discard duplicates */
 	allvars = set_union(NIL, allvars);
 
 	/*
-	 * Step 2: acquire statistical estimate of number of distinct values
+	 * Acquire statistical estimate of number of distinct values
 	 * of each Var (total in its table, without regard for filtering).
 	 * Also, detect known-equal Vars and discard the ones we don't want.
 	 */
@@ -2132,7 +2156,7 @@ estimate_num_groups(Query *root, List *groupExprs, double input_rows)
 	}
 
 	/*
-	 * Steps 3/4: group Vars by relation and estimate total numdistinct.
+	 * Group Vars by relation and estimate total numdistinct.
 	 *
 	 * For each iteration of the outer loop, we process the frontmost Var in
 	 * varinfos, plus all other Vars in the same relation.	We remove
@@ -2140,8 +2164,6 @@ estimate_num_groups(Query *root, List *groupExprs, double input_rows)
 	 * is the easiest way to group Vars of same rel together.
 	 */
 	Assert(varinfos != NIL);
-	numdistinct = 1.0;
-
 	do
 	{
 		MyVarInfo  *varinfo1 = (MyVarInfo *) lfirst(varinfos);