13 files changed, 396 insertions, 111 deletions
diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c
index 66630ae6612..408b9b2a757 100644
--- a/src/backend/nodes/outfuncs.c
+++ b/src/backend/nodes/outfuncs.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/nodes/outfuncs.c,v 1.329 2008/08/02 21:31:59 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/nodes/outfuncs.c,v 1.330 2008/08/05 02:43:17 tgl Exp $
  *
  * NOTES
  *	  Every node type that can appear in stored rules' parsetrees *must*
@@ -1334,6 +1334,7 @@ _outPlannerInfo(StringInfo str, PlannerInfo *node)
 	WRITE_NODE_FIELD(append_rel_list);
 	WRITE_NODE_FIELD(query_pathkeys);
 	WRITE_NODE_FIELD(group_pathkeys);
+	WRITE_NODE_FIELD(distinct_pathkeys);
 	WRITE_NODE_FIELD(sort_pathkeys);
 	WRITE_FLOAT_FIELD(total_table_pages, "%.0f");
 	WRITE_FLOAT_FIELD(tuple_fraction, "%.4f");
diff --git a/src/backend/optimizer/plan/planmain.c b/src/backend/optimizer/plan/planmain.c
index 5e5da6cda4f..081a7c9cebd 100644
--- a/src/backend/optimizer/plan/planmain.c
+++ b/src/backend/optimizer/plan/planmain.c
@@ -14,7 +14,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/optimizer/plan/planmain.c,v 1.108 2008/08/03 19:10:52 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/optimizer/plan/planmain.c,v 1.109 2008/08/05 02:43:17 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -66,9 +66,9 @@
  * PlannerInfo field and not a passed parameter is that the low-level routines
  * in indxpath.c need to see it.)
  *
- * Note: the PlannerInfo node also includes group_pathkeys and sort_pathkeys,
- * which like query_pathkeys need to be canonicalized once the info is
- * available.
+ * Note: the PlannerInfo node also includes group_pathkeys, distinct_pathkeys,
+ * and sort_pathkeys, which like query_pathkeys need to be canonicalized once
+ * the info is available.
  *
  * tuple_fraction is interpreted as follows:
  *	  0: expect all tuples to be retrieved (normal case)
@@ -120,6 +120,8 @@ query_planner(PlannerInfo *root, List *tlist,
 													 root->query_pathkeys);
 		root->group_pathkeys = canonicalize_pathkeys(root,
 													 root->group_pathkeys);
+		root->distinct_pathkeys = canonicalize_pathkeys(root,
+													root->distinct_pathkeys);
 		root->sort_pathkeys = canonicalize_pathkeys(root,
 													root->sort_pathkeys);
 		return;
@@ -237,10 +239,12 @@ query_planner(PlannerInfo *root, List *tlist,
 	/*
 	 * We have completed merging equivalence sets, so it's now possible to
 	 * convert the requested query_pathkeys to canonical form.	Also
-	 * canonicalize the groupClause and sortClause pathkeys for use later.
+	 * canonicalize the groupClause, distinctClause and sortClause pathkeys
+	 * for use later.
 	 */
 	root->query_pathkeys = canonicalize_pathkeys(root, root->query_pathkeys);
 	root->group_pathkeys = canonicalize_pathkeys(root, root->group_pathkeys);
+	root->distinct_pathkeys = canonicalize_pathkeys(root, root->distinct_pathkeys);
 	root->sort_pathkeys = canonicalize_pathkeys(root, root->sort_pathkeys);
 
 	/*
@@ -286,9 +290,11 @@ query_planner(PlannerInfo *root, List *tlist,
 		/*
 		 * If both GROUP BY and ORDER BY are specified, we will need two
 		 * levels of sort --- and, therefore, certainly need to read all the
-		 * tuples --- unless ORDER BY is a subset of GROUP BY.
+		 * tuples --- unless ORDER BY is a subset of GROUP BY.  Likewise if
+		 * we have both DISTINCT and GROUP BY.
 		 */
-		if (!pathkeys_contained_in(root->sort_pathkeys, root->group_pathkeys))
+		if (!pathkeys_contained_in(root->sort_pathkeys, root->group_pathkeys) ||
+			!pathkeys_contained_in(root->distinct_pathkeys, root->group_pathkeys))
 			tuple_fraction = 0.0;
 	}
 	else if (parse->hasAggs || root->hasHavingQual)
diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c
index 40a9bffaf37..d8c6942e250 100644
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/optimizer/plan/planner.c,v 1.237 2008/08/03 19:10:52 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/optimizer/plan/planner.c,v 1.238 2008/08/05 02:43:17 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -69,12 +69,17 @@ static double preprocess_limit(PlannerInfo *root,
 				 int64 *offset_est, int64 *count_est);
 static void preprocess_groupclause(PlannerInfo *root);
 static Oid *extract_grouping_ops(List *groupClause);
+static AttrNumber *extract_grouping_cols(List *groupClause, List *tlist);
 static bool grouping_is_sortable(List *groupClause);
 static bool grouping_is_hashable(List *groupClause);
 static bool choose_hashed_grouping(PlannerInfo *root,
 					   double tuple_fraction, double limit_tuples,
 					   Path *cheapest_path, Path *sorted_path,
 					   double dNumGroups, AggClauseCounts *agg_counts);
+static bool choose_hashed_distinct(PlannerInfo *root,
+					   Plan *input_plan, List *input_pathkeys,
+					   double tuple_fraction, double limit_tuples,
+					   double dNumDistinctRows);
 static List *make_subplanTargetList(PlannerInfo *root, List *tlist,
 					   AttrNumber **groupColIdx, bool *need_tlist_eval);
 static void locate_grouping_columns(PlannerInfo *root,
@@ -757,7 +762,6 @@ grouping_planner(PlannerInfo *root, double tuple_fraction)
 	double		limit_tuples = -1.0;
 	Plan	   *result_plan;
 	List	   *current_pathkeys;
-	List	   *sort_pathkeys;
 	double		dNumGroups = 0;
 
 	/* Tweak caller-supplied tuple_fraction if have LIMIT/OFFSET */
@@ -829,16 +833,15 @@ grouping_planner(PlannerInfo *root, double tuple_fraction)
 		 * Calculate pathkeys that represent result ordering requirements
 		 */
 		Assert(parse->distinctClause == NIL);
-		sort_pathkeys = make_pathkeys_for_sortclauses(root,
-													  parse->sortClause,
-													  tlist,
-													  true);
+		root->sort_pathkeys = make_pathkeys_for_sortclauses(root,
+															parse->sortClause,
+															tlist,
+															true);
 	}
 	else
 	{
 		/* No set operations, do regular planning */
 		List	   *sub_tlist;
-		List	   *group_pathkeys;
 		AttrNumber *groupColIdx = NULL;
 		bool		need_tlist_eval = true;
 		QualCost	tlist_cost;
@@ -870,14 +873,12 @@ grouping_planner(PlannerInfo *root, double tuple_fraction)
 		/*
 		 * Calculate pathkeys that represent grouping/ordering requirements.
 		 * Stash them in PlannerInfo so that query_planner can canonicalize
-		 * them after EquivalenceClasses have been formed.
-		 *
-		 * Note: for the moment, DISTINCT is always implemented via sort/uniq,
-		 * and we set the sort_pathkeys to be the more rigorous of the
-		 * DISTINCT and ORDER BY requirements.  This should be changed
-		 * someday, but DISTINCT ON is a bit of a problem ...
+		 * them after EquivalenceClasses have been formed.  The sortClause
+		 * is certainly sort-able, but GROUP BY and DISTINCT might not be,
+		 * in which case we just leave their pathkeys empty.
 		 */
-		if (parse->groupClause && grouping_is_sortable(parse->groupClause))
+		if (parse->groupClause &&
+			grouping_is_sortable(parse->groupClause))
 			root->group_pathkeys =
 				make_pathkeys_for_sortclauses(root,
 											  parse->groupClause,
@@ -886,18 +887,21 @@ grouping_planner(PlannerInfo *root, double tuple_fraction)
 		else
 			root->group_pathkeys = NIL;
 
-		if (list_length(parse->distinctClause) > list_length(parse->sortClause))
-			root->sort_pathkeys =
+		if (parse->distinctClause &&
+			grouping_is_sortable(parse->distinctClause))
+			root->distinct_pathkeys =
 				make_pathkeys_for_sortclauses(root,
 											  parse->distinctClause,
 											  tlist,
 											  false);
 		else
-			root->sort_pathkeys =
-				make_pathkeys_for_sortclauses(root,
-											  parse->sortClause,
-											  tlist,
-											  false);
+			root->distinct_pathkeys = NIL;
+
+		root->sort_pathkeys =
+			make_pathkeys_for_sortclauses(root,
+										  parse->sortClause,
+										  tlist,
+										  false);
 
 		/*
 		 * Will need actual number of aggregates for estimating costs.
@@ -917,17 +921,27 @@ grouping_planner(PlannerInfo *root, double tuple_fraction)
 		}
 
 		/*
-		 * Figure out whether we need a sorted result from query_planner.
+		 * Figure out whether we want a sorted result from query_planner.
 		 *
 		 * If we have a sortable GROUP BY clause, then we want a result sorted
-		 * properly for grouping.  Otherwise, if there is an ORDER BY clause,
-		 * we want to sort by the ORDER BY clause. (Note: if we have both, and
-		 * ORDER BY is a superset of GROUP BY, it would be tempting to request
-		 * sort by ORDER BY --- but that might just leave us failing to
-		 * exploit an available sort order at all. Needs more thought...)
+		 * properly for grouping.  Otherwise, if there's a sortable DISTINCT
+		 * clause that's more rigorous than the ORDER BY clause, we try to
+		 * produce output that's sufficiently well sorted for the DISTINCT.
+		 * Otherwise, if there is an ORDER BY clause, we want to sort by the
+		 * ORDER BY clause.
+		 *
+		 * Note: if we have both ORDER BY and GROUP BY, and ORDER BY is a
+		 * superset of GROUP BY, it would be tempting to request sort by ORDER
+		 * BY --- but that might just leave us failing to exploit an available
+		 * sort order at all.  Needs more thought.  The choice for DISTINCT
+		 * versus ORDER BY is much easier, since we know that the parser
+		 * ensured that one is a superset of the other.
 		 */
 		if (root->group_pathkeys)
 			root->query_pathkeys = root->group_pathkeys;
+		else if (list_length(root->distinct_pathkeys) >
+				 list_length(root->sort_pathkeys))
+			root->query_pathkeys = root->distinct_pathkeys;
 		else if (root->sort_pathkeys)
 			root->query_pathkeys = root->sort_pathkeys;
 		else
@@ -942,9 +956,6 @@ grouping_planner(PlannerInfo *root, double tuple_fraction)
 		query_planner(root, sub_tlist, tuple_fraction, limit_tuples,
 					  &cheapest_path, &sorted_path, &dNumGroups);
 
-		group_pathkeys = root->group_pathkeys;
-		sort_pathkeys = root->sort_pathkeys;
-
 		/*
 		 * If grouping, decide whether to use sorted or hashed grouping.
 		 */
@@ -1024,7 +1035,7 @@ grouping_planner(PlannerInfo *root, double tuple_fraction)
 
 			/* Detect if we'll need an explicit sort for grouping */
 			if (parse->groupClause && !use_hashed_grouping &&
-				!pathkeys_contained_in(group_pathkeys, current_pathkeys))
+				!pathkeys_contained_in(root->group_pathkeys, current_pathkeys))
 			{
 				need_sort_for_grouping = true;
 				/*
@@ -1135,7 +1146,7 @@ grouping_planner(PlannerInfo *root, double tuple_fraction)
 													 parse->groupClause,
 													 groupColIdx,
 													 result_plan);
-						current_pathkeys = group_pathkeys;
+						current_pathkeys = root->group_pathkeys;
 					}
 					aggstrategy = AGG_SORTED;
 
@@ -1178,7 +1189,7 @@ grouping_planner(PlannerInfo *root, double tuple_fraction)
 												 parse->groupClause,
 												 groupColIdx,
 												 result_plan);
-					current_pathkeys = group_pathkeys;
+					current_pathkeys = root->group_pathkeys;
 				}
 
 				result_plan = (Plan *) make_group(root,
@@ -1214,35 +1225,129 @@ grouping_planner(PlannerInfo *root, double tuple_fraction)
 	}							/* end of if (setOperations) */
 
 	/*
-	 * If we were not able to make the plan come out in the right order, add
-	 * an explicit sort step.
+	 * If there is a DISTINCT clause, add the necessary node(s).
 	 */
-	if (sort_pathkeys)
+	if (parse->distinctClause)
 	{
-		if (!pathkeys_contained_in(sort_pathkeys, current_pathkeys))
+		double	dNumDistinctRows;
+		long	numDistinctRows;
+		bool	use_hashed_distinct;
+		bool	can_sort;
+		bool	can_hash;
+
+		/*
+		 * If there was grouping or aggregation, use the current number of
+		 * rows as the estimated number of DISTINCT rows (ie, assume the
+		 * result was already mostly unique).  If not, use the number of
+		 * distinct-groups calculated by query_planner.
+		 */
+		if (parse->groupClause || root->hasHavingQual || parse->hasAggs)
+			dNumDistinctRows = result_plan->plan_rows;
+		else
+			dNumDistinctRows = dNumGroups;
+
+		/* Also convert to long int --- but 'ware overflow! */
+		numDistinctRows = (long) Min(dNumDistinctRows, (double) LONG_MAX);
+
+		/*
+		 * If we have a sortable DISTINCT ON clause, we always use sorting.
+		 * This enforces the expected behavior of DISTINCT ON.
+		 */
+		can_sort = grouping_is_sortable(parse->distinctClause);
+		if (can_sort && parse->hasDistinctOn)
+			use_hashed_distinct = false;
+		else
 		{
-			result_plan = (Plan *) make_sort_from_pathkeys(root,
-														   result_plan,
-														   sort_pathkeys,
-														   limit_tuples);
-			current_pathkeys = sort_pathkeys;
+			can_hash = grouping_is_hashable(parse->distinctClause);
+			if (can_hash && can_sort)
+			{
+				/* we have a meaningful choice to make ... */
+				use_hashed_distinct =
+					choose_hashed_distinct(root,
+										   result_plan, current_pathkeys,
+										   tuple_fraction, limit_tuples,
+										   dNumDistinctRows);
+			}
+			else if (can_hash)
+				use_hashed_distinct = true;
+			else if (can_sort)
+				use_hashed_distinct = false;
+			else
+			{
+				ereport(ERROR,
+						(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+						 errmsg("could not implement DISTINCT"),
+						 errdetail("Some of the datatypes only support hashing, while others only support sorting.")));
+				use_hashed_distinct = false; /* keep compiler quiet */
+			}
+		}
+
+		if (use_hashed_distinct)
+		{
+			/* Hashed aggregate plan --- no sort needed */
+			result_plan = (Plan *) make_agg(root,
+											result_plan->targetlist,
+											NIL,
+											AGG_HASHED,
+											list_length(parse->distinctClause),
+											extract_grouping_cols(parse->distinctClause,
+																  result_plan->targetlist),
+											extract_grouping_ops(parse->distinctClause),
+											numDistinctRows,
+											0,
+											result_plan);
+			/* Hashed aggregation produces randomly-ordered results */
+			current_pathkeys = NIL;
+		}
+		else
+		{
+			/*
+			 * Use a Unique node to implement DISTINCT.  Add an explicit sort
+			 * if we couldn't make the path come out the way the Unique node
+			 * needs it.  If we do have to sort, sort by the more rigorous
+			 * of DISTINCT and ORDER BY, to avoid a second sort below.
+			 */
+			if (!pathkeys_contained_in(root->distinct_pathkeys,
+									   current_pathkeys))
+			{
+				if (list_length(root->distinct_pathkeys) >=
+					list_length(root->sort_pathkeys))
+					current_pathkeys = root->distinct_pathkeys;
+				else
+				{
+					current_pathkeys = root->sort_pathkeys;
+					/* Assert checks that parser didn't mess up... */
+					Assert(pathkeys_contained_in(root->distinct_pathkeys,
+												 current_pathkeys));
+				}
+
+				result_plan = (Plan *) make_sort_from_pathkeys(root,
+															   result_plan,
+															   current_pathkeys,
+															   -1.0);
+			}
+
+			result_plan = (Plan *) make_unique(result_plan,
+											   parse->distinctClause);
+			result_plan->plan_rows = dNumDistinctRows;
+			/* The Unique node won't change sort ordering */
 		}
 	}
 
 	/*
-	 * If there is a DISTINCT clause, add the UNIQUE node.
+	 * If ORDER BY was given and we were not able to make the plan come out in
+	 * the right order, add an explicit sort step.
 	 */
-	if (parse->distinctClause)
+	if (parse->sortClause)
 	{
-		result_plan = (Plan *) make_unique(result_plan, parse->distinctClause);
-
-		/*
-		 * If there was grouping or aggregation, leave plan_rows as-is (ie,
-		 * assume the result was already mostly unique).  If not, use the
-		 * number of distinct-groups calculated by query_planner.
-		 */
-		if (!parse->groupClause && !root->hasHavingQual && !parse->hasAggs)
-			result_plan->plan_rows = dNumGroups;
+		if (!pathkeys_contained_in(root->sort_pathkeys, current_pathkeys))
+		{
+			result_plan = (Plan *) make_sort_from_pathkeys(root,
+														   result_plan,
+														   root->sort_pathkeys,
+														   limit_tuples);
+			current_pathkeys = root->sort_pathkeys;
+		}
 	}
 
 	/*
@@ -1623,6 +1728,31 @@ extract_grouping_ops(List *groupClause)
 }
 
 /*
+ * extract_grouping_cols - make an array of the grouping column resnos
+ *		for a SortGroupClause list
+ */
+static AttrNumber *
+extract_grouping_cols(List *groupClause, List *tlist)
+{
+	AttrNumber *grpColIdx;
+	int			numCols = list_length(groupClause);
+	int			colno = 0;
+	ListCell   *glitem;
+
+	grpColIdx = (AttrNumber *) palloc(sizeof(AttrNumber) * numCols);
+
+	foreach(glitem, groupClause)
+	{
+		SortGroupClause *groupcl = (SortGroupClause *) lfirst(glitem);
+		TargetEntry *tle = get_sortgroupclause_tle(groupcl, tlist);
+
+		grpColIdx[colno++] = tle->resno;
+	}
+
+	return grpColIdx;
+}
+
+/*
  * grouping_is_sortable - is it possible to implement grouping list by sorting?
  *
  * This is easy since the parser will have included a sortop if one exists.
@@ -1680,6 +1810,7 @@ choose_hashed_grouping(PlannerInfo *root,
 	double		cheapest_path_rows;
 	int			cheapest_path_width;
 	Size		hashentrysize;
+	List	   *target_pathkeys;
 	List	   *current_pathkeys;
 	Path		hashed_p;
 	Path		sorted_p;
@@ -1717,6 +1848,20 @@ choose_hashed_grouping(PlannerInfo *root,
 		return false;
 
 	/*
+	 * When we have both GROUP BY and DISTINCT, use the more-rigorous of
+	 * DISTINCT and ORDER BY as the assumed required output sort order.
+	 * This is an oversimplification because the DISTINCT might get
+	 * implemented via hashing, but it's not clear that the case is common
+	 * enough (or that our estimates are good enough) to justify trying to
+	 * solve it exactly.
+	 */
+	if (list_length(root->distinct_pathkeys) >
+		list_length(root->sort_pathkeys))
+		target_pathkeys = root->distinct_pathkeys;
+	else
+		target_pathkeys = root->sort_pathkeys;
+
+	/*
 	 * See if the estimated cost is no more than doing it the other way. While
 	 * avoiding the need for sorted input is usually a win, the fact that the
 	 * output won't be sorted may be a loss; so we need to do an actual cost
@@ -1737,8 +1882,8 @@ choose_hashed_grouping(PlannerInfo *root,
 			 cheapest_path->startup_cost, cheapest_path->total_cost,
 			 cheapest_path_rows);
 	/* Result of hashed agg is always unsorted */
-	if (root->sort_pathkeys)
-		cost_sort(&hashed_p, root, root->sort_pathkeys, hashed_p.total_cost,
+	if (target_pathkeys)
+		cost_sort(&hashed_p, root, target_pathkeys, hashed_p.total_cost,
 				  dNumGroups, cheapest_path_width, limit_tuples);
 
 	if (sorted_path)
@@ -1770,9 +1915,9 @@ choose_hashed_grouping(PlannerInfo *root,
 				   sorted_p.startup_cost, sorted_p.total_cost,
 				   cheapest_path_rows);
 	/* The Agg or Group node will preserve ordering */
-	if (root->sort_pathkeys &&
-		!pathkeys_contained_in(root->sort_pathkeys, current_pathkeys))
-		cost_sort(&sorted_p, root, root->sort_pathkeys, sorted_p.total_cost,
+	if (target_pathkeys &&
+		!pathkeys_contained_in(target_pathkeys, current_pathkeys))
+		cost_sort(&sorted_p, root, target_pathkeys, sorted_p.total_cost,
 				  dNumGroups, cheapest_path_width, limit_tuples);
 
 	/*
@@ -1791,6 +1936,111 @@ choose_hashed_grouping(PlannerInfo *root,
 	return false;
 }
 
+/*
+ * choose_hashed_distinct - should we use hashing for DISTINCT?
+ *
+ * This is fairly similar to choose_hashed_grouping, but there are enough
+ * differences that it doesn't seem worth trying to unify the two functions.
+ *
+ * But note that making the two choices independently is a bit bogus in
+ * itself.  If the two could be combined into a single choice operation
+ * it'd probably be better, but that seems far too unwieldy to be practical,
+ * especially considering that the combination of GROUP BY and DISTINCT
+ * isn't very common in real queries.  By separating them, we are giving
+ * extra preference to using a sorting implementation when a common sort key
+ * is available ... and that's not necessarily wrong anyway.
+ *
+ * Note: this is only applied when both alternatives are actually feasible.
+ */
+static bool
+choose_hashed_distinct(PlannerInfo *root,
+					   Plan *input_plan, List *input_pathkeys,
+					   double tuple_fraction, double limit_tuples,
+					   double dNumDistinctRows)
+{
+	int			numDistinctCols = list_length(root->parse->distinctClause);
+	Size		hashentrysize;
+	List	   *current_pathkeys;
+	Path		hashed_p;
+	Path		sorted_p;
+
+	/* Prefer sorting when enable_hashagg is off */
+	if (!enable_hashagg)
+		return false;
+
+	/*
+	 * Don't do it if it doesn't look like the hashtable will fit into
+	 * work_mem.
+	 */
+	hashentrysize = MAXALIGN(input_plan->plan_width) + MAXALIGN(sizeof(MinimalTupleData));
+
+	if (hashentrysize * dNumDistinctRows > work_mem * 1024L)
+		return false;
+
+	/*
+	 * See if the estimated cost is no more than doing it the other way. While
+	 * avoiding the need for sorted input is usually a win, the fact that the
+	 * output won't be sorted may be a loss; so we need to do an actual cost
+	 * comparison.
+	 *
+	 * We need to consider input_plan + hashagg [+ final sort] versus
+	 * input_plan [+ sort] + group [+ final sort] where brackets indicate
+	 * a step that may not be needed.
+	 *
+	 * These path variables are dummies that just hold cost fields; we don't
+	 * make actual Paths for these steps.
+	 */
+	cost_agg(&hashed_p, root, AGG_HASHED, 0,
+			 numDistinctCols, dNumDistinctRows,
+			 input_plan->startup_cost, input_plan->total_cost,
+			 input_plan->plan_rows);
+	/*
+	 * Result of hashed agg is always unsorted, so if ORDER BY is present
+	 * we need to charge for the final sort.
+	 */
+	if (root->parse->sortClause)
+		cost_sort(&hashed_p, root, root->sort_pathkeys, hashed_p.total_cost,
+				  dNumDistinctRows, input_plan->plan_width, limit_tuples);
+
+	/* Now for the GROUP case ... */
+	sorted_p.startup_cost = input_plan->startup_cost;
+	sorted_p.total_cost = input_plan->total_cost;
+	current_pathkeys = input_pathkeys;
+	if (!pathkeys_contained_in(root->distinct_pathkeys, current_pathkeys))
+	{
+		/* We don't want to sort twice */
+		if (list_length(root->distinct_pathkeys) >=
+			list_length(root->sort_pathkeys))
+			current_pathkeys = root->distinct_pathkeys;
+		else
+			current_pathkeys = root->sort_pathkeys;
+		cost_sort(&sorted_p, root, current_pathkeys, sorted_p.total_cost,
+				  input_plan->plan_rows, input_plan->plan_width, -1.0);
+	}
+	cost_group(&sorted_p, root, numDistinctCols, dNumDistinctRows,
+			   sorted_p.startup_cost, sorted_p.total_cost,
+			   input_plan->plan_rows);
+	if (root->parse->sortClause &&
+		!pathkeys_contained_in(root->sort_pathkeys, current_pathkeys))
+		cost_sort(&sorted_p, root, root->sort_pathkeys, sorted_p.total_cost,
+				  dNumDistinctRows, input_plan->plan_width, limit_tuples);
+
+	/*
+	 * Now make the decision using the top-level tuple fraction.  First we
+	 * have to convert an absolute count (LIMIT) into fractional form.
+	 */
+	if (tuple_fraction >= 1.0)
+		tuple_fraction /= dNumDistinctRows;
+
+	if (compare_fractional_path_costs(&hashed_p, &sorted_p,
+									  tuple_fraction) < 0)
+	{
+		/* Hashed is cheaper, so use it */
+		return true;
+	}
+	return false;
+}
+
 /*---------------
  * make_subplanTargetList
  *	  Generate appropriate target list when grouping is required.
@@ -1857,7 +2107,7 @@ make_subplanTargetList(PlannerInfo *root,
 
 	/*
 	 * Otherwise, start with a "flattened" tlist (having just the vars
-	 * mentioned in the targetlist and HAVING qual --- but not upper- level
+	 * mentioned in the targetlist and HAVING qual --- but not upper-level
 	 * Vars; they will be replaced by Params later on).
 	 */
 	sub_tlist = flatten_tlist(tlist);
@@ -1886,16 +2136,28 @@ make_subplanTargetList(PlannerInfo *root,
 			SortGroupClause *grpcl = (SortGroupClause *) lfirst(gl);
 			Node	   *groupexpr = get_sortgroupclause_expr(grpcl, tlist);
 			TargetEntry *te = NULL;
-			ListCell   *sl;
 
-			/* Find or make a matching sub_tlist entry */
-			foreach(sl, sub_tlist)
+			/*
+			 * Find or make a matching sub_tlist entry.  If the groupexpr
+			 * isn't a Var, no point in searching.  (Note that the parser
+			 * won't make multiple groupClause entries for the same TLE.)
+			 */
+			if (groupexpr && IsA(groupexpr, Var))
 			{
-				te = (TargetEntry *) lfirst(sl);
-				if (equal(groupexpr, te->expr))
-					break;
+				ListCell   *sl;
+
+				foreach(sl, sub_tlist)
+				{
+					TargetEntry *lte = (TargetEntry *) lfirst(sl);
+
+					if (equal(groupexpr, lte->expr))
+					{
+						te = lte;
+						break;
+					}
+				}
 			}
-			if (!sl)
+			if (!te)
 			{
 				te = makeTargetEntry((Expr *) groupexpr,
 									 list_length(sub_tlist) + 1,
diff --git a/src/backend/parser/parse_clause.c b/src/backend/parser/parse_clause.c
index 76e59c82d6e..2b04ee5e337 100644
--- a/src/backend/parser/parse_clause.c
+++ b/src/backend/parser/parse_clause.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/parser/parse_clause.c,v 1.173 2008/08/03 19:10:52 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/parser/parse_clause.c,v 1.174 2008/08/05 02:43:17 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -1447,9 +1447,6 @@ transformDistinctClause(ParseState *pstate,
 	/*
 	 * Now add any remaining non-resjunk tlist items, using default
 	 * sort/group semantics for their data types.
-	 *
-	 * XXX for now, the planner requires distinctClause to be sortable,
-	 * so we have to insist on that here.
 	 */
 	foreach(tlitem, *targetlist)
 	{
@@ -1459,8 +1456,7 @@ transformDistinctClause(ParseState *pstate,
 			continue;			/* ignore junk */
 		result = addTargetToGroupList(pstate, tle,
 									  result, *targetlist,
-									  true,	/* XXX for now */
-									  true);
+									  false, true);
 	}
 
 	return result;
@@ -1555,8 +1551,7 @@ transformDistinctOnClause(ParseState *pstate, List *distinctlist,
 					 errmsg("SELECT DISTINCT ON expressions must match initial ORDER BY expressions")));
 		result = addTargetToGroupList(pstate, tle,
 									  result, *targetlist,
-									  true,	/* someday allow hash-only? */
-									  true);
+									  false, true);
 	}
 
 	return result;
diff --git a/src/include/nodes/relation.h b/src/include/nodes/relation.h
index 8476d7e85c1..f8c23071661 100644
--- a/src/include/nodes/relation.h
+++ b/src/include/nodes/relation.h
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/nodes/relation.h,v 1.156 2008/04/21 20:54:15 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/nodes/relation.h,v 1.157 2008/08/05 02:43:17 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -162,8 +162,9 @@ typedef struct PlannerInfo
 	List	   *query_pathkeys; /* desired pathkeys for query_planner(), and
 								 * actual pathkeys afterwards */
 
-	List	   *group_pathkeys; /* groupClause pathkeys, if any */
-	List	   *sort_pathkeys;	/* sortClause pathkeys, if any */
+	List	   *group_pathkeys;		/* groupClause pathkeys, if any */
+	List	   *distinct_pathkeys;	/* distinctClause pathkeys, if any */
+	List	   *sort_pathkeys;		/* sortClause pathkeys, if any */
 
 	List	   *initial_rels;	/* RelOptInfos we are now trying to join */
 
diff --git a/src/test/regress/expected/numerology.out b/src/test/regress/expected/numerology.out
index c5ad36fdd32..d404d9db681 100644
--- a/src/test/regress/expected/numerology.out
+++ b/src/test/regress/expected/numerology.out
@@ -79,7 +79,7 @@ INSERT INTO TEMP_GROUP
 INSERT INTO TEMP_GROUP
   SELECT 2, i.f1, f.f1
   FROM INT4_TBL i, FLOAT8_TBL f;
-SELECT DISTINCT f1 AS two FROM TEMP_GROUP;
+SELECT DISTINCT f1 AS two FROM TEMP_GROUP ORDER BY 1;
  two 
 -----
    1
diff --git a/src/test/regress/expected/opr_sanity.out b/src/test/regress/expected/opr_sanity.out
index 69f0efb8d42..533bac3ab67 100644
--- a/src/test/regress/expected/opr_sanity.out
+++ b/src/test/regress/expected/opr_sanity.out
@@ -129,7 +129,8 @@ WHERE p1.oid != p2.oid AND
     p1.prosrc = p2.prosrc AND
     p1.prolang = 12 AND p2.prolang = 12 AND
     NOT p1.proisagg AND NOT p2.proisagg AND
-    (p1.prorettype < p2.prorettype);
+    (p1.prorettype < p2.prorettype)
+ORDER BY 1, 2;
  prorettype | prorettype 
 ------------+------------
          25 |       1043
@@ -142,7 +143,8 @@ WHERE p1.oid != p2.oid AND
     p1.prosrc = p2.prosrc AND
     p1.prolang = 12 AND p2.prolang = 12 AND
     NOT p1.proisagg AND NOT p2.proisagg AND
-    (p1.proargtypes[0] < p2.proargtypes[0]);
+    (p1.proargtypes[0] < p2.proargtypes[0])
+ORDER BY 1, 2;
  proargtypes | proargtypes 
 -------------+-------------
           25 |        1042
@@ -158,7 +160,8 @@ WHERE p1.oid != p2.oid AND
     p1.prosrc = p2.prosrc AND
     p1.prolang = 12 AND p2.prolang = 12 AND
     NOT p1.proisagg AND NOT p2.proisagg AND
-    (p1.proargtypes[1] < p2.proargtypes[1]);
+    (p1.proargtypes[1] < p2.proargtypes[1])
+ORDER BY 1, 2;
  proargtypes | proargtypes 
 -------------+-------------
           23 |          28
@@ -173,7 +176,8 @@ WHERE p1.oid != p2.oid AND
     p1.prosrc = p2.prosrc AND
     p1.prolang = 12 AND p2.prolang = 12 AND
     NOT p1.proisagg AND NOT p2.proisagg AND
-    (p1.proargtypes[2] < p2.proargtypes[2]);
+    (p1.proargtypes[2] < p2.proargtypes[2])
+ORDER BY 1, 2;
  proargtypes | proargtypes 
 -------------+-------------
         1114 |        1184
@@ -185,7 +189,8 @@ WHERE p1.oid != p2.oid AND
     p1.prosrc = p2.prosrc AND
     p1.prolang = 12 AND p2.prolang = 12 AND
     NOT p1.proisagg AND NOT p2.proisagg AND
-    (p1.proargtypes[3] < p2.proargtypes[3]);
+    (p1.proargtypes[3] < p2.proargtypes[3])
+ORDER BY 1, 2;
  proargtypes | proargtypes 
 -------------+-------------
         1114 |        1184
@@ -197,7 +202,8 @@ WHERE p1.oid != p2.oid AND
     p1.prosrc = p2.prosrc AND
     p1.prolang = 12 AND p2.prolang = 12 AND
     NOT p1.proisagg AND NOT p2.proisagg AND
-    (p1.proargtypes[4] < p2.proargtypes[4]);
+    (p1.proargtypes[4] < p2.proargtypes[4])
+ORDER BY 1, 2;
  proargtypes | proargtypes 
 -------------+-------------
 (0 rows)
@@ -208,7 +214,8 @@ WHERE p1.oid != p2.oid AND
     p1.prosrc = p2.prosrc AND
     p1.prolang = 12 AND p2.prolang = 12 AND
     NOT p1.proisagg AND NOT p2.proisagg AND
-    (p1.proargtypes[5] < p2.proargtypes[5]);
+    (p1.proargtypes[5] < p2.proargtypes[5])
+ORDER BY 1, 2;
  proargtypes | proargtypes 
 -------------+-------------
 (0 rows)
@@ -219,7 +226,8 @@ WHERE p1.oid != p2.oid AND
     p1.prosrc = p2.prosrc AND
     p1.prolang = 12 AND p2.prolang = 12 AND
     NOT p1.proisagg AND NOT p2.proisagg AND
-    (p1.proargtypes[6] < p2.proargtypes[6]);
+    (p1.proargtypes[6] < p2.proargtypes[6])
+ORDER BY 1, 2;
  proargtypes | proargtypes 
 -------------+-------------
 (0 rows)
@@ -230,7 +238,8 @@ WHERE p1.oid != p2.oid AND
     p1.prosrc = p2.prosrc AND
     p1.prolang = 12 AND p2.prolang = 12 AND
     NOT p1.proisagg AND NOT p2.proisagg AND
-    (p1.proargtypes[7] < p2.proargtypes[7]);
+    (p1.proargtypes[7] < p2.proargtypes[7])
+ORDER BY 1, 2;
  proargtypes | proargtypes 
 -------------+-------------
 (0 rows)
diff --git a/src/test/regress/expected/select_distinct.out b/src/test/regress/expected/select_distinct.out
index abe34ae7ae5..fe64fe0d9c1 100644
--- a/src/test/regress/expected/select_distinct.out
+++ b/src/test/regress/expected/select_distinct.out
@@ -14,7 +14,7 @@ SELECT DISTINCT two FROM tmp;
 --
 -- awk '{print $5;}' onek.data | sort -n | uniq
 --
-SELECT DISTINCT ten FROM tmp;
+SELECT DISTINCT ten FROM tmp ORDER BY 1;
  ten 
 -----
    0
@@ -32,7 +32,7 @@ SELECT DISTINCT ten FROM tmp;
 --
 -- awk '{print $16;}' onek.data | sort -d | uniq
 --
-SELECT DISTINCT string4 FROM tmp;
+SELECT DISTINCT string4 FROM tmp ORDER BY 1;
  string4 
 ---------
  AAAAxx
diff --git a/src/test/regress/input/misc.source b/src/test/regress/input/misc.source
index c82b1405978..c5813630625 100644
--- a/src/test/regress/input/misc.source
+++ b/src/test/regress/input/misc.source
@@ -183,7 +183,8 @@ SELECT p.name, name(p.hobbies) FROM person* p;
 -- the next two queries demonstrate how functions generate bogus duplicates.
 -- this is a "feature" ..
 --
-SELECT DISTINCT hobbies_r.name, name(hobbies_r.equipment) FROM hobbies_r;
+SELECT DISTINCT hobbies_r.name, name(hobbies_r.equipment) FROM hobbies_r
+  ORDER BY 1,2;
 
 SELECT hobbies_r.name, (hobbies_r.equipment).name FROM hobbies_r;
 
diff --git a/src/test/regress/output/misc.source b/src/test/regress/output/misc.source
index e409c0a1001..91e0d2b04d7 100644
--- a/src/test/regress/output/misc.source
+++ b/src/test/regress/output/misc.source
@@ -469,7 +469,8 @@ SELECT p.name, name(p.hobbies) FROM person* p;
 -- the next two queries demonstrate how functions generate bogus duplicates.
 -- this is a "feature" ..
 --
-SELECT DISTINCT hobbies_r.name, name(hobbies_r.equipment) FROM hobbies_r;
+SELECT DISTINCT hobbies_r.name, name(hobbies_r.equipment) FROM hobbies_r
+  ORDER BY 1,2;
     name     |     name      
 -------------+---------------
  basketball  | hightops
diff --git a/src/test/regress/sql/numerology.sql b/src/test/regress/sql/numerology.sql
index 2220fdba385..6626cf20ebc 100644
--- a/src/test/regress/sql/numerology.sql
+++ b/src/test/regress/sql/numerology.sql
@@ -63,7 +63,7 @@ INSERT INTO TEMP_GROUP
   SELECT 2, i.f1, f.f1
   FROM INT4_TBL i, FLOAT8_TBL f;
 
-SELECT DISTINCT f1 AS two FROM TEMP_GROUP;
+SELECT DISTINCT f1 AS two FROM TEMP_GROUP ORDER BY 1;
 
 SELECT f1 AS two, max(f3) AS max_float, min(f3) as min_float
   FROM TEMP_GROUP
diff --git a/src/test/regress/sql/opr_sanity.sql b/src/test/regress/sql/opr_sanity.sql
index e2ab6e57d4a..5017849830a 100644
--- a/src/test/regress/sql/opr_sanity.sql
+++ b/src/test/regress/sql/opr_sanity.sql
@@ -121,7 +121,8 @@ WHERE p1.oid != p2.oid AND
     p1.prosrc = p2.prosrc AND
     p1.prolang = 12 AND p2.prolang = 12 AND
     NOT p1.proisagg AND NOT p2.proisagg AND
-    (p1.prorettype < p2.prorettype);
+    (p1.prorettype < p2.prorettype)
+ORDER BY 1, 2;
 
 SELECT DISTINCT p1.proargtypes[0], p2.proargtypes[0]
 FROM pg_proc AS p1, pg_proc AS p2
@@ -129,7 +130,8 @@ WHERE p1.oid != p2.oid AND
     p1.prosrc = p2.prosrc AND
     p1.prolang = 12 AND p2.prolang = 12 AND
     NOT p1.proisagg AND NOT p2.proisagg AND
-    (p1.proargtypes[0] < p2.proargtypes[0]);
+    (p1.proargtypes[0] < p2.proargtypes[0])
+ORDER BY 1, 2;
 
 SELECT DISTINCT p1.proargtypes[1], p2.proargtypes[1]
 FROM pg_proc AS p1, pg_proc AS p2
@@ -137,7 +139,8 @@ WHERE p1.oid != p2.oid AND
     p1.prosrc = p2.prosrc AND
     p1.prolang = 12 AND p2.prolang = 12 AND
     NOT p1.proisagg AND NOT p2.proisagg AND
-    (p1.proargtypes[1] < p2.proargtypes[1]);
+    (p1.proargtypes[1] < p2.proargtypes[1])
+ORDER BY 1, 2;
 
 SELECT DISTINCT p1.proargtypes[2], p2.proargtypes[2]
 FROM pg_proc AS p1, pg_proc AS p2
@@ -145,7 +148,8 @@ WHERE p1.oid != p2.oid AND
     p1.prosrc = p2.prosrc AND
     p1.prolang = 12 AND p2.prolang = 12 AND
     NOT p1.proisagg AND NOT p2.proisagg AND
-    (p1.proargtypes[2] < p2.proargtypes[2]);
+    (p1.proargtypes[2] < p2.proargtypes[2])
+ORDER BY 1, 2;
 
 SELECT DISTINCT p1.proargtypes[3], p2.proargtypes[3]
 FROM pg_proc AS p1, pg_proc AS p2
@@ -153,7 +157,8 @@ WHERE p1.oid != p2.oid AND
     p1.prosrc = p2.prosrc AND
     p1.prolang = 12 AND p2.prolang = 12 AND
     NOT p1.proisagg AND NOT p2.proisagg AND
-    (p1.proargtypes[3] < p2.proargtypes[3]);
+    (p1.proargtypes[3] < p2.proargtypes[3])
+ORDER BY 1, 2;
 
 SELECT DISTINCT p1.proargtypes[4], p2.proargtypes[4]
 FROM pg_proc AS p1, pg_proc AS p2
@@ -161,7 +166,8 @@ WHERE p1.oid != p2.oid AND
     p1.prosrc = p2.prosrc AND
     p1.prolang = 12 AND p2.prolang = 12 AND
     NOT p1.proisagg AND NOT p2.proisagg AND
-    (p1.proargtypes[4] < p2.proargtypes[4]);
+    (p1.proargtypes[4] < p2.proargtypes[4])
+ORDER BY 1, 2;
 
 SELECT DISTINCT p1.proargtypes[5], p2.proargtypes[5]
 FROM pg_proc AS p1, pg_proc AS p2
@@ -169,7 +175,8 @@ WHERE p1.oid != p2.oid AND
     p1.prosrc = p2.prosrc AND
     p1.prolang = 12 AND p2.prolang = 12 AND
     NOT p1.proisagg AND NOT p2.proisagg AND
-    (p1.proargtypes[5] < p2.proargtypes[5]);
+    (p1.proargtypes[5] < p2.proargtypes[5])
+ORDER BY 1, 2;
 
 SELECT DISTINCT p1.proargtypes[6], p2.proargtypes[6]
 FROM pg_proc AS p1, pg_proc AS p2
@@ -177,7 +184,8 @@ WHERE p1.oid != p2.oid AND
     p1.prosrc = p2.prosrc AND
     p1.prolang = 12 AND p2.prolang = 12 AND
     NOT p1.proisagg AND NOT p2.proisagg AND
-    (p1.proargtypes[6] < p2.proargtypes[6]);
+    (p1.proargtypes[6] < p2.proargtypes[6])
+ORDER BY 1, 2;
 
 SELECT DISTINCT p1.proargtypes[7], p2.proargtypes[7]
 FROM pg_proc AS p1, pg_proc AS p2
@@ -185,7 +193,8 @@ WHERE p1.oid != p2.oid AND
     p1.prosrc = p2.prosrc AND
     p1.prolang = 12 AND p2.prolang = 12 AND
     NOT p1.proisagg AND NOT p2.proisagg AND
-    (p1.proargtypes[7] < p2.proargtypes[7]);
+    (p1.proargtypes[7] < p2.proargtypes[7])
+ORDER BY 1, 2;
 
 -- Look for functions that return type "internal" and do not have any
 -- "internal" argument.  Such a function would be a security hole since
diff --git a/src/test/regress/sql/select_distinct.sql b/src/test/regress/sql/select_distinct.sql
index c4a63aaf16f..7416e0194e1 100644
--- a/src/test/regress/sql/select_distinct.sql
+++ b/src/test/regress/sql/select_distinct.sql
@@ -10,12 +10,12 @@ SELECT DISTINCT two FROM tmp;
 --
 -- awk '{print $5;}' onek.data | sort -n | uniq
 --
-SELECT DISTINCT ten FROM tmp;
+SELECT DISTINCT ten FROM tmp ORDER BY 1;
 
 --
 -- awk '{print $16;}' onek.data | sort -d | uniq
 --
-SELECT DISTINCT string4 FROM tmp;
+SELECT DISTINCT string4 FROM tmp ORDER BY 1;
 
 --
 -- awk '{print $3,$16,$5;}' onek.data | sort -d | uniq |