7 files changed, 173 insertions, 124 deletions
diff --git a/src/backend/optimizer/plan/planmain.c b/src/backend/optimizer/plan/planmain.c
index 3776cf9b347..d25a5509b4c 100644
--- a/src/backend/optimizer/plan/planmain.c
+++ b/src/backend/optimizer/plan/planmain.c
@@ -14,7 +14,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/optimizer/plan/planmain.c,v 1.106 2008/01/11 04:02:18 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/optimizer/plan/planmain.c,v 1.107 2008/07/31 22:47:56 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -288,7 +288,7 @@ query_planner(PlannerInfo *root, List *tlist,
 		 * levels of sort --- and, therefore, certainly need to read all the
 		 * tuples --- unless ORDER BY is a subset of GROUP BY.
 		 */
-		if (parse->groupClause && parse->sortClause &&
+		if (root->group_pathkeys && root->sort_pathkeys &&
 			!pathkeys_contained_in(root->sort_pathkeys, root->group_pathkeys))
 			tuple_fraction = 0.0;
 	}
diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c
index 9248022d8b3..6ebbc5b7bc3 100644
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/optimizer/plan/planner.c,v 1.234 2008/07/10 02:14:03 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/optimizer/plan/planner.c,v 1.235 2008/07/31 22:47:56 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -826,6 +826,7 @@ grouping_planner(PlannerInfo *root, double tuple_fraction)
 		/*
 		 * Calculate pathkeys that represent result ordering requirements
 		 */
+		Assert(parse->distinctClause == NIL);
 		sort_pathkeys = make_pathkeys_for_sortclauses(root,
 													  parse->sortClause,
 													  tlist,
@@ -864,17 +865,29 @@ grouping_planner(PlannerInfo *root, double tuple_fraction)
 		 * Calculate pathkeys that represent grouping/ordering requirements.
 		 * Stash them in PlannerInfo so that query_planner can canonicalize
 		 * them after EquivalenceClasses have been formed.
+		 *
+		 * Note: for the moment, DISTINCT is always implemented via sort/uniq,
+		 * and we set the sort_pathkeys to be the more rigorous of the
+		 * DISTINCT and ORDER BY requirements.  This should be changed
+		 * someday, but DISTINCT ON is a bit of a problem ...
 		 */
 		root->group_pathkeys =
 			make_pathkeys_for_sortclauses(root,
 										  parse->groupClause,
 										  tlist,
 										  false);
-		root->sort_pathkeys =
-			make_pathkeys_for_sortclauses(root,
-										  parse->sortClause,
-										  tlist,
-										  false);
+		if (list_length(parse->distinctClause) > list_length(parse->sortClause))
+			root->sort_pathkeys =
+				make_pathkeys_for_sortclauses(root,
+											  parse->distinctClause,
+											  tlist,
+											  false);
+		else
+			root->sort_pathkeys =
+				make_pathkeys_for_sortclauses(root,
+											  parse->sortClause,
+											  tlist,
+											  false);
 
 		/*
 		 * Will need actual number of aggregates for estimating costs.
@@ -903,9 +916,9 @@ grouping_planner(PlannerInfo *root, double tuple_fraction)
 		 * by ORDER BY --- but that might just leave us failing to exploit an
 		 * available sort order at all. Needs more thought...)
 		 */
-		if (parse->groupClause)
+		if (root->group_pathkeys)
 			root->query_pathkeys = root->group_pathkeys;
-		else if (parse->sortClause)
+		else if (root->sort_pathkeys)
 			root->query_pathkeys = root->sort_pathkeys;
 		else
 			root->query_pathkeys = NIL;
@@ -1172,7 +1185,7 @@ grouping_planner(PlannerInfo *root, double tuple_fraction)
 	 * If we were not able to make the plan come out in the right order, add
 	 * an explicit sort step.
 	 */
-	if (parse->sortClause)
+	if (sort_pathkeys)
 	{
 		if (!pathkeys_contained_in(sort_pathkeys, current_pathkeys))
 		{
diff --git a/src/backend/optimizer/prep/prepunion.c b/src/backend/optimizer/prep/prepunion.c
index ed18cf5e3fe..b30cedee9fc 100644
--- a/src/backend/optimizer/prep/prepunion.c
+++ b/src/backend/optimizer/prep/prepunion.c
@@ -22,7 +22,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/optimizer/prep/prepunion.c,v 1.147 2008/06/19 00:46:04 alvherre Exp $
+ *	  $PostgreSQL: pgsql/src/backend/optimizer/prep/prepunion.c,v 1.148 2008/07/31 22:47:56 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -69,6 +69,7 @@ static List *generate_setop_tlist(List *colTypes, int flag,
 static List *generate_append_tlist(List *colTypes, bool flag,
 					  List *input_plans,
 					  List *refnames_tlist);
+static List *generate_setop_sortlist(List *targetlist);
 static void expand_inherited_rtentry(PlannerInfo *root, RangeTblEntry *rte,
 						 Index rti);
 static void make_inh_translation_lists(Relation oldrelation,
@@ -319,7 +320,7 @@ generate_union_plan(SetOperationStmt *op, PlannerInfo *root,
 	{
 		List	   *sortList;
 
-		sortList = addAllTargetsToSortList(NULL, NIL, tlist, false);
+		sortList = generate_setop_sortlist(tlist);
 		if (sortList)
 		{
 			plan = (Plan *) make_sort_from_sortclauses(root, sortList, plan);
@@ -384,7 +385,7 @@ generate_nonunion_plan(SetOperationStmt *op, PlannerInfo *root,
 	 * Sort the child results, then add a SetOp plan node to generate the
 	 * correct output.
 	 */
-	sortList = addAllTargetsToSortList(NULL, NIL, tlist, false);
+	sortList = generate_setop_sortlist(tlist);
 
 	if (sortList == NIL)		/* nothing to sort on? */
 	{
@@ -675,6 +676,31 @@ generate_append_tlist(List *colTypes, bool flag,
 	return tlist;
 }
 
+/*
+ * generate_setop_sortlist
+ *		Build a SortClause list enumerating all the non-resjunk tlist entries,
+ *		using default ordering properties.
+ */
+static List *
+generate_setop_sortlist(List *targetlist)
+{
+	List	   *sortlist = NIL;
+	ListCell   *l;
+
+	foreach(l, targetlist)
+	{
+		TargetEntry *tle = (TargetEntry *) lfirst(l);
+
+		if (!tle->resjunk)
+			sortlist = addTargetToSortList(NULL, tle,
+										   sortlist, targetlist,
+										   SORTBY_DEFAULT,
+										   SORTBY_NULLS_DEFAULT,
+										   NIL, false);
+	}
+	return sortlist;
+}
+
 
 /*
  * find_all_inheritors -
diff --git a/src/backend/parser/analyze.c b/src/backend/parser/analyze.c
index 520b5037255..329b1bc881a 100644
--- a/src/backend/parser/analyze.c
+++ b/src/backend/parser/analyze.c
@@ -17,7 +17,7 @@
  * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- *	$PostgreSQL: pgsql/src/backend/parser/analyze.c,v 1.373 2008/07/18 20:26:06 tgl Exp $
+ *	$PostgreSQL: pgsql/src/backend/parser/analyze.c,v 1.374 2008/07/31 22:47:56 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -711,6 +711,8 @@ transformSelectStmt(ParseState *pstate, SelectStmt *stmt)
 	/*
 	 * Transform sorting/grouping stuff.  Do ORDER BY first because both
 	 * transformGroupClause and transformDistinctClause need the results.
+	 * Note that these functions can also change the targetList, so it's
+	 * passed to them by reference.
 	 */
 	qry->sortClause = transformSortClause(pstate,
 										  stmt->sortClause,
@@ -725,8 +727,9 @@ transformSelectStmt(ParseState *pstate, SelectStmt *stmt)
 	qry->distinctClause = transformDistinctClause(pstate,
 												  stmt->distinctClause,
 												  &qry->targetList,
-												  &qry->sortClause);
+												  qry->sortClause);
 
+	/* transform LIMIT */
 	qry->limitOffset = transformLimitClause(pstate, stmt->limitOffset,
 											"OFFSET");
 	qry->limitCount = transformLimitClause(pstate, stmt->limitCount,
diff --git a/src/backend/parser/parse_clause.c b/src/backend/parser/parse_clause.c
index f532b26d374..d82573fd03d 100644
--- a/src/backend/parser/parse_clause.c
+++ b/src/backend/parser/parse_clause.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/parser/parse_clause.c,v 1.170 2008/06/19 00:46:05 alvherre Exp $
+ *	  $PostgreSQL: pgsql/src/backend/parser/parse_clause.c,v 1.171 2008/07/31 22:47:56 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -1454,16 +1454,23 @@ transformSortClause(ParseState *pstate,
  * transformDistinctClause -
  *	  transform a DISTINCT or DISTINCT ON clause
  *
- * Since we may need to add items to the query's sortClause list, that list
- * is passed by reference.	Likewise for the targetlist.
+ * Since we may need to add items to the query's targetlist, that list
+ * is passed by reference.
+ *
+ * As with GROUP BY, we absorb the sorting semantics of ORDER BY as much as
+ * possible into the distinctClause.  This avoids a possible need to re-sort,
+ * and allows the user to determine the equality semantics used by DISTINCT,
+ * should she be working with a datatype that has more than one btree equality
+ * operator.
  */
 List *
 transformDistinctClause(ParseState *pstate, List *distinctlist,
-						List **targetlist, List **sortClause)
+						List **targetlist, List *sortClause)
 {
 	List	   *result = NIL;
 	ListCell   *slitem;
 	ListCell   *dlitem;
+	ListCell   *tlitem;
 
 	/* No work if there was no DISTINCT clause */
 	if (distinctlist == NIL)
@@ -1474,25 +1481,21 @@ transformDistinctClause(ParseState *pstate, List *distinctlist,
 		/* We had SELECT DISTINCT */
 
 		/*
-		 * All non-resjunk elements from target list that are not already in
-		 * the sort list should be added to it.  (We don't really care what
-		 * order the DISTINCT fields are checked in, so we can leave the
-		 * user's ORDER BY spec alone, and just add additional sort keys to it
-		 * to ensure that all targetlist items get sorted.)
-		 */
-		*sortClause = addAllTargetsToSortList(pstate,
-											  *sortClause,
-											  *targetlist,
-											  true);
-
-		/*
-		 * Now, DISTINCT list consists of all non-resjunk sortlist items.
-		 * Actually, all the sortlist items had better be non-resjunk!
-		 * Otherwise, user wrote SELECT DISTINCT with an ORDER BY item that
-		 * does not appear anywhere in the SELECT targetlist, and we can't
-		 * implement that with only one sorting pass...
+		 * The distinctClause should consist of all ORDER BY items followed
+		 * by all other non-resjunk targetlist items.  There must not be any
+		 * resjunk ORDER BY items --- that would imply that we are sorting
+		 * by a value that isn't necessarily unique within a DISTINCT group,
+		 * so the results wouldn't be well-defined.  This construction
+		 * ensures we follow the rule that sortClause and distinctClause match;
+		 * in fact the sortClause will always be a prefix of distinctClause.
+		 *
+		 * Note a corner case: the same TLE could be in the ORDER BY list
+		 * multiple times with different sortops.  We have to include it in
+		 * the distinctClause the same way to preserve the prefix property.
+		 * The net effect will be that the TLE value will be made unique
+		 * according to both sortops.
 		 */
-		foreach(slitem, *sortClause)
+		foreach(slitem, sortClause)
 		{
 			SortClause *scl = (SortClause *) lfirst(slitem);
 			TargetEntry *tle = get_sortgroupclause_tle(scl, *targetlist);
@@ -1504,71 +1507,103 @@ transformDistinctClause(ParseState *pstate, List *distinctlist,
 			else
 				result = lappend(result, copyObject(scl));
 		}
+
+		/*
+		 * Now add any remaining non-resjunk tlist items, using default
+		 * sorting semantics for their data types.
+		 */
+		foreach(tlitem, *targetlist)
+		{
+			TargetEntry *tle = (TargetEntry *) lfirst(tlitem);
+
+			if (tle->resjunk)
+				continue;		/* ignore junk */
+			if (targetIsInSortList(tle, InvalidOid, result))
+				continue;		/* already in list (with some semantics) */
+			result = addTargetToSortList(pstate, tle,
+										 result, *targetlist,
+										 SORTBY_DEFAULT,
+										 SORTBY_NULLS_DEFAULT,
+										 NIL, true);
+		}
 	}
 	else
 	{
 		/* We had SELECT DISTINCT ON (expr, ...) */
+		Bitmapset  *refnos = NULL;
+		int			sortgroupref;
+		bool		skipped_sortitem;
 
 		/*
-		 * If the user writes both DISTINCT ON and ORDER BY, then the two
-		 * expression lists must match (until one or the other runs out).
-		 * Otherwise the ORDER BY requires a different sort order than the
-		 * DISTINCT does, and we can't implement that with only one sort pass
-		 * (and if we do two passes, the results will be rather
-		 * unpredictable). However, it's OK to have more DISTINCT ON
-		 * expressions than ORDER BY expressions; we can just add the extra
-		 * DISTINCT values to the sort list, much as we did above for ordinary
-		 * DISTINCT fields.
-		 *
-		 * Actually, it'd be OK for the common prefixes of the two lists to
-		 * match in any order, but implementing that check seems like more
-		 * trouble than it's worth.
+		 * Add all the DISTINCT ON expressions to the tlist (if not already
+		 * present, they are added as resjunk items).  Assign sortgroupref
+		 * numbers to them, and form a bitmapset of these numbers.  (A
+		 * bitmapset is convenient here because we don't care about order
+		 * and we can discard duplicates.)
 		 */
-		ListCell   *nextsortlist = list_head(*sortClause);
-
 		foreach(dlitem, distinctlist)
 		{
+			Node   *dexpr = (Node *) lfirst(dlitem);
 			TargetEntry *tle;
 
-			tle = findTargetlistEntry(pstate, lfirst(dlitem),
+			tle = findTargetlistEntry(pstate, dexpr,
 									  targetlist, DISTINCT_ON_CLAUSE);
+			sortgroupref = assignSortGroupRef(tle, *targetlist);
+			refnos = bms_add_member(refnos, sortgroupref);
+		}
 
-			if (nextsortlist != NULL)
-			{
-				SortClause *scl = (SortClause *) lfirst(nextsortlist);
+		/*
+		 * If the user writes both DISTINCT ON and ORDER BY, adopt the
+		 * sorting semantics from ORDER BY items that match DISTINCT ON
+		 * items, and also adopt their column sort order.  We insist that
+		 * the distinctClause and sortClause match, so throw error if we
+		 * find the need to add any more distinctClause items after we've
+		 * skipped an ORDER BY item that wasn't in DISTINCT ON.
+		 */
+		skipped_sortitem = false;
+		foreach(slitem, sortClause)
+		{
+			SortClause *scl = (SortClause *) lfirst(slitem);
 
-				if (tle->ressortgroupref != scl->tleSortGroupRef)
+			if (bms_is_member(scl->tleSortGroupRef, refnos))
+			{
+				if (skipped_sortitem)
 					ereport(ERROR,
 							(errcode(ERRCODE_INVALID_COLUMN_REFERENCE),
 							 errmsg("SELECT DISTINCT ON expressions must match initial ORDER BY expressions")));
-				result = lappend(result, copyObject(scl));
-				nextsortlist = lnext(nextsortlist);
+				else
+					result = lappend(result, copyObject(scl));
 			}
 			else
 			{
-				*sortClause = addTargetToSortList(pstate, tle,
-												  *sortClause, *targetlist,
-												  SORTBY_DEFAULT,
-												  SORTBY_NULLS_DEFAULT,
-												  NIL, true);
+				skipped_sortitem = true;
+			}
+		}
 
-				/*
-				 * Probably, the tle should always have been added at the end
-				 * of the sort list ... but search to be safe.
-				 */
-				foreach(slitem, *sortClause)
-				{
-					SortClause *scl = (SortClause *) lfirst(slitem);
+		/*
+		 * Now add any remaining DISTINCT ON items, using default sorting
+		 * semantics for their data types.  (Note: this is pretty
+		 * questionable; if the ORDER BY list doesn't include all the DISTINCT
+		 * ON items and more besides, you certainly aren't using DISTINCT ON
+		 * in the intended way, and you probably aren't going to get
+		 * consistent results.  It might be better to throw an error or warning
+		 * here.  But historically we've allowed it, so keep doing so.)
+		 */
+		while ((sortgroupref = bms_first_member(refnos)) >= 0)
+		{
+			TargetEntry *tle = get_sortgroupref_tle(sortgroupref, *targetlist);
 
-					if (tle->ressortgroupref == scl->tleSortGroupRef)
-					{
-						result = lappend(result, copyObject(scl));
-						break;
-					}
-				}
-				if (slitem == NULL)		/* should not happen */
-					elog(ERROR, "failed to add DISTINCT ON clause to target list");
-			}
+			if (targetIsInSortList(tle, InvalidOid, result))
+				continue;		/* already in list (with some semantics) */
+			if (skipped_sortitem)
+				ereport(ERROR,
+						(errcode(ERRCODE_INVALID_COLUMN_REFERENCE),
+						 errmsg("SELECT DISTINCT ON expressions must match initial ORDER BY expressions")));
+			result = addTargetToSortList(pstate, tle,
+										 result, *targetlist,
+										 SORTBY_DEFAULT,
+										 SORTBY_NULLS_DEFAULT,
+										 NIL, true);
 		}
 	}
 
@@ -1576,38 +1611,8 @@ transformDistinctClause(ParseState *pstate, List *distinctlist,
 }
 
 /*
- * addAllTargetsToSortList
- *		Make sure all non-resjunk targets in the targetlist are in the
- *		ORDER BY list, adding the not-yet-sorted ones to the end of the list.
- *		This is typically used to help implement SELECT DISTINCT.
- *
- * See addTargetToSortList for info about pstate and resolveUnknown inputs.
- *
- * Returns the updated ORDER BY list.
- */
-List *
-addAllTargetsToSortList(ParseState *pstate, List *sortlist,
-						List *targetlist, bool resolveUnknown)
-{
-	ListCell   *l;
-
-	foreach(l, targetlist)
-	{
-		TargetEntry *tle = (TargetEntry *) lfirst(l);
-
-		if (!tle->resjunk)
-			sortlist = addTargetToSortList(pstate, tle,
-										   sortlist, targetlist,
-										   SORTBY_DEFAULT,
-										   SORTBY_NULLS_DEFAULT,
-										   NIL, resolveUnknown);
-	}
-	return sortlist;
-}
-
-/*
  * addTargetToSortList
- *		If the given targetlist entry isn't already in the ORDER BY list,
+ *		If the given targetlist entry isn't already in the SortClause list,
  *		add it to the end of the list, using the given sort ordering info.
  *
  * If resolveUnknown is TRUE, convert TLEs of type UNKNOWN to TEXT.  If not,
@@ -1615,7 +1620,7 @@ addAllTargetsToSortList(ParseState *pstate, List *sortlist,
  * pstate should be provided if resolveUnknown is TRUE, but can be NULL
  * otherwise.
  *
- * Returns the updated ORDER BY list.
+ * Returns the updated SortClause list.
  */
 List *
 addTargetToSortList(ParseState *pstate, TargetEntry *tle,
diff --git a/src/include/nodes/parsenodes.h b/src/include/nodes/parsenodes.h
index e6de4b49c85..fa78b4b188b 100644
--- a/src/include/nodes/parsenodes.h
+++ b/src/include/nodes/parsenodes.h
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/nodes/parsenodes.h,v 1.368 2008/07/18 03:32:53 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/nodes/parsenodes.h,v 1.369 2008/07/31 22:47:56 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -613,14 +613,19 @@ typedef struct RangeTblEntry
  * tleSortGroupRef must match ressortgroupref of exactly one entry of the
  * associated targetlist; that is the expression to be sorted (or grouped) by.
  * sortop is the OID of the ordering operator (a "<" or ">" operator).
- * nulls_first does about what you'd expect.
+ * nulls_first means about what you'd expect.
  *
  * SortClauses are also used to identify targets that we will do a "Unique"
  * filter step on (for SELECT DISTINCT and SELECT DISTINCT ON).  The
- * distinctClause list is simply a copy of the relevant members of the
- * sortClause list.  Note that distinctClause can be a subset of sortClause,
- * but cannot have members not present in sortClause; and the members that
- * do appear must be in the same order as in sortClause.
+ * distinctClause list is a list of SortClauses for the expressions to be
+ * unique-ified.  (As per comment for GroupClause, this overspecifies the
+ * semantics.)  In SELECT DISTINCT, the distinctClause list is typically
+ * longer than the ORDER BY list, while in SELECT DISTINCT ON it's typically
+ * shorter.  The two lists must match up to the end of the shorter one ---
+ * the parser rearranges the distinctClause if necessary to make this true.
+ * (This restriction ensures that only one sort step is needed to both
+ * satisfy the ORDER BY and set up for the Unique step.  This is semantically
+ * necessary for DISTINCT ON, and offers no real drawback for DISTINCT.)
  */
 typedef struct SortClause
 {
diff --git a/src/include/parser/parse_clause.h b/src/include/parser/parse_clause.h
index f2a6bf01ba8..6fc357522e7 100644
--- a/src/include/parser/parse_clause.h
+++ b/src/include/parser/parse_clause.h
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/parser/parse_clause.h,v 1.49 2008/01/01 19:45:58 momjian Exp $
+ * $PostgreSQL: pgsql/src/include/parser/parse_clause.h,v 1.50 2008/07/31 22:47:56 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -31,11 +31,8 @@ extern List *transformGroupClause(ParseState *pstate, List *grouplist,
 extern List *transformSortClause(ParseState *pstate, List *orderlist,
 					List **targetlist, bool resolveUnknown);
 extern List *transformDistinctClause(ParseState *pstate, List *distinctlist,
-						List **targetlist, List **sortClause);
+						List **targetlist, List *sortClause);
 
-extern List *addAllTargetsToSortList(ParseState *pstate,
-						List *sortlist, List *targetlist,
-						bool resolveUnknown);
 extern List *addTargetToSortList(ParseState *pstate, TargetEntry *tle,
 					List *sortlist, List *targetlist,
 					SortByDir sortby_dir, SortByNulls sortby_nulls,