3 files changed, 920 insertions, 28 deletions
diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c
index 679d8ed597e..ec3c23013a3 100644
--- a/src/backend/optimizer/path/costsize.c
+++ b/src/backend/optimizer/path/costsize.c
@@ -1756,6 +1756,322 @@ cost_recursive_union(Path *runion, Path *nrterm, Path *rterm)
 }
 
 /*
+ * is_fake_var
+ *		Workaround for generate_append_tlist() which generates fake Vars with
+ *		varno == 0, that will cause a fail of estimate_num_group() call
+ *
+ * XXX Ummm, why would estimate_num_group fail with this?
+ */
+static bool
+is_fake_var(Expr *expr)
+{
+	if (IsA(expr, RelabelType))
+		expr = (Expr *) ((RelabelType *) expr)->arg;
+
+	return (IsA(expr, Var) && ((Var *) expr)->varno == 0);
+}
+
+/*
+ * get_width_cost_multiplier
+ *		Returns relative complexity of comparing two values based on its width.
+ * The idea behind is that the comparison becomes more expensive the longer the
+ * value is. Return value is in cpu_operator_cost units.
+ */
+static double
+get_width_cost_multiplier(PlannerInfo *root, Expr *expr)
+{
+	double	width = -1.0; /* fake value */
+
+	if (IsA(expr, RelabelType))
+		expr = (Expr *) ((RelabelType *) expr)->arg;
+
+	/* Try to find actual stat in corresponding relation */
+	if (IsA(expr, Var))
+	{
+		Var		*var = (Var *) expr;
+
+		if (var->varno > 0 && var->varno < root->simple_rel_array_size)
+		{
+			RelOptInfo	*rel = root->simple_rel_array[var->varno];
+
+			if (rel != NULL &&
+				var->varattno >= rel->min_attr &&
+				var->varattno <= rel->max_attr)
+			{
+				int	ndx = var->varattno - rel->min_attr;
+
+				if (rel->attr_widths[ndx] > 0)
+					width = rel->attr_widths[ndx];
+			}
+		}
+	}
+
+	/* Didn't find any actual stats, try using type width instead. */
+	if (width < 0.0)
+	{
+		Node	*node = (Node*) expr;
+
+		width = get_typavgwidth(exprType(node), exprTypmod(node));
+	}
+
+	/*
+	 * Values are passed as Datum type, so comparisons can't be cheaper than
+	 * comparing a Datum value.
+	 *
+	 * FIXME I find this reasoning questionable. We may pass int2, and comparing
+	 * it is probably a bit cheaper than comparing a bigint.
+	 */
+	if (width <= sizeof(Datum))
+		return 1.0;
+
+	/*
+	 * We consider the cost of a comparison not to be directly proportional to
+	 * width of the argument, because widths of the arguments could be slightly
+	 * different (we only know the average width for the whole column). So we
+	 * use log16(width) as an estimate.
+	 */
+	return 1.0 + 0.125 * LOG2(width / sizeof(Datum));
+}
+
+/*
+ * compute_cpu_sort_cost
+ *		compute CPU cost of sort (i.e. in-memory)
+ *
+ * The main thing we need to calculate to estimate sort CPU costs is the number
+ * of calls to the comparator functions. The difficulty is that for multi-column
+ * sorts there may be different data types involved (for some of which the calls
+ * may be much more expensive). Furthermore, columns may have a very different
+ * number of distinct values - the higher the number, the fewer comparisons will
+ * be needed for the following columns.
+ *
+ * The algorithm is incremental - we add pathkeys one by one, and at each step we
+ * estimate the number of necessary comparisons (based on the number of distinct
+ * groups in the current pathkey prefix and the new pathkey), and the comparison
+ * costs (which is data type specific).
+ *
+ * Estimation of the number of comparisons is based on ideas from:
+ *
+ * "Quicksort Is Optimal", Robert Sedgewick, Jon Bentley, 2002
+ * [https://www.cs.princeton.edu/~rs/talks/QuicksortIsOptimal.pdf]
+ *
+ * In term of that paper, let N - number of tuples, Xi - number of identical
+ * tuples with value Ki, then the estimate of number of comparisons is:
+ *
+ *	log(N! / (X1! * X2! * ..))  ~  sum(Xi * log(N/Xi))
+ *
+ * We assume all Xi the same because now we don't have any estimation of
+ * group sizes, we have only know the estimate of number of groups (distinct
+ * values). In that case, formula becomes:
+ *
+ *	N * log(NumberOfGroups)
+ *
+ * For multi-column sorts we need to estimate the number of comparisons for
+ * each individual column - for example with columns (c1, c2, ..., ck) we
+ * can estimate that number of comparisons on ck is roughly
+ *
+ *	ncomparisons(c1, c2, ..., ck) / ncomparisons(c1, c2, ..., c(k-1))
+ *
+ * Let k be a column number, Gk - number of groups defined by k columns, and Fk
+ * the cost of the comparison is
+ *
+ *	N * sum( Fk * log(Gk) )
+ *
+ * Note: We also consider column width, not just the comparator cost.
+ *
+ * NOTE: some callers currently pass NIL for pathkeys because they
+ * can't conveniently supply the sort keys. In this case, it will fallback to
+ * simple comparison cost estimate.
+ */
+static Cost
+compute_cpu_sort_cost(PlannerInfo *root, List *pathkeys, int nPresortedKeys,
+					  Cost comparison_cost, double tuples, double output_tuples,
+					  bool heapSort)
+{
+	Cost		per_tuple_cost = 0.0;
+	ListCell	*lc;
+	List		*pathkeyExprs = NIL;
+	double		tuplesPerPrevGroup = tuples;
+	double		totalFuncCost = 1.0;
+	bool		has_fake_var = false;
+	int			i = 0;
+	Oid			prev_datatype = InvalidOid;
+	List		*cache_varinfos = NIL;
+
+	/* fallback if pathkeys is unknown */
+	if (list_length(pathkeys) == 0)
+	{
+		/*
+		 * If we'll use a bounded heap-sort keeping just K tuples in memory, for
+		 * a total number of tuple comparisons of N log2 K; but the constant
+		 * factor is a bit higher than for quicksort. Tweak it so that the cost
+		 * curve is continuous at the crossover point.
+		 */
+		output_tuples = (heapSort) ? 2.0 * output_tuples : tuples;
+		per_tuple_cost += 2.0 * cpu_operator_cost * LOG2(output_tuples);
+
+		/* add cost provided by caller */
+		per_tuple_cost += comparison_cost;
+
+		return per_tuple_cost * tuples;
+	}
+
+	/*
+	 * Computing total cost of sorting takes into account:
+	 * - per column comparison function cost
+	 * - we try to compute needed number of comparison per column
+	 */
+	foreach(lc, pathkeys)
+	{
+		PathKey			   *pathkey = (PathKey*) lfirst(lc);
+		EquivalenceMember  *em;
+		double				nGroups,
+							correctedNGroups;
+		Cost				funcCost = 1.0;
+
+		/*
+		 * We believe that equivalence members aren't very different, so, to
+		 * estimate cost we consider just the first member.
+		 */
+		em = (EquivalenceMember *) linitial(pathkey->pk_eclass->ec_members);
+
+		if (em->em_datatype != InvalidOid)
+		{
+			/* do not lookup funcCost if the data type is the same */
+			if (prev_datatype != em->em_datatype)
+			{
+				Oid			sortop;
+				QualCost	cost;
+
+				sortop = get_opfamily_member(pathkey->pk_opfamily,
+											 em->em_datatype, em->em_datatype,
+											 pathkey->pk_strategy);
+
+				cost.startup = 0;
+				cost.per_tuple = 0;
+				add_function_cost(root, get_opcode(sortop), NULL, &cost);
+
+				/*
+				 * add_function_cost returns the product of cpu_operator_cost
+				 * and procost, but we need just procost, co undo that.
+				 */
+				funcCost = cost.per_tuple / cpu_operator_cost;
+
+				prev_datatype = em->em_datatype;
+			}
+		}
+
+		/* factor in the width of the values in this column */
+		funcCost *= get_width_cost_multiplier(root, em->em_expr);
+
+		/* now we have per-key cost, so add to the running total */
+		totalFuncCost += funcCost;
+
+		/* remember if we have found a fake Var in pathkeys */
+		has_fake_var |= is_fake_var(em->em_expr);
+		pathkeyExprs = lappend(pathkeyExprs, em->em_expr);
+
+		/*
+		 * We need to calculate the number of comparisons for this column, which
+		 * requires knowing the group size. So we estimate the number of groups
+		 * by calling estimate_num_groups_incremental(), which estimates the
+		 * group size for "new" pathkeys.
+		 *
+		 * Note: estimate_num_groups_incremntal does not handle fake Vars, so use
+		 * a default estimate otherwise.
+		 */
+		if (!has_fake_var)
+			nGroups = estimate_num_groups_incremental(root, pathkeyExprs,
+													  tuplesPerPrevGroup, NULL, NULL,
+													  &cache_varinfos,
+													  list_length(pathkeyExprs) - 1);
+		else if (tuples > 4.0)
+			/*
+			 * Use geometric mean as estimation if there are no stats.
+			 *
+			 * We don't use DEFAULT_NUM_DISTINCT here, because that’s used for
+			 * a single column, but here we’re dealing with multiple columns.
+			 */
+			nGroups = ceil(2.0 + sqrt(tuples) * (i + 1) / list_length(pathkeys));
+		else
+			nGroups = tuples;
+
+		/*
+		 * Presorted keys are not considered in the cost above, but we still do
+		 * have to compare them in the qsort comparator. So make sure to factor
+		 * in the cost in that case.
+		 */
+		if (i >= nPresortedKeys)
+		{
+			if (heapSort)
+			{
+				/* have to keep at least one group, and a multiple of group size */
+				correctedNGroups = ceil(output_tuples / tuplesPerPrevGroup);
+			}
+			else
+				/* all groups in the input */
+				correctedNGroups = nGroups;
+
+			correctedNGroups = Max(1.0, ceil(correctedNGroups));
+
+			per_tuple_cost += totalFuncCost * LOG2(correctedNGroups);
+		}
+
+		i++;
+
+		/*
+		 * Uniform distributions with all groups being of the same size are the
+		 * best case, with nice smooth behavior. Real-world distributions tend
+		 * not to be uniform, though, and we don’t have any reliable easy-to-use
+		 * information. As a basic defense against skewed distributions, we use
+		 * a 1.5 factor to make the expected group a bit larger, but we need to
+		 * be careful not to make the group larger than in the preceding step.
+		 */
+		tuplesPerPrevGroup = Min(tuplesPerPrevGroup,
+								 ceil(1.5 * tuplesPerPrevGroup / nGroups));
+
+		/*
+		 * Once we get single-row group, it means tuples in the group are unique
+		 * and we can skip all remaining columns.
+		 */
+		if (tuplesPerPrevGroup <= 1.0)
+			break;
+	}
+
+	list_free(pathkeyExprs);
+
+	/* per_tuple_cost is in cpu_operator_cost units */
+	per_tuple_cost *= cpu_operator_cost;
+
+	/*
+	 * Accordingly to "Introduction to algorithms", Thomas H. Cormen, Charles E.
+	 * Leiserson, Ronald L. Rivest, ISBN 0-07-013143-0, quicksort estimation
+	 * formula has additional term proportional to number of tuples (See Chapter
+	 * 8.2 and Theorem 4.1). That affects  cases with a low number of tuples,
+	 * approximately less than 1e4. We could implement it as an additional
+	 * multiplier under the logarithm, but we use a bit more complex formula
+	 * which takes into account the number of unique tuples and it’s not clear
+	 * how to combine the multiplier with the number of groups. Estimate it as
+	 * 10 in cpu_operator_cost unit.
+	 */
+	per_tuple_cost += 10 * cpu_operator_cost;
+
+	per_tuple_cost += comparison_cost;
+
+	return tuples * per_tuple_cost;
+}
+
+/*
+ * simple wrapper just to estimate best sort path
+ */
+Cost
+cost_sort_estimate(PlannerInfo *root, List *pathkeys, int nPresortedKeys,
+				   double tuples)
+{
+	return compute_cpu_sort_cost(root, pathkeys, nPresortedKeys,
+								0, tuples, tuples, false);
+}
+
+/*
  * cost_tuplesort
  *	  Determines and returns the cost of sorting a relation using tuplesort,
  *    not including the cost of reading the input data.
@@ -1771,7 +2087,7 @@ cost_recursive_union(Path *runion, Path *nrterm, Path *rterm)
  * number of initial runs formed and M is the merge order used by tuplesort.c.
  * Since the average initial run should be about sort_mem, we have
  *		disk traffic = 2 * relsize * ceil(logM(p / sort_mem))
- *		cpu = comparison_cost * t * log2(t)
+ * 		and cpu cost (computed by compute_cpu_sort_cost()).
  *
  * If the sort is bounded (i.e., only the first k result tuples are needed)
  * and k tuples can fit into sort_mem, we use a heap method that keeps only
@@ -1790,9 +2106,11 @@ cost_recursive_union(Path *runion, Path *nrterm, Path *rterm)
  * 'comparison_cost' is the extra cost per comparison, if any
  * 'sort_mem' is the number of kilobytes of work memory allowed for the sort
  * 'limit_tuples' is the bound on the number of output tuples; -1 if no bound
+ * 'startup_cost' is expected to be 0 at input. If there is "input cost" it should
+ * be added by caller later
  */
 static void
-cost_tuplesort(Cost *startup_cost, Cost *run_cost,
+cost_tuplesort(PlannerInfo *root, List *pathkeys, Cost *startup_cost, Cost *run_cost,
 			   double tuples, int width,
 			   Cost comparison_cost, int sort_mem,
 			   double limit_tuples)
@@ -1809,9 +2127,6 @@ cost_tuplesort(Cost *startup_cost, Cost *run_cost,
 	if (tuples < 2.0)
 		tuples = 2.0;
 
-	/* Include the default cost-per-comparison */
-	comparison_cost += 2.0 * cpu_operator_cost;
-
 	/* Do we have a useful LIMIT? */
 	if (limit_tuples > 0 && limit_tuples < tuples)
 	{
@@ -1835,12 +2150,10 @@ cost_tuplesort(Cost *startup_cost, Cost *run_cost,
 		double		log_runs;
 		double		npageaccesses;
 
-		/*
-		 * CPU costs
-		 *
-		 * Assume about N log2 N comparisons
-		 */
-		*startup_cost = comparison_cost * tuples * LOG2(tuples);
+		/* CPU costs */
+		*startup_cost = compute_cpu_sort_cost(root, pathkeys, 0,
+											  comparison_cost, tuples,
+											  tuples, false);
 
 		/* Disk costs */
 
@@ -1856,18 +2169,17 @@ cost_tuplesort(Cost *startup_cost, Cost *run_cost,
 	}
 	else if (tuples > 2 * output_tuples || input_bytes > sort_mem_bytes)
 	{
-		/*
-		 * We'll use a bounded heap-sort keeping just K tuples in memory, for
-		 * a total number of tuple comparisons of N log2 K; but the constant
-		 * factor is a bit higher than for quicksort.  Tweak it so that the
-		 * cost curve is continuous at the crossover point.
-		 */
-		*startup_cost = comparison_cost * tuples * LOG2(2.0 * output_tuples);
+		/* We'll use a bounded heap-sort keeping just K tuples in memory. */
+		*startup_cost = compute_cpu_sort_cost(root, pathkeys, 0,
+											  comparison_cost, tuples,
+											  output_tuples, true);
 	}
 	else
 	{
 		/* We'll use plain quicksort on all the input tuples */
-		*startup_cost = comparison_cost * tuples * LOG2(tuples);
+		*startup_cost = compute_cpu_sort_cost(root, pathkeys, 0,
+											  comparison_cost, tuples,
+											  tuples, false);
 	}
 
 	/*
@@ -1900,8 +2212,8 @@ cost_incremental_sort(Path *path,
 					  double input_tuples, int width, Cost comparison_cost, int sort_mem,
 					  double limit_tuples)
 {
-	Cost		startup_cost = 0,
-				run_cost = 0,
+	Cost		startup_cost,
+				run_cost,
 				input_run_cost = input_total_cost - input_startup_cost;
 	double		group_tuples,
 				input_groups;
@@ -1986,7 +2298,7 @@ cost_incremental_sort(Path *path,
 	 * pessimistic about incremental sort performance and increase its average
 	 * group size by half.
 	 */
-	cost_tuplesort(&group_startup_cost, &group_run_cost,
+	cost_tuplesort(root, pathkeys, &group_startup_cost, &group_run_cost,
 				   1.5 * group_tuples, width, comparison_cost, sort_mem,
 				   limit_tuples);
 
@@ -1994,7 +2306,7 @@ cost_incremental_sort(Path *path,
 	 * Startup cost of incremental sort is the startup cost of its first group
 	 * plus the cost of its input.
 	 */
-	startup_cost += group_startup_cost
+	startup_cost = group_startup_cost
 		+ input_startup_cost + group_input_run_cost;
 
 	/*
@@ -2003,7 +2315,7 @@ cost_incremental_sort(Path *path,
 	 * group, plus the total cost to process the remaining groups, plus the
 	 * remaining cost of input.
 	 */
-	run_cost += group_run_cost
+	run_cost = group_run_cost
 		+ (group_run_cost + group_startup_cost) * (input_groups - 1)
 		+ group_input_run_cost * (input_groups - 1);
 
@@ -2043,7 +2355,7 @@ cost_sort(Path *path, PlannerInfo *root,
 	Cost		startup_cost;
 	Cost		run_cost;
 
-	cost_tuplesort(&startup_cost, &run_cost,
+	cost_tuplesort(root, pathkeys, &startup_cost, &run_cost,
 				   tuples, width,
 				   comparison_cost, sort_mem,
 				   limit_tuples);
@@ -2141,7 +2453,7 @@ append_nonpartial_cost(List *subpaths, int numpaths, int parallel_workers)
  *	  Determines and returns the cost of an Append node.
  */
 void
-cost_append(AppendPath *apath)
+cost_append(AppendPath *apath, PlannerInfo *root)
 {
 	ListCell   *l;
 
@@ -2209,7 +2521,7 @@ cost_append(AppendPath *apath)
 					 * any child.
 					 */
 					cost_sort(&sort_path,
-							  NULL, /* doesn't currently need root */
+							  root,
 							  pathkeys,
 							  subpath->total_cost,
 							  subpath->rows,
diff --git a/src/backend/optimizer/path/equivclass.c b/src/backend/optimizer/path/equivclass.c
index 8c6770de972..258302840f8 100644
--- a/src/backend/optimizer/path/equivclass.c
+++ b/src/backend/optimizer/path/equivclass.c
@@ -681,7 +681,18 @@ get_eclass_for_sort_expr(PlannerInfo *root,
 
 			if (opcintype == cur_em->em_datatype &&
 				equal(expr, cur_em->em_expr))
-				return cur_ec;	/* Match! */
+			{
+				/*
+				 * Match!
+				 *
+				 * Copy the sortref if it wasn't set yet. That may happen if the
+				 * ec was constructed from WHERE clause, i.e. it doesn't have a
+				 * target reference at all.
+				 */
+				if (cur_ec->ec_sortref == 0 && sortref > 0)
+					cur_ec->ec_sortref = sortref;
+				return cur_ec;
+			}
 		}
 	}
 
diff --git a/src/backend/optimizer/path/pathkeys.c b/src/backend/optimizer/path/pathkeys.c
index 86a35cdef17..75fe03fd04b 100644
--- a/src/backend/optimizer/path/pathkeys.c
+++ b/src/backend/optimizer/path/pathkeys.c
@@ -17,17 +17,22 @@
  */
 #include "postgres.h"
 
+#include "miscadmin.h"
 #include "access/stratnum.h"
 #include "catalog/pg_opfamily.h"
 #include "nodes/makefuncs.h"
 #include "nodes/nodeFuncs.h"
 #include "nodes/plannodes.h"
+#include "optimizer/cost.h"
 #include "optimizer/optimizer.h"
 #include "optimizer/pathnode.h"
 #include "optimizer/paths.h"
 #include "partitioning/partbounds.h"
 #include "utils/lsyscache.h"
+#include "utils/selfuncs.h"
 
+/* Consider reordering of GROUP BY keys? */
+bool enable_group_by_reordering = true;
 
 static bool pathkey_is_redundant(PathKey *new_pathkey, List *pathkeys);
 static bool matches_boolean_partition_clause(RestrictInfo *rinfo,
@@ -335,6 +340,517 @@ pathkeys_contained_in(List *keys1, List *keys2)
 }
 
 /*
+ * group_keys_reorder_by_pathkeys
+ *		Reorder GROUP BY keys to match pathkeys of input path.
+ *
+ * Function returns new lists (pathkeys and clauses), original GROUP BY lists
+ * stay untouched.
+ *
+ * Returns the number of GROUP BY keys with a matching pathkey.
+ */
+int
+group_keys_reorder_by_pathkeys(List *pathkeys, List **group_pathkeys,
+							   List **group_clauses)
+{
+	List	   *new_group_pathkeys= NIL,
+			   *new_group_clauses = NIL;
+	ListCell   *lc;
+	int			n;
+
+	if (pathkeys == NIL || *group_pathkeys == NIL)
+		return 0;
+
+	/*
+	 * Walk the pathkeys (determining ordering of the input path) and see if
+	 * there's a matching GROUP BY key. If we find one, we append it to the
+	 * list, and do the same for the clauses.
+	 *
+	 * Once we find the first pathkey without a matching GROUP BY key, the rest
+	 * of the pathkeys are useless and can't be used to evaluate the grouping,
+	 * so we abort the loop and ignore the remaining pathkeys.
+	 *
+	 * XXX Pathkeys are built in a way to allow simply comparing pointers.
+	 */
+	foreach(lc, pathkeys)
+	{
+		PathKey			*pathkey = (PathKey *) lfirst(lc);
+		SortGroupClause	*sgc;
+
+		/* abort on first mismatch */
+		if (!list_member_ptr(*group_pathkeys, pathkey))
+			break;
+
+		new_group_pathkeys = lappend(new_group_pathkeys, pathkey);
+
+		sgc = get_sortgroupref_clause(pathkey->pk_eclass->ec_sortref,
+									  *group_clauses);
+
+		new_group_clauses = lappend(new_group_clauses, sgc);
+	}
+
+	/* remember the number of pathkeys with a matching GROUP BY key */
+	n = list_length(new_group_pathkeys);
+
+	/* append the remaining group pathkeys (will be treated as not sorted) */
+	*group_pathkeys = list_concat_unique_ptr(new_group_pathkeys,
+											 *group_pathkeys);
+	*group_clauses = list_concat_unique_ptr(new_group_clauses,
+											*group_clauses);
+
+	return n;
+}
+
+/*
+ * Used to generate all permutations of a pathkey list.
+ */
+typedef struct PathkeyMutatorState {
+	List	   *elemsList;
+	ListCell  **elemCells;
+	void	  **elems;
+	int		   *positions;
+	int			 mutatorNColumns;
+	int			 count;
+} PathkeyMutatorState;
+
+
+/*
+ * PathkeyMutatorInit
+ *		Initialize state of the permutation generator.
+ *
+ * We want to generate permutations of elements in the "elems" list. We may want
+ * to skip some number of elements at the beginning (when treating as presorted)
+ * or at the end (we only permute a limited number of group keys).
+ *
+ * The list is decomposed into elements, and we also keep pointers to individual
+ * cells. This allows us to build the permuted list quickly and cheaply, without
+ * creating any copies.
+ */
+static void
+PathkeyMutatorInit(PathkeyMutatorState *state, List *elems, int start, int end)
+{
+	int i;
+	int			n = end - start;
+	ListCell	*lc;
+
+	memset(state, 0, sizeof(*state));
+
+	state->mutatorNColumns = n;
+
+	state->elemsList = list_copy(elems);
+
+	state->elems = palloc(sizeof(void*) * n);
+	state->elemCells = palloc(sizeof(ListCell*) * n);
+	state->positions = palloc(sizeof(int) * n);
+
+	i = 0;
+	for_each_cell(lc, state->elemsList, list_nth_cell(state->elemsList, start))
+	{
+		state->elemCells[i] = lc;
+		state->elems[i] = lfirst(lc);
+		state->positions[i] = i + 1;
+		i++;
+
+		if (i >= n)
+			break;
+	}
+}
+
+/* Swap two elements of an array. */
+static void
+PathkeyMutatorSwap(int *a, int i, int j)
+{
+  int s = a[i];
+
+  a[i] = a[j];
+  a[j] = s;
+}
+
+/*
+ * Generate the next permutation of elements.
+ */
+static bool
+PathkeyMutatorNextSet(int *a, int n)
+{
+	int j, k, l, r;
+
+	j = n - 2;
+
+	while (j >= 0 && a[j] >= a[j + 1])
+		j--;
+
+	if (j < 0)
+		return false;
+
+	k = n - 1;
+
+	while (k >= 0 && a[j] >= a[k])
+		k--;
+
+	PathkeyMutatorSwap(a, j, k);
+
+	l = j + 1;
+	r = n - 1;
+
+	while (l < r)
+		PathkeyMutatorSwap(a, l++, r--);
+
+	return true;
+}
+
+/*
+ * PathkeyMutatorNext
+ *		Generate the next permutation of list of elements.
+ *
+ * Returns the next permutation (as a list of elements) or NIL if there are no
+ * more permutations.
+ */
+static List *
+PathkeyMutatorNext(PathkeyMutatorState *state)
+{
+	int	i;
+
+	state->count++;
+
+	/* first permutation is original list */
+	if (state->count == 1)
+		return state->elemsList;
+
+	/* when there are no more permutations, return NIL */
+	if (!PathkeyMutatorNextSet(state->positions, state->mutatorNColumns))
+	{
+		pfree(state->elems);
+		pfree(state->elemCells);
+		pfree(state->positions);
+
+		list_free(state->elemsList);
+
+		return NIL;
+	}
+
+	/* update the list cells to point to the right elements */
+	for(i = 0; i < state->mutatorNColumns; i++)
+		lfirst(state->elemCells[i]) =
+			(void *) state->elems[ state->positions[i] - 1 ];
+
+	return state->elemsList;
+}
+
+/*
+ * Cost of comparing pathkeys.
+ */
+typedef struct PathkeySortCost
+{
+	Cost		cost;
+	PathKey	   *pathkey;
+} PathkeySortCost;
+
+static int
+pathkey_sort_cost_comparator(const void *_a, const void *_b)
+{
+	const PathkeySortCost *a = (PathkeySortCost *) _a;
+	const PathkeySortCost *b = (PathkeySortCost *) _b;
+
+	if (a->cost < b->cost)
+		return -1;
+	else if (a->cost == b->cost)
+		return 0;
+	return 1;
+}
+
+/*
+ * get_cheapest_group_keys_order
+ *		Reorders the group pathkeys/clauses to minimize the comparison cost.
+ *
+ * Given a list of pathkeys, we try to reorder them in a way that minimizes
+ * the CPU cost of sorting. This depends mainly on the cost of comparator
+ * function (the pathkeys may use different data types) and the number of
+ * distinct values in each column (which affects the number of comparator
+ * calls for the following pathkeys).
+ *
+ * In case the input is partially sorted, only the remaining pathkeys are
+ * considered.
+ *
+ * Returns true if the keys/clauses have been reordered (or might have been),
+ * and a new list is returned through an argument. The list is a new copy
+ * and may be freed using list_free.
+ *
+ * Returns false if no reordering was possible.
+ */
+static bool
+get_cheapest_group_keys_order(PlannerInfo *root, double nrows,
+							  List **group_pathkeys, List **group_clauses,
+							  int n_preordered)
+{
+	List		   *new_group_pathkeys = NIL,
+				   *new_group_clauses = NIL,
+				   *var_group_pathkeys;
+
+	ListCell	   *cell;
+	PathkeyMutatorState	mstate;
+	double			cheapest_sort_cost = -1.0;
+
+	int nFreeKeys;
+	int nToPermute;
+
+	/* If there are less than 2 unsorted pathkeys, we're done. */
+	if (list_length(*group_pathkeys) - n_preordered < 2)
+		return false;
+
+	/*
+	 * We could exhaustively cost all possible orderings of the pathkeys, but for
+	 * a large number of pathkeys it might be prohibitively expensive. So we try
+	 * to apply simple cheap heuristics first - we sort the pathkeys by sort cost
+	 * (as if the pathkey was sorted independently) and then check only the four
+	 * cheapest pathkeys. The remaining pathkeys are kept ordered by cost.
+	 *
+	 * XXX This is a very simple heuristics, but likely to work fine for most
+	 * cases (because the number of GROUP BY clauses tends to be lower than 4).
+	 * But it ignores how the number of distinct values in each pathkey affects
+	 * the following steps. It might be better to use "more expensive" pathkey
+	 * first if it has many distinct values, because it then limits the number
+	 * of comparisons for the remaining pathkeys. But evaluating that is likely
+	 * quite the expensive.
+	 */
+	nFreeKeys = list_length(*group_pathkeys) - n_preordered;
+	nToPermute = 4;
+	if (nFreeKeys > nToPermute)
+	{
+		int i;
+		PathkeySortCost *costs = palloc(sizeof(PathkeySortCost) * nFreeKeys);
+
+		/* skip the pre-ordered pathkeys */
+		cell = list_nth_cell(*group_pathkeys, n_preordered);
+
+		/* estimate cost for sorting individual pathkeys */
+		for (i = 0; cell != NULL; i++, (cell = lnext(*group_pathkeys, cell)))
+		{
+			List *to_cost = list_make1(lfirst(cell));
+
+			Assert(i < nFreeKeys);
+
+			costs[i].pathkey = lfirst(cell);
+			costs[i].cost = cost_sort_estimate(root, to_cost, 0, nrows);
+
+			pfree(to_cost);
+		}
+
+		/* sort the pathkeys by sort cost in ascending order */
+		qsort(costs, nFreeKeys, sizeof(*costs), pathkey_sort_cost_comparator);
+
+		/*
+		 * Rebuild the list of pathkeys - first the preordered ones, then the
+		 * rest ordered by cost.
+		 */
+		new_group_pathkeys = list_truncate(list_copy(*group_pathkeys), n_preordered);
+
+		for (i = 0; i < nFreeKeys; i++)
+			new_group_pathkeys = lappend(new_group_pathkeys, costs[i].pathkey);
+
+		pfree(costs);
+	}
+	else
+	{
+		/* Copy the list, so that we can free the new list by list_free. */
+		new_group_pathkeys = list_copy(*group_pathkeys);
+		nToPermute = nFreeKeys;
+	}
+
+	Assert(list_length(new_group_pathkeys) == list_length(*group_pathkeys));
+
+	/*
+	 * Generate pathkey lists with permutations of the first nToPermute pathkeys.
+	 *
+	 * XXX We simply calculate sort cost for each individual pathkey list, but
+	 * there's room for two dynamic programming optimizations here. Firstly, we
+	 * may pass the current "best" cost to cost_sort_estimate so that it can
+	 * "abort" if the estimated pathkeys list exceeds it. Secondly, it could pass
+	 * the return information about the position when it exceeded the cost, and
+	 * we could skip all permutations with the same prefix.
+	 *
+	 * Imagine we've already found ordering with cost C1, and we're evaluating
+	 * another ordering - cost_sort_estimate() calculates cost by adding the
+	 * pathkeys one by one (more or less), and the cost only grows. If at any
+	 * point it exceeds C1, it can't possibly be "better" so we can discard it.
+	 * But we also know that we can discard all ordering with the same prefix,
+	 * because if we're estimating (a,b,c,d) and we exceed C1 at (a,b) then the
+	 * same thing will happen for any ordering with this prefix.
+	 */
+	PathkeyMutatorInit(&mstate, new_group_pathkeys, n_preordered, n_preordered + nToPermute);
+
+	while((var_group_pathkeys = PathkeyMutatorNext(&mstate)) != NIL)
+	{
+		Cost	cost;
+
+		cost = cost_sort_estimate(root, var_group_pathkeys, n_preordered, nrows);
+
+		if (cost < cheapest_sort_cost || cheapest_sort_cost < 0)
+		{
+			list_free(new_group_pathkeys);
+			new_group_pathkeys = list_copy(var_group_pathkeys);
+			cheapest_sort_cost = cost;
+		}
+	}
+
+	/* Reorder the group clauses according to the reordered pathkeys. */
+	foreach(cell, new_group_pathkeys)
+	{
+		PathKey			*pathkey = (PathKey *) lfirst(cell);
+
+		new_group_clauses = lappend(new_group_clauses,
+						get_sortgroupref_clause(pathkey->pk_eclass->ec_sortref,
+												*group_clauses));
+	}
+
+	/* Just append the rest GROUP BY clauses */
+	new_group_clauses = list_concat_unique_ptr(new_group_clauses,
+											   *group_clauses);
+
+	*group_pathkeys = new_group_pathkeys;
+	*group_clauses = new_group_clauses;
+
+	return true;
+}
+
+/*
+ * get_useful_group_keys_orderings
+ *		Determine which orderings of GROUP BY keys are potentially interesting.
+ *
+ * Returns list of PathKeyInfo items, each representing an interesting ordering
+ * of GROUP BY keys. Each item stores pathkeys and clauses in matching order.
+ *
+ * The function considers (and keeps) multiple group by orderings:
+ *
+ * - the original ordering, as specified by the GROUP BY clause
+ *
+ * - GROUP BY keys reordered to minimize the sort cost
+ *
+ * - GROUP BY keys reordered to match path ordering (as much as possible), with
+ *   the tail reordered to minimize the sort cost
+ *
+ * - GROUP BY keys to match target ORDER BY clause (as much as possible), with
+ *   the tail reordered to minimize the sort cost
+ *
+ * There are other potentially interesting orderings (e.g. it might be best to
+ * match the first ORDER BY key, order the remaining keys differently and then
+ * rely on the incremental sort to fix this), but we ignore those for now. To
+ * make this work we'd have to pretty much generate all possible permutations.
+ */
+List *
+get_useful_group_keys_orderings(PlannerInfo *root, double nrows,
+								List *path_pathkeys,
+								List *group_pathkeys, List *group_clauses)
+{
+	Query	   *parse = root->parse;
+	List	   *infos = NIL;
+	PathKeyInfo *info;
+	int			n_preordered = 0;
+
+	List *pathkeys = group_pathkeys;
+	List *clauses = group_clauses;
+
+	/* always return at least the original pathkeys/clauses */
+	info = makeNode(PathKeyInfo);
+	info->pathkeys = pathkeys;
+	info->clauses = clauses;
+
+	infos = lappend(infos, info);
+
+	/*
+	 * Should we try generating alternative orderings of the group keys? If not,
+	 * we produce only the order specified in the query, i.e. the optimization
+	 * is effectively disabled.
+	 */
+	if (!enable_group_by_reordering)
+		return infos;
+
+	/* for grouping sets we can't do any reordering */
+	if (parse->groupingSets)
+		return infos;
+
+	/*
+	 * Try reordering pathkeys to minimize the sort cost, ignoring both the
+	 * target ordering (ORDER BY) and ordering of the input path.
+	 */
+	if (get_cheapest_group_keys_order(root, nrows, &pathkeys, &clauses,
+									  n_preordered))
+	{
+		info = makeNode(PathKeyInfo);
+		info->pathkeys = pathkeys;
+		info->clauses = clauses;
+
+		infos = lappend(infos, info);
+	}
+
+	/*
+	 * If the path is sorted in some way, try reordering the group keys to match
+	 * as much of the ordering as possible - we get this sort for free (mostly).
+	 *
+	 * We must not do this when there are no grouping sets, because those use
+	 * more complex logic to decide the ordering.
+	 *
+	 * XXX Isn't this somewhat redundant with presorted_keys? Actually, it's
+	 * more a complement, because it allows benefiting from incremental sort
+	 * as much as possible.
+	 *
+	 * XXX This does nothing if (n_preordered == 0). We shouldn't create the
+	 * info in this case.
+	 */
+	if (path_pathkeys)
+	{
+		n_preordered = group_keys_reorder_by_pathkeys(path_pathkeys,
+													  &pathkeys,
+													  &clauses);
+
+		/* reorder the tail to minimize sort cost */
+		get_cheapest_group_keys_order(root, nrows, &pathkeys, &clauses,
+									  n_preordered);
+
+		/*
+		 * reorder the tail to minimize sort cost
+		 *
+		 * XXX Ignore the return value - there may be nothing to reorder, in
+		 * which case get_cheapest_group_keys_order returns false. But we
+		 * still want to keep the keys reordered to path_pathkeys.
+		 */
+		info = makeNode(PathKeyInfo);
+		info->pathkeys = pathkeys;
+		info->clauses = clauses;
+
+		infos = lappend(infos, info);
+	}
+
+	/*
+	 * Try reordering pathkeys to minimize the sort cost (this time consider
+	 * the ORDER BY clause, but only if set debug_group_by_match_order_by).
+	 */
+	if (root->sort_pathkeys)
+	{
+		n_preordered = group_keys_reorder_by_pathkeys(root->sort_pathkeys,
+													  &pathkeys,
+													  &clauses);
+
+		/*
+		 * reorder the tail to minimize sort cost
+		 *
+		 * XXX Ignore the return value - there may be nothing to reorder, in
+		 * which case get_cheapest_group_keys_order returns false. But we
+		 * still want to keep the keys reordered to sort_pathkeys.
+		 */
+		get_cheapest_group_keys_order(root, nrows, &pathkeys, &clauses,
+									  n_preordered);
+
+		/* keep the group keys reordered to match ordering of input path */
+		info = makeNode(PathKeyInfo);
+		info->pathkeys = pathkeys;
+		info->clauses = clauses;
+
+		infos = lappend(infos, info);
+	}
+
+	return infos;
+}
+
+/*
  * pathkeys_count_contained_in
  *    Same as pathkeys_contained_in, but also sets length of longest
  *    common prefix of keys1 and keys2.
@@ -1863,6 +2379,54 @@ pathkeys_useful_for_ordering(PlannerInfo *root, List *pathkeys)
 }
 
 /*
+ * pathkeys_useful_for_grouping
+ *		Count the number of pathkeys that are useful for grouping (instead of
+ *		explicit sort)
+ *
+ * Group pathkeys could be reordered to benefit from the odering. The ordering
+ * may not be "complete" and may require incremental sort, but that's fine. So
+ * we simply count prefix pathkeys with a matching group key, and stop once we
+ * find the first pathkey without a match.
+ *
+ * So e.g. with pathkeys (a,b,c) and group keys (a,b,e) this determines (a,b)
+ * pathkeys are useful for grouping, and we might do incremental sort to get
+ * path ordered by (a,b,e).
+ *
+ * This logic is necessary to retain paths with ordeding not matching grouping
+ * keys directly, without the reordering.
+ *
+ * Returns the length of pathkey prefix with matching group keys.
+ */
+static int
+pathkeys_useful_for_grouping(PlannerInfo *root, List *pathkeys)
+{
+	ListCell *key;
+	int		  n = 0;
+
+	/* no special ordering requested for grouping */
+	if (root->group_pathkeys == NIL)
+		return 0;
+
+	/* unordered path */
+	if (pathkeys == NIL)
+		return 0;
+
+	/* walk the pathkeys and search for matching group key */
+	foreach(key, pathkeys)
+	{
+		PathKey	*pathkey = (PathKey *) lfirst(key);
+
+		/* no matching group key, we're done */
+		if (!list_member_ptr(root->group_pathkeys, pathkey))
+			break;
+
+		n++;
+	}
+
+	return n;
+}
+
+/*
  * truncate_useless_pathkeys
  *		Shorten the given pathkey list to just the useful pathkeys.
  */
@@ -1878,6 +2442,9 @@ truncate_useless_pathkeys(PlannerInfo *root,
 	nuseful2 = pathkeys_useful_for_ordering(root, pathkeys);
 	if (nuseful2 > nuseful)
 		nuseful = nuseful2;
+	nuseful2 = pathkeys_useful_for_grouping(root, pathkeys);
+	if (nuseful2 > nuseful)
+		nuseful = nuseful2;
 
 	/*
 	 * Note: not safe to modify input list destructively, but we can avoid
@@ -1911,6 +2478,8 @@ has_useful_pathkeys(PlannerInfo *root, RelOptInfo *rel)
 {
 	if (rel->joininfo != NIL || rel->has_eclass_joins)
 		return true;			/* might be able to use pathkeys for merging */
+	if (root->group_pathkeys != NIL)
+		return true;			/* might be able to use pathkeys for grouping */
 	if (root->query_pathkeys != NIL)
 		return true;			/* might be able to use them for ordering */
 	return false;				/* definitely useless */