5 files changed, 200 insertions, 22 deletions
diff --git a/src/backend/optimizer/plan/initsplan.c b/src/backend/optimizer/plan/initsplan.c
index c1c4f9f8b9d..5f3908be519 100644
--- a/src/backend/optimizer/plan/initsplan.c
+++ b/src/backend/optimizer/plan/initsplan.c
@@ -400,17 +400,13 @@ add_vars_to_attr_needed(PlannerInfo *root, List *vars,
  *
  * Since some other DBMSes do not allow references to ungrouped columns, it's
  * not unusual to find all columns listed in GROUP BY even though listing the
- * primary-key columns would be sufficient.  Deleting such excess columns
- * avoids redundant sorting work, so it's worth doing.
+ * primary-key columns, or columns of a unique constraint would be sufficient.
+ * Deleting such excess columns avoids redundant sorting or hashing work, so
+ * it's worth doing.
  *
  * Relcache invalidations will ensure that cached plans become invalidated
- * when the underlying index of the pkey constraint is dropped.
- *
- * Currently, we only make use of pkey constraints for this, however, we may
- * wish to take this further in the future and also use unique constraints
- * which have NOT NULL columns.  In that case, plan invalidation will still
- * work since relations will receive a relcache invalidation when a NOT NULL
- * constraint is dropped.
+ * when the underlying supporting indexes are dropped or if a column's NOT
+ * NULL attribute is removed.
  */
 void
 remove_useless_groupby_columns(PlannerInfo *root)
@@ -418,6 +414,7 @@ remove_useless_groupby_columns(PlannerInfo *root)
 	Query	   *parse = root->parse;
 	Bitmapset **groupbyattnos;
 	Bitmapset **surplusvars;
+	bool		tryremove = false;
 	ListCell   *lc;
 	int			relid;
 
@@ -457,11 +454,25 @@ remove_useless_groupby_columns(PlannerInfo *root)
 		/* OK, remember we have this Var */
 		relid = var->varno;
 		Assert(relid <= list_length(parse->rtable));
+
+		/*
+		 * If this isn't the first column for this relation then we now have
+		 * multiple columns.  That means there might be some that can be
+		 * removed.
+		 */
+		tryremove |= !bms_is_empty(groupbyattnos[relid]);
 		groupbyattnos[relid] = bms_add_member(groupbyattnos[relid],
 											  var->varattno - FirstLowInvalidHeapAttributeNumber);
 	}
 
 	/*
+	 * No Vars or didn't find multiple Vars for any relation in the GROUP BY?
+	 * If so, nothing can be removed, so don't waste more effort trying.
+	 */
+	if (!tryremove)
+		return;
+
+	/*
 	 * Consider each relation and see if it is possible to remove some of its
 	 * Vars from GROUP BY.  For simplicity and speed, we do the actual removal
 	 * in a separate pass.  Here, we just fill surplusvars[k] with a bitmapset
@@ -472,9 +483,10 @@ remove_useless_groupby_columns(PlannerInfo *root)
 	foreach(lc, parse->rtable)
 	{
 		RangeTblEntry *rte = lfirst_node(RangeTblEntry, lc);
+		RelOptInfo *rel;
 		Bitmapset  *relattnos;
-		Bitmapset  *pkattnos;
-		Oid			constraintOid;
+		Bitmapset  *best_keycolumns = NULL;
+		int32		best_nkeycolumns = PG_INT32_MAX;
 
 		relid++;
 
@@ -495,19 +507,83 @@ remove_useless_groupby_columns(PlannerInfo *root)
 		if (bms_membership(relattnos) != BMS_MULTIPLE)
 			continue;
 
-		/*
-		 * Can't remove any columns for this rel if there is no suitable
-		 * (i.e., nondeferrable) primary key constraint.
-		 */
-		pkattnos = get_primary_key_attnos(rte->relid, false, &constraintOid);
-		if (pkattnos == NULL)
-			continue;
+		rel = root->simple_rel_array[relid];
 
 		/*
-		 * If the primary key is a proper subset of relattnos then we have
-		 * some items in the GROUP BY that can be removed.
+		 * Now check each index for this relation to see if there are any with
+		 * columns which are a proper subset of the grouping columns for this
+		 * relation.
 		 */
-		if (bms_subset_compare(pkattnos, relattnos) == BMS_SUBSET1)
+		foreach_node(IndexOptInfo, index, rel->indexlist)
+		{
+			Bitmapset  *ind_attnos;
+			bool		nulls_check_ok;
+
+			/*
+			 * Skip any non-unique and deferrable indexes.  Predicate indexes
+			 * have not been checked yet, so we must skip those too as the
+			 * predOK check that's done later might fail.
+			 */
+			if (!index->unique || !index->immediate || index->indpred != NIL)
+				continue;
+
+			/* For simplicity, we currently don't support expression indexes */
+			if (index->indexprs != NIL)
+				continue;
+
+			ind_attnos = NULL;
+			nulls_check_ok = true;
+			for (int i = 0; i < index->nkeycolumns; i++)
+			{
+				/*
+				 * We must insist that the index columns are all defined NOT
+				 * NULL otherwise duplicate NULLs could exist.  However, we
+				 * can relax this check when the index is defined with NULLS
+				 * NOT DISTINCT as there can only be 1 NULL row, therefore
+				 * functional dependency on the unique columns is maintained,
+				 * despite the NULL.
+				 */
+				if (!index->nullsnotdistinct &&
+					!bms_is_member(index->indexkeys[i],
+								   rel->notnullattnums))
+				{
+					nulls_check_ok = false;
+					break;
+				}
+
+				ind_attnos =
+					bms_add_member(ind_attnos,
+								   index->indexkeys[i] -
+								   FirstLowInvalidHeapAttributeNumber);
+			}
+
+			if (!nulls_check_ok)
+				continue;
+
+			/*
+			 * Skip any indexes where the indexed columns aren't a proper
+			 * subset of the GROUP BY.
+			 */
+			if (bms_subset_compare(ind_attnos, relattnos) != BMS_SUBSET1)
+				continue;
+
+			/*
+			 * Record the attribute numbers from the index with the fewest
+			 * columns.  This allows the largest number of columns to be
+			 * removed from the GROUP BY clause.  In the future, we may wish
+			 * to consider using the narrowest set of columns and looking at
+			 * pg_statistic.stawidth as it might be better to use an index
+			 * with, say two INT4s, rather than, say, one long varlena column.
+			 */
+			if (index->nkeycolumns < best_nkeycolumns)
+			{
+				best_keycolumns = ind_attnos;
+				best_nkeycolumns = index->nkeycolumns;
+			}
+		}
+
+		/* Did we find a suitable index? */
+		if (!bms_is_empty(best_keycolumns))
 		{
 			/*
 			 * To easily remember whether we've found anything to do, we don't
@@ -518,7 +594,7 @@ remove_useless_groupby_columns(PlannerInfo *root)
 													 (list_length(parse->rtable) + 1));
 
 			/* Remember the attnos of the removable columns */
-			surplusvars[relid] = bms_difference(relattnos, pkattnos);
+			surplusvars[relid] = bms_difference(relattnos, best_keycolumns);
 		}
 	}
 
diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c
index 37b0ca2e439..153390f2dc9 100644
--- a/src/backend/optimizer/util/plancat.c
+++ b/src/backend/optimizer/util/plancat.c
@@ -457,6 +457,7 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
 			info->indrestrictinfo = NIL;	/* set later, in indxpath.c */
 			info->predOK = false;	/* set later, in indxpath.c */
 			info->unique = index->indisunique;
+			info->nullsnotdistinct = index->indnullsnotdistinct;
 			info->immediate = index->indimmediate;
 			info->hypothetical = false;
 
diff --git a/src/include/nodes/pathnodes.h b/src/include/nodes/pathnodes.h
index add0f9e45fc..0759e00e96d 100644
--- a/src/include/nodes/pathnodes.h
+++ b/src/include/nodes/pathnodes.h
@@ -1187,6 +1187,8 @@ struct IndexOptInfo
 	bool		predOK;
 	/* true if a unique index */
 	bool		unique;
+	/* true if the index was defined with NULLS NOT DISTINCT */
+	bool		nullsnotdistinct;
 	/* is uniqueness enforced immediately? */
 	bool		immediate;
 	/* true if index doesn't really exist */
diff --git a/src/test/regress/expected/aggregates.out b/src/test/regress/expected/aggregates.out
index 1e682565d13..f2fb66388cc 100644
--- a/src/test/regress/expected/aggregates.out
+++ b/src/test/regress/expected/aggregates.out
@@ -1448,6 +1448,73 @@ explain (costs off) select * from p_t1 group by a,b,c,d;
          ->  Seq Scan on p_t1_2
 (5 rows)
 
+create unique index t3_c_uidx on t3(c);
+-- Ensure we don't remove any columns from the GROUP BY for a unique
+-- index on a NULLable column.
+explain (costs off) select b,c from t3 group by b,c;
+      QUERY PLAN      
+----------------------
+ HashAggregate
+   Group Key: b, c
+   ->  Seq Scan on t3
+(3 rows)
+
+-- Make the column NOT NULL and ensure we remove the redundant column
+alter table t3 alter column c set not null;
+explain (costs off) select b,c from t3 group by b,c;
+      QUERY PLAN      
+----------------------
+ HashAggregate
+   Group Key: c
+   ->  Seq Scan on t3
+(3 rows)
+
+-- When there are multiple supporting unique indexes and the GROUP BY contains
+-- columns to cover all of those, ensure we pick the index with the least
+-- number of columns so that we can remove more columns from the GROUP BY.
+explain (costs off) select a,b,c from t3 group by a,b,c;
+      QUERY PLAN      
+----------------------
+ HashAggregate
+   Group Key: c
+   ->  Seq Scan on t3
+(3 rows)
+
+-- As above but try ordering the columns differently to ensure we get the
+-- same result.
+explain (costs off) select a,b,c from t3 group by c,a,b;
+      QUERY PLAN      
+----------------------
+ HashAggregate
+   Group Key: c
+   ->  Seq Scan on t3
+(3 rows)
+
+-- Ensure we don't use a partial index as proof of functional dependency
+drop index t3_c_uidx;
+create index t3_c_uidx on t3 (c) where c > 0;
+explain (costs off) select b,c from t3 group by b,c;
+      QUERY PLAN      
+----------------------
+ HashAggregate
+   Group Key: b, c
+   ->  Seq Scan on t3
+(3 rows)
+
+-- A unique index defined as NULLS NOT DISTINCT does not need a supporting NOT
+-- NULL constraint on the indexed columns.  Ensure the redundant columns are
+-- removed from the GROUP BY for such a table.
+drop index t3_c_uidx;
+alter table t3 alter column c drop not null;
+create unique index t3_c_uidx on t3(c) nulls not distinct;
+explain (costs off) select b,c from t3 group by b,c;
+      QUERY PLAN      
+----------------------
+ HashAggregate
+   Group Key: c
+   ->  Seq Scan on t3
+(3 rows)
+
 drop table t1 cascade;
 NOTICE:  drop cascades to table t1c
 drop table t2;
diff --git a/src/test/regress/sql/aggregates.sql b/src/test/regress/sql/aggregates.sql
index 4885daffe63..77168bcc744 100644
--- a/src/test/regress/sql/aggregates.sql
+++ b/src/test/regress/sql/aggregates.sql
@@ -507,6 +507,38 @@ create temp table p_t1_2 partition of p_t1 for values in(2);
 -- Ensure we can remove non-PK columns for partitioned tables.
 explain (costs off) select * from p_t1 group by a,b,c,d;
 
+create unique index t3_c_uidx on t3(c);
+
+-- Ensure we don't remove any columns from the GROUP BY for a unique
+-- index on a NULLable column.
+explain (costs off) select b,c from t3 group by b,c;
+
+-- Make the column NOT NULL and ensure we remove the redundant column
+alter table t3 alter column c set not null;
+explain (costs off) select b,c from t3 group by b,c;
+
+-- When there are multiple supporting unique indexes and the GROUP BY contains
+-- columns to cover all of those, ensure we pick the index with the least
+-- number of columns so that we can remove more columns from the GROUP BY.
+explain (costs off) select a,b,c from t3 group by a,b,c;
+
+-- As above but try ordering the columns differently to ensure we get the
+-- same result.
+explain (costs off) select a,b,c from t3 group by c,a,b;
+
+-- Ensure we don't use a partial index as proof of functional dependency
+drop index t3_c_uidx;
+create index t3_c_uidx on t3 (c) where c > 0;
+explain (costs off) select b,c from t3 group by b,c;
+
+-- A unique index defined as NULLS NOT DISTINCT does not need a supporting NOT
+-- NULL constraint on the indexed columns.  Ensure the redundant columns are
+-- removed from the GROUP BY for such a table.
+drop index t3_c_uidx;
+alter table t3 alter column c drop not null;
+create unique index t3_c_uidx on t3(c) nulls not distinct;
+explain (costs off) select b,c from t3 group by b,c;
+
 drop table t1 cascade;
 drop table t2;
 drop table t3;