diff options
-rw-r--r-- | src/backend/optimizer/plan/initsplan.c | 120 | ||||
-rw-r--r-- | src/backend/optimizer/util/plancat.c | 1 | ||||
-rw-r--r-- | src/include/nodes/pathnodes.h | 2 | ||||
-rw-r--r-- | src/test/regress/expected/aggregates.out | 67 | ||||
-rw-r--r-- | src/test/regress/sql/aggregates.sql | 32 |
5 files changed, 200 insertions, 22 deletions
diff --git a/src/backend/optimizer/plan/initsplan.c b/src/backend/optimizer/plan/initsplan.c index c1c4f9f8b9d..5f3908be519 100644 --- a/src/backend/optimizer/plan/initsplan.c +++ b/src/backend/optimizer/plan/initsplan.c @@ -400,17 +400,13 @@ add_vars_to_attr_needed(PlannerInfo *root, List *vars, * * Since some other DBMSes do not allow references to ungrouped columns, it's * not unusual to find all columns listed in GROUP BY even though listing the - * primary-key columns would be sufficient. Deleting such excess columns - * avoids redundant sorting work, so it's worth doing. + * primary-key columns, or columns of a unique constraint would be sufficient. + * Deleting such excess columns avoids redundant sorting or hashing work, so + * it's worth doing. * * Relcache invalidations will ensure that cached plans become invalidated - * when the underlying index of the pkey constraint is dropped. - * - * Currently, we only make use of pkey constraints for this, however, we may - * wish to take this further in the future and also use unique constraints - * which have NOT NULL columns. In that case, plan invalidation will still - * work since relations will receive a relcache invalidation when a NOT NULL - * constraint is dropped. + * when the underlying supporting indexes are dropped or if a column's NOT + * NULL attribute is removed. */ void remove_useless_groupby_columns(PlannerInfo *root) @@ -418,6 +414,7 @@ remove_useless_groupby_columns(PlannerInfo *root) Query *parse = root->parse; Bitmapset **groupbyattnos; Bitmapset **surplusvars; + bool tryremove = false; ListCell *lc; int relid; @@ -457,11 +454,25 @@ remove_useless_groupby_columns(PlannerInfo *root) /* OK, remember we have this Var */ relid = var->varno; Assert(relid <= list_length(parse->rtable)); + + /* + * If this isn't the first column for this relation then we now have + * multiple columns. That means there might be some that can be + * removed. + */ + tryremove |= !bms_is_empty(groupbyattnos[relid]); groupbyattnos[relid] = bms_add_member(groupbyattnos[relid], var->varattno - FirstLowInvalidHeapAttributeNumber); } /* + * No Vars or didn't find multiple Vars for any relation in the GROUP BY? + * If so, nothing can be removed, so don't waste more effort trying. + */ + if (!tryremove) + return; + + /* * Consider each relation and see if it is possible to remove some of its * Vars from GROUP BY. For simplicity and speed, we do the actual removal * in a separate pass. Here, we just fill surplusvars[k] with a bitmapset @@ -472,9 +483,10 @@ remove_useless_groupby_columns(PlannerInfo *root) foreach(lc, parse->rtable) { RangeTblEntry *rte = lfirst_node(RangeTblEntry, lc); + RelOptInfo *rel; Bitmapset *relattnos; - Bitmapset *pkattnos; - Oid constraintOid; + Bitmapset *best_keycolumns = NULL; + int32 best_nkeycolumns = PG_INT32_MAX; relid++; @@ -495,19 +507,83 @@ remove_useless_groupby_columns(PlannerInfo *root) if (bms_membership(relattnos) != BMS_MULTIPLE) continue; - /* - * Can't remove any columns for this rel if there is no suitable - * (i.e., nondeferrable) primary key constraint. - */ - pkattnos = get_primary_key_attnos(rte->relid, false, &constraintOid); - if (pkattnos == NULL) - continue; + rel = root->simple_rel_array[relid]; /* - * If the primary key is a proper subset of relattnos then we have - * some items in the GROUP BY that can be removed. + * Now check each index for this relation to see if there are any with + * columns which are a proper subset of the grouping columns for this + * relation. */ - if (bms_subset_compare(pkattnos, relattnos) == BMS_SUBSET1) + foreach_node(IndexOptInfo, index, rel->indexlist) + { + Bitmapset *ind_attnos; + bool nulls_check_ok; + + /* + * Skip any non-unique and deferrable indexes. Predicate indexes + * have not been checked yet, so we must skip those too as the + * predOK check that's done later might fail. + */ + if (!index->unique || !index->immediate || index->indpred != NIL) + continue; + + /* For simplicity, we currently don't support expression indexes */ + if (index->indexprs != NIL) + continue; + + ind_attnos = NULL; + nulls_check_ok = true; + for (int i = 0; i < index->nkeycolumns; i++) + { + /* + * We must insist that the index columns are all defined NOT + * NULL otherwise duplicate NULLs could exist. However, we + * can relax this check when the index is defined with NULLS + * NOT DISTINCT as there can only be 1 NULL row, therefore + * functional dependency on the unique columns is maintained, + * despite the NULL. + */ + if (!index->nullsnotdistinct && + !bms_is_member(index->indexkeys[i], + rel->notnullattnums)) + { + nulls_check_ok = false; + break; + } + + ind_attnos = + bms_add_member(ind_attnos, + index->indexkeys[i] - + FirstLowInvalidHeapAttributeNumber); + } + + if (!nulls_check_ok) + continue; + + /* + * Skip any indexes where the indexed columns aren't a proper + * subset of the GROUP BY. + */ + if (bms_subset_compare(ind_attnos, relattnos) != BMS_SUBSET1) + continue; + + /* + * Record the attribute numbers from the index with the fewest + * columns. This allows the largest number of columns to be + * removed from the GROUP BY clause. In the future, we may wish + * to consider using the narrowest set of columns and looking at + * pg_statistic.stawidth as it might be better to use an index + * with, say two INT4s, rather than, say, one long varlena column. + */ + if (index->nkeycolumns < best_nkeycolumns) + { + best_keycolumns = ind_attnos; + best_nkeycolumns = index->nkeycolumns; + } + } + + /* Did we find a suitable index? */ + if (!bms_is_empty(best_keycolumns)) { /* * To easily remember whether we've found anything to do, we don't @@ -518,7 +594,7 @@ remove_useless_groupby_columns(PlannerInfo *root) (list_length(parse->rtable) + 1)); /* Remember the attnos of the removable columns */ - surplusvars[relid] = bms_difference(relattnos, pkattnos); + surplusvars[relid] = bms_difference(relattnos, best_keycolumns); } } diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c index 37b0ca2e439..153390f2dc9 100644 --- a/src/backend/optimizer/util/plancat.c +++ b/src/backend/optimizer/util/plancat.c @@ -457,6 +457,7 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent, info->indrestrictinfo = NIL; /* set later, in indxpath.c */ info->predOK = false; /* set later, in indxpath.c */ info->unique = index->indisunique; + info->nullsnotdistinct = index->indnullsnotdistinct; info->immediate = index->indimmediate; info->hypothetical = false; diff --git a/src/include/nodes/pathnodes.h b/src/include/nodes/pathnodes.h index add0f9e45fc..0759e00e96d 100644 --- a/src/include/nodes/pathnodes.h +++ b/src/include/nodes/pathnodes.h @@ -1187,6 +1187,8 @@ struct IndexOptInfo bool predOK; /* true if a unique index */ bool unique; + /* true if the index was defined with NULLS NOT DISTINCT */ + bool nullsnotdistinct; /* is uniqueness enforced immediately? */ bool immediate; /* true if index doesn't really exist */ diff --git a/src/test/regress/expected/aggregates.out b/src/test/regress/expected/aggregates.out index 1e682565d13..f2fb66388cc 100644 --- a/src/test/regress/expected/aggregates.out +++ b/src/test/regress/expected/aggregates.out @@ -1448,6 +1448,73 @@ explain (costs off) select * from p_t1 group by a,b,c,d; -> Seq Scan on p_t1_2 (5 rows) +create unique index t3_c_uidx on t3(c); +-- Ensure we don't remove any columns from the GROUP BY for a unique +-- index on a NULLable column. +explain (costs off) select b,c from t3 group by b,c; + QUERY PLAN +---------------------- + HashAggregate + Group Key: b, c + -> Seq Scan on t3 +(3 rows) + +-- Make the column NOT NULL and ensure we remove the redundant column +alter table t3 alter column c set not null; +explain (costs off) select b,c from t3 group by b,c; + QUERY PLAN +---------------------- + HashAggregate + Group Key: c + -> Seq Scan on t3 +(3 rows) + +-- When there are multiple supporting unique indexes and the GROUP BY contains +-- columns to cover all of those, ensure we pick the index with the least +-- number of columns so that we can remove more columns from the GROUP BY. +explain (costs off) select a,b,c from t3 group by a,b,c; + QUERY PLAN +---------------------- + HashAggregate + Group Key: c + -> Seq Scan on t3 +(3 rows) + +-- As above but try ordering the columns differently to ensure we get the +-- same result. +explain (costs off) select a,b,c from t3 group by c,a,b; + QUERY PLAN +---------------------- + HashAggregate + Group Key: c + -> Seq Scan on t3 +(3 rows) + +-- Ensure we don't use a partial index as proof of functional dependency +drop index t3_c_uidx; +create index t3_c_uidx on t3 (c) where c > 0; +explain (costs off) select b,c from t3 group by b,c; + QUERY PLAN +---------------------- + HashAggregate + Group Key: b, c + -> Seq Scan on t3 +(3 rows) + +-- A unique index defined as NULLS NOT DISTINCT does not need a supporting NOT +-- NULL constraint on the indexed columns. Ensure the redundant columns are +-- removed from the GROUP BY for such a table. +drop index t3_c_uidx; +alter table t3 alter column c drop not null; +create unique index t3_c_uidx on t3(c) nulls not distinct; +explain (costs off) select b,c from t3 group by b,c; + QUERY PLAN +---------------------- + HashAggregate + Group Key: c + -> Seq Scan on t3 +(3 rows) + drop table t1 cascade; NOTICE: drop cascades to table t1c drop table t2; diff --git a/src/test/regress/sql/aggregates.sql b/src/test/regress/sql/aggregates.sql index 4885daffe63..77168bcc744 100644 --- a/src/test/regress/sql/aggregates.sql +++ b/src/test/regress/sql/aggregates.sql @@ -507,6 +507,38 @@ create temp table p_t1_2 partition of p_t1 for values in(2); -- Ensure we can remove non-PK columns for partitioned tables. explain (costs off) select * from p_t1 group by a,b,c,d; +create unique index t3_c_uidx on t3(c); + +-- Ensure we don't remove any columns from the GROUP BY for a unique +-- index on a NULLable column. +explain (costs off) select b,c from t3 group by b,c; + +-- Make the column NOT NULL and ensure we remove the redundant column +alter table t3 alter column c set not null; +explain (costs off) select b,c from t3 group by b,c; + +-- When there are multiple supporting unique indexes and the GROUP BY contains +-- columns to cover all of those, ensure we pick the index with the least +-- number of columns so that we can remove more columns from the GROUP BY. +explain (costs off) select a,b,c from t3 group by a,b,c; + +-- As above but try ordering the columns differently to ensure we get the +-- same result. +explain (costs off) select a,b,c from t3 group by c,a,b; + +-- Ensure we don't use a partial index as proof of functional dependency +drop index t3_c_uidx; +create index t3_c_uidx on t3 (c) where c > 0; +explain (costs off) select b,c from t3 group by b,c; + +-- A unique index defined as NULLS NOT DISTINCT does not need a supporting NOT +-- NULL constraint on the indexed columns. Ensure the redundant columns are +-- removed from the GROUP BY for such a table. +drop index t3_c_uidx; +alter table t3 alter column c drop not null; +create unique index t3_c_uidx on t3(c) nulls not distinct; +explain (costs off) select b,c from t3 group by b,c; + drop table t1 cascade; drop table t2; drop table t3; |