aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorDavid Rowley <drowley@postgresql.org>2023-01-09 17:15:08 +1300
committerDavid Rowley <drowley@postgresql.org>2023-01-09 17:15:08 +1300
commit3c569049b7b502bb4952483d19ce622ff0af5fd6 (patch)
tree7ce433043ae025e93ed62e5a763d1ed01942f0c4 /src
parent216a784829c2c5f03ab0c43e009126cbb819e9b2 (diff)
downloadpostgresql-3c569049b7b502bb4952483d19ce622ff0af5fd6.tar.gz
postgresql-3c569049b7b502bb4952483d19ce622ff0af5fd6.zip
Allow left join removals and unique joins on partitioned tables
This allows left join removals and unique joins to work with partitioned tables. The planner just lacked sufficient proofs that a given join would not cause any row duplication. Unique indexes currently serve as that proof, so have get_relation_info() populate the indexlist for partitioned tables too. Author: Arne Roland Reviewed-by: Alvaro Herrera, Zhihong Yu, Amit Langote, David Rowley Discussion: https://postgr.es/m/c3b2408b7a39433b8230bbcd02e9f302@index.de
Diffstat (limited to 'src')
-rw-r--r--src/backend/optimizer/util/plancat.c264
-rw-r--r--src/backend/utils/adt/selfuncs.c4
-rw-r--r--src/include/nodes/pathnodes.h10
-rw-r--r--src/test/regress/expected/join.out10
-rw-r--r--src/test/regress/expected/partition_join.out4
-rw-r--r--src/test/regress/sql/join.sql7
-rw-r--r--src/test/regress/sql/partition_join.sql4
7 files changed, 179 insertions, 124 deletions
diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c
index 9f158f2421b..d58c4a10782 100644
--- a/src/backend/optimizer/util/plancat.c
+++ b/src/backend/optimizer/util/plancat.c
@@ -109,7 +109,9 @@ static void set_baserel_partition_constraint(Relation relation,
* If inhparent is true, all we need to do is set up the attr arrays:
* the RelOptInfo actually represents the appendrel formed by an inheritance
* tree, and so the parent rel's physical size and index information isn't
- * important for it.
+ * important for it, however, for partitioned tables, we do populate the
+ * indexlist as the planner uses unique indexes as unique proofs for certain
+ * optimizations.
*/
void
get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
@@ -175,10 +177,14 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
/*
* Make list of indexes. Ignore indexes on system catalogs if told to.
- * Don't bother with indexes for an inheritance parent, either.
+ * Don't bother with indexes from traditional inheritance parents. For
+ * partitioned tables, we need a list of at least unique indexes as these
+ * serve as unique proofs for certain planner optimizations. However,
+ * let's not discriminate here and just record all partitioned indexes
+ * whether they're unique indexes or not.
*/
- if (inhparent ||
- (IgnoreSystemIndexes && IsSystemRelation(relation)))
+ if ((inhparent && relation->rd_rel->relkind != RELKIND_PARTITIONED_TABLE)
+ || (IgnoreSystemIndexes && IsSystemRelation(relation)))
hasindex = false;
else
hasindex = relation->rd_rel->relhasindex;
@@ -232,16 +238,6 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
}
/*
- * Ignore partitioned indexes, since they are not usable for
- * queries.
- */
- if (indexRelation->rd_rel->relkind == RELKIND_PARTITIONED_INDEX)
- {
- index_close(indexRelation, NoLock);
- continue;
- }
-
- /*
* If the index is valid, but cannot yet be used, ignore it; but
* mark the plan we are generating as transient. See
* src/backend/access/heap/README.HOT for discussion.
@@ -285,105 +281,129 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
info->relam = indexRelation->rd_rel->relam;
- /* We copy just the fields we need, not all of rd_indam */
- amroutine = indexRelation->rd_indam;
- info->amcanorderbyop = amroutine->amcanorderbyop;
- info->amoptionalkey = amroutine->amoptionalkey;
- info->amsearcharray = amroutine->amsearcharray;
- info->amsearchnulls = amroutine->amsearchnulls;
- info->amcanparallel = amroutine->amcanparallel;
- info->amhasgettuple = (amroutine->amgettuple != NULL);
- info->amhasgetbitmap = amroutine->amgetbitmap != NULL &&
- relation->rd_tableam->scan_bitmap_next_block != NULL;
- info->amcanmarkpos = (amroutine->ammarkpos != NULL &&
- amroutine->amrestrpos != NULL);
- info->amcostestimate = amroutine->amcostestimate;
- Assert(info->amcostestimate != NULL);
-
- /* Fetch index opclass options */
- info->opclassoptions = RelationGetIndexAttOptions(indexRelation, true);
-
/*
- * Fetch the ordering information for the index, if any.
+ * We don't have an AM for partitioned indexes, so we'll just
+ * NULLify the AM related fields for those.
*/
- if (info->relam == BTREE_AM_OID)
+ if (indexRelation->rd_rel->relkind != RELKIND_PARTITIONED_INDEX)
{
+ /* We copy just the fields we need, not all of rd_indam */
+ amroutine = indexRelation->rd_indam;
+ info->amcanorderbyop = amroutine->amcanorderbyop;
+ info->amoptionalkey = amroutine->amoptionalkey;
+ info->amsearcharray = amroutine->amsearcharray;
+ info->amsearchnulls = amroutine->amsearchnulls;
+ info->amcanparallel = amroutine->amcanparallel;
+ info->amhasgettuple = (amroutine->amgettuple != NULL);
+ info->amhasgetbitmap = amroutine->amgetbitmap != NULL &&
+ relation->rd_tableam->scan_bitmap_next_block != NULL;
+ info->amcanmarkpos = (amroutine->ammarkpos != NULL &&
+ amroutine->amrestrpos != NULL);
+ info->amcostestimate = amroutine->amcostestimate;
+ Assert(info->amcostestimate != NULL);
+
+ /* Fetch index opclass options */
+ info->opclassoptions = RelationGetIndexAttOptions(indexRelation, true);
+
/*
- * If it's a btree index, we can use its opfamily OIDs
- * directly as the sort ordering opfamily OIDs.
+ * Fetch the ordering information for the index, if any.
*/
- Assert(amroutine->amcanorder);
-
- info->sortopfamily = info->opfamily;
- info->reverse_sort = (bool *) palloc(sizeof(bool) * nkeycolumns);
- info->nulls_first = (bool *) palloc(sizeof(bool) * nkeycolumns);
-
- for (i = 0; i < nkeycolumns; i++)
+ if (info->relam == BTREE_AM_OID)
{
- int16 opt = indexRelation->rd_indoption[i];
+ /*
+ * If it's a btree index, we can use its opfamily OIDs
+ * directly as the sort ordering opfamily OIDs.
+ */
+ Assert(amroutine->amcanorder);
- info->reverse_sort[i] = (opt & INDOPTION_DESC) != 0;
- info->nulls_first[i] = (opt & INDOPTION_NULLS_FIRST) != 0;
- }
- }
- else if (amroutine->amcanorder)
- {
- /*
- * Otherwise, identify the corresponding btree opfamilies by
- * trying to map this index's "<" operators into btree. Since
- * "<" uniquely defines the behavior of a sort order, this is
- * a sufficient test.
- *
- * XXX This method is rather slow and also requires the
- * undesirable assumption that the other index AM numbers its
- * strategies the same as btree. It'd be better to have a way
- * to explicitly declare the corresponding btree opfamily for
- * each opfamily of the other index type. But given the lack
- * of current or foreseeable amcanorder index types, it's not
- * worth expending more effort on now.
- */
- info->sortopfamily = (Oid *) palloc(sizeof(Oid) * nkeycolumns);
- info->reverse_sort = (bool *) palloc(sizeof(bool) * nkeycolumns);
- info->nulls_first = (bool *) palloc(sizeof(bool) * nkeycolumns);
+ info->sortopfamily = info->opfamily;
+ info->reverse_sort = (bool *) palloc(sizeof(bool) * nkeycolumns);
+ info->nulls_first = (bool *) palloc(sizeof(bool) * nkeycolumns);
- for (i = 0; i < nkeycolumns; i++)
- {
- int16 opt = indexRelation->rd_indoption[i];
- Oid ltopr;
- Oid btopfamily;
- Oid btopcintype;
- int16 btstrategy;
-
- info->reverse_sort[i] = (opt & INDOPTION_DESC) != 0;
- info->nulls_first[i] = (opt & INDOPTION_NULLS_FIRST) != 0;
-
- ltopr = get_opfamily_member(info->opfamily[i],
- info->opcintype[i],
- info->opcintype[i],
- BTLessStrategyNumber);
- if (OidIsValid(ltopr) &&
- get_ordering_op_properties(ltopr,
- &btopfamily,
- &btopcintype,
- &btstrategy) &&
- btopcintype == info->opcintype[i] &&
- btstrategy == BTLessStrategyNumber)
+ for (i = 0; i < nkeycolumns; i++)
{
- /* Successful mapping */
- info->sortopfamily[i] = btopfamily;
+ int16 opt = indexRelation->rd_indoption[i];
+
+ info->reverse_sort[i] = (opt & INDOPTION_DESC) != 0;
+ info->nulls_first[i] = (opt & INDOPTION_NULLS_FIRST) != 0;
}
- else
+ }
+ else if (amroutine->amcanorder)
+ {
+ /*
+ * Otherwise, identify the corresponding btree opfamilies
+ * by trying to map this index's "<" operators into btree.
+ * Since "<" uniquely defines the behavior of a sort
+ * order, this is a sufficient test.
+ *
+ * XXX This method is rather slow and also requires the
+ * undesirable assumption that the other index AM numbers
+ * its strategies the same as btree. It'd be better to
+ * have a way to explicitly declare the corresponding
+ * btree opfamily for each opfamily of the other index
+ * type. But given the lack of current or foreseeable
+ * amcanorder index types, it's not worth expending more
+ * effort on now.
+ */
+ info->sortopfamily = (Oid *) palloc(sizeof(Oid) * nkeycolumns);
+ info->reverse_sort = (bool *) palloc(sizeof(bool) * nkeycolumns);
+ info->nulls_first = (bool *) palloc(sizeof(bool) * nkeycolumns);
+
+ for (i = 0; i < nkeycolumns; i++)
{
- /* Fail ... quietly treat index as unordered */
- info->sortopfamily = NULL;
- info->reverse_sort = NULL;
- info->nulls_first = NULL;
- break;
+ int16 opt = indexRelation->rd_indoption[i];
+ Oid ltopr;
+ Oid btopfamily;
+ Oid btopcintype;
+ int16 btstrategy;
+
+ info->reverse_sort[i] = (opt & INDOPTION_DESC) != 0;
+ info->nulls_first[i] = (opt & INDOPTION_NULLS_FIRST) != 0;
+
+ ltopr = get_opfamily_member(info->opfamily[i],
+ info->opcintype[i],
+ info->opcintype[i],
+ BTLessStrategyNumber);
+ if (OidIsValid(ltopr) &&
+ get_ordering_op_properties(ltopr,
+ &btopfamily,
+ &btopcintype,
+ &btstrategy) &&
+ btopcintype == info->opcintype[i] &&
+ btstrategy == BTLessStrategyNumber)
+ {
+ /* Successful mapping */
+ info->sortopfamily[i] = btopfamily;
+ }
+ else
+ {
+ /* Fail ... quietly treat index as unordered */
+ info->sortopfamily = NULL;
+ info->reverse_sort = NULL;
+ info->nulls_first = NULL;
+ break;
+ }
}
}
+ else
+ {
+ info->sortopfamily = NULL;
+ info->reverse_sort = NULL;
+ info->nulls_first = NULL;
+ }
}
else
{
+ info->amcanorderbyop = false;
+ info->amoptionalkey = false;
+ info->amsearcharray = false;
+ info->amsearchnulls = false;
+ info->amcanparallel = false;
+ info->amhasgettuple = false;
+ info->amhasgetbitmap = false;
+ info->amcanmarkpos = false;
+ info->amcostestimate = NULL;
+
info->sortopfamily = NULL;
info->reverse_sort = NULL;
info->nulls_first = NULL;
@@ -416,31 +436,45 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
* the number-of-tuples estimate to equal the parent table; if it
* is partial then we have to use the same methods as we would for
* a table, except we can be sure that the index is not larger
- * than the table.
+ * than the table. We must ignore partitioned indexes here as as
+ * there are not physical indexes.
*/
- if (info->indpred == NIL)
+ if (indexRelation->rd_rel->relkind != RELKIND_PARTITIONED_INDEX)
{
- info->pages = RelationGetNumberOfBlocks(indexRelation);
- info->tuples = rel->tuples;
- }
- else
- {
- double allvisfrac; /* dummy */
-
- estimate_rel_size(indexRelation, NULL,
- &info->pages, &info->tuples, &allvisfrac);
- if (info->tuples > rel->tuples)
+ if (info->indpred == NIL)
+ {
+ info->pages = RelationGetNumberOfBlocks(indexRelation);
info->tuples = rel->tuples;
- }
+ }
+ else
+ {
+ double allvisfrac; /* dummy */
- if (info->relam == BTREE_AM_OID)
- {
- /* For btrees, get tree height while we have the index open */
- info->tree_height = _bt_getrootheight(indexRelation);
+ estimate_rel_size(indexRelation, NULL,
+ &info->pages, &info->tuples, &allvisfrac);
+ if (info->tuples > rel->tuples)
+ info->tuples = rel->tuples;
+ }
+
+ if (info->relam == BTREE_AM_OID)
+ {
+ /*
+ * For btrees, get tree height while we have the index
+ * open
+ */
+ info->tree_height = _bt_getrootheight(indexRelation);
+ }
+ else
+ {
+ /* For other index types, just set it to "unknown" for now */
+ info->tree_height = -1;
+ }
}
else
{
- /* For other index types, just set it to "unknown" for now */
+ /* Zero these out for partitioned indexes */
+ info->pages = 0;
+ info->tuples = 0.0;
info->tree_height = -1;
}
diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index f50e58adbd6..57de51f0db2 100644
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -5994,6 +5994,10 @@ get_actual_variable_range(PlannerInfo *root, VariableStatData *vardata,
rte = root->simple_rte_array[rel->relid];
Assert(rte->rtekind == RTE_RELATION);
+ /* ignore partitioned tables. Any indexes here are not real indexes */
+ if (rte->relkind == RELKIND_PARTITIONED_TABLE)
+ return false;
+
/* Search through the indexes to see if any match our problem */
foreach(lc, rel->indexlist)
{
diff --git a/src/include/nodes/pathnodes.h b/src/include/nodes/pathnodes.h
index 1827e506479..c20b7298a3d 100644
--- a/src/include/nodes/pathnodes.h
+++ b/src/include/nodes/pathnodes.h
@@ -653,7 +653,7 @@ typedef struct PartitionSchemeData *PartitionScheme;
* lateral_referencers - relids of rels that reference this one laterally
* (includes both direct and indirect lateral references)
* indexlist - list of IndexOptInfo nodes for relation's indexes
- * (always NIL if it's not a table)
+ * (always NIL if it's not a table or partitioned table)
* pages - number of disk pages in relation (zero if not a table)
* tuples - number of tuples in relation (not considering restrictions)
* allvisfrac - fraction of disk pages that are marked all-visible
@@ -1097,11 +1097,11 @@ struct IndexOptInfo
Oid *opfamily pg_node_attr(array_size(nkeycolumns));
/* OIDs of opclass declared input data types */
Oid *opcintype pg_node_attr(array_size(nkeycolumns));
- /* OIDs of btree opfamilies, if orderable */
+ /* OIDs of btree opfamilies, if orderable. NULL if partitioned index */
Oid *sortopfamily pg_node_attr(array_size(nkeycolumns));
- /* is sort order descending? */
+ /* is sort order descending? or NULL if partitioned index */
bool *reverse_sort pg_node_attr(array_size(nkeycolumns));
- /* do NULLs come first in the sort order? */
+ /* do NULLs come first in the sort order? or NULL if partitioned index */
bool *nulls_first pg_node_attr(array_size(nkeycolumns));
/* opclass-specific options for columns */
bytea **opclassoptions pg_node_attr(read_write_ignore);
@@ -1139,7 +1139,7 @@ struct IndexOptInfo
/*
* Remaining fields are copied from the index AM's API struct
- * (IndexAmRoutine).
+ * (IndexAmRoutine). These fields are not set for partitioned indexes.
*/
bool amcanorderbyop;
bool amoptionalkey;
diff --git a/src/test/regress/expected/join.out b/src/test/regress/expected/join.out
index 3ddea3b6837..c2b85d27950 100644
--- a/src/test/regress/expected/join.out
+++ b/src/test/regress/expected/join.out
@@ -4860,6 +4860,16 @@ select 1 from (select a.id FROM a left join b on a.b_id = b.id) q,
Filter: (a.id = i)
(4 rows)
+CREATE TEMP TABLE parted_b (id int PRIMARY KEY) partition by range(id);
+CREATE TEMP TABLE parted_b1 partition of parted_b for values from (0) to (10);
+-- test join removals on a partitioned table
+explain (costs off)
+select a.* from a left join parted_b pb on a.b_id = pb.id;
+ QUERY PLAN
+---------------
+ Seq Scan on a
+(1 row)
+
rollback;
create temp table parent (k int primary key, pd int);
create temp table child (k int unique, cd int);
diff --git a/src/test/regress/expected/partition_join.out b/src/test/regress/expected/partition_join.out
index c59caf1cb3d..c649c4aeaae 100644
--- a/src/test/regress/expected/partition_join.out
+++ b/src/test/regress/expected/partition_join.out
@@ -4874,7 +4874,7 @@ ANALYZE fract_t;
SET max_parallel_workers_per_gather = 0;
SET enable_partitionwise_join = on;
EXPLAIN (COSTS OFF)
-SELECT * FROM fract_t x LEFT JOIN fract_t y USING (id) ORDER BY id ASC LIMIT 10;
+SELECT x.id, y.id FROM fract_t x LEFT JOIN fract_t y USING (id) ORDER BY x.id ASC LIMIT 10;
QUERY PLAN
-----------------------------------------------------------------------
Limit
@@ -4891,7 +4891,7 @@ SELECT * FROM fract_t x LEFT JOIN fract_t y USING (id) ORDER BY id ASC LIMIT 10;
(11 rows)
EXPLAIN (COSTS OFF)
-SELECT * FROM fract_t x LEFT JOIN fract_t y USING (id) ORDER BY id DESC LIMIT 10;
+SELECT x.id, y.id FROM fract_t x LEFT JOIN fract_t y USING (id) ORDER BY x.id DESC LIMIT 10;
QUERY PLAN
--------------------------------------------------------------------------------
Limit
diff --git a/src/test/regress/sql/join.sql b/src/test/regress/sql/join.sql
index 9fc6ef43768..027927354c0 100644
--- a/src/test/regress/sql/join.sql
+++ b/src/test/regress/sql/join.sql
@@ -1709,6 +1709,13 @@ explain (costs off)
select 1 from (select a.id FROM a left join b on a.b_id = b.id) q,
lateral generate_series(1, q.id) gs(i) where q.id = gs.i;
+CREATE TEMP TABLE parted_b (id int PRIMARY KEY) partition by range(id);
+CREATE TEMP TABLE parted_b1 partition of parted_b for values from (0) to (10);
+
+-- test join removals on a partitioned table
+explain (costs off)
+select a.* from a left join parted_b pb on a.b_id = pb.id;
+
rollback;
create temp table parent (k int primary key, pd int);
diff --git a/src/test/regress/sql/partition_join.sql b/src/test/regress/sql/partition_join.sql
index 67f506361f8..9e16f1ca550 100644
--- a/src/test/regress/sql/partition_join.sql
+++ b/src/test/regress/sql/partition_join.sql
@@ -1157,10 +1157,10 @@ SET max_parallel_workers_per_gather = 0;
SET enable_partitionwise_join = on;
EXPLAIN (COSTS OFF)
-SELECT * FROM fract_t x LEFT JOIN fract_t y USING (id) ORDER BY id ASC LIMIT 10;
+SELECT x.id, y.id FROM fract_t x LEFT JOIN fract_t y USING (id) ORDER BY x.id ASC LIMIT 10;
EXPLAIN (COSTS OFF)
-SELECT * FROM fract_t x LEFT JOIN fract_t y USING (id) ORDER BY id DESC LIMIT 10;
+SELECT x.id, y.id FROM fract_t x LEFT JOIN fract_t y USING (id) ORDER BY x.id DESC LIMIT 10;
-- cleanup
DROP TABLE fract_t;