aboutsummaryrefslogtreecommitdiff
path: root/src/backend/optimizer
diff options
context:
space:
mode:
authorEtsuro Fujita <efujita@postgresql.org>2020-04-08 10:25:00 +0900
committerEtsuro Fujita <efujita@postgresql.org>2020-04-08 10:25:00 +0900
commitc8434d64ce03c32e0029417a82ae937f2055268f (patch)
tree7ea57c9e8292b9ad8a61b99603299f66ae4951b6 /src/backend/optimizer
parent41a194f49177daf9348bfde2c42e85b806dcee31 (diff)
downloadpostgresql-c8434d64ce03c32e0029417a82ae937f2055268f.tar.gz
postgresql-c8434d64ce03c32e0029417a82ae937f2055268f.zip
Allow partitionwise joins in more cases.
Previously, the partitionwise join technique only allowed partitionwise join when input partitioned tables had exactly the same partition bounds. This commit extends the technique to some cases when the tables have different partition bounds, by using an advanced partition-matching algorithm introduced by this commit. For both the input partitioned tables, the algorithm checks whether every partition of one input partitioned table only matches one partition of the other input partitioned table at most, and vice versa. In such a case the join between the tables can be broken down into joins between the matching partitions, so the algorithm produces the pairs of the matching partitions, plus the partition bounds for the join relation, to allow partitionwise join for computing the join. Currently, the algorithm works for list-partitioned and range-partitioned tables, but not hash-partitioned tables. See comments in partition_bounds_merge(). Ashutosh Bapat and Etsuro Fujita, most of regression tests by Rajkumar Raghuwanshi, some of the tests by Mark Dilger and Amul Sul, reviewed by Dmitry Dolgov and Amul Sul, with additional review at various points by Ashutosh Bapat, Mark Dilger, Robert Haas, Antonin Houska, Amit Langote, Justin Pryzby, and Tomas Vondra Discussion: https://postgr.es/m/CAFjFpRdjQvaUEV5DJX3TW6pU5eq54NCkadtxHX2JiJG_GvbrCA@mail.gmail.com
Diffstat (limited to 'src/backend/optimizer')
-rw-r--r--src/backend/optimizer/README27
-rw-r--r--src/backend/optimizer/path/joinrels.c263
-rw-r--r--src/backend/optimizer/util/inherit.c2
-rw-r--r--src/backend/optimizer/util/relnode.c45
4 files changed, 280 insertions, 57 deletions
diff --git a/src/backend/optimizer/README b/src/backend/optimizer/README
index 89ce373d5e6..40558cdf6b7 100644
--- a/src/backend/optimizer/README
+++ b/src/backend/optimizer/README
@@ -1106,6 +1106,33 @@ into joins between their partitions is called partitionwise join. We will use
term "partitioned relation" for either a partitioned table or a join between
compatibly partitioned tables.
+Even if the joining relations don't have exactly the same partition bounds,
+partitionwise join can still be applied by using an advanced
+partition-matching algorithm. For both the joining relations, the algorithm
+checks wether every partition of one joining relation only matches one
+partition of the other joining relation at most. In such a case the join
+between the joining relations can be broken down into joins between the
+matching partitions. The join relation can then be considerd partitioned.
+The algorithm produces the pairs of the matching partitions, plus the
+partition bounds for the join relation, to allow partitionwise join for
+computing the join. The algorithm is implemented in partition_bounds_merge().
+For an N-way join relation considered partitioned this way, not every pair of
+joining relations can use partitionwise join. For example:
+
+ (A leftjoin B on (Pab)) innerjoin C on (Pac)
+
+where A, B, and C are partitioned tables, and A has an extra partition
+compared to B and C. When considering partitionwise join for the join {A B},
+the extra partition of A doesn't have a matching partition on the nullable
+side, which is the case that the current implementation of partitionwise join
+can't handle. So {A B} is not considered partitioned, and the pair of {A B}
+and C considered for the 3-way join can't use partitionwise join. On the
+other hand, the pair of {A C} and B can use partitionwise join because {A C}
+is considered partitioned by eliminating the extra partition (see identity 1
+on outer join reordering). Whether an N-way join can use partitionwise join
+is determined based on the first pair of joining relations that are both
+partitioned and can use partitionwise join.
+
The partitioning properties of a partitioned relation are stored in its
RelOptInfo. The information about data types of partition keys are stored in
PartitionSchemeData structure. The planner maintains a list of canonical
diff --git a/src/backend/optimizer/path/joinrels.c b/src/backend/optimizer/path/joinrels.c
index 5b8d71dc267..4e1650994d5 100644
--- a/src/backend/optimizer/path/joinrels.c
+++ b/src/backend/optimizer/path/joinrels.c
@@ -45,6 +45,13 @@ static void try_partitionwise_join(PlannerInfo *root, RelOptInfo *rel1,
static SpecialJoinInfo *build_child_join_sjinfo(PlannerInfo *root,
SpecialJoinInfo *parent_sjinfo,
Relids left_relids, Relids right_relids);
+static void compute_partition_bounds(PlannerInfo *root, RelOptInfo *rel1,
+ RelOptInfo *rel2, RelOptInfo *joinrel,
+ SpecialJoinInfo *parent_sjinfo,
+ List **parts1, List **parts2);
+static void get_matching_part_pairs(PlannerInfo *root, RelOptInfo *joinrel,
+ RelOptInfo *rel1, RelOptInfo *rel2,
+ List **parts1, List **parts2);
/*
@@ -1354,25 +1361,29 @@ try_partitionwise_join(PlannerInfo *root, RelOptInfo *rel1, RelOptInfo *rel2,
{
bool rel1_is_simple = IS_SIMPLE_REL(rel1);
bool rel2_is_simple = IS_SIMPLE_REL(rel2);
- int nparts;
+ List *parts1 = NIL;
+ List *parts2 = NIL;
+ ListCell *lcr1 = NULL;
+ ListCell *lcr2 = NULL;
int cnt_parts;
/* Guard against stack overflow due to overly deep partition hierarchy. */
check_stack_depth();
/* Nothing to do, if the join relation is not partitioned. */
- if (!IS_PARTITIONED_REL(joinrel))
+ if (joinrel->part_scheme == NULL || joinrel->nparts == 0)
return;
/* The join relation should have consider_partitionwise_join set. */
Assert(joinrel->consider_partitionwise_join);
/*
- * Since this join relation is partitioned, all the base relations
- * participating in this join must be partitioned and so are all the
- * intermediate join relations.
+ * We can not perform partitionwise join if either of the joining relations
+ * is not partitioned.
*/
- Assert(IS_PARTITIONED_REL(rel1) && IS_PARTITIONED_REL(rel2));
+ if (!IS_PARTITIONED_REL(rel1) || !IS_PARTITIONED_REL(rel2))
+ return;
+
Assert(REL_HAS_ALL_PART_PROPS(rel1) && REL_HAS_ALL_PART_PROPS(rel2));
/* The joining relations should have consider_partitionwise_join set. */
@@ -1386,35 +1397,28 @@ try_partitionwise_join(PlannerInfo *root, RelOptInfo *rel1, RelOptInfo *rel2,
Assert(joinrel->part_scheme == rel1->part_scheme &&
joinrel->part_scheme == rel2->part_scheme);
- /*
- * Since we allow partitionwise join only when the partition bounds of the
- * joining relations exactly match, the partition bounds of the join
- * should match those of the joining relations.
- */
- Assert(partition_bounds_equal(joinrel->part_scheme->partnatts,
- joinrel->part_scheme->parttyplen,
- joinrel->part_scheme->parttypbyval,
- joinrel->boundinfo, rel1->boundinfo));
- Assert(partition_bounds_equal(joinrel->part_scheme->partnatts,
- joinrel->part_scheme->parttyplen,
- joinrel->part_scheme->parttypbyval,
- joinrel->boundinfo, rel2->boundinfo));
+ Assert(!(joinrel->partbounds_merged && (joinrel->nparts <= 0)));
- nparts = joinrel->nparts;
+ compute_partition_bounds(root, rel1, rel2, joinrel, parent_sjinfo,
+ &parts1, &parts2);
+
+ if (joinrel->partbounds_merged)
+ {
+ lcr1 = list_head(parts1);
+ lcr2 = list_head(parts2);
+ }
/*
* Create child-join relations for this partitioned join, if those don't
* exist. Add paths to child-joins for a pair of child relations
* corresponding to the given pair of parent relations.
*/
- for (cnt_parts = 0; cnt_parts < nparts; cnt_parts++)
+ for (cnt_parts = 0; cnt_parts < joinrel->nparts; cnt_parts++)
{
- RelOptInfo *child_rel1 = rel1->part_rels[cnt_parts];
- RelOptInfo *child_rel2 = rel2->part_rels[cnt_parts];
- bool rel1_empty = (child_rel1 == NULL ||
- IS_DUMMY_REL(child_rel1));
- bool rel2_empty = (child_rel2 == NULL ||
- IS_DUMMY_REL(child_rel2));
+ RelOptInfo *child_rel1;
+ RelOptInfo *child_rel2;
+ bool rel1_empty;
+ bool rel2_empty;
SpecialJoinInfo *child_sjinfo;
List *child_restrictlist;
RelOptInfo *child_joinrel;
@@ -1422,6 +1426,22 @@ try_partitionwise_join(PlannerInfo *root, RelOptInfo *rel1, RelOptInfo *rel2,
AppendRelInfo **appinfos;
int nappinfos;
+ if (joinrel->partbounds_merged)
+ {
+ child_rel1 = lfirst_node(RelOptInfo, lcr1);
+ child_rel2 = lfirst_node(RelOptInfo, lcr2);
+ lcr1 = lnext(parts1, lcr1);
+ lcr2 = lnext(parts2, lcr2);
+ }
+ else
+ {
+ child_rel1 = rel1->part_rels[cnt_parts];
+ child_rel2 = rel2->part_rels[cnt_parts];
+ }
+
+ rel1_empty = (child_rel1 == NULL || IS_DUMMY_REL(child_rel1));
+ rel2_empty = (child_rel2 == NULL || IS_DUMMY_REL(child_rel2));
+
/*
* Check for cases where we can prove that this segment of the join
* returns no rows, due to one or both inputs being empty (including
@@ -1519,6 +1539,8 @@ try_partitionwise_join(PlannerInfo *root, RelOptInfo *rel1, RelOptInfo *rel2,
child_sjinfo,
child_sjinfo->jointype);
joinrel->part_rels[cnt_parts] = child_joinrel;
+ joinrel->all_partrels = bms_add_members(joinrel->all_partrels,
+ child_joinrel->relids);
}
Assert(bms_equal(child_joinrel->relids, child_joinrelids));
@@ -1570,3 +1592,190 @@ build_child_join_sjinfo(PlannerInfo *root, SpecialJoinInfo *parent_sjinfo,
return sjinfo;
}
+
+/*
+ * compute_partition_bounds
+ * Compute the partition bounds for a join rel from those for inputs
+ */
+static void
+compute_partition_bounds(PlannerInfo *root, RelOptInfo *rel1,
+ RelOptInfo *rel2, RelOptInfo *joinrel,
+ SpecialJoinInfo *parent_sjinfo,
+ List **parts1, List **parts2)
+{
+ /*
+ * If we don't have the partition bounds for the join rel yet, try to
+ * compute those along with pairs of partitions to be joined.
+ */
+ if (joinrel->nparts == -1)
+ {
+ PartitionScheme part_scheme = joinrel->part_scheme;
+ PartitionBoundInfo boundinfo = NULL;
+ int nparts = 0;
+
+ Assert(joinrel->boundinfo == NULL);
+ Assert(joinrel->part_rels == NULL);
+
+ /*
+ * See if the partition bounds for inputs are exactly the same, in
+ * which case we don't need to work hard: the join rel have the same
+ * partition bounds as inputs, and the partitions with the same
+ * cardinal positions form the pairs.
+ *
+ * Note: even in cases where one or both inputs have merged bounds,
+ * it would be possible for both the bounds to be exactly the same, but
+ * it seems unlikely to be worth the cycles to check.
+ */
+ if (!rel1->partbounds_merged &&
+ !rel2->partbounds_merged &&
+ rel1->nparts == rel2->nparts &&
+ partition_bounds_equal(part_scheme->partnatts,
+ part_scheme->parttyplen,
+ part_scheme->parttypbyval,
+ rel1->boundinfo, rel2->boundinfo))
+ {
+ boundinfo = rel1->boundinfo;
+ nparts = rel1->nparts;
+ }
+ else
+ {
+ /* Try merging the partition bounds for inputs. */
+ boundinfo = partition_bounds_merge(part_scheme->partnatts,
+ part_scheme->partsupfunc,
+ part_scheme->partcollation,
+ rel1, rel2,
+ parent_sjinfo->jointype,
+ parts1, parts2);
+ if (boundinfo == NULL)
+ {
+ joinrel->nparts = 0;
+ return;
+ }
+ nparts = list_length(*parts1);
+ joinrel->partbounds_merged = true;
+ }
+
+ Assert(nparts > 0);
+ joinrel->boundinfo = boundinfo;
+ joinrel->nparts = nparts;
+ joinrel->part_rels =
+ (RelOptInfo **) palloc0(sizeof(RelOptInfo *) * nparts);
+ }
+ else
+ {
+ Assert(joinrel->nparts > 0);
+ Assert(joinrel->boundinfo);
+ Assert(joinrel->part_rels);
+
+ /*
+ * If the join rel's partbounds_merged flag is true, it means inputs
+ * are not guaranteed to have the same partition bounds, therefore we
+ * can't assume that the partitions at the same cardinal positions form
+ * the pairs; let get_matching_part_pairs() generate the pairs.
+ * Otherwise, nothing to do since we can assume that.
+ */
+ if (joinrel->partbounds_merged)
+ {
+ get_matching_part_pairs(root, joinrel, rel1, rel2,
+ parts1, parts2);
+ Assert(list_length(*parts1) == joinrel->nparts);
+ Assert(list_length(*parts2) == joinrel->nparts);
+ }
+ }
+}
+
+/*
+ * get_matching_part_pairs
+ * Generate pairs of partitions to be joined from inputs
+ */
+static void
+get_matching_part_pairs(PlannerInfo *root, RelOptInfo *joinrel,
+ RelOptInfo *rel1, RelOptInfo *rel2,
+ List **parts1, List **parts2)
+{
+ bool rel1_is_simple = IS_SIMPLE_REL(rel1);
+ bool rel2_is_simple = IS_SIMPLE_REL(rel2);
+ int cnt_parts;
+
+ *parts1 = NIL;
+ *parts2 = NIL;
+
+ for (cnt_parts = 0; cnt_parts < joinrel->nparts; cnt_parts++)
+ {
+ RelOptInfo *child_joinrel = joinrel->part_rels[cnt_parts];
+ RelOptInfo *child_rel1;
+ RelOptInfo *child_rel2;
+ Relids child_relids1;
+ Relids child_relids2;
+
+ /*
+ * If this segment of the join is empty, it means that this segment
+ * was ignored when previously creating child-join paths for it in
+ * try_partitionwise_join() as it would not contribute to the join
+ * result, due to one or both inputs being empty; add NULL to each of
+ * the given lists so that this segment will be ignored again in that
+ * function.
+ */
+ if (!child_joinrel)
+ {
+ *parts1 = lappend(*parts1, NULL);
+ *parts2 = lappend(*parts2, NULL);
+ continue;
+ }
+
+ /*
+ * Get a relids set of partition(s) involved in this join segment that
+ * are from the rel1 side.
+ */
+ child_relids1 = bms_intersect(child_joinrel->relids,
+ rel1->all_partrels);
+ Assert(bms_num_members(child_relids1) == bms_num_members(rel1->relids));
+
+ /*
+ * Get a child rel for rel1 with the relids. Note that we should have
+ * the child rel even if rel1 is a join rel, because in that case the
+ * partitions specified in the relids would have matching/overlapping
+ * boundaries, so the specified partitions should be considered as ones
+ * to be joined when planning partitionwise joins of rel1, meaning that
+ * the child rel would have been built by the time we get here.
+ */
+ if (rel1_is_simple)
+ {
+ int varno = bms_singleton_member(child_relids1);
+
+ child_rel1 = find_base_rel(root, varno);
+ }
+ else
+ child_rel1 = find_join_rel(root, child_relids1);
+ Assert(child_rel1);
+
+ /*
+ * Get a relids set of partition(s) involved in this join segment that
+ * are from the rel2 side.
+ */
+ child_relids2 = bms_intersect(child_joinrel->relids,
+ rel2->all_partrels);
+ Assert(bms_num_members(child_relids2) == bms_num_members(rel2->relids));
+
+ /*
+ * Get a child rel for rel2 with the relids. See above comments.
+ */
+ if (rel2_is_simple)
+ {
+ int varno = bms_singleton_member(child_relids2);
+
+ child_rel2 = find_base_rel(root, varno);
+ }
+ else
+ child_rel2 = find_join_rel(root, child_relids2);
+ Assert(child_rel2);
+
+ /*
+ * The join of rel1 and rel2 is legal, so is the join of the child
+ * rels obtained above; add them to the given lists as a join pair
+ * producing this join segment.
+ */
+ *parts1 = lappend(*parts1, child_rel1);
+ *parts2 = lappend(*parts2, child_rel2);
+ }
+}
diff --git a/src/backend/optimizer/util/inherit.c b/src/backend/optimizer/util/inherit.c
index 7db67fdf344..3132fd35a51 100644
--- a/src/backend/optimizer/util/inherit.c
+++ b/src/backend/optimizer/util/inherit.c
@@ -376,6 +376,8 @@ expand_partitioned_rtentry(PlannerInfo *root, RelOptInfo *relinfo,
/* Create the otherrel RelOptInfo too. */
childrelinfo = build_simple_rel(root, childRTindex, relinfo);
relinfo->part_rels[i] = childrelinfo;
+ relinfo->all_partrels = bms_add_members(relinfo->all_partrels,
+ childrelinfo->relids);
/* If this child is itself partitioned, recurse */
if (childrel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
diff --git a/src/backend/optimizer/util/relnode.c b/src/backend/optimizer/util/relnode.c
index af1fb486488..cbb1ee3b337 100644
--- a/src/backend/optimizer/util/relnode.c
+++ b/src/backend/optimizer/util/relnode.c
@@ -249,10 +249,12 @@ build_simple_rel(PlannerInfo *root, int relid, RelOptInfo *parent)
rel->has_eclass_joins = false;
rel->consider_partitionwise_join = false; /* might get changed later */
rel->part_scheme = NULL;
- rel->nparts = 0;
+ rel->nparts = -1;
rel->boundinfo = NULL;
+ rel->partbounds_merged = false;
rel->partition_qual = NIL;
rel->part_rels = NULL;
+ rel->all_partrels = NULL;
rel->partexprs = NULL;
rel->nullable_partexprs = NULL;
rel->partitioned_child_rels = NIL;
@@ -662,10 +664,12 @@ build_join_rel(PlannerInfo *root,
joinrel->consider_partitionwise_join = false; /* might get changed later */
joinrel->top_parent_relids = NULL;
joinrel->part_scheme = NULL;
- joinrel->nparts = 0;
+ joinrel->nparts = -1;
joinrel->boundinfo = NULL;
+ joinrel->partbounds_merged = false;
joinrel->partition_qual = NIL;
joinrel->part_rels = NULL;
+ joinrel->all_partrels = NULL;
joinrel->partexprs = NULL;
joinrel->nullable_partexprs = NULL;
joinrel->partitioned_child_rels = NIL;
@@ -838,10 +842,12 @@ build_child_join_rel(PlannerInfo *root, RelOptInfo *outer_rel,
joinrel->consider_partitionwise_join = false; /* might get changed later */
joinrel->top_parent_relids = NULL;
joinrel->part_scheme = NULL;
- joinrel->nparts = 0;
+ joinrel->nparts = -1;
joinrel->boundinfo = NULL;
+ joinrel->partbounds_merged = false;
joinrel->partition_qual = NIL;
joinrel->part_rels = NULL;
+ joinrel->all_partrels = NULL;
joinrel->partexprs = NULL;
joinrel->nullable_partexprs = NULL;
joinrel->partitioned_child_rels = NIL;
@@ -1645,7 +1651,7 @@ build_joinrel_partition_info(RelOptInfo *joinrel, RelOptInfo *outer_rel,
* of the way the query planner deduces implied equalities and reorders
* the joins. Please see optimizer/README for details.
*/
- if (!IS_PARTITIONED_REL(outer_rel) || !IS_PARTITIONED_REL(inner_rel) ||
+ if (outer_rel->part_scheme == NULL || inner_rel->part_scheme == NULL ||
!outer_rel->consider_partitionwise_join ||
!inner_rel->consider_partitionwise_join ||
outer_rel->part_scheme != inner_rel->part_scheme ||
@@ -1658,24 +1664,6 @@ build_joinrel_partition_info(RelOptInfo *joinrel, RelOptInfo *outer_rel,
part_scheme = outer_rel->part_scheme;
- Assert(REL_HAS_ALL_PART_PROPS(outer_rel) &&
- REL_HAS_ALL_PART_PROPS(inner_rel));
-
- /*
- * For now, our partition matching algorithm can match partitions only
- * when the partition bounds of the joining relations are exactly same.
- * So, bail out otherwise.
- */
- if (outer_rel->nparts != inner_rel->nparts ||
- !partition_bounds_equal(part_scheme->partnatts,
- part_scheme->parttyplen,
- part_scheme->parttypbyval,
- outer_rel->boundinfo, inner_rel->boundinfo))
- {
- Assert(!IS_PARTITIONED_REL(joinrel));
- return;
- }
-
/*
* This function will be called only once for each joinrel, hence it
* should not have partitioning fields filled yet.
@@ -1685,18 +1673,15 @@ build_joinrel_partition_info(RelOptInfo *joinrel, RelOptInfo *outer_rel,
!joinrel->boundinfo);
/*
- * Join relation is partitioned using the same partitioning scheme as the
- * joining relations and has same bounds.
+ * If the join relation is partitioned, it uses the same partitioning
+ * scheme as the joining relations.
+ *
+ * Note: we calculate the partition bounds, number of partitions, and
+ * child-join relations of the join relation in try_partitionwise_join().
*/
joinrel->part_scheme = part_scheme;
- joinrel->boundinfo = outer_rel->boundinfo;
- joinrel->nparts = outer_rel->nparts;
set_joinrel_partition_key_exprs(joinrel, outer_rel, inner_rel, jointype);
- /* part_rels[] will be filled later, but allocate it now */
- joinrel->part_rels =
- (RelOptInfo **) palloc0(sizeof(RelOptInfo *) * joinrel->nparts);
-
/*
* Set the consider_partitionwise_join flag.
*/