diff options
-rw-r--r-- | src/backend/optimizer/README | 1 | ||||
-rw-r--r-- | src/backend/optimizer/plan/planner.c | 219 | ||||
-rw-r--r-- | src/include/nodes/pathnodes.h | 1 | ||||
-rw-r--r-- | src/test/regress/expected/select_distinct.out | 67 | ||||
-rw-r--r-- | src/test/regress/sql/select_distinct.sql | 37 |
5 files changed, 292 insertions, 33 deletions
diff --git a/src/backend/optimizer/README b/src/backend/optimizer/README index 2339347c241..41c120e0cdf 100644 --- a/src/backend/optimizer/README +++ b/src/backend/optimizer/README @@ -1015,6 +1015,7 @@ UPPERREL_SETOP result of UNION/INTERSECT/EXCEPT, if any UPPERREL_PARTIAL_GROUP_AGG result of partial grouping/aggregation, if any UPPERREL_GROUP_AGG result of grouping/aggregation, if any UPPERREL_WINDOW result of window functions, if any +UPPERREL_PARTIAL_DISTINCT result of partial "SELECT DISTINCT", if any UPPERREL_DISTINCT result of "SELECT DISTINCT", if any UPPERREL_ORDERED result of ORDER BY, if any UPPERREL_FINAL result of any remaining top-level actions diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c index 2cd691191c9..1e42d75465e 100644 --- a/src/backend/optimizer/plan/planner.c +++ b/src/backend/optimizer/plan/planner.c @@ -189,6 +189,12 @@ static void create_one_window_path(PlannerInfo *root, List *activeWindows); static RelOptInfo *create_distinct_paths(PlannerInfo *root, RelOptInfo *input_rel); +static void create_partial_distinct_paths(PlannerInfo *root, + RelOptInfo *input_rel, + RelOptInfo *final_distinct_rel); +static RelOptInfo *create_final_distinct_paths(PlannerInfo *root, + RelOptInfo *input_rel, + RelOptInfo *distinct_rel); static RelOptInfo *create_ordered_paths(PlannerInfo *root, RelOptInfo *input_rel, PathTarget *target, @@ -1570,6 +1576,7 @@ grouping_planner(PlannerInfo *root, double tuple_fraction) */ root->upper_targets[UPPERREL_FINAL] = final_target; root->upper_targets[UPPERREL_ORDERED] = final_target; + root->upper_targets[UPPERREL_PARTIAL_DISTINCT] = sort_input_target; root->upper_targets[UPPERREL_DISTINCT] = sort_input_target; root->upper_targets[UPPERREL_WINDOW] = sort_input_target; root->upper_targets[UPPERREL_GROUP_AGG] = grouping_target; @@ -4227,16 +4234,9 @@ create_one_window_path(PlannerInfo *root, * Sort/Unique won't project anything. */ static RelOptInfo * -create_distinct_paths(PlannerInfo *root, - RelOptInfo *input_rel) +create_distinct_paths(PlannerInfo *root, RelOptInfo *input_rel) { - Query *parse = root->parse; - Path *cheapest_input_path = input_rel->cheapest_total_path; RelOptInfo *distinct_rel; - double numDistinctRows; - bool allow_hash; - Path *path; - ListCell *lc; /* For now, do all work in the (DISTINCT, NULL) upperrel */ distinct_rel = fetch_upper_rel(root, UPPERREL_DISTINCT, NULL); @@ -4258,6 +4258,184 @@ create_distinct_paths(PlannerInfo *root, distinct_rel->useridiscurrent = input_rel->useridiscurrent; distinct_rel->fdwroutine = input_rel->fdwroutine; + /* build distinct paths based on input_rel's pathlist */ + create_final_distinct_paths(root, input_rel, distinct_rel); + + /* now build distinct paths based on input_rel's partial_pathlist */ + create_partial_distinct_paths(root, input_rel, distinct_rel); + + /* Give a helpful error if we failed to create any paths */ + if (distinct_rel->pathlist == NIL) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("could not implement DISTINCT"), + errdetail("Some of the datatypes only support hashing, while others only support sorting."))); + + /* + * If there is an FDW that's responsible for all baserels of the query, + * let it consider adding ForeignPaths. + */ + if (distinct_rel->fdwroutine && + distinct_rel->fdwroutine->GetForeignUpperPaths) + distinct_rel->fdwroutine->GetForeignUpperPaths(root, + UPPERREL_DISTINCT, + input_rel, + distinct_rel, + NULL); + + /* Let extensions possibly add some more paths */ + if (create_upper_paths_hook) + (*create_upper_paths_hook) (root, UPPERREL_DISTINCT, input_rel, + distinct_rel, NULL); + + /* Now choose the best path(s) */ + set_cheapest(distinct_rel); + + return distinct_rel; +} + +/* + * create_partial_distinct_paths + * + * Process 'input_rel' partial paths and add unique/aggregate paths to the + * UPPERREL_PARTIAL_DISTINCT rel. For paths created, add Gather/GatherMerge + * paths on top and add a final unique/aggregate path to remove any duplicate + * produced from combining rows from parallel workers. + */ +static void +create_partial_distinct_paths(PlannerInfo *root, RelOptInfo *input_rel, + RelOptInfo *final_distinct_rel) +{ + RelOptInfo *partial_distinct_rel; + Query *parse; + List *distinctExprs; + double numDistinctRows; + Path *cheapest_partial_path; + ListCell *lc; + + /* nothing to do when there are no partial paths in the input rel */ + if (!input_rel->consider_parallel || input_rel->partial_pathlist == NIL) + return; + + parse = root->parse; + + /* can't do parallel DISTINCT ON */ + if (parse->hasDistinctOn) + return; + + partial_distinct_rel = fetch_upper_rel(root, UPPERREL_PARTIAL_DISTINCT, + NULL); + partial_distinct_rel->reltarget = root->upper_targets[UPPERREL_PARTIAL_DISTINCT]; + partial_distinct_rel->consider_parallel = input_rel->consider_parallel; + + /* + * If input_rel belongs to a single FDW, so does the partial_distinct_rel. + */ + partial_distinct_rel->serverid = input_rel->serverid; + partial_distinct_rel->userid = input_rel->userid; + partial_distinct_rel->useridiscurrent = input_rel->useridiscurrent; + partial_distinct_rel->fdwroutine = input_rel->fdwroutine; + + cheapest_partial_path = linitial(input_rel->partial_pathlist); + + distinctExprs = get_sortgrouplist_exprs(parse->distinctClause, + parse->targetList); + + /* estimate how many distinct rows we'll get from each worker */ + numDistinctRows = estimate_num_groups(root, distinctExprs, + cheapest_partial_path->rows, + NULL, NULL); + + /* first try adding unique paths atop of sorted paths */ + if (grouping_is_sortable(parse->distinctClause)) + { + foreach(lc, input_rel->partial_pathlist) + { + Path *path = (Path *) lfirst(lc); + + if (pathkeys_contained_in(root->distinct_pathkeys, path->pathkeys)) + { + add_partial_path(partial_distinct_rel, (Path *) + create_upper_unique_path(root, + partial_distinct_rel, + path, + list_length(root->distinct_pathkeys), + numDistinctRows)); + } + } + } + + /* + * Now try hash aggregate paths, if enabled and hashing is possible. Since + * we're not on the hook to ensure we do our best to create at least one + * path here, we treat enable_hashagg as a hard off-switch rather than the + * slightly softer variant in create_final_distinct_paths. + */ + if (enable_hashagg && grouping_is_hashable(parse->distinctClause)) + { + add_partial_path(partial_distinct_rel, (Path *) + create_agg_path(root, + partial_distinct_rel, + cheapest_partial_path, + cheapest_partial_path->pathtarget, + AGG_HASHED, + AGGSPLIT_SIMPLE, + parse->distinctClause, + NIL, + NULL, + numDistinctRows)); + } + + /* + * If there is an FDW that's responsible for all baserels of the query, + * let it consider adding ForeignPaths. + */ + if (partial_distinct_rel->fdwroutine && + partial_distinct_rel->fdwroutine->GetForeignUpperPaths) + partial_distinct_rel->fdwroutine->GetForeignUpperPaths(root, + UPPERREL_PARTIAL_DISTINCT, + input_rel, + partial_distinct_rel, + NULL); + + /* Let extensions possibly add some more partial paths */ + if (create_upper_paths_hook) + (*create_upper_paths_hook) (root, UPPERREL_PARTIAL_DISTINCT, + input_rel, partial_distinct_rel, NULL); + + if (partial_distinct_rel->partial_pathlist != NIL) + { + generate_gather_paths(root, partial_distinct_rel, true); + set_cheapest(partial_distinct_rel); + + /* + * Finally, create paths to distinctify the final result. This step + * is needed to remove any duplicates due to combining rows from + * parallel workers. + */ + create_final_distinct_paths(root, partial_distinct_rel, + final_distinct_rel); + } +} + +/* + * create_final_distinct_paths + * Create distinct paths in 'distinct_rel' based on 'input_rel' pathlist + * + * input_rel: contains the source-data paths + * distinct_rel: destination relation for storing created paths + */ +static RelOptInfo * +create_final_distinct_paths(PlannerInfo *root, RelOptInfo *input_rel, + RelOptInfo *distinct_rel) +{ + Query *parse = root->parse; + Path *cheapest_input_path = input_rel->cheapest_total_path; + double numDistinctRows; + bool allow_hash; + Path *path; + ListCell *lc; + /* Estimate number of distinct rows there will be */ if (parse->groupClause || parse->groupingSets || parse->hasAggs || root->hasHavingQual) @@ -4384,31 +4562,6 @@ create_distinct_paths(PlannerInfo *root, numDistinctRows)); } - /* Give a helpful error if we failed to find any implementation */ - if (distinct_rel->pathlist == NIL) - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("could not implement DISTINCT"), - errdetail("Some of the datatypes only support hashing, while others only support sorting."))); - - /* - * If there is an FDW that's responsible for all baserels of the query, - * let it consider adding ForeignPaths. - */ - if (distinct_rel->fdwroutine && - distinct_rel->fdwroutine->GetForeignUpperPaths) - distinct_rel->fdwroutine->GetForeignUpperPaths(root, UPPERREL_DISTINCT, - input_rel, distinct_rel, - NULL); - - /* Let extensions possibly add some more paths */ - if (create_upper_paths_hook) - (*create_upper_paths_hook) (root, UPPERREL_DISTINCT, - input_rel, distinct_rel, NULL); - - /* Now choose the best path(s) */ - set_cheapest(distinct_rel); - return distinct_rel; } diff --git a/src/include/nodes/pathnodes.h b/src/include/nodes/pathnodes.h index 6e068f2c8b3..1abe233db2b 100644 --- a/src/include/nodes/pathnodes.h +++ b/src/include/nodes/pathnodes.h @@ -71,6 +71,7 @@ typedef enum UpperRelationKind * any */ UPPERREL_GROUP_AGG, /* result of grouping/aggregation, if any */ UPPERREL_WINDOW, /* result of window functions, if any */ + UPPERREL_PARTIAL_DISTINCT, /* result of partial "SELECT DISTINCT", if any */ UPPERREL_DISTINCT, /* result of "SELECT DISTINCT", if any */ UPPERREL_ORDERED, /* result of ORDER BY, if any */ UPPERREL_FINAL /* result of any remaining top-level actions */ diff --git a/src/test/regress/expected/select_distinct.out b/src/test/regress/expected/select_distinct.out index 11c6f50fbfa..0c8e10f88a5 100644 --- a/src/test/regress/expected/select_distinct.out +++ b/src/test/regress/expected/select_distinct.out @@ -210,6 +210,73 @@ DROP TABLE distinct_hash_1; DROP TABLE distinct_hash_2; DROP TABLE distinct_group_1; DROP TABLE distinct_group_2; +-- Test parallel DISTINCT +SET parallel_tuple_cost=0; +SET parallel_setup_cost=0; +SET min_parallel_table_scan_size=0; +-- Ensure we get a parallel plan +EXPLAIN (costs off) +SELECT DISTINCT four FROM tenk1; + QUERY PLAN +---------------------------------------------------- + Unique + -> Sort + Sort Key: four + -> Gather + Workers Planned: 2 + -> HashAggregate + Group Key: four + -> Parallel Seq Scan on tenk1 +(8 rows) + +-- Ensure the parallel plan produces the correct results +SELECT DISTINCT four FROM tenk1; + four +------ + 0 + 1 + 2 + 3 +(4 rows) + +CREATE OR REPLACE FUNCTION distinct_func(a INT) RETURNS INT AS $$ + BEGIN + RETURN a; + END; +$$ LANGUAGE plpgsql PARALLEL UNSAFE; +-- Ensure we don't do parallel distinct with a parallel unsafe function +EXPLAIN (COSTS OFF) +SELECT DISTINCT distinct_func(1) FROM tenk1; + QUERY PLAN +---------------------------------------------------------- + Unique + -> Sort + Sort Key: (distinct_func(1)) + -> Index Only Scan using tenk1_hundred on tenk1 +(4 rows) + +-- make the function parallel safe +CREATE OR REPLACE FUNCTION distinct_func(a INT) RETURNS INT AS $$ + BEGIN + RETURN a; + END; +$$ LANGUAGE plpgsql PARALLEL SAFE; +-- Ensure we do parallel distinct now that the function is parallel safe +EXPLAIN (COSTS OFF) +SELECT DISTINCT distinct_func(1) FROM tenk1; + QUERY PLAN +---------------------------------------------- + Unique + -> Sort + Sort Key: (distinct_func(1)) + -> Gather + Workers Planned: 2 + -> Parallel Seq Scan on tenk1 +(6 rows) + +RESET min_parallel_table_scan_size; +RESET parallel_setup_cost; +RESET parallel_tuple_cost; -- -- Also, some tests of IS DISTINCT FROM, which doesn't quite deserve its -- very own regression file. diff --git a/src/test/regress/sql/select_distinct.sql b/src/test/regress/sql/select_distinct.sql index 33102744ebf..e00582e46c1 100644 --- a/src/test/regress/sql/select_distinct.sql +++ b/src/test/regress/sql/select_distinct.sql @@ -107,6 +107,43 @@ DROP TABLE distinct_hash_2; DROP TABLE distinct_group_1; DROP TABLE distinct_group_2; +-- Test parallel DISTINCT +SET parallel_tuple_cost=0; +SET parallel_setup_cost=0; +SET min_parallel_table_scan_size=0; + +-- Ensure we get a parallel plan +EXPLAIN (costs off) +SELECT DISTINCT four FROM tenk1; + +-- Ensure the parallel plan produces the correct results +SELECT DISTINCT four FROM tenk1; + +CREATE OR REPLACE FUNCTION distinct_func(a INT) RETURNS INT AS $$ + BEGIN + RETURN a; + END; +$$ LANGUAGE plpgsql PARALLEL UNSAFE; + +-- Ensure we don't do parallel distinct with a parallel unsafe function +EXPLAIN (COSTS OFF) +SELECT DISTINCT distinct_func(1) FROM tenk1; + +-- make the function parallel safe +CREATE OR REPLACE FUNCTION distinct_func(a INT) RETURNS INT AS $$ + BEGIN + RETURN a; + END; +$$ LANGUAGE plpgsql PARALLEL SAFE; + +-- Ensure we do parallel distinct now that the function is parallel safe +EXPLAIN (COSTS OFF) +SELECT DISTINCT distinct_func(1) FROM tenk1; + +RESET min_parallel_table_scan_size; +RESET parallel_setup_cost; +RESET parallel_tuple_cost; + -- -- Also, some tests of IS DISTINCT FROM, which doesn't quite deserve its -- very own regression file. |