diff options
author | Alexander Korotkov <akorotkov@postgresql.org> | 2025-04-23 20:13:51 +0300 |
---|---|---|
committer | Alexander Korotkov <akorotkov@postgresql.org> | 2025-04-23 20:25:21 +0300 |
commit | 9f404d7922e8831dc49bfa225530ba5309900e4e (patch) | |
tree | 72950ab02339a280611e95da553099dd9993c767 /src/backend/utils/adt/selfuncs.c | |
parent | 3db61db48ef5b8898f7e85f98548fdec79d76524 (diff) | |
download | postgresql-9f404d7922e8831dc49bfa225530ba5309900e4e.tar.gz postgresql-9f404d7922e8831dc49bfa225530ba5309900e4e.zip |
Properly prepare varinfos in estimate_multivariate_bucketsize()
To estimate with extended statistics, we need to clear the varnullingrels
field in the expression, and duplicates are not allowed in the GroupVarInfo
list. We might re-use add_unique_group_var(), but we don't do so for two
reasons.
1) We must keep the origin_rinfos list ordered exactly the same way as
varinfos.
2) add_unique_group_var() is designed for estimate_num_groups(), where a
larger number of groups is worse. While estimating the number of hash
buckets, we have the opposite: a lesser number of groups is worse.
Therefore, we don't have to remove "known equal" vars: the removed var
may valuably contribute to the multivariate statistics to grow the number
of groups.
This commit adds custom code to estimate_multivariate_bucketsize() to
initialize varinfos properly.
Reported-by: Robins Tharakan <tharakan@gmail.com>
Discussion: https://postgr.es/m/18885-da51324078588253%40postgresql.org
Author: Andrei Lepikhov <lepihov@gmail.com>
Reviewed-by: David Rowley <dgrowleyml@gmail.com>
Reviewed-by: Tomas Vondra <tomas@vondra.me>
Reviewed-by: Alexander Korotkov <aekorotkov@gmail.com>
Diffstat (limited to 'src/backend/utils/adt/selfuncs.c')
-rw-r--r-- | src/backend/utils/adt/selfuncs.c | 60 |
1 files changed, 57 insertions, 3 deletions
diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c index 987f2154459..a96b1b9c0bc 100644 --- a/src/backend/utils/adt/selfuncs.c +++ b/src/backend/utils/adt/selfuncs.c @@ -3850,6 +3850,8 @@ estimate_multivariate_bucketsize(PlannerInfo *root, RelOptInfo *inner, if (bms_get_singleton_member(relids, &relid) && root->simple_rel_array[relid]->statlist != NIL) { + bool is_duplicate = false; + /* * This inner-side expression references only one relation. * Extended statistics on this clause can exist. @@ -3880,11 +3882,61 @@ estimate_multivariate_bucketsize(PlannerInfo *root, RelOptInfo *inner, */ continue; - varinfo = (GroupVarInfo *) palloc(sizeof(GroupVarInfo)); + /* + * We're going to add the new clause to the varinfos list. We + * might re-use add_unique_group_var(), but we don't do so for + * two reasons. + * + * 1) We must keep the origin_rinfos list ordered exactly the + * same way as varinfos. + * + * 2) add_unique_group_var() is designed for + * estimate_num_groups(), where a larger number of groups is + * worse. While estimating the number of hash buckets, we + * have the opposite: a lesser number of groups is worse. + * Therefore, we don't have to remove "known equal" vars: the + * removed var may valuably contribute to the multivariate + * statistics to grow the number of groups. + */ + + /* + * Clear nullingrels to correctly match hash keys. See + * add_unique_group_var()'s comment for details. + */ + expr = remove_nulling_relids(expr, root->outer_join_rels, NULL); + + /* + * Detect and exclude exact duplicates from the list of hash + * keys (like add_unique_group_var does). + */ + foreach(lc1, varinfos) + { + varinfo = (GroupVarInfo *) lfirst(lc1); + + if (!equal(expr, varinfo->var)) + continue; + + is_duplicate = true; + break; + } + + if (is_duplicate) + { + /* + * Skip exact duplicates. Adding them to the otherclauses + * list also doesn't make sense. + */ + continue; + } + + /* + * Initialize GroupVarInfo. We only use it to call + * estimate_multivariate_ndistinct(), which doesn't care about + * ndistinct and isdefault fields. Thus, skip these fields. + */ + varinfo = (GroupVarInfo *) palloc0(sizeof(GroupVarInfo)); varinfo->var = expr; varinfo->rel = root->simple_rel_array[relid]; - varinfo->ndistinct = 0.0; - varinfo->isdefault = false; varinfos = lappend(varinfos, varinfo); /* @@ -3894,8 +3946,10 @@ estimate_multivariate_bucketsize(PlannerInfo *root, RelOptInfo *inner, origin_rinfos = lappend(origin_rinfos, rinfo); } else + { /* This clause can't be estimated with extended statistics */ otherclauses = lappend(otherclauses, rinfo); + } clauses = foreach_delete_current(clauses, lc); } |