diff options
Diffstat (limited to 'src/backend/utils/adt/selfuncs.c')
-rw-r--r-- | src/backend/utils/adt/selfuncs.c | 254 |
1 files changed, 251 insertions, 3 deletions
diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c index 936b9ad99c0..23e012c64e9 100644 --- a/src/backend/utils/adt/selfuncs.c +++ b/src/backend/utils/adt/selfuncs.c @@ -15,7 +15,7 @@ * * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/utils/adt/selfuncs.c,v 1.120 2002/11/08 20:23:57 momjian Exp $ + * $Header: /cvsroot/pgsql/src/backend/utils/adt/selfuncs.c,v 1.121 2002/11/19 23:21:59 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -85,7 +85,10 @@ #include "optimizer/cost.h" #include "optimizer/pathnode.h" #include "optimizer/plancat.h" +#include "optimizer/planmain.h" #include "optimizer/prep.h" +#include "optimizer/tlist.h" +#include "optimizer/var.h" #include "parser/parse_func.h" #include "parser/parse_oper.h" #include "parser/parsetree.h" @@ -1810,6 +1813,251 @@ mergejoinscansel(Query *root, Node *clause, } /* + * estimate_num_groups - Estimate number of groups in a grouped query + * + * Given a query having a GROUP BY clause, estimate how many groups there + * will be --- ie, the number of distinct combinations of the GROUP BY + * expressions. + * + * This routine is also used to estimate the number of rows emitted by + * a DISTINCT filtering step; that is an isomorphic problem. (Note: + * actually, we only use it for DISTINCT when there's no grouping or + * aggregation ahead of the DISTINCT.) + * + * Inputs: + * root - the query + * groupClauses - list of GroupClauses (or SortClauses for the DISTINCT + * case, but those are equivalent structs) + * input_rows - number of rows estimated to arrive at the group/unique + * filter step + * + * Given the lack of any cross-correlation statistics in the system, it's + * impossible to do anything really trustworthy with GROUP BY conditions + * involving multiple Vars. We should however avoid assuming the worst + * case (all possible cross-product terms actually appear as groups) since + * very often the grouped-by Vars are highly correlated. Our current approach + * is as follows: + * 1. Reduce the given expressions to a list of unique Vars used. For + * example, GROUP BY a, a + b is treated the same as GROUP BY a, b. + * It is clearly correct not to count the same Var more than once. + * It is also reasonable to treat f(x) the same as x: f() cannot + * increase the number of distinct values (unless it is volatile, + * which we consider unlikely for grouping), but it probably won't + * reduce the number of distinct values much either. + * 2. If the list contains Vars of different relations that are known equal + * due to equijoin clauses, then drop all but one of the Vars from each + * known-equal set, keeping the one with smallest estimated # of values + * (since the extra values of the others can't appear in joined rows). + * Note the reason we only consider Vars of different relations is that + * if we considered ones of the same rel, we'd be double-counting the + * restriction selectivity of the equality in the next step. + * 3. For Vars within a single source rel, we multiply together the numbers + * of values, clamp to the number of rows in the rel, and then multiply + * by the selectivity of the restriction clauses for that rel. The + * initial product is probably too high (it's the worst case) but since + * we can clamp to the rel's rows it won't be hugely bad. Multiplying + * by the restriction selectivity is effectively assuming that the + * restriction clauses are independent of the grouping, which is a crummy + * assumption, but it's hard to do better. + * 4. If there are Vars from multiple rels, we repeat step 3 for each such + * rel, and multiply the results together. + * Note that rels not containing grouped Vars are ignored completely, as are + * join clauses other than the equijoin clauses used in step 2. Such rels + * cannot increase the number of groups, and we assume such clauses do not + * reduce the number either (somewhat bogus, but we don't have the info to + * do better). + */ +double +estimate_num_groups(Query *root, List *groupClauses, double input_rows) +{ + List *allvars = NIL; + List *varinfos = NIL; + double numdistinct; + List *l; + typedef struct { /* varinfos is a List of these */ + Var *var; + double ndistinct; + } MyVarInfo; + + /* We should not be called unless query has GROUP BY (or DISTINCT) */ + Assert(groupClauses != NIL); + + /* Step 1: get the unique Vars used */ + foreach(l, groupClauses) + { + GroupClause *grpcl = (GroupClause *) lfirst(l); + Node *groupexpr = get_sortgroupclause_expr(grpcl, + root->targetList); + List *varshere; + + varshere = pull_var_clause(groupexpr, false); + /* + * Replace any JOIN alias Vars with the underlying Vars. (This + * is not really right for FULL JOIN ...) + */ + if (root->hasJoinRTEs) + { + varshere = (List *) flatten_join_alias_vars((Node *) varshere, + root->rtable, + true); + varshere = pull_var_clause((Node *) varshere, false); + } + /* + * If we find any variable-free GROUP BY item, then either it is + * a constant (and we can ignore it) or it contains a volatile + * function; in the latter case we punt and assume that each input + * row will yield a distinct group. + */ + if (varshere == NIL) + { + if (contain_volatile_functions(groupexpr)) + return input_rows; + continue; + } + allvars = nconc(allvars, varshere); + } + + /* If now no Vars, we must have an all-constant GROUP BY list. */ + if (allvars == NIL) + return 1.0; + + /* Use set_union() to discard duplicates */ + allvars = set_union(NIL, allvars); + + /* + * Step 2: acquire statistical estimate of number of distinct values + * of each Var (total in its table, without regard for filtering). + * Also, detect known-equal Vars and discard the ones we don't want. + */ + foreach(l, allvars) + { + Var *var = (Var *) lfirst(l); + Oid relid = getrelid(var->varno, root->rtable); + HeapTuple statsTuple = NULL; + Form_pg_statistic stats = NULL; + double ndistinct; + bool keep = true; + List *l2; + + if (OidIsValid(relid)) + { + statsTuple = SearchSysCache(STATRELATT, + ObjectIdGetDatum(relid), + Int16GetDatum(var->varattno), + 0, 0); + if (HeapTupleIsValid(statsTuple)) + stats = (Form_pg_statistic) GETSTRUCT(statsTuple); + } + ndistinct = get_att_numdistinct(root, var, stats); + if (HeapTupleIsValid(statsTuple)) + ReleaseSysCache(statsTuple); + + foreach(l2, varinfos) + { + MyVarInfo *varinfo = (MyVarInfo *) lfirst(l2); + + if (var->varno != varinfo->var->varno && + vars_known_equal(root, var, varinfo->var)) + { + /* Found a match */ + if (varinfo->ndistinct <= ndistinct) + { + /* Keep older item, forget new one */ + keep = false; + break; + } + else + { + /* + * Delete the older item. We assume lremove() will not + * break the lnext link of the item... + */ + varinfos = lremove(varinfo, varinfos); + } + } + } + + if (keep) + { + MyVarInfo *varinfo = (MyVarInfo *) palloc(sizeof(MyVarInfo)); + + varinfo->var = var; + varinfo->ndistinct = ndistinct; + varinfos = lcons(varinfo, varinfos); + } + } + + /* + * Steps 3/4: group Vars by relation and estimate total numdistinct. + * + * For each iteration of the outer loop, we process the frontmost + * Var in varinfos, plus all other Vars in the same relation. We + * remove these Vars from the newvarinfos list for the next iteration. + * This is the easiest way to group Vars of same rel together. + */ + Assert(varinfos != NIL); + numdistinct = 1.0; + + do + { + MyVarInfo *varinfo1 = (MyVarInfo *) lfirst(varinfos); + RelOptInfo *rel = find_base_rel(root, varinfo1->var->varno); + double reldistinct = varinfo1->ndistinct; + List *newvarinfos = NIL; + + /* + * Get the largest numdistinct estimate of the Vars for this rel. + * Also, construct new varinfos list of remaining Vars. + */ + foreach(l, lnext(varinfos)) + { + MyVarInfo *varinfo2 = (MyVarInfo *) lfirst(l); + + if (varinfo2->var->varno == varinfo1->var->varno) + { + reldistinct *= varinfo2->ndistinct; + } + else + { + /* not time to process varinfo2 yet */ + newvarinfos = lcons(varinfo2, newvarinfos); + } + } + + /* + * Clamp to size of rel, multiply by restriction selectivity. + */ + Assert(rel->reloptkind == RELOPT_BASEREL); + if (reldistinct > rel->tuples) + reldistinct = rel->tuples; + reldistinct *= rel->rows / rel->tuples; + + /* + * Update estimate of total distinct groups. + */ + numdistinct *= reldistinct; + + varinfos = newvarinfos; + } while (varinfos != NIL); + + /* Guard against out-of-range answers */ + if (numdistinct > input_rows) + numdistinct = input_rows; + if (numdistinct < 1.0) + numdistinct = 1.0; + + return numdistinct; +} + + +/*------------------------------------------------------------------------- + * + * Support routines + * + *------------------------------------------------------------------------- + */ + +/* * get_var_maximum * Estimate the maximum value of the specified variable. * If successful, store value in *max and return TRUE. @@ -3271,7 +3519,7 @@ pattern_selectivity(Const *patt, Pattern_Type ptype) /* - * We want test whether the database's LC_COLLATE setting is safe for + * We want to test whether the database's LC_COLLATE setting is safe for * LIKE/regexp index optimization. * * The key requirement here is that given a prefix string, say "foo", @@ -3284,7 +3532,7 @@ pattern_selectivity(Const *patt, Pattern_Type ptype) * * (In theory, locales other than C may be LIKE-safe so this function * could be different from lc_collate_is_c(), but in a different - * theory, non-C locales are completely unpredicable so it's unlikely + * theory, non-C locales are completely unpredictable so it's unlikely * to happen.) * * Be sure to maintain the correspondence with the code in initdb. |