diff options
Diffstat (limited to 'src/backend/optimizer/path/costsize.c')
-rw-r--r-- | src/backend/optimizer/path/costsize.c | 180 |
1 files changed, 151 insertions, 29 deletions
diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c index b4379e4b39b..65c211deaee 100644 --- a/src/backend/optimizer/path/costsize.c +++ b/src/backend/optimizer/path/costsize.c @@ -42,7 +42,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/optimizer/path/costsize.c,v 1.74 2001/05/20 20:28:18 tgl Exp $ + * $Header: /cvsroot/pgsql/src/backend/optimizer/path/costsize.c,v 1.75 2001/06/05 05:26:04 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -83,7 +83,9 @@ bool enable_mergejoin = true; bool enable_hashjoin = true; +static Selectivity estimate_hash_bucketsize(Query *root, Var *var); static bool cost_qual_eval_walker(Node *node, Cost *total); +static Selectivity approx_selectivity(Query *root, List *quals); static void set_rel_width(Query *root, RelOptInfo *rel); static double relation_byte_size(double tuples, int width); static double page_size(double tuples, int width); @@ -99,7 +101,8 @@ static double page_size(double tuples, int width); * parameters, even though much of it could be extracted from the Path. */ void -cost_seqscan(Path *path, RelOptInfo *baserel) +cost_seqscan(Path *path, Query *root, + RelOptInfo *baserel) { Cost startup_cost = 0; Cost run_cost = 0; @@ -356,10 +359,11 @@ cost_index(Path *path, Query *root, /* * cost_tidscan - * Determines and returns the cost of scanning a relation using tid-s. + * Determines and returns the cost of scanning a relation using TIDs. */ void -cost_tidscan(Path *path, RelOptInfo *baserel, List *tideval) +cost_tidscan(Path *path, Query *root, + RelOptInfo *baserel, List *tideval) { Cost startup_cost = 0; Cost run_cost = 0; @@ -417,7 +421,8 @@ cost_tidscan(Path *path, RelOptInfo *baserel, List *tideval) * but if it ever does, it should react gracefully to lack of key data. */ void -cost_sort(Path *path, List *pathkeys, double tuples, int width) +cost_sort(Path *path, Query *root, + List *pathkeys, double tuples, int width) { Cost startup_cost = 0; Cost run_cost = 0; @@ -479,7 +484,7 @@ cost_sort(Path *path, List *pathkeys, double tuples, int width) * 'restrictlist' are the RestrictInfo nodes to be applied at the join */ void -cost_nestloop(Path *path, +cost_nestloop(Path *path, Query *root, Path *outer_path, Path *inner_path, List *restrictlist) @@ -510,7 +515,8 @@ cost_nestloop(Path *path, run_cost += outer_path->parent->rows * (inner_path->total_cost - inner_path->startup_cost); if (outer_path->parent->rows > 1) - run_cost += (outer_path->parent->rows - 1) * inner_path->startup_cost; + run_cost += (outer_path->parent->rows - 1) * + inner_path->startup_cost * 0.5; /* * Number of tuples processed (not number emitted!). If inner path is @@ -540,15 +546,18 @@ cost_nestloop(Path *path, * 'outer_path' is the path for the outer relation * 'inner_path' is the path for the inner relation * 'restrictlist' are the RestrictInfo nodes to be applied at the join + * 'mergeclauses' are the RestrictInfo nodes to use as merge clauses + * (this should be a subset of the restrictlist) * 'outersortkeys' and 'innersortkeys' are lists of the keys to be used * to sort the outer and inner relations, or NIL if no explicit * sort is needed because the source path is already ordered */ void -cost_mergejoin(Path *path, +cost_mergejoin(Path *path, Query *root, Path *outer_path, Path *inner_path, List *restrictlist, + List *mergeclauses, List *outersortkeys, List *innersortkeys) { @@ -573,6 +582,7 @@ cost_mergejoin(Path *path, { startup_cost += outer_path->total_cost; cost_sort(&sort_path, + root, outersortkeys, outer_path->parent->rows, outer_path->parent->width); @@ -589,6 +599,7 @@ cost_mergejoin(Path *path, { startup_cost += inner_path->total_cost; cost_sort(&sort_path, + root, innersortkeys, inner_path->parent->rows, inner_path->parent->width); @@ -602,12 +613,24 @@ cost_mergejoin(Path *path, } /* - * Estimate the number of tuples to be processed in the mergejoin - * itself as one per tuple in the two source relations. This could be - * a drastic underestimate if there are many equal-keyed tuples in - * either relation, but we have no good way of estimating that... + * The number of tuple comparisons needed depends drastically on the + * number of equal keys in the two source relations, which we have no + * good way of estimating. Somewhat arbitrarily, we charge one + * tuple comparison (one cpu_operator_cost) for each tuple in the + * two source relations. This is probably a lower bound. */ - ntuples = outer_path->parent->rows + inner_path->parent->rows; + run_cost += cpu_operator_cost * + (outer_path->parent->rows + inner_path->parent->rows); + + /* + * For each tuple that gets through the mergejoin proper, we charge + * cpu_tuple_cost plus the cost of evaluating additional restriction + * clauses that are to be applied at the join. It's OK to use an + * approximate selectivity here, since in most cases this is a minor + * component of the cost. + */ + ntuples = approx_selectivity(root, mergeclauses) * + outer_path->parent->rows * inner_path->parent->rows; /* CPU costs */ cpu_per_tuple = cpu_tuple_cost + cost_qual_eval(restrictlist); @@ -625,15 +648,15 @@ cost_mergejoin(Path *path, * 'outer_path' is the path for the outer relation * 'inner_path' is the path for the inner relation * 'restrictlist' are the RestrictInfo nodes to be applied at the join - * 'innerbucketsize' is an estimate of the bucketsize statistic - * for the inner hash key. + * 'hashclauses' is a list of the hash join clause (always a 1-element list) + * (this should be a subset of the restrictlist) */ void -cost_hashjoin(Path *path, +cost_hashjoin(Path *path, Query *root, Path *outer_path, Path *inner_path, List *restrictlist, - Selectivity innerbucketsize) + List *hashclauses) { Cost startup_cost = 0; Cost run_cost = 0; @@ -644,6 +667,10 @@ cost_hashjoin(Path *path, double innerbytes = relation_byte_size(inner_path->parent->rows, inner_path->parent->width); long hashtablebytes = SortMem * 1024L; + RestrictInfo *restrictinfo; + Var *left, + *right; + Selectivity innerbucketsize; if (!enable_hashjoin) startup_cost += disable_cost; @@ -658,6 +685,46 @@ cost_hashjoin(Path *path, run_cost += cpu_operator_cost * outer_path->parent->rows; /* + * Determine bucketsize fraction for inner relation. First we have + * to figure out which side of the hashjoin clause is the inner side. + */ + Assert(length(hashclauses) == 1); + Assert(IsA(lfirst(hashclauses), RestrictInfo)); + restrictinfo = (RestrictInfo *) lfirst(hashclauses); + /* these must be OK, since check_hashjoinable accepted the clause */ + left = get_leftop(restrictinfo->clause); + right = get_rightop(restrictinfo->clause); + + /* + * Since we tend to visit the same clauses over and over when + * planning a large query, we cache the bucketsize estimate in + * the RestrictInfo node to avoid repeated lookups of statistics. + */ + if (intMember(right->varno, inner_path->parent->relids)) + { + /* righthand side is inner */ + innerbucketsize = restrictinfo->right_bucketsize; + if (innerbucketsize < 0) + { + /* not cached yet */ + innerbucketsize = estimate_hash_bucketsize(root, right); + restrictinfo->right_bucketsize = innerbucketsize; + } + } + else + { + Assert(intMember(left->varno, inner_path->parent->relids)); + /* lefthand side is inner */ + innerbucketsize = restrictinfo->left_bucketsize; + if (innerbucketsize < 0) + { + /* not cached yet */ + innerbucketsize = estimate_hash_bucketsize(root, left); + restrictinfo->left_bucketsize = innerbucketsize; + } + } + + /* * The number of tuple comparisons needed is the number of outer * tuples times the typical number of tuples in a hash bucket, * which is the inner relation size times its bucketsize fraction. @@ -667,14 +734,14 @@ cost_hashjoin(Path *path, ceil(inner_path->parent->rows * innerbucketsize); /* - * Estimate the number of tuples that get through the hashing filter - * as one per tuple in the two source relations. This could be a - * drastic underestimate if there are many equal-keyed tuples in - * either relation, but we have no simple way of estimating that; - * and since this is only a second-order parameter, it's probably - * not worth expending a lot of effort on the estimate. + * For each tuple that gets through the hashjoin proper, we charge + * cpu_tuple_cost plus the cost of evaluating additional restriction + * clauses that are to be applied at the join. It's OK to use an + * approximate selectivity here, since in most cases this is a minor + * component of the cost. */ - ntuples = outer_path->parent->rows + inner_path->parent->rows; + ntuples = approx_selectivity(root, hashclauses) * + outer_path->parent->rows * inner_path->parent->rows; /* CPU costs */ cpu_per_tuple = cpu_tuple_cost + cost_qual_eval(restrictlist); @@ -718,10 +785,6 @@ cost_hashjoin(Path *path, * divided by total tuples in relation) if the specified Var is used * as a hash key. * - * This statistic is used by cost_hashjoin. We split out the calculation - * because it's useful to cache the result for re-use across multiple path - * cost calculations. - * * XXX This is really pretty bogus since we're effectively assuming that the * distribution of hash keys will be the same after applying restriction * clauses as it was in the underlying relation. However, we are not nearly @@ -747,7 +810,7 @@ cost_hashjoin(Path *path, * which is what we want. We do not want to hash unless we know that the * inner rel is well-dispersed (or the alternatives seem much worse). */ -Selectivity +static Selectivity estimate_hash_bucketsize(Query *root, Var *var) { Oid relid; @@ -1001,6 +1064,65 @@ cost_qual_eval_walker(Node *node, Cost *total) /* + * approx_selectivity + * Quick-and-dirty estimation of clause selectivities. + * The input can be either an implicitly-ANDed list of boolean + * expressions, or a list of RestrictInfo nodes (typically the latter). + * + * The "quick" part comes from caching the selectivity estimates so we can + * avoid recomputing them later. (Since the same clauses are typically + * examined over and over in different possible join trees, this makes a + * big difference.) + * + * The "dirty" part comes from the fact that the selectivities of multiple + * clauses are estimated independently and multiplied together. Currently, + * clauselist_selectivity can seldom do any better than that anyhow, but + * someday it might be smarter. + * + * Since we are only using the results to estimate how many potential + * output tuples are generated and passed through qpqual checking, it + * seems OK to live with the approximation. + */ +static Selectivity +approx_selectivity(Query *root, List *quals) +{ + Selectivity total = 1.0; + List *l; + + foreach(l, quals) + { + Node *qual = (Node *) lfirst(l); + Selectivity selec; + + /* + * RestrictInfo nodes contain a this_selec field reserved for this + * routine's use, so that it's not necessary to evaluate the qual + * clause's selectivity more than once. If the clause's selectivity + * hasn't been computed yet, the field will contain -1. + */ + if (qual && IsA(qual, RestrictInfo)) + { + RestrictInfo *restrictinfo = (RestrictInfo *) qual; + + if (restrictinfo->this_selec < 0) + restrictinfo->this_selec = + clause_selectivity(root, + (Node *) restrictinfo->clause, + 0); + selec = restrictinfo->this_selec; + } + else + { + /* If it's a bare expression, must always do it the hard way */ + selec = clause_selectivity(root, qual, 0); + } + total *= selec; + } + return total; +} + + +/* * set_baserel_size_estimates * Set the size estimates for the given base relation. * |