diff options
author | Tom Lane <tgl@sss.pgh.pa.us> | 2017-09-13 11:12:39 -0400 |
---|---|---|
committer | Tom Lane <tgl@sss.pgh.pa.us> | 2017-09-13 11:12:39 -0400 |
commit | 7d08ce286cd5854d58152e428c28636a616bdc42 (patch) | |
tree | 2e4f6f2ce25df95b86a1becf7a09935334ce5d90 /src/backend/utils | |
parent | 089880ba9af5f95e1a3b050874a90dbe5c33fd61 (diff) | |
download | postgresql-7d08ce286cd5854d58152e428c28636a616bdc42.tar.gz postgresql-7d08ce286cd5854d58152e428c28636a616bdc42.zip |
Distinguish selectivity of < from <= and > from >=.
Historically, the selectivity functions have simply not distinguished
< from <=, or > from >=, arguing that the fraction of the population that
satisfies the "=" aspect can be considered to be vanishingly small, if the
comparison value isn't any of the most-common-values for the variable.
(If it is, the code path that executes the operator against each MCV will
take care of things properly.) But that isn't really true unless we're
dealing with a continuum of variable values, and in practice we seldom are.
If "x = const" would estimate a nonzero number of rows for a given const
value, then it follows that we ought to estimate different numbers of rows
for "x < const" and "x <= const", even if the const is not one of the MCVs.
Handling this more honestly makes a significant difference in edge cases,
such as the estimate for a tight range (x BETWEEN y AND z where y and z
are close together).
Hence, split scalarltsel into scalarltsel/scalarlesel, and similarly
split scalargtsel into scalargtsel/scalargesel. Adjust <= and >=
operator definitions to reference the new selectivity functions.
Improve the core ineq_histogram_selectivity() function to make a
correction for equality. (Along the way, I learned quite a bit about
exactly why that function gives good answers, which I tried to memorialize
in improved comments.)
The corresponding join selectivity functions were, and remain, just stubs.
But I chose to split them similarly, to avoid confusion and to prevent the
need for doing this exercise again if someone ever makes them less stubby.
In passing, change ineq_histogram_selectivity's clamp for extreme
probability estimates so that it varies depending on the histogram
size, instead of being hardwired at 0.0001. With the default histogram
size of 100 entries, you still get the old clamp value, but bigger
histograms should allow us to put more faith in edge values.
Tom Lane, reviewed by Aleksander Alekseev and Kuntal Ghosh
Discussion: https://postgr.es/m/12232.1499140410@sss.pgh.pa.us
Diffstat (limited to 'src/backend/utils')
-rw-r--r-- | src/backend/utils/adt/network.c | 4 | ||||
-rw-r--r-- | src/backend/utils/adt/selfuncs.c | 323 |
2 files changed, 205 insertions, 122 deletions
diff --git a/src/backend/utils/adt/network.c b/src/backend/utils/adt/network.c index ec4ac20bb7b..aac76217173 100644 --- a/src/backend/utils/adt/network.c +++ b/src/backend/utils/adt/network.c @@ -957,8 +957,8 @@ convert_network_to_scalar(Datum value, Oid typid) } /* - * Can't get here unless someone tries to use scalarltsel/scalargtsel on - * an operator with one network and one non-network operand. + * Can't get here unless someone tries to use scalarineqsel() on an + * operator with one network and one non-network operand. */ elog(ERROR, "unsupported type: %u", typid); return 0; diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c index 81b0bc37d27..db1792bf8d4 100644 --- a/src/backend/utils/adt/selfuncs.c +++ b/src/backend/utils/adt/selfuncs.c @@ -164,7 +164,7 @@ static double var_eq_non_const(VariableStatData *vardata, Oid operator, bool varonleft, bool negate); static double ineq_histogram_selectivity(PlannerInfo *root, VariableStatData *vardata, - FmgrInfo *opproc, bool isgt, + FmgrInfo *opproc, bool isgt, bool iseq, Datum constval, Oid consttype); static double eqjoinsel_inner(Oid operator, VariableStatData *vardata1, VariableStatData *vardata2); @@ -545,18 +545,21 @@ neqsel(PG_FUNCTION_ARGS) /* * scalarineqsel - Selectivity of "<", "<=", ">", ">=" for scalars. * - * This is the guts of both scalarltsel and scalargtsel. The caller has - * commuted the clause, if necessary, so that we can treat the variable as - * being on the left. The caller must also make sure that the other side - * of the clause is a non-null Const, and dissect same into a value and - * datatype. + * This is the guts of scalarltsel/scalarlesel/scalargtsel/scalargesel. + * The isgt and iseq flags distinguish which of the four cases apply. + * + * The caller has commuted the clause, if necessary, so that we can treat + * the variable as being on the left. The caller must also make sure that + * the other side of the clause is a non-null Const, and dissect that into + * a value and datatype. (This definition simplifies some callers that + * want to estimate against a computed value instead of a Const node.) * * This routine works for any datatype (or pair of datatypes) known to * convert_to_scalar(). If it is applied to some other datatype, * it will return a default estimate. */ static double -scalarineqsel(PlannerInfo *root, Oid operator, bool isgt, +scalarineqsel(PlannerInfo *root, Oid operator, bool isgt, bool iseq, VariableStatData *vardata, Datum constval, Oid consttype) { Form_pg_statistic stats; @@ -588,7 +591,8 @@ scalarineqsel(PlannerInfo *root, Oid operator, bool isgt, * If there is a histogram, determine which bin the constant falls in, and * compute the resulting contribution to selectivity. */ - hist_selec = ineq_histogram_selectivity(root, vardata, &opproc, isgt, + hist_selec = ineq_histogram_selectivity(root, vardata, + &opproc, isgt, iseq, constval, consttype); /* @@ -758,7 +762,8 @@ histogram_selectivity(VariableStatData *vardata, FmgrInfo *opproc, * ineq_histogram_selectivity - Examine the histogram for scalarineqsel * * Determine the fraction of the variable's histogram population that - * satisfies the inequality condition, ie, VAR < CONST or VAR > CONST. + * satisfies the inequality condition, ie, VAR < (or <=, >, >=) CONST. + * The isgt and iseq flags distinguish which of the four cases apply. * * Returns -1 if there is no histogram (valid results will always be >= 0). * @@ -769,7 +774,7 @@ histogram_selectivity(VariableStatData *vardata, FmgrInfo *opproc, static double ineq_histogram_selectivity(PlannerInfo *root, VariableStatData *vardata, - FmgrInfo *opproc, bool isgt, + FmgrInfo *opproc, bool isgt, bool iseq, Datum constval, Oid consttype) { double hist_selec; @@ -796,11 +801,17 @@ ineq_histogram_selectivity(PlannerInfo *root, if (sslot.nvalues > 1) { /* - * Use binary search to find proper location, ie, the first slot - * at which the comparison fails. (If the given operator isn't - * actually sort-compatible with the histogram, you'll get garbage - * results ... but probably not any more garbage-y than you would - * from the old linear search.) + * Use binary search to find the desired location, namely the + * right end of the histogram bin containing the comparison value, + * which is the leftmost entry for which the comparison operator + * succeeds (if isgt) or fails (if !isgt). (If the given operator + * isn't actually sort-compatible with the histogram, you'll get + * garbage results ... but probably not any more garbage-y than + * you would have from the old linear search.) + * + * In this loop, we pay no attention to whether the operator iseq + * or not; that detail will be mopped up below. (We cannot tell, + * anyway, whether the operator thinks the values are equal.) * * If the binary search accesses the first or last histogram * entry, we try to replace that endpoint with the true column min @@ -865,25 +876,74 @@ ineq_histogram_selectivity(PlannerInfo *root, if (lobound <= 0) { - /* Constant is below lower histogram boundary. */ + /* + * Constant is below lower histogram boundary. More + * precisely, we have found that no entry in the histogram + * satisfies the inequality clause (if !isgt) or they all do + * (if isgt). We estimate that that's true of the entire + * table, so set histfrac to 0.0 (which we'll flip to 1.0 + * below, if isgt). + */ histfrac = 0.0; } else if (lobound >= sslot.nvalues) { - /* Constant is above upper histogram boundary. */ + /* + * Inverse case: constant is above upper histogram boundary. + */ histfrac = 1.0; } else { + /* We have values[i-1] <= constant <= values[i]. */ int i = lobound; + double eq_selec = 0; double val, high, low; double binfrac; /* - * We have values[i-1] <= constant <= values[i]. + * In the cases where we'll need it below, obtain an estimate + * of the selectivity of "x = constval". We use a calculation + * similar to what var_eq_const() does for a non-MCV constant, + * ie, estimate that all distinct non-MCV values occur equally + * often. But multiplication by "1.0 - sumcommon - nullfrac" + * will be done by our caller, so we shouldn't do that here. + * Therefore we can't try to clamp the estimate by reference + * to the least common MCV; the result would be too small. * + * Note: since this is effectively assuming that constval + * isn't an MCV, it's logically dubious if constval in fact is + * one. But we have to apply *some* correction for equality, + * and anyway we cannot tell if constval is an MCV, since we + * don't have a suitable equality operator at hand. + */ + if (i == 1 || isgt == iseq) + { + double otherdistinct; + bool isdefault; + AttStatsSlot mcvslot; + + /* Get estimated number of distinct values */ + otherdistinct = get_variable_numdistinct(vardata, + &isdefault); + + /* Subtract off the number of known MCVs */ + if (get_attstatsslot(&mcvslot, vardata->statsTuple, + STATISTIC_KIND_MCV, InvalidOid, + ATTSTATSSLOT_NUMBERS)) + { + otherdistinct -= mcvslot.nnumbers; + free_attstatsslot(&mcvslot); + } + + /* If result doesn't seem sane, leave eq_selec at 0 */ + if (otherdistinct > 1) + eq_selec = 1.0 / otherdistinct; + } + + /* * Convert the constant and the two nearest bin boundary * values to a uniform comparison scale, and do a linear * interpolation within this bin. @@ -937,13 +997,54 @@ ineq_histogram_selectivity(PlannerInfo *root, */ histfrac = (double) (i - 1) + binfrac; histfrac /= (double) (sslot.nvalues - 1); + + /* + * At this point, histfrac is an estimate of the fraction of + * the population represented by the histogram that satisfies + * "x <= constval". Somewhat remarkably, this statement is + * true regardless of which operator we were doing the probes + * with, so long as convert_to_scalar() delivers reasonable + * results. If the probe constant is equal to some histogram + * entry, we would have considered the bin to the left of that + * entry if probing with "<" or ">=", or the bin to the right + * if probing with "<=" or ">"; but binfrac would have come + * out as 1.0 in the first case and 0.0 in the second, leading + * to the same histfrac in either case. For probe constants + * between histogram entries, we find the same bin and get the + * same estimate with any operator. + * + * The fact that the estimate corresponds to "x <= constval" + * and not "x < constval" is because of the way that ANALYZE + * constructs the histogram: each entry is, effectively, the + * rightmost value in its sample bucket. So selectivity + * values that are exact multiples of 1/(histogram_size-1) + * should be understood as estimates including a histogram + * entry plus everything to its left. + * + * However, that breaks down for the first histogram entry, + * which necessarily is the leftmost value in its sample + * bucket. That means the first histogram bin is slightly + * narrower than the rest, by an amount equal to eq_selec. + * Another way to say that is that we want "x <= leftmost" to + * be estimated as eq_selec not zero. So, if we're dealing + * with the first bin (i==1), rescale to make that true while + * adjusting the rest of that bin linearly. + */ + if (i == 1) + histfrac += eq_selec * (1.0 - binfrac); + + /* + * "x <= constval" is good if we want an estimate for "<=" or + * ">", but if we are estimating for "<" or ">=", we now need + * to decrease the estimate by eq_selec. + */ + if (isgt == iseq) + histfrac -= eq_selec; } /* - * Now histfrac = fraction of histogram entries below the - * constant. - * - * Account for "<" vs ">" + * Now the estimate is finished for "<" and "<=" cases. If we are + * estimating for ">" or ">=", flip it. */ hist_selec = isgt ? (1.0 - histfrac) : histfrac; @@ -951,16 +1052,21 @@ ineq_histogram_selectivity(PlannerInfo *root, * The histogram boundaries are only approximate to begin with, * and may well be out of date anyway. Therefore, don't believe * extremely small or large selectivity estimates --- unless we - * got actual current endpoint values from the table. + * got actual current endpoint values from the table, in which + * case just do the usual sanity clamp. Somewhat arbitrarily, we + * set the cutoff for other cases at a hundredth of the histogram + * resolution. */ if (have_end) CLAMP_PROBABILITY(hist_selec); else { - if (hist_selec < 0.0001) - hist_selec = 0.0001; - else if (hist_selec > 0.9999) - hist_selec = 0.9999; + double cutoff = 0.01 / (double) (sslot.nvalues - 1); + + if (hist_selec < cutoff) + hist_selec = cutoff; + else if (hist_selec > 1.0 - cutoff) + hist_selec = 1.0 - cutoff; } } @@ -971,10 +1077,11 @@ ineq_histogram_selectivity(PlannerInfo *root, } /* - * scalarltsel - Selectivity of "<" (also "<=") for scalars. + * Common wrapper function for the selectivity estimators that simply + * invoke scalarineqsel(). */ -Datum -scalarltsel(PG_FUNCTION_ARGS) +static Datum +scalarineqsel_wrapper(PG_FUNCTION_ARGS, bool isgt, bool iseq) { PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0); Oid operator = PG_GETARG_OID(1); @@ -985,7 +1092,6 @@ scalarltsel(PG_FUNCTION_ARGS) bool varonleft; Datum constval; Oid consttype; - bool isgt; double selec; /* @@ -1020,14 +1126,8 @@ scalarltsel(PG_FUNCTION_ARGS) /* * Force the var to be on the left to simplify logic in scalarineqsel. */ - if (varonleft) + if (!varonleft) { - /* we have var < other */ - isgt = false; - } - else - { - /* we have other < var, commute to make var > other */ operator = get_commutator(operator); if (!operator) { @@ -1035,10 +1135,12 @@ scalarltsel(PG_FUNCTION_ARGS) ReleaseVariableStats(vardata); PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL); } - isgt = true; + isgt = !isgt; } - selec = scalarineqsel(root, operator, isgt, &vardata, constval, consttype); + /* The rest of the work is done by scalarineqsel(). */ + selec = scalarineqsel(root, operator, isgt, iseq, + &vardata, constval, consttype); ReleaseVariableStats(vardata); @@ -1046,78 +1148,39 @@ scalarltsel(PG_FUNCTION_ARGS) } /* - * scalargtsel - Selectivity of ">" (also ">=") for integers. + * scalarltsel - Selectivity of "<" for scalars. */ Datum -scalargtsel(PG_FUNCTION_ARGS) +scalarltsel(PG_FUNCTION_ARGS) { - PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0); - Oid operator = PG_GETARG_OID(1); - List *args = (List *) PG_GETARG_POINTER(2); - int varRelid = PG_GETARG_INT32(3); - VariableStatData vardata; - Node *other; - bool varonleft; - Datum constval; - Oid consttype; - bool isgt; - double selec; - - /* - * If expression is not variable op something or something op variable, - * then punt and return a default estimate. - */ - if (!get_restriction_variable(root, args, varRelid, - &vardata, &other, &varonleft)) - PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL); - - /* - * Can't do anything useful if the something is not a constant, either. - */ - if (!IsA(other, Const)) - { - ReleaseVariableStats(vardata); - PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL); - } - - /* - * If the constant is NULL, assume operator is strict and return zero, ie, - * operator will never return TRUE. - */ - if (((Const *) other)->constisnull) - { - ReleaseVariableStats(vardata); - PG_RETURN_FLOAT8(0.0); - } - constval = ((Const *) other)->constvalue; - consttype = ((Const *) other)->consttype; - - /* - * Force the var to be on the left to simplify logic in scalarineqsel. - */ - if (varonleft) - { - /* we have var > other */ - isgt = true; - } - else - { - /* we have other > var, commute to make var < other */ - operator = get_commutator(operator); - if (!operator) - { - /* Use default selectivity (should we raise an error instead?) */ - ReleaseVariableStats(vardata); - PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL); - } - isgt = false; - } + return scalarineqsel_wrapper(fcinfo, false, false); +} - selec = scalarineqsel(root, operator, isgt, &vardata, constval, consttype); +/* + * scalarlesel - Selectivity of "<=" for scalars. + */ +Datum +scalarlesel(PG_FUNCTION_ARGS) +{ + return scalarineqsel_wrapper(fcinfo, false, true); +} - ReleaseVariableStats(vardata); +/* + * scalargtsel - Selectivity of ">" for scalars. + */ +Datum +scalargtsel(PG_FUNCTION_ARGS) +{ + return scalarineqsel_wrapper(fcinfo, true, false); +} - PG_RETURN_FLOAT8((float8) selec); +/* + * scalargesel - Selectivity of ">=" for scalars. + */ +Datum +scalargesel(PG_FUNCTION_ARGS) +{ + return scalarineqsel_wrapper(fcinfo, true, true); } /* @@ -2722,7 +2785,7 @@ neqjoinsel(PG_FUNCTION_ARGS) } /* - * scalarltjoinsel - Join selectivity of "<" and "<=" for scalars + * scalarltjoinsel - Join selectivity of "<" for scalars */ Datum scalarltjoinsel(PG_FUNCTION_ARGS) @@ -2731,7 +2794,16 @@ scalarltjoinsel(PG_FUNCTION_ARGS) } /* - * scalargtjoinsel - Join selectivity of ">" and ">=" for scalars + * scalarlejoinsel - Join selectivity of "<=" for scalars + */ +Datum +scalarlejoinsel(PG_FUNCTION_ARGS) +{ + PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL); +} + +/* + * scalargtjoinsel - Join selectivity of ">" for scalars */ Datum scalargtjoinsel(PG_FUNCTION_ARGS) @@ -2740,6 +2812,15 @@ scalargtjoinsel(PG_FUNCTION_ARGS) } /* + * scalargejoinsel - Join selectivity of ">=" for scalars + */ +Datum +scalargejoinsel(PG_FUNCTION_ARGS) +{ + PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL); +} + +/* * patternjoinsel - Generic code for pattern-match join selectivity. */ static double @@ -3036,13 +3117,13 @@ mergejoinscansel(PlannerInfo *root, Node *clause, * fraction that's <= the right-side maximum value. But only believe * non-default estimates, else stick with our 1.0. */ - selec = scalarineqsel(root, leop, isgt, &leftvar, + selec = scalarineqsel(root, leop, isgt, true, &leftvar, rightmax, op_righttype); if (selec != DEFAULT_INEQ_SEL) *leftend = selec; /* And similarly for the right variable. */ - selec = scalarineqsel(root, revleop, isgt, &rightvar, + selec = scalarineqsel(root, revleop, isgt, true, &rightvar, leftmax, op_lefttype); if (selec != DEFAULT_INEQ_SEL) *rightend = selec; @@ -3066,13 +3147,13 @@ mergejoinscansel(PlannerInfo *root, Node *clause, * minimum value. But only believe non-default estimates, else stick with * our own default. */ - selec = scalarineqsel(root, ltop, isgt, &leftvar, + selec = scalarineqsel(root, ltop, isgt, false, &leftvar, rightmin, op_righttype); if (selec != DEFAULT_INEQ_SEL) *leftstart = selec; /* And similarly for the right variable. */ - selec = scalarineqsel(root, revltop, isgt, &rightvar, + selec = scalarineqsel(root, revltop, isgt, false, &rightvar, leftmin, op_lefttype); if (selec != DEFAULT_INEQ_SEL) *rightstart = selec; @@ -4029,8 +4110,8 @@ convert_numeric_to_scalar(Datum value, Oid typid) } /* - * Can't get here unless someone tries to use scalarltsel/scalargtsel on - * an operator with one numeric and one non-numeric operand. + * Can't get here unless someone tries to use scalarineqsel() on an + * operator with one numeric and one non-numeric operand. */ elog(ERROR, "unsupported type: %u", typid); return 0; @@ -4211,8 +4292,8 @@ convert_string_datum(Datum value, Oid typid) default: /* - * Can't get here unless someone tries to use scalarltsel on an - * operator with one string and one non-string operand. + * Can't get here unless someone tries to use scalarineqsel() on + * an operator with one string and one non-string operand. */ elog(ERROR, "unsupported type: %u", typid); return NULL; @@ -4416,8 +4497,8 @@ convert_timevalue_to_scalar(Datum value, Oid typid) } /* - * Can't get here unless someone tries to use scalarltsel/scalargtsel on - * an operator with one timevalue and one non-timevalue operand. + * Can't get here unless someone tries to use scalarineqsel() on an + * operator with one timevalue and one non-timevalue operand. */ elog(ERROR, "unsupported type: %u", typid); return 0; @@ -5806,7 +5887,8 @@ prefix_selectivity(PlannerInfo *root, VariableStatData *vardata, elog(ERROR, "no >= operator for opfamily %u", opfamily); fmgr_info(get_opcode(cmpopr), &opproc); - prefixsel = ineq_histogram_selectivity(root, vardata, &opproc, true, + prefixsel = ineq_histogram_selectivity(root, vardata, + &opproc, true, true, prefixcon->constvalue, prefixcon->consttype); @@ -5832,7 +5914,8 @@ prefix_selectivity(PlannerInfo *root, VariableStatData *vardata, { Selectivity topsel; - topsel = ineq_histogram_selectivity(root, vardata, &opproc, false, + topsel = ineq_histogram_selectivity(root, vardata, + &opproc, false, false, greaterstrcon->constvalue, greaterstrcon->consttype); |