diff options
author | Tom Lane <tgl@sss.pgh.pa.us> | 2008-08-16 00:01:38 +0000 |
---|---|---|
committer | Tom Lane <tgl@sss.pgh.pa.us> | 2008-08-16 00:01:38 +0000 |
commit | d4af2a64816ea4639c9614112b0c8a5a3cb4b522 (patch) | |
tree | eaa243adfc187eecc1a96598cba3cf4a1d7d448f /src/backend/utils/adt/selfuncs.c | |
parent | 118461114e14f4fac5103df173df192771b5befc (diff) | |
download | postgresql-d4af2a64816ea4639c9614112b0c8a5a3cb4b522.tar.gz postgresql-d4af2a64816ea4639c9614112b0c8a5a3cb4b522.zip |
Clean up the loose ends in selectivity estimation left by my patch for semi
and anti joins. To do this, pass the SpecialJoinInfo struct for the current
join as an additional optional argument to operator join selectivity
estimation functions. This allows the estimator to tell not only what kind
of join is being formed, but which variable is on which side of the join;
a requirement long recognized but not dealt with till now. This also leaves
the door open for future improvements in the estimators, such as accounting
for the null-insertion effects of lower outer joins. I didn't do anything
about that in the current patch but the information is in principle deducible
from what's passed.
The patch also clarifies the definition of join selectivity for semi/anti
joins: it's the fraction of the left input that has (at least one) match
in the right input. This allows getting rid of some very fuzzy thinking
that I had committed in the original 7.4-era IN-optimization patch.
There's probably room to estimate this better than the present patch does,
but at least we know what to estimate.
Since I had to touch CREATE OPERATOR anyway to allow a variant signature
for join estimator functions, I took the opportunity to add a couple of
additional checks that were missing, per my recent message to -hackers:
* Check that estimator functions return float8;
* Require execute permission at the time of CREATE OPERATOR on the
operator's function as well as the estimator functions;
* Require ownership of any pre-existing operator that's modified by
the command.
I also moved the lookup of the functions out of OperatorCreate() and
into operatorcmds.c, since that seemed more consistent with most of
the other catalog object creation processes, eg CREATE TYPE.
Diffstat (limited to 'src/backend/utils/adt/selfuncs.c')
-rw-r--r-- | src/backend/utils/adt/selfuncs.c | 400 |
1 files changed, 323 insertions, 77 deletions
diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c index 7d5609d447e..d484221232c 100644 --- a/src/backend/utils/adt/selfuncs.c +++ b/src/backend/utils/adt/selfuncs.c @@ -15,7 +15,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/adt/selfuncs.c,v 1.251 2008/08/14 18:47:59 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/utils/adt/selfuncs.c,v 1.252 2008/08/16 00:01:36 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -55,19 +55,34 @@ * * float8 oprrest (internal, oid, internal, int4); * + * The result is a selectivity, that is, a fraction (0 to 1) of the rows + * of the relation that are expected to produce a TRUE result for the + * given operator. + * * The call convention for a join estimator (oprjoin function) is similar - * except that varRelid is not needed, and instead the join type is + * except that varRelid is not needed, and instead join information is * supplied: * * Selectivity oprjoin (PlannerInfo *root, * Oid operator, * List *args, - * JoinType jointype); + * JoinType jointype, + * SpecialJoinInfo *sjinfo); + * + * float8 oprjoin (internal, oid, internal, int2, internal); * - * float8 oprjoin (internal, oid, internal, int2); + * (Before Postgres 8.4, join estimators had only the first four of these + * parameters. That signature is still allowed, but deprecated.) The + * relationship between jointype and sjinfo is explained in the comments for + * clause_selectivity() --- the short version is that jointype is usually + * best ignored in favor of examining sjinfo. * - * (We deliberately make the SQL signature different to facilitate - * catching errors.) + * Join selectivity for regular inner and outer joins is defined as the + * fraction (0 to 1) of the cross product of the relations that is expected + * to produce a TRUE result for the given operator. For both semi and anti + * joins, however, the selectivity is defined as the fraction of the left-hand + * side relation's rows that are expected to have a match (ie, at least one + * row with a TRUE result) in the right-hand side. *---------- */ @@ -113,6 +128,10 @@ static double var_eq_non_const(VariableStatData *vardata, Oid operator, static double ineq_histogram_selectivity(VariableStatData *vardata, FmgrInfo *opproc, bool isgt, Datum constval, Oid consttype); +static double eqjoinsel_inner(Oid operator, + VariableStatData *vardata1, VariableStatData *vardata2); +static double eqjoinsel_semi(Oid operator, + VariableStatData *vardata1, VariableStatData *vardata2); static bool convert_to_scalar(Datum value, Oid valuetypid, double *scaledvalue, Datum lobound, Datum hibound, Oid boundstypid, double *scaledlobound, double *scaledhibound); @@ -1579,7 +1598,6 @@ scalararraysel(PlannerInfo *root, Oid nominal_element_type; RegProcedure oprsel; FmgrInfo oprselproc; - Datum selarg4; Selectivity s1; /* @@ -1587,15 +1605,9 @@ scalararraysel(PlannerInfo *root, * it hasn't got one. */ if (is_join_clause) - { oprsel = get_oprjoin(operator); - selarg4 = Int16GetDatum(jointype); - } else - { oprsel = get_oprrest(operator); - selarg4 = Int32GetDatum(varRelid); - } if (!oprsel) return (Selectivity) 0.5; fmgr_info(oprsel, &oprselproc); @@ -1660,11 +1672,19 @@ scalararraysel(PlannerInfo *root, elem_values[i], elem_nulls[i], elmbyval)); - s2 = DatumGetFloat8(FunctionCall4(&oprselproc, - PointerGetDatum(root), - ObjectIdGetDatum(operator), - PointerGetDatum(args), - selarg4)); + if (is_join_clause) + s2 = DatumGetFloat8(FunctionCall5(&oprselproc, + PointerGetDatum(root), + ObjectIdGetDatum(operator), + PointerGetDatum(args), + Int16GetDatum(jointype), + PointerGetDatum(sjinfo))); + else + s2 = DatumGetFloat8(FunctionCall4(&oprselproc, + PointerGetDatum(root), + ObjectIdGetDatum(operator), + PointerGetDatum(args), + Int32GetDatum(varRelid))); if (useOr) s1 = s1 + s2 - s1 * s2; else @@ -1694,11 +1714,19 @@ scalararraysel(PlannerInfo *root, * estimation function would really care ... */ args = list_make2(leftop, elem); - s2 = DatumGetFloat8(FunctionCall4(&oprselproc, - PointerGetDatum(root), - ObjectIdGetDatum(operator), - PointerGetDatum(args), - selarg4)); + if (is_join_clause) + s2 = DatumGetFloat8(FunctionCall5(&oprselproc, + PointerGetDatum(root), + ObjectIdGetDatum(operator), + PointerGetDatum(args), + Int16GetDatum(jointype), + PointerGetDatum(sjinfo))); + else + s2 = DatumGetFloat8(FunctionCall4(&oprselproc, + PointerGetDatum(root), + ObjectIdGetDatum(operator), + PointerGetDatum(args), + Int32GetDatum(varRelid))); if (useOr) s1 = s1 + s2 - s1 * s2; else @@ -1721,11 +1749,19 @@ scalararraysel(PlannerInfo *root, dummyexpr->typeId = nominal_element_type; dummyexpr->typeMod = -1; args = list_make2(leftop, dummyexpr); - s2 = DatumGetFloat8(FunctionCall4(&oprselproc, - PointerGetDatum(root), - ObjectIdGetDatum(operator), - PointerGetDatum(args), - selarg4)); + if (is_join_clause) + s2 = DatumGetFloat8(FunctionCall5(&oprselproc, + PointerGetDatum(root), + ObjectIdGetDatum(operator), + PointerGetDatum(args), + Int16GetDatum(jointype), + PointerGetDatum(sjinfo))); + else + s2 = DatumGetFloat8(FunctionCall4(&oprselproc, + PointerGetDatum(root), + ObjectIdGetDatum(operator), + PointerGetDatum(args), + Int32GetDatum(varRelid))); s1 = useOr ? 0.0 : 1.0; /* @@ -1803,13 +1839,24 @@ rowcomparesel(PlannerInfo *root, /* Build equivalent arg list for single operator */ opargs = list_make2(linitial(clause->largs), linitial(clause->rargs)); - /* Decide if it's a join clause, same as for OpExpr */ + /* + * Decide if it's a join clause. This should match clausesel.c's + * treat_as_join_clause(), except that we intentionally consider only + * the leading columns and not the rest of the clause. + */ if (varRelid != 0) { /* - * If we are considering a nestloop join then all clauses are - * restriction clauses, since we are only interested in the one - * relation. + * Caller is forcing restriction mode (eg, because we are examining + * an inner indexscan qual). + */ + is_join_clause = false; + } + else if (sjinfo == NULL) + { + /* + * It must be a restriction clause, since it's being evaluated at + * a scan node. */ is_join_clause = false; } @@ -1817,7 +1864,6 @@ rowcomparesel(PlannerInfo *root, { /* * Otherwise, it's a join if there's more than one relation used. - * Notice we ignore the low-order columns here. */ is_join_clause = (NumRelids((Node *) opargs) > 1); } @@ -1827,7 +1873,8 @@ rowcomparesel(PlannerInfo *root, /* Estimate selectivity for a join clause. */ s1 = join_selectivity(root, opno, opargs, - jointype); + jointype, + sjinfo); } else { @@ -1849,10 +1896,60 @@ eqjoinsel(PG_FUNCTION_ARGS) PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0); Oid operator = PG_GETARG_OID(1); List *args = (List *) PG_GETARG_POINTER(2); +#ifdef NOT_USED JoinType jointype = (JoinType) PG_GETARG_INT16(3); +#endif + SpecialJoinInfo *sjinfo = (SpecialJoinInfo *) PG_GETARG_POINTER(4); double selec; VariableStatData vardata1; VariableStatData vardata2; + bool join_is_reversed; + + get_join_variables(root, args, sjinfo, + &vardata1, &vardata2, &join_is_reversed); + + switch (sjinfo->jointype) + { + case JOIN_INNER: + case JOIN_LEFT: + case JOIN_FULL: + selec = eqjoinsel_inner(operator, &vardata1, &vardata2); + break; + case JOIN_SEMI: + case JOIN_ANTI: + if (!join_is_reversed) + selec = eqjoinsel_semi(operator, &vardata1, &vardata2); + else + selec = eqjoinsel_semi(get_commutator(operator), + &vardata2, &vardata1); + break; + default: + /* other values not expected here */ + elog(ERROR, "unrecognized join type: %d", + (int) sjinfo->jointype); + selec = 0; /* keep compiler quiet */ + break; + } + + ReleaseVariableStats(vardata1); + ReleaseVariableStats(vardata2); + + CLAMP_PROBABILITY(selec); + + PG_RETURN_FLOAT8((float8) selec); +} + +/* + * eqjoinsel_inner --- eqjoinsel for normal inner join + * + * We also use this for LEFT/FULL outer joins; it's not presently clear + * that it's worth trying to distinguish them here. + */ +static double +eqjoinsel_inner(Oid operator, + VariableStatData *vardata1, VariableStatData *vardata2) +{ + double selec; double nd1; double nd2; Form_pg_statistic stats1 = NULL; @@ -1868,29 +1965,27 @@ eqjoinsel(PG_FUNCTION_ARGS) float4 *numbers2 = NULL; int nnumbers2 = 0; - get_join_variables(root, args, &vardata1, &vardata2); - - nd1 = get_variable_numdistinct(&vardata1); - nd2 = get_variable_numdistinct(&vardata2); + nd1 = get_variable_numdistinct(vardata1); + nd2 = get_variable_numdistinct(vardata2); - if (HeapTupleIsValid(vardata1.statsTuple)) + if (HeapTupleIsValid(vardata1->statsTuple)) { - stats1 = (Form_pg_statistic) GETSTRUCT(vardata1.statsTuple); - have_mcvs1 = get_attstatsslot(vardata1.statsTuple, - vardata1.atttype, - vardata1.atttypmod, + stats1 = (Form_pg_statistic) GETSTRUCT(vardata1->statsTuple); + have_mcvs1 = get_attstatsslot(vardata1->statsTuple, + vardata1->atttype, + vardata1->atttypmod, STATISTIC_KIND_MCV, InvalidOid, &values1, &nvalues1, &numbers1, &nnumbers1); } - if (HeapTupleIsValid(vardata2.statsTuple)) + if (HeapTupleIsValid(vardata2->statsTuple)) { - stats2 = (Form_pg_statistic) GETSTRUCT(vardata2.statsTuple); - have_mcvs2 = get_attstatsslot(vardata2.statsTuple, - vardata2.atttype, - vardata2.atttypmod, + stats2 = (Form_pg_statistic) GETSTRUCT(vardata2->statsTuple); + have_mcvs2 = get_attstatsslot(vardata2->statsTuple, + vardata2->atttype, + vardata2->atttypmod, STATISTIC_KIND_MCV, InvalidOid, &values2, &nvalues2, @@ -1933,25 +2028,6 @@ eqjoinsel(PG_FUNCTION_ARGS) hasmatch2 = (bool *) palloc0(nvalues2 * sizeof(bool)); /* - * If we are doing any variant of JOIN_SEMI, pretend all the values of - * the righthand relation are unique (ie, act as if it's been - * DISTINCT'd). - * - * NOTE: it would be dangerous to try to be smart about JOIN_LEFT or - * JOIN_RIGHT here, because we do not have enough information to - * determine which var is really on which side of the join. Perhaps - * someday we should pass in more information. - */ - if (jointype == JOIN_SEMI) - { - float4 oneovern = 1.0 / nd2; - - for (i = 0; i < nvalues2; i++) - numbers2[i] = oneovern; - nullfrac2 = oneovern; - } - - /* * Note we assume that each MCV will match at most one member of the * other MCV list. If the operator isn't really equality, there could * be multiple matches --- but we don't look for them, both for speed @@ -2075,18 +2151,170 @@ eqjoinsel(PG_FUNCTION_ARGS) } if (have_mcvs1) - free_attstatsslot(vardata1.atttype, values1, nvalues1, + free_attstatsslot(vardata1->atttype, values1, nvalues1, numbers1, nnumbers1); if (have_mcvs2) - free_attstatsslot(vardata2.atttype, values2, nvalues2, + free_attstatsslot(vardata2->atttype, values2, nvalues2, numbers2, nnumbers2); - ReleaseVariableStats(vardata1); - ReleaseVariableStats(vardata2); + return selec; +} - CLAMP_PROBABILITY(selec); +/* + * eqjoinsel_semi --- eqjoinsel for semi join + * + * (Also used for anti join, which we are supposed to estimate the same way.) + * Caller has ensured that vardata1 is the LHS variable. + */ +static double +eqjoinsel_semi(Oid operator, + VariableStatData *vardata1, VariableStatData *vardata2) +{ + double selec; + double nd1; + double nd2; + Form_pg_statistic stats1 = NULL; + Form_pg_statistic stats2 = NULL; + bool have_mcvs1 = false; + Datum *values1 = NULL; + int nvalues1 = 0; + float4 *numbers1 = NULL; + int nnumbers1 = 0; + bool have_mcvs2 = false; + Datum *values2 = NULL; + int nvalues2 = 0; + float4 *numbers2 = NULL; + int nnumbers2 = 0; - PG_RETURN_FLOAT8((float8) selec); + nd1 = get_variable_numdistinct(vardata1); + nd2 = get_variable_numdistinct(vardata2); + + if (HeapTupleIsValid(vardata1->statsTuple)) + { + stats1 = (Form_pg_statistic) GETSTRUCT(vardata1->statsTuple); + have_mcvs1 = get_attstatsslot(vardata1->statsTuple, + vardata1->atttype, + vardata1->atttypmod, + STATISTIC_KIND_MCV, + InvalidOid, + &values1, &nvalues1, + &numbers1, &nnumbers1); + } + + if (HeapTupleIsValid(vardata2->statsTuple)) + { + stats2 = (Form_pg_statistic) GETSTRUCT(vardata2->statsTuple); + have_mcvs2 = get_attstatsslot(vardata2->statsTuple, + vardata2->atttype, + vardata2->atttypmod, + STATISTIC_KIND_MCV, + InvalidOid, + &values2, &nvalues2, + &numbers2, &nnumbers2); + } + + if (have_mcvs1 && have_mcvs2 && OidIsValid(operator)) + { + /* + * We have most-common-value lists for both relations. Run through + * the lists to see which MCVs actually join to each other with the + * given operator. This allows us to determine the exact join + * selectivity for the portion of the relations represented by the MCV + * lists. We still have to estimate for the remaining population, but + * in a skewed distribution this gives us a big leg up in accuracy. + */ + FmgrInfo eqproc; + bool *hasmatch1; + bool *hasmatch2; + double nullfrac1 = stats1->stanullfrac; + double matchfreq1; + int i, + nmatches; + + fmgr_info(get_opcode(operator), &eqproc); + hasmatch1 = (bool *) palloc0(nvalues1 * sizeof(bool)); + hasmatch2 = (bool *) palloc0(nvalues2 * sizeof(bool)); + + /* + * Note we assume that each MCV will match at most one member of the + * other MCV list. If the operator isn't really equality, there could + * be multiple matches --- but we don't look for them, both for speed + * and because the math wouldn't add up... + */ + nmatches = 0; + for (i = 0; i < nvalues1; i++) + { + int j; + + for (j = 0; j < nvalues2; j++) + { + if (hasmatch2[j]) + continue; + if (DatumGetBool(FunctionCall2(&eqproc, + values1[i], + values2[j]))) + { + hasmatch1[i] = hasmatch2[j] = true; + nmatches++; + break; + } + } + } + /* Sum up frequencies of matched MCVs */ + matchfreq1 = 0.0; + for (i = 0; i < nvalues1; i++) + { + if (hasmatch1[i]) + matchfreq1 += numbers1[i]; + } + CLAMP_PROBABILITY(matchfreq1); + pfree(hasmatch1); + pfree(hasmatch2); + + /* + * Now we need to estimate the fraction of relation 1 that has at + * least one join partner. We know for certain that the matched + * MCVs do, so that gives us a lower bound, but we're really in the + * dark about everything else. Our crude approach is: if nd1 <= nd2 + * then assume all non-null rel1 rows have join partners, else assume + * for the uncertain rows that a fraction nd2/nd1 have join partners. + * We can discount the known-matched MCVs from the distinct-values + * counts before doing the division. + */ + nd1 -= nmatches; + nd2 -= nmatches; + if (nd1 <= nd2 || nd2 <= 0) + selec = Max(matchfreq1, 1.0 - nullfrac1); + else + { + double uncertain = 1.0 - matchfreq1 - nullfrac1; + + CLAMP_PROBABILITY(uncertain); + selec = matchfreq1 + (nd2/nd1) * uncertain; + } + } + else + { + /* + * Without MCV lists for both sides, we can only use the heuristic + * about nd1 vs nd2. + */ + double nullfrac1 = stats1 ? stats1->stanullfrac : 0.0; + + if (nd1 <= nd2 || nd2 <= 0) + selec = 1.0 - nullfrac1; + else + selec = (nd2/nd1) * (1.0 - nullfrac1); + } + + if (have_mcvs1) + free_attstatsslot(vardata1->atttype, values1, nvalues1, + numbers1, nnumbers1); + if (have_mcvs2) + free_attstatsslot(vardata2->atttype, values2, nvalues2, + numbers2, nnumbers2); + + return selec; } /* @@ -2099,6 +2327,7 @@ neqjoinsel(PG_FUNCTION_ARGS) Oid operator = PG_GETARG_OID(1); List *args = (List *) PG_GETARG_POINTER(2); JoinType jointype = (JoinType) PG_GETARG_INT16(3); + SpecialJoinInfo *sjinfo = (SpecialJoinInfo *) PG_GETARG_POINTER(4); Oid eqop; float8 result; @@ -2109,11 +2338,12 @@ neqjoinsel(PG_FUNCTION_ARGS) eqop = get_negator(operator); if (eqop) { - result = DatumGetFloat8(DirectFunctionCall4(eqjoinsel, + result = DatumGetFloat8(DirectFunctionCall5(eqjoinsel, PointerGetDatum(root), ObjectIdGetDatum(eqop), PointerGetDatum(args), - Int16GetDatum(jointype))); + Int16GetDatum(jointype), + PointerGetDatum(sjinfo))); } else { @@ -3661,10 +3891,17 @@ get_restriction_variable(PlannerInfo *root, List *args, int varRelid, /* * get_join_variables * Apply examine_variable() to each side of a join clause. + * Also, attempt to identify whether the join clause has the same + * or reversed sense compared to the SpecialJoinInfo. + * + * We consider the join clause "normal" if it is "lhs_var OP rhs_var", + * or "reversed" if it is "rhs_var OP lhs_var". In complicated cases + * where we can't tell for sure, we default to assuming it's normal. */ void -get_join_variables(PlannerInfo *root, List *args, - VariableStatData *vardata1, VariableStatData *vardata2) +get_join_variables(PlannerInfo *root, List *args, SpecialJoinInfo *sjinfo, + VariableStatData *vardata1, VariableStatData *vardata2, + bool *join_is_reversed) { Node *left, *right; @@ -3677,6 +3914,15 @@ get_join_variables(PlannerInfo *root, List *args, examine_variable(root, left, 0, vardata1); examine_variable(root, right, 0, vardata2); + + if (vardata1->rel && + bms_is_subset(vardata1->rel->relids, sjinfo->syn_righthand)) + *join_is_reversed = true; /* var1 is on RHS */ + else if (vardata2->rel && + bms_is_subset(vardata2->rel->relids, sjinfo->syn_lefthand)) + *join_is_reversed = true; /* var2 is on LHS */ + else + *join_is_reversed = false; } /* |