aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/backend/utils/adt/selfuncs.c242
1 files changed, 201 insertions, 41 deletions
diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index 07c4da115f5..1c9b3c60b9d 100644
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -15,7 +15,7 @@
*
*
* IDENTIFICATION
- * $Header: /cvsroot/pgsql/src/backend/utils/adt/selfuncs.c,v 1.90 2001/05/20 20:28:19 tgl Exp $
+ * $Header: /cvsroot/pgsql/src/backend/utils/adt/selfuncs.c,v 1.91 2001/05/27 17:37:48 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@@ -940,9 +940,7 @@ Datum
eqjoinsel(PG_FUNCTION_ARGS)
{
Query *root = (Query *) PG_GETARG_POINTER(0);
-#ifdef NOT_USED /* see neqjoinsel() before removing me! */
Oid operator = PG_GETARG_OID(1);
-#endif
List *args = (List *) PG_GETARG_POINTER(2);
Var *var1;
Var *var2;
@@ -958,73 +956,219 @@ eqjoinsel(PG_FUNCTION_ARGS)
HeapTuple statsTuple2 = NULL;
Form_pg_statistic stats1 = NULL;
Form_pg_statistic stats2 = NULL;
- double nd1,
- nd2;
-
- if (var1 == NULL)
- {
- nd1 = DEFAULT_NUM_DISTINCT;
- }
- else
+ double nd1 = DEFAULT_NUM_DISTINCT;
+ double nd2 = DEFAULT_NUM_DISTINCT;
+ bool have_mcvs1 = false;
+ Datum *values1 = NULL;
+ int nvalues1 = 0;
+ float4 *numbers1 = NULL;
+ int nnumbers1 = 0;
+ bool have_mcvs2 = false;
+ Datum *values2 = NULL;
+ int nvalues2 = 0;
+ float4 *numbers2 = NULL;
+ int nnumbers2 = 0;
+
+ if (var1 != NULL)
{
/* get stats for the attribute, if available */
Oid relid1 = getrelid(var1->varno, root->rtable);
- if (relid1 == InvalidOid)
- nd1 = DEFAULT_NUM_DISTINCT;
- else
+ if (relid1 != InvalidOid)
{
statsTuple1 = SearchSysCache(STATRELATT,
ObjectIdGetDatum(relid1),
Int16GetDatum(var1->varattno),
0, 0);
if (HeapTupleIsValid(statsTuple1))
+ {
stats1 = (Form_pg_statistic) GETSTRUCT(statsTuple1);
+ have_mcvs1 = get_attstatsslot(statsTuple1,
+ var1->vartype,
+ var1->vartypmod,
+ STATISTIC_KIND_MCV,
+ InvalidOid,
+ &values1, &nvalues1,
+ &numbers1, &nnumbers1);
+ }
nd1 = get_att_numdistinct(root, var1, stats1);
}
}
- if (var2 == NULL)
- {
- nd2 = DEFAULT_NUM_DISTINCT;
- }
- else
+ if (var2 != NULL)
{
/* get stats for the attribute, if available */
Oid relid2 = getrelid(var2->varno, root->rtable);
- if (relid2 == InvalidOid)
- nd2 = DEFAULT_NUM_DISTINCT;
- else
+ if (relid2 != InvalidOid)
{
statsTuple2 = SearchSysCache(STATRELATT,
ObjectIdGetDatum(relid2),
Int16GetDatum(var2->varattno),
0, 0);
if (HeapTupleIsValid(statsTuple2))
+ {
stats2 = (Form_pg_statistic) GETSTRUCT(statsTuple2);
+ have_mcvs2 = get_attstatsslot(statsTuple2,
+ var2->vartype,
+ var2->vartypmod,
+ STATISTIC_KIND_MCV,
+ InvalidOid,
+ &values2, &nvalues2,
+ &numbers2, &nnumbers2);
+ }
nd2 = get_att_numdistinct(root, var2, stats2);
}
}
- /*
- * Estimate the join selectivity as 1 / sqrt(nd1*nd2)
- * (can we produce any theory for this)?
- *
- * XXX possibility to do better: if both attributes have histograms
- * then we could determine the exact join selectivity between the
- * MCV sets, and only have to assume the join behavior of the non-MCV
- * values. This could be a big win when the MCVs cover a large part
- * of the population.
- *
- * XXX what about nulls?
- */
- selec = 1.0 / sqrt(nd1 * nd2);
- if (selec > 1.0)
- selec = 1.0;
+ if (have_mcvs1 && have_mcvs2)
+ {
+ /*
+ * We have most-common-value lists for both relations. Run
+ * through the lists to see which MCVs actually join to each
+ * other with the given operator. This allows us to determine
+ * the exact join selectivity for the portion of the relations
+ * represented by the MCV lists. We still have to estimate for
+ * the remaining population, but in a skewed distribution this
+ * gives us a big leg up in accuracy. For motivation see the
+ * analysis in Y. Ioannidis and S. Christodoulakis, "On the
+ * propagation of errors in the size of join results", Technical
+ * Report 1018, Computer Science Dept., University of Wisconsin,
+ * Madison, March 1991 (available from ftp.cs.wisc.edu).
+ */
+ FmgrInfo eqproc;
+ bool *hasmatch1;
+ bool *hasmatch2;
+ double matchprodfreq,
+ matchfreq1,
+ matchfreq2,
+ unmatchfreq1,
+ unmatchfreq2,
+ otherfreq1,
+ otherfreq2,
+ totalsel1,
+ totalsel2;
+ int i,
+ nmatches;
+
+ fmgr_info(get_opcode(operator), &eqproc);
+ hasmatch1 = (bool *) palloc(nvalues1 * sizeof(bool));
+ memset(hasmatch1, 0, nvalues1 * sizeof(bool));
+ hasmatch2 = (bool *) palloc(nvalues2 * sizeof(bool));
+ memset(hasmatch2, 0, nvalues2 * sizeof(bool));
+ /*
+ * Note we assume that each MCV will match at most one member of
+ * the other MCV list. If the operator isn't really equality,
+ * there could be multiple matches --- but we don't look for them,
+ * both for speed and because the math wouldn't add up...
+ */
+ matchprodfreq = 0.0;
+ nmatches = 0;
+ for (i = 0; i < nvalues1; i++)
+ {
+ int j;
+ for (j = 0; j < nvalues2; j++)
+ {
+ if (hasmatch2[j])
+ continue;
+ if (DatumGetBool(FunctionCall2(&eqproc,
+ values1[i],
+ values2[j])))
+ {
+ hasmatch1[i] = hasmatch2[j] = true;
+ matchprodfreq += numbers1[i] * numbers2[j];
+ nmatches++;
+ break;
+ }
+ }
+ }
+ /* Sum up frequencies of matched and unmatched MCVs */
+ matchfreq1 = unmatchfreq1 = 0.0;
+ for (i = 0; i < nvalues1; i++)
+ {
+ if (hasmatch1[i])
+ matchfreq1 += numbers1[i];
+ else
+ unmatchfreq1 += numbers1[i];
+ }
+ matchfreq2 = unmatchfreq2 = 0.0;
+ for (i = 0; i < nvalues2; i++)
+ {
+ if (hasmatch2[i])
+ matchfreq2 += numbers2[i];
+ else
+ unmatchfreq2 += numbers2[i];
+ }
+ pfree(hasmatch1);
+ pfree(hasmatch2);
+ /*
+ * Compute total frequency of non-null values that are not in
+ * the MCV lists.
+ */
+ otherfreq1 = 1.0 - stats1->stanullfrac - matchfreq1 - unmatchfreq1;
+ otherfreq2 = 1.0 - stats2->stanullfrac - matchfreq2 - unmatchfreq2;
+ /*
+ * We can estimate the total selectivity from the point of view
+ * of relation 1 as: the known selectivity for matched MCVs, plus
+ * unmatched MCVs that are assumed to match against random members
+ * of relation 2's non-MCV population, plus non-MCV values that
+ * are assumed to match against random members of relation 2's
+ * unmatched MCVs plus non-MCV values.
+ */
+ totalsel1 = matchprodfreq;
+ if (nd2 > nvalues2)
+ totalsel1 += unmatchfreq1 * otherfreq2 / (nd2 - nvalues2);
+ if (nd2 > nmatches)
+ totalsel1 += otherfreq1 * (otherfreq2 + unmatchfreq2) /
+ (nd2 - nmatches);
+ /* Same estimate from the point of view of relation 2. */
+ totalsel2 = matchprodfreq;
+ if (nd1 > nvalues1)
+ totalsel2 += unmatchfreq2 * otherfreq1 / (nd1 - nvalues1);
+ if (nd1 > nmatches)
+ totalsel2 += otherfreq2 * (otherfreq1 + unmatchfreq1) /
+ (nd1 - nmatches);
+ /*
+ * For robustness, we average the two estimates. (Can a case
+ * be made for taking the min or max instead?)
+ */
+ selec = (totalsel1 + totalsel2) * 0.5;
+ }
+ else
+ {
+ /*
+ * We do not have MCV lists for both sides. Estimate the
+ * join selectivity as MIN(1/nd1, 1/nd2). This is plausible
+ * if we assume that the values are about equally distributed:
+ * a given tuple of rel1 will join to either 0 or N2/nd2 rows
+ * of rel2, so total join rows are at most N1*N2/nd2 giving
+ * a join selectivity of not more than 1/nd2. By the same logic
+ * it is not more than 1/nd1, so MIN(1/nd1, 1/nd2) is an upper
+ * bound. Using the MIN() means we estimate from the point of
+ * view of the relation with smaller nd (since the larger nd is
+ * determining the MIN). It is reasonable to assume that most
+ * tuples in this rel will have join partners, so the bound is
+ * probably reasonably tight and should be taken as-is.
+ *
+ * XXX Can we be smarter if we have an MCV list for just one side?
+ * It seems that if we assume equal distribution for the other
+ * side, we end up with the same answer anyway.
+ */
+ if (nd1 > nd2)
+ selec = 1.0 / nd1;
+ else
+ selec = 1.0 / nd2;
+ }
+
+ if (have_mcvs1)
+ free_attstatsslot(var1->vartype, values1, nvalues1,
+ numbers1, nnumbers1);
+ if (have_mcvs2)
+ free_attstatsslot(var2->vartype, values2, nvalues2,
+ numbers2, nnumbers2);
if (HeapTupleIsValid(statsTuple1))
ReleaseSysCache(statsTuple1);
if (HeapTupleIsValid(statsTuple2))
@@ -1039,14 +1183,30 @@ eqjoinsel(PG_FUNCTION_ARGS)
Datum
neqjoinsel(PG_FUNCTION_ARGS)
{
+ Query *root = (Query *) PG_GETARG_POINTER(0);
+ Oid operator = PG_GETARG_OID(1);
+ List *args = (List *) PG_GETARG_POINTER(2);
+ Oid eqop;
float8 result;
/*
- * XXX we skip looking up the negator operator here because we know
- * eqjoinsel() won't look at it anyway. If eqjoinsel() ever does
- * look, this routine will need to look more like neqsel() does.
+ * We want 1 - eqjoinsel() where the equality operator is the one
+ * associated with this != operator, that is, its negator.
*/
- result = DatumGetFloat8(eqjoinsel(fcinfo));
+ eqop = get_negator(operator);
+ if (eqop)
+ {
+ result = DatumGetFloat8(DirectFunctionCall3(eqjoinsel,
+ PointerGetDatum(root),
+ ObjectIdGetDatum(eqop),
+ PointerGetDatum(args)));
+
+ }
+ else
+ {
+ /* Use default selectivity (should we raise an error instead?) */
+ result = DEFAULT_EQ_SEL;
+ }
result = 1.0 - result;
PG_RETURN_FLOAT8(result);
}