aboutsummaryrefslogtreecommitdiff
path: root/src/backend/utils/adt/selfuncs.c
diff options
context:
space:
mode:
authorTom Lane <tgl@sss.pgh.pa.us>2003-04-15 05:18:12 +0000
committerTom Lane <tgl@sss.pgh.pa.us>2003-04-15 05:18:12 +0000
commit5ab15591d9bb5274f2937d5c5524e1b90b5734ed (patch)
tree3e846af0c0eb88df7fb0ab606007b84cd3af874b /src/backend/utils/adt/selfuncs.c
parent49c3cf5fd1acf61520676f65ce468b088d75df68 (diff)
downloadpostgresql-5ab15591d9bb5274f2937d5c5524e1b90b5734ed.tar.gz
postgresql-5ab15591d9bb5274f2937d5c5524e1b90b5734ed.zip
eqjoinsel's logic for case where MCV lists are not present should
account for NULLs; in hindsight this is obvious since the code for the MCV-lists case would reduce to this when there are zero entries in both lists. Per example from Alec Mitchell.
Diffstat (limited to 'src/backend/utils/adt/selfuncs.c')
-rw-r--r--src/backend/utils/adt/selfuncs.c36
1 files changed, 21 insertions, 15 deletions
diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index 2a5ceb767f4..ca502aa448b 100644
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -15,7 +15,7 @@
*
*
* IDENTIFICATION
- * $Header: /cvsroot/pgsql/src/backend/utils/adt/selfuncs.c,v 1.134 2003/03/23 05:14:36 tgl Exp $
+ * $Header: /cvsroot/pgsql/src/backend/utils/adt/selfuncs.c,v 1.135 2003/04/15 05:18:12 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@@ -1591,27 +1591,33 @@ eqjoinsel(PG_FUNCTION_ARGS)
{
/*
* We do not have MCV lists for both sides. Estimate the join
- * selectivity as MIN(1/nd1, 1/nd2). This is plausible if we
- * assume that the values are about equally distributed: a
- * given tuple of rel1 will join to either 0 or N2/nd2 rows of
- * rel2, so total join rows are at most N1*N2/nd2 giving a
- * join selectivity of not more than 1/nd2. By the same logic
- * it is not more than 1/nd1, so MIN(1/nd1, 1/nd2) is an upper
- * bound. Using the MIN() means we estimate from the point of
- * view of the relation with smaller nd (since the larger nd
- * is determining the MIN). It is reasonable to assume that
- * most tuples in this rel will have join partners, so the
- * bound is probably reasonably tight and should be taken
- * as-is.
+ * selectivity as MIN(1/nd1,1/nd2)*(1-nullfrac1)*(1-nullfrac2).
+ * This is plausible if we assume that the join operator is
+ * strict and the non-null values are about equally distributed:
+ * a given non-null tuple of rel1 will join to either zero or
+ * N2*(1-nullfrac2)/nd2 rows of rel2, so total join rows are at
+ * most N1*(1-nullfrac1)*N2*(1-nullfrac2)/nd2 giving a join
+ * selectivity of not more than (1-nullfrac1)*(1-nullfrac2)/nd2.
+ * By the same logic it is not more than
+ * (1-nullfrac1)*(1-nullfrac2)/nd1, so the expression with MIN()
+ * is an upper bound. Using the MIN() means we estimate from the
+ * point of view of the relation with smaller nd (since the larger
+ * nd is determining the MIN). It is reasonable to assume that
+ * most tuples in this rel will have join partners, so the bound
+ * is probably reasonably tight and should be taken as-is.
*
* XXX Can we be smarter if we have an MCV list for just one
* side? It seems that if we assume equal distribution for the
* other side, we end up with the same answer anyway.
*/
+ double nullfrac1 = stats1->stanullfrac;
+ double nullfrac2 = stats2->stanullfrac;
+
+ selec = (1.0 - nullfrac1) * (1.0 - nullfrac2);
if (nd1 > nd2)
- selec = 1.0 / nd1;
+ selec /= nd1;
else
- selec = 1.0 / nd2;
+ selec /= nd2;
}
if (have_mcvs1)