aboutsummaryrefslogtreecommitdiff
path: root/src/backend/utils/adt/selfuncs.c
diff options
context:
space:
mode:
authorTom Lane <tgl@sss.pgh.pa.us>2016-08-07 18:52:02 -0400
committerTom Lane <tgl@sss.pgh.pa.us>2016-08-07 18:52:02 -0400
commit95bee941be4c009ebbc29162a0dc9664f40de12f (patch)
tree97557258e66828f2075dbf6281d416c53cde4621 /src/backend/utils/adt/selfuncs.c
parent8a8c6b53810026641a1e12f60f873a7bd3cea5e3 (diff)
downloadpostgresql-95bee941be4c009ebbc29162a0dc9664f40de12f.tar.gz
postgresql-95bee941be4c009ebbc29162a0dc9664f40de12f.zip
Fix misestimation of n_distinct for a nearly-unique column with many nulls.
If ANALYZE found no repeated non-null entries in its sample, it set the column's stadistinct value to -1.0, intending to indicate that the entries are all distinct. But what this value actually means is that the number of distinct values is 100% of the table's rowcount, and thus it was overestimating the number of distinct values by however many nulls there are. This could lead to very poor selectivity estimates, as for example in a recent report from Andreas Joseph Krogh. We should discount the stadistinct value by whatever we've estimated the nulls fraction to be. (That is what will happen if we choose to use a negative stadistinct for a column that does have repeated entries, so this code path was just inconsistent.) In addition to fixing the stadistinct entries stored by several different ANALYZE code paths, adjust the logic where get_variable_numdistinct() forces an "all distinct" estimate on the basis of finding a relevant unique index. Unique indexes don't reject nulls, so there's no reason to assume that the null fraction doesn't apply. Back-patch to all supported branches. Back-patching is a bit of a judgment call, but this problem seems to affect only a few users (else we'd have identified it long ago), and it's bad enough when it does happen that destabilizing plan choices in a worse direction seems unlikely. Patch by me, with documentation wording suggested by Dean Rasheed Report: <VisenaEmail.26.df42f82acae38a58.156463942b8@tc7-visena> Discussion: <16143.1470350371@sss.pgh.pa.us>
Diffstat (limited to 'src/backend/utils/adt/selfuncs.c')
-rw-r--r--src/backend/utils/adt/selfuncs.c12
1 files changed, 8 insertions, 4 deletions
diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index cc2a9a1b6c5..56943f2a87a 100644
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -4738,6 +4738,7 @@ double
get_variable_numdistinct(VariableStatData *vardata, bool *isdefault)
{
double stadistinct;
+ double stanullfrac = 0.0;
double ntuples;
*isdefault = false;
@@ -4745,7 +4746,8 @@ get_variable_numdistinct(VariableStatData *vardata, bool *isdefault)
/*
* Determine the stadistinct value to use. There are cases where we can
* get an estimate even without a pg_statistic entry, or can get a better
- * value than is in pg_statistic.
+ * value than is in pg_statistic. Grab stanullfrac too if we can find it
+ * (otherwise, assume no nulls, for lack of any better idea).
*/
if (HeapTupleIsValid(vardata->statsTuple))
{
@@ -4754,6 +4756,7 @@ get_variable_numdistinct(VariableStatData *vardata, bool *isdefault)
stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
stadistinct = stats->stadistinct;
+ stanullfrac = stats->stanullfrac;
}
else if (vardata->vartype == BOOLOID)
{
@@ -4777,7 +4780,7 @@ get_variable_numdistinct(VariableStatData *vardata, bool *isdefault)
{
case ObjectIdAttributeNumber:
case SelfItemPointerAttributeNumber:
- stadistinct = -1.0; /* unique */
+ stadistinct = -1.0; /* unique (and all non null) */
break;
case TableOidAttributeNumber:
stadistinct = 1.0; /* only 1 value */
@@ -4799,10 +4802,11 @@ get_variable_numdistinct(VariableStatData *vardata, bool *isdefault)
* If there is a unique index or DISTINCT clause for the variable, assume
* it is unique no matter what pg_statistic says; the statistics could be
* out of date, or we might have found a partial unique index that proves
- * the var is unique for this query.
+ * the var is unique for this query. However, we'd better still believe
+ * the null-fraction statistic.
*/
if (vardata->isunique)
- stadistinct = -1.0;
+ stadistinct = -1.0 * (1.0 - stanullfrac);
/*
* If we had an absolute estimate, use that.