aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/backend/commands/analyze.c66
1 files changed, 45 insertions, 21 deletions
diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c
index e6d9a574e2b..0dc75cb4d4b 100644
--- a/src/backend/commands/analyze.c
+++ b/src/backend/commands/analyze.c
@@ -8,7 +8,7 @@
*
*
* IDENTIFICATION
- * $Header: /cvsroot/pgsql/src/backend/commands/analyze.c,v 1.25 2002/01/06 00:37:44 tgl Exp $
+ * $Header: /cvsroot/pgsql/src/backend/commands/analyze.c,v 1.26 2002/02/18 16:04:14 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@@ -1009,10 +1009,15 @@ compute_minimal_stats(VacAttrStats *stats,
{
/*----------
* Estimate the number of distinct values using the estimator
- * proposed by Chaudhuri et al (see citation above). This is
- * sqrt(n/r) * max(f1,1) + f2 + f3 + ...
- * where fk is the number of distinct values that occurred
- * exactly k times in our sample of r rows (from a total of n).
+ * proposed by Haas and Stokes in IBM Research Report RJ 10025:
+ * n*d / (n - f1 + f1*n/N)
+ * where f1 is the number of distinct values that occurred
+ * exactly once in our sample of n rows (from a total of N),
+ * and d is the total number of distinct values in the sample.
+ * This is their Duj1 estimator; the other estimators they
+ * recommend are considerably more complex, and are numerically
+ * very unstable when n is much smaller than N.
+ *
* We assume (not very reliably!) that all the multiply-occurring
* values are reflected in the final track[] list, and the other
* nonnull values all appeared but once. (XXX this usually
@@ -1021,12 +1026,19 @@ compute_minimal_stats(VacAttrStats *stats,
*----------
*/
int f1 = nonnull_cnt - summultiple;
- double term1;
-
- if (f1 < 1)
- f1 = 1;
- term1 = sqrt(totalrows / (double) numrows) * f1;
- stats->stadistinct = floor(term1 + nmultiple + 0.5);
+ int d = f1 + nmultiple;
+ double numer, denom, stadistinct;
+
+ numer = (double) numrows * (double) d;
+ denom = (double) (numrows - f1) +
+ (double) f1 * (double) numrows / totalrows;
+ stadistinct = numer / denom;
+ /* Clamp to sane range in case of roundoff error */
+ if (stadistinct < (double) d)
+ stadistinct = (double) d;
+ if (stadistinct > totalrows)
+ stadistinct = totalrows;
+ stats->stadistinct = floor(stadistinct + 0.5);
}
/*
@@ -1313,20 +1325,32 @@ compute_scalar_stats(VacAttrStats *stats,
{
/*----------
* Estimate the number of distinct values using the estimator
- * proposed by Chaudhuri et al (see citation above). This is
- * sqrt(n/r) * max(f1,1) + f2 + f3 + ...
- * where fk is the number of distinct values that occurred
- * exactly k times in our sample of r rows (from a total of n).
+ * proposed by Haas and Stokes in IBM Research Report RJ 10025:
+ * n*d / (n - f1 + f1*n/N)
+ * where f1 is the number of distinct values that occurred
+ * exactly once in our sample of n rows (from a total of N),
+ * and d is the total number of distinct values in the sample.
+ * This is their Duj1 estimator; the other estimators they
+ * recommend are considerably more complex, and are numerically
+ * very unstable when n is much smaller than N.
+ *
* Overwidth values are assumed to have been distinct.
*----------
*/
int f1 = ndistinct - nmultiple + toowide_cnt;
- double term1;
-
- if (f1 < 1)
- f1 = 1;
- term1 = sqrt(totalrows / (double) numrows) * f1;
- stats->stadistinct = floor(term1 + nmultiple + 0.5);
+ int d = f1 + nmultiple;
+ double numer, denom, stadistinct;
+
+ numer = (double) numrows * (double) d;
+ denom = (double) (numrows - f1) +
+ (double) f1 * (double) numrows / totalrows;
+ stadistinct = numer / denom;
+ /* Clamp to sane range in case of roundoff error */
+ if (stadistinct < (double) d)
+ stadistinct = (double) d;
+ if (stadistinct > totalrows)
+ stadistinct = totalrows;
+ stats->stadistinct = floor(stadistinct + 0.5);
}
/*