aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTom Lane <tgl@sss.pgh.pa.us>2016-04-04 17:06:33 -0400
committerTom Lane <tgl@sss.pgh.pa.us>2016-04-04 17:06:33 -0400
commit3c69b33f459f62fe6db66c386ef12620ea697f74 (patch)
tree26c427977d1af2e1c64b62c774376fb8d33b08aa
parent391159e03a8b69dd04a1432ceb800c7c4c3d608c (diff)
downloadpostgresql-3c69b33f459f62fe6db66c386ef12620ea697f74.tar.gz
postgresql-3c69b33f459f62fe6db66c386ef12620ea697f74.zip
Add a few comments about ANALYZE's strategy for collecting MCVs.
Alex Shulgin complained that the underlying strategy wasn't all that apparent, particularly not the fact that we intentionally have two code paths depending on whether we think the column has a limited set of possible values or not. Try to make it clearer.
-rw-r--r--src/backend/commands/analyze.c32
1 files changed, 30 insertions, 2 deletions
diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c
index 1283de03348..cf8c8164b7e 100644
--- a/src/backend/commands/analyze.c
+++ b/src/backend/commands/analyze.c
@@ -2055,7 +2055,11 @@ compute_distinct_stats(VacAttrStatsP stats,
/*
* Our track list includes every value in the sample, and every
* value appeared more than once. Assume the column has just
- * these values.
+ * these values. (This case is meant to address columns with
+ * small, fixed sets of possible values, such as boolean or enum
+ * columns. If there are any values that appear just once in the
+ * sample, including too-wide values, we should assume that that's
+ * not what we're dealing with.)
*/
stats->stadistinct = track_cnt;
}
@@ -2123,6 +2127,16 @@ compute_distinct_stats(VacAttrStatsP stats,
* significantly more common than the (estimated) average. We set the
* threshold rather arbitrarily at 25% more than average, with at
* least 2 instances in the sample.
+ *
+ * Note: the first of these cases is meant to address columns with
+ * small, fixed sets of possible values, such as boolean or enum
+ * columns. If we can *completely* represent the column population by
+ * an MCV list that will fit into the stats target, then we should do
+ * so and thus provide the planner with complete information. But if
+ * the MCV list is not complete, it's generally worth being more
+ * selective, and not just filling it all the way up to the stats
+ * target. So for an incomplete list, we try to take only MCVs that
+ * are significantly more common than average.
*/
if (track_cnt < track_max && toowide_cnt == 0 &&
stats->stadistinct > 0 &&
@@ -2416,7 +2430,11 @@ compute_scalar_stats(VacAttrStatsP stats,
{
/*
* Every value in the sample appeared more than once. Assume the
- * column has just these values.
+ * column has just these values. (This case is meant to address
+ * columns with small, fixed sets of possible values, such as
+ * boolean or enum columns. If there are any values that appear
+ * just once in the sample, including too-wide values, we should
+ * assume that that's not what we're dealing with.)
*/
stats->stadistinct = ndistinct;
}
@@ -2485,6 +2503,16 @@ compute_scalar_stats(VacAttrStatsP stats,
* emit duplicate histogram bin boundaries. (We might end up with
* duplicate histogram entries anyway, if the distribution is skewed;
* but we prefer to treat such values as MCVs if at all possible.)
+ *
+ * Note: the first of these cases is meant to address columns with
+ * small, fixed sets of possible values, such as boolean or enum
+ * columns. If we can *completely* represent the column population by
+ * an MCV list that will fit into the stats target, then we should do
+ * so and thus provide the planner with complete information. But if
+ * the MCV list is not complete, it's generally worth being more
+ * selective, and not just filling it all the way up to the stats
+ * target. So for an incomplete list, we try to take only MCVs that
+ * are significantly more common than average.
*/
if (track_cnt == ndistinct && toowide_cnt == 0 &&
stats->stadistinct > 0 &&