aboutsummaryrefslogtreecommitdiff
path: root/src/backend/commands/analyze.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/backend/commands/analyze.c')
-rw-r--r--src/backend/commands/analyze.c32
1 files changed, 30 insertions, 2 deletions
diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c
index 1283de03348..cf8c8164b7e 100644
--- a/src/backend/commands/analyze.c
+++ b/src/backend/commands/analyze.c
@@ -2055,7 +2055,11 @@ compute_distinct_stats(VacAttrStatsP stats,
/*
* Our track list includes every value in the sample, and every
* value appeared more than once. Assume the column has just
- * these values.
+ * these values. (This case is meant to address columns with
+ * small, fixed sets of possible values, such as boolean or enum
+ * columns. If there are any values that appear just once in the
+ * sample, including too-wide values, we should assume that that's
+ * not what we're dealing with.)
*/
stats->stadistinct = track_cnt;
}
@@ -2123,6 +2127,16 @@ compute_distinct_stats(VacAttrStatsP stats,
* significantly more common than the (estimated) average. We set the
* threshold rather arbitrarily at 25% more than average, with at
* least 2 instances in the sample.
+ *
+ * Note: the first of these cases is meant to address columns with
+ * small, fixed sets of possible values, such as boolean or enum
+ * columns. If we can *completely* represent the column population by
+ * an MCV list that will fit into the stats target, then we should do
+ * so and thus provide the planner with complete information. But if
+ * the MCV list is not complete, it's generally worth being more
+ * selective, and not just filling it all the way up to the stats
+ * target. So for an incomplete list, we try to take only MCVs that
+ * are significantly more common than average.
*/
if (track_cnt < track_max && toowide_cnt == 0 &&
stats->stadistinct > 0 &&
@@ -2416,7 +2430,11 @@ compute_scalar_stats(VacAttrStatsP stats,
{
/*
* Every value in the sample appeared more than once. Assume the
- * column has just these values.
+ * column has just these values. (This case is meant to address
+ * columns with small, fixed sets of possible values, such as
+ * boolean or enum columns. If there are any values that appear
+ * just once in the sample, including too-wide values, we should
+ * assume that that's not what we're dealing with.)
*/
stats->stadistinct = ndistinct;
}
@@ -2485,6 +2503,16 @@ compute_scalar_stats(VacAttrStatsP stats,
* emit duplicate histogram bin boundaries. (We might end up with
* duplicate histogram entries anyway, if the distribution is skewed;
* but we prefer to treat such values as MCVs if at all possible.)
+ *
+ * Note: the first of these cases is meant to address columns with
+ * small, fixed sets of possible values, such as boolean or enum
+ * columns. If we can *completely* represent the column population by
+ * an MCV list that will fit into the stats target, then we should do
+ * so and thus provide the planner with complete information. But if
+ * the MCV list is not complete, it's generally worth being more
+ * selective, and not just filling it all the way up to the stats
+ * target. So for an incomplete list, we try to take only MCVs that
+ * are significantly more common than average.
*/
if (track_cnt == ndistinct && toowide_cnt == 0 &&
stats->stadistinct > 0 &&