1 files changed, 278 insertions, 117 deletions
diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index bd63377ada7..72c2c30ad40 100644
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -201,6 +201,7 @@ static Selectivity regex_selectivity(const char *patt, int pattlen,
 static Datum string_to_datum(const char *str, Oid datatype);
 static Const *string_to_const(const char *str, Oid datatype);
 static Const *string_to_bytea_const(const char *str, size_t str_len);
+static List *add_predicate_to_quals(IndexOptInfo *index, List *indexQuals);
 
 
 /*
@@ -5916,76 +5917,55 @@ string_to_bytea_const(const char *str, size_t str_len)
  */
 
 /*
- * If the index is partial, add its predicate to the given qual list.
+ * genericcostestimate is a general-purpose estimator that can be used for
+ * most index types.  In some cases we use genericcostestimate as the base
+ * code and then incorporate additional index-type-specific knowledge in
+ * the type-specific calling function.  To avoid code duplication, we make
+ * genericcostestimate return a number of intermediate values as well as
+ * its preliminary estimates of the output cost values.  The GenericCosts
+ * struct includes all these values.
  *
- * ANDing the index predicate with the explicitly given indexquals produces
- * a more accurate idea of the index's selectivity.  However, we need to be
- * careful not to insert redundant clauses, because clauselist_selectivity()
- * is easily fooled into computing a too-low selectivity estimate.	Our
- * approach is to add only the predicate clause(s) that cannot be proven to
- * be implied by the given indexquals.	This successfully handles cases such
- * as a qual "x = 42" used with a partial index "WHERE x >= 40 AND x < 50".
- * There are many other cases where we won't detect redundancy, leading to a
- * too-low selectivity estimate, which will bias the system in favor of using
- * partial indexes where possible.	That is not necessarily bad though.
- *
- * Note that indexQuals contains RestrictInfo nodes while the indpred
- * does not, so the output list will be mixed.	This is OK for both
- * predicate_implied_by() and clauselist_selectivity(), but might be
- * problematic if the result were passed to other things.
+ * Callers should initialize all fields of GenericCosts to zero.  In addition,
+ * they can set numIndexTuples to some positive value if they have a better
+ * than default way of estimating the number of leaf index tuples visited.
  */
-static List *
-add_predicate_to_quals(IndexOptInfo *index, List *indexQuals)
+typedef struct
 {
-	List	   *predExtraQuals = NIL;
-	ListCell   *lc;
-
-	if (index->indpred == NIL)
-		return indexQuals;
-
-	foreach(lc, index->indpred)
-	{
-		Node	   *predQual = (Node *) lfirst(lc);
-		List	   *oneQual = list_make1(predQual);
+	/* These are the values the cost estimator must return to the planner */
+	Cost		indexStartupCost;		/* index-related startup cost */
+	Cost		indexTotalCost;			/* total index-related scan cost */
+	Selectivity	indexSelectivity;		/* selectivity of index */
+	double		indexCorrelation;		/* order correlation of index */
+
+	/* Intermediate values we obtain along the way */
+	double		numIndexPages;			/* number of leaf pages visited */
+	double		numIndexTuples;			/* number of leaf tuples visited */
+	double		spc_random_page_cost;	/* relevant random_page_cost value */
+	double		num_sa_scans;			/* # indexscans from ScalarArrayOps */
+} GenericCosts;
 
-		if (!predicate_implied_by(oneQual, indexQuals))
-			predExtraQuals = list_concat(predExtraQuals, oneQual);
-	}
-	/* list_concat avoids modifying the passed-in indexQuals list */
-	return list_concat(predExtraQuals, indexQuals);
-}
-
-/*
- * genericcostestimate is a general-purpose estimator for use when we
- * don't have any better idea about how to estimate.  Index-type-specific
- * knowledge can be incorporated in the type-specific routines.
- *
- * One bit of index-type-specific knowledge we can relatively easily use
- * in genericcostestimate is the estimate of the number of index tuples
- * visited.  If numIndexTuples is not 0 then it is used as the estimate,
- * otherwise we compute a generic estimate.
- */
 static void
 genericcostestimate(PlannerInfo *root,
 					IndexPath *path,
 					double loop_count,
-					double numIndexTuples,
-					Cost *indexStartupCost,
-					Cost *indexTotalCost,
-					Selectivity *indexSelectivity,
-					double *indexCorrelation)
+					GenericCosts *costs)
 {
 	IndexOptInfo *index = path->indexinfo;
 	List	   *indexQuals = path->indexquals;
 	List	   *indexOrderBys = path->indexorderbys;
+	Cost		indexStartupCost;
+	Cost		indexTotalCost;
+	Selectivity	indexSelectivity;
+	double		indexCorrelation;
 	double		numIndexPages;
+	double		numIndexTuples;
+	double		spc_random_page_cost;
 	double		num_sa_scans;
 	double		num_outer_scans;
 	double		num_scans;
 	QualCost	index_qual_cost;
 	double		qual_op_cost;
 	double		qual_arg_cost;
-	double		spc_random_page_cost;
 	List	   *selectivityQuals;
 	ListCell   *l;
 
@@ -6016,19 +5996,20 @@ genericcostestimate(PlannerInfo *root,
 	}
 
 	/* Estimate the fraction of main-table tuples that will be visited */
-	*indexSelectivity = clauselist_selectivity(root, selectivityQuals,
-											   index->rel->relid,
-											   JOIN_INNER,
-											   NULL);
+	indexSelectivity = clauselist_selectivity(root, selectivityQuals,
+											  index->rel->relid,
+											  JOIN_INNER,
+											  NULL);
 
 	/*
 	 * If caller didn't give us an estimate, estimate the number of index
 	 * tuples that will be visited.  We do it in this rather peculiar-looking
 	 * way in order to get the right answer for partial indexes.
 	 */
+	numIndexTuples = costs->numIndexTuples;
 	if (numIndexTuples <= 0.0)
 	{
-		numIndexTuples = *indexSelectivity * index->rel->tuples;
+		numIndexTuples = indexSelectivity * index->rel->tuples;
 
 		/*
 		 * The above calculation counts all the tuples visited across all
@@ -6055,9 +6036,12 @@ genericcostestimate(PlannerInfo *root,
 	 *
 	 * We use the simplistic method of taking a pro-rata fraction of the total
 	 * number of index pages.  In effect, this counts only leaf pages and not
-	 * any overhead such as index metapage or upper tree levels. In practice
-	 * this seems a better approximation than charging for access to the upper
-	 * levels, perhaps because those tend to stay in cache under load.
+	 * any overhead such as index metapage or upper tree levels.
+	 *
+	 * In practice access to upper index levels is often nearly free because
+	 * those tend to stay in cache under load; moreover, the cost involved is
+	 * highly dependent on index type.  We therefore ignore such costs here
+	 * and leave it to the caller to add a suitable charge if needed.
 	 */
 	if (index->pages > 1 && index->tuples > 1)
 		numIndexPages = ceil(numIndexTuples * index->pages / index->tuples);
@@ -6107,7 +6091,7 @@ genericcostestimate(PlannerInfo *root,
 		 * share for each outer scan.  (Don't pro-rate for ScalarArrayOpExpr,
 		 * since that's internal to the indexscan.)
 		 */
-		*indexTotalCost = (pages_fetched * spc_random_page_cost)
+		indexTotalCost = (pages_fetched * spc_random_page_cost)
 			/ num_outer_scans;
 	}
 	else
@@ -6116,30 +6100,10 @@ genericcostestimate(PlannerInfo *root,
 		 * For a single index scan, we just charge spc_random_page_cost per
 		 * page touched.
 		 */
-		*indexTotalCost = numIndexPages * spc_random_page_cost;
+		indexTotalCost = numIndexPages * spc_random_page_cost;
 	}
 
 	/*
-	 * A difficulty with the leaf-pages-only cost approach is that for small
-	 * selectivities (eg, single index tuple fetched) all indexes will look
-	 * equally attractive because we will estimate exactly 1 leaf page to be
-	 * fetched.  All else being equal, we should prefer physically smaller
-	 * indexes over larger ones.  (An index might be smaller because it is
-	 * partial or because it contains fewer columns; presumably the other
-	 * columns in the larger index aren't useful to the query, or the larger
-	 * index would have better selectivity.)
-	 *
-	 * We can deal with this by adding a very small "fudge factor" that
-	 * depends on the index size, so that indexes of different sizes won't
-	 * look exactly equally attractive.  To ensure the fudge factor stays
-	 * small even for very large indexes, use a log function.  (We previously
-	 * used a factor of one spc_random_page_cost per 10000 index pages, which
-	 * grew too large for large indexes.  This expression has about the same
-	 * growth rate for small indexes, but tails off quickly.)
-	 */
-	*indexTotalCost += log(1.0 + index->pages / 10000.0) * spc_random_page_cost;
-
-	/*
 	 * CPU cost: any complex expressions in the indexquals will need to be
 	 * evaluated once at the start of the scan to reduce them to runtime keys
 	 * to pass to the index AM (see nodeIndexscan.c).  We model the per-tuple
@@ -6149,10 +6113,9 @@ genericcostestimate(PlannerInfo *root,
 	 * for ScalarArrayOpExpr cases.  Similarly add in costs for any index
 	 * ORDER BY expressions.
 	 *
-	 * Note: this neglects the possible costs of rechecking lossy operators
-	 * and OR-clause expressions.  Detecting that that might be needed seems
-	 * more expensive than it's worth, though, considering all the other
-	 * inaccuracies here ...
+	 * Note: this neglects the possible costs of rechecking lossy operators.
+	 * Detecting that that might be needed seems more expensive than it's
+	 * worth, though, considering all the other inaccuracies here ...
 	 */
 	cost_qual_eval(&index_qual_cost, indexQuals, root);
 	qual_arg_cost = index_qual_cost.startup + index_qual_cost.per_tuple;
@@ -6164,29 +6127,66 @@ genericcostestimate(PlannerInfo *root,
 	if (qual_arg_cost < 0)		/* just in case... */
 		qual_arg_cost = 0;
 
-	*indexStartupCost = qual_arg_cost;
-	*indexTotalCost += qual_arg_cost;
-	*indexTotalCost += numIndexTuples * num_sa_scans * (cpu_index_tuple_cost + qual_op_cost);
+	indexStartupCost = qual_arg_cost;
+	indexTotalCost += qual_arg_cost;
+	indexTotalCost += numIndexTuples * num_sa_scans * (cpu_index_tuple_cost + qual_op_cost);
 
 	/*
-	 * We also add a CPU-cost component to represent the general costs of
-	 * starting an indexscan, such as analysis of btree index keys and initial
-	 * tree descent.  This is estimated at 100x cpu_operator_cost, which is a
-	 * bit arbitrary but seems the right order of magnitude. (As noted above,
-	 * we don't charge any I/O for touching upper tree levels, but charging
-	 * nothing at all has been found too optimistic.)
-	 *
-	 * Although this is startup cost with respect to any one scan, we add it
-	 * to the "total" cost component because it's only very interesting in the
-	 * many-ScalarArrayOpExpr-scan case, and there it will be paid over the
-	 * life of the scan node.
+	 * Generic assumption about index correlation: there isn't any.
 	 */
-	*indexTotalCost += num_sa_scans * 100.0 * cpu_operator_cost;
+	indexCorrelation = 0.0;
 
 	/*
-	 * Generic assumption about index correlation: there isn't any.
+	 * Return everything to caller.
 	 */
-	*indexCorrelation = 0.0;
+	costs->indexStartupCost = indexStartupCost;
+	costs->indexTotalCost = indexTotalCost;
+	costs->indexSelectivity = indexSelectivity;
+	costs->indexCorrelation = indexCorrelation;
+	costs->numIndexPages = numIndexPages;
+	costs->numIndexTuples = numIndexTuples;
+	costs->spc_random_page_cost = spc_random_page_cost;
+	costs->num_sa_scans = num_sa_scans;
+}
+
+/*
+ * If the index is partial, add its predicate to the given qual list.
+ *
+ * ANDing the index predicate with the explicitly given indexquals produces
+ * a more accurate idea of the index's selectivity.  However, we need to be
+ * careful not to insert redundant clauses, because clauselist_selectivity()
+ * is easily fooled into computing a too-low selectivity estimate.	Our
+ * approach is to add only the predicate clause(s) that cannot be proven to
+ * be implied by the given indexquals.	This successfully handles cases such
+ * as a qual "x = 42" used with a partial index "WHERE x >= 40 AND x < 50".
+ * There are many other cases where we won't detect redundancy, leading to a
+ * too-low selectivity estimate, which will bias the system in favor of using
+ * partial indexes where possible.	That is not necessarily bad though.
+ *
+ * Note that indexQuals contains RestrictInfo nodes while the indpred
+ * does not, so the output list will be mixed.	This is OK for both
+ * predicate_implied_by() and clauselist_selectivity(), but might be
+ * problematic if the result were passed to other things.
+ */
+static List *
+add_predicate_to_quals(IndexOptInfo *index, List *indexQuals)
+{
+	List	   *predExtraQuals = NIL;
+	ListCell   *lc;
+
+	if (index->indpred == NIL)
+		return indexQuals;
+
+	foreach(lc, index->indpred)
+	{
+		Node	   *predQual = (Node *) lfirst(lc);
+		List	   *oneQual = list_make1(predQual);
+
+		if (!predicate_implied_by(oneQual, indexQuals))
+			predExtraQuals = list_concat(predExtraQuals, oneQual);
+	}
+	/* list_concat avoids modifying the passed-in indexQuals list */
+	return list_concat(predExtraQuals, indexQuals);
 }
 
 
@@ -6201,10 +6201,12 @@ btcostestimate(PG_FUNCTION_ARGS)
 	Selectivity *indexSelectivity = (Selectivity *) PG_GETARG_POINTER(5);
 	double	   *indexCorrelation = (double *) PG_GETARG_POINTER(6);
 	IndexOptInfo *index = path->indexinfo;
+	GenericCosts costs;
 	Oid			relid;
 	AttrNumber	colnum;
 	VariableStatData vardata;
 	double		numIndexTuples;
+	Cost		descentCost;
 	List	   *indexBoundQuals;
 	int			indexcol;
 	bool		eqQualHere;
@@ -6379,10 +6381,45 @@ btcostestimate(PG_FUNCTION_ARGS)
 		numIndexTuples = rint(numIndexTuples / num_sa_scans);
 	}
 
-	genericcostestimate(root, path, loop_count,
-						numIndexTuples,
-						indexStartupCost, indexTotalCost,
-						indexSelectivity, indexCorrelation);
+	/*
+	 * Now do generic index cost estimation.
+	 */
+	MemSet(&costs, 0, sizeof(costs));
+	costs.numIndexTuples = numIndexTuples;
+
+	genericcostestimate(root, path, loop_count, &costs);
+
+	/*
+	 * Add a CPU-cost component to represent the costs of initial btree
+	 * descent.  We don't charge any I/O cost for touching upper btree levels,
+	 * since they tend to stay in cache, but we still have to do about log2(N)
+	 * comparisons to descend a btree of N leaf tuples.  We charge one
+	 * cpu_operator_cost per comparison.
+	 *
+	 * If there are ScalarArrayOpExprs, charge this once per SA scan.  The
+	 * ones after the first one are not startup cost so far as the overall
+	 * plan is concerned, so add them only to "total" cost.
+	 */
+	if (index->tuples > 1)		/* avoid computing log(0) */
+	{
+		descentCost = ceil(log(index->tuples) / log(2.0)) * cpu_operator_cost;
+		costs.indexStartupCost += descentCost;
+		costs.indexTotalCost += costs.num_sa_scans * descentCost;
+	}
+
+	/*
+	 * Even though we're not charging I/O cost for touching upper btree pages,
+	 * it's still reasonable to charge some CPU cost per page descended
+	 * through.  Moreover, if we had no such charge at all, bloated indexes
+	 * would appear to have the same search cost as unbloated ones, at least
+	 * in cases where only a single leaf page is expected to be visited.  This
+	 * cost is somewhat arbitrarily set at 50x cpu_operator_cost per page
+	 * touched.  The number of such pages is btree tree height plus one (ie,
+	 * we charge for the leaf page too).  As above, charge once per SA scan.
+	 */
+	descentCost = (index->tree_height + 1) * 50.0 * cpu_operator_cost;
+	costs.indexStartupCost += descentCost;
+	costs.indexTotalCost += costs.num_sa_scans * descentCost;
 
 	/*
 	 * If we can get an estimate of the first column's ordering correlation C
@@ -6478,9 +6515,9 @@ btcostestimate(PG_FUNCTION_ARGS)
 				varCorrelation = -varCorrelation;
 
 			if (index->ncolumns > 1)
-				*indexCorrelation = varCorrelation * 0.75;
+				costs.indexCorrelation = varCorrelation * 0.75;
 			else
-				*indexCorrelation = varCorrelation;
+				costs.indexCorrelation = varCorrelation;
 
 			free_attstatsslot(InvalidOid, NULL, 0, numbers, nnumbers);
 		}
@@ -6488,6 +6525,11 @@ btcostestimate(PG_FUNCTION_ARGS)
 
 	ReleaseVariableStats(vardata);
 
+	*indexStartupCost = costs.indexStartupCost;
+	*indexTotalCost = costs.indexTotalCost;
+	*indexSelectivity = costs.indexSelectivity;
+	*indexCorrelation = costs.indexCorrelation;
+
 	PG_RETURN_VOID();
 }
 
@@ -6501,10 +6543,41 @@ hashcostestimate(PG_FUNCTION_ARGS)
 	Cost	   *indexTotalCost = (Cost *) PG_GETARG_POINTER(4);
 	Selectivity *indexSelectivity = (Selectivity *) PG_GETARG_POINTER(5);
 	double	   *indexCorrelation = (double *) PG_GETARG_POINTER(6);
+	GenericCosts costs;
+
+	MemSet(&costs, 0, sizeof(costs));
+
+	genericcostestimate(root, path, loop_count, &costs);
+
+	/*
+	 * A hash index has no descent costs as such, since the index AM can go
+	 * directly to the target bucket after computing the hash value.  There
+	 * are a couple of other hash-specific costs that we could conceivably add
+	 * here, though:
+	 *
+	 * Ideally we'd charge spc_random_page_cost for each page in the target
+	 * bucket, not just the numIndexPages pages that genericcostestimate
+	 * thought we'd visit.  However in most cases we don't know which bucket
+	 * that will be.  There's no point in considering the average bucket size
+	 * because the hash AM makes sure that's always one page.
+	 *
+	 * Likewise, we could consider charging some CPU for each index tuple in
+	 * the bucket, if we knew how many there were.  But the per-tuple cost is
+	 * just a hash value comparison, not a general datatype-dependent
+	 * comparison, so any such charge ought to be quite a bit less than
+	 * cpu_operator_cost; which makes it probably not worth worrying about.
+	 *
+	 * A bigger issue is that chance hash-value collisions will result in
+	 * wasted probes into the heap.  We don't currently attempt to model this
+	 * cost on the grounds that it's rare, but maybe it's not rare enough.
+	 * (Any fix for this ought to consider the generic lossy-operator problem,
+	 * though; it's not entirely hash-specific.)
+	 */
 
-	genericcostestimate(root, path, loop_count, 0.0,
-						indexStartupCost, indexTotalCost,
-						indexSelectivity, indexCorrelation);
+	*indexStartupCost = costs.indexStartupCost;
+	*indexTotalCost = costs.indexTotalCost;
+	*indexSelectivity = costs.indexSelectivity;
+	*indexCorrelation = costs.indexCorrelation;
 
 	PG_RETURN_VOID();
 }
@@ -6519,10 +6592,54 @@ gistcostestimate(PG_FUNCTION_ARGS)
 	Cost	   *indexTotalCost = (Cost *) PG_GETARG_POINTER(4);
 	Selectivity *indexSelectivity = (Selectivity *) PG_GETARG_POINTER(5);
 	double	   *indexCorrelation = (double *) PG_GETARG_POINTER(6);
+	IndexOptInfo *index = path->indexinfo;
+	GenericCosts costs;
+	Cost		descentCost;
+
+	MemSet(&costs, 0, sizeof(costs));
+
+	genericcostestimate(root, path, loop_count, &costs);
+
+	/*
+	 * We model index descent costs similarly to those for btree, but to do
+	 * that we first need an idea of the tree height.  We somewhat arbitrarily
+	 * assume that the fanout is 100, meaning the tree height is at most
+	 * log100(index->pages).
+	 *
+	 * Although this computation isn't really expensive enough to require
+	 * caching, we might as well use index->tree_height to cache it.
+	 */
+	if (index->tree_height < 0)	/* unknown? */
+	{
+		if (index->pages > 1)	/* avoid computing log(0) */
+			index->tree_height = (int) (log(index->pages) / log(100.0));
+		else
+			index->tree_height = 0;
+	}
+
+	/*
+	 * Add a CPU-cost component to represent the costs of initial descent.
+	 * We just use log(N) here not log2(N) since the branching factor isn't
+	 * necessarily two anyway.  As for btree, charge once per SA scan.
+	 */
+	if (index->tuples > 1)		/* avoid computing log(0) */
+	{
+		descentCost = ceil(log(index->tuples)) * cpu_operator_cost;
+		costs.indexStartupCost += descentCost;
+		costs.indexTotalCost += costs.num_sa_scans * descentCost;
+	}
+
+	/*
+	 * Likewise add a per-page charge, calculated the same as for btrees.
+	 */
+	descentCost = (index->tree_height + 1) * 50.0 * cpu_operator_cost;
+	costs.indexStartupCost += descentCost;
+	costs.indexTotalCost += costs.num_sa_scans * descentCost;
 
-	genericcostestimate(root, path, loop_count, 0.0,
-						indexStartupCost, indexTotalCost,
-						indexSelectivity, indexCorrelation);
+	*indexStartupCost = costs.indexStartupCost;
+	*indexTotalCost = costs.indexTotalCost;
+	*indexSelectivity = costs.indexSelectivity;
+	*indexCorrelation = costs.indexCorrelation;
 
 	PG_RETURN_VOID();
 }
@@ -6537,10 +6654,54 @@ spgcostestimate(PG_FUNCTION_ARGS)
 	Cost	   *indexTotalCost = (Cost *) PG_GETARG_POINTER(4);
 	Selectivity *indexSelectivity = (Selectivity *) PG_GETARG_POINTER(5);
 	double	   *indexCorrelation = (double *) PG_GETARG_POINTER(6);
+	IndexOptInfo *index = path->indexinfo;
+	GenericCosts costs;
+	Cost		descentCost;
+
+	MemSet(&costs, 0, sizeof(costs));
+
+	genericcostestimate(root, path, loop_count, &costs);
 
-	genericcostestimate(root, path, loop_count, 0.0,
-						indexStartupCost, indexTotalCost,
-						indexSelectivity, indexCorrelation);
+	/*
+	 * We model index descent costs similarly to those for btree, but to do
+	 * that we first need an idea of the tree height.  We somewhat arbitrarily
+	 * assume that the fanout is 100, meaning the tree height is at most
+	 * log100(index->pages).
+	 *
+	 * Although this computation isn't really expensive enough to require
+	 * caching, we might as well use index->tree_height to cache it.
+	 */
+	if (index->tree_height < 0)	/* unknown? */
+	{
+		if (index->pages > 1)	/* avoid computing log(0) */
+			index->tree_height = (int) (log(index->pages) / log(100.0));
+		else
+			index->tree_height = 0;
+	}
+
+	/*
+	 * Add a CPU-cost component to represent the costs of initial descent.
+	 * We just use log(N) here not log2(N) since the branching factor isn't
+	 * necessarily two anyway.  As for btree, charge once per SA scan.
+	 */
+	if (index->tuples > 1)		/* avoid computing log(0) */
+	{
+		descentCost = ceil(log(index->tuples)) * cpu_operator_cost;
+		costs.indexStartupCost += descentCost;
+		costs.indexTotalCost += costs.num_sa_scans * descentCost;
+	}
+
+	/*
+	 * Likewise add a per-page charge, calculated the same as for btrees.
+	 */
+	descentCost = (index->tree_height + 1) * 50.0 * cpu_operator_cost;
+	costs.indexStartupCost += descentCost;
+	costs.indexTotalCost += costs.num_sa_scans * descentCost;
+
+	*indexStartupCost = costs.indexStartupCost;
+	*indexTotalCost = costs.indexTotalCost;
+	*indexSelectivity = costs.indexSelectivity;
+	*indexCorrelation = costs.indexCorrelation;
 
 	PG_RETURN_VOID();
 }