7 files changed, 287 insertions, 177 deletions
diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c
index 965574c579d..fa8d06707b8 100644
--- a/src/backend/optimizer/path/costsize.c
+++ b/src/backend/optimizer/path/costsize.c
@@ -54,7 +54,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/optimizer/path/costsize.c,v 1.157 2006/06/05 20:56:33 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/optimizer/path/costsize.c,v 1.158 2006/06/06 17:59:57 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -133,7 +133,7 @@ clamp_row_est(double nrows)
 	 * better and to avoid possible divide-by-zero when interpolating costs.
 	 * Make it an integer, too.
 	 */
-	if (nrows < 1.0)
+	if (nrows <= 1.0)
 		nrows = 1.0;
 	else
 		nrows = rint(nrows);
@@ -181,8 +181,10 @@ cost_seqscan(Path *path, PlannerInfo *root,
  *
  * 'index' is the index to be used
  * 'indexQuals' is the list of applicable qual clauses (implicit AND semantics)
- * 'is_injoin' is T if we are considering using the index scan as the inside
- *		of a nestloop join (hence, some of the indexQuals are join clauses)
+ * 'outer_rel' is the outer relation when we are considering using the index
+ *		scan as the inside of a nestloop join (hence, some of the indexQuals
+ *		are join clauses, and we should expect repeated scans of the index);
+ *		NULL for a plain index scan
  *
  * cost_index() takes an IndexPath not just a Path, because it sets a few
  * additional fields of the IndexPath besides startup_cost and total_cost.
@@ -200,7 +202,7 @@ void
 cost_index(IndexPath *path, PlannerInfo *root,
 		   IndexOptInfo *index,
 		   List *indexQuals,
-		   bool is_injoin)
+		   RelOptInfo *outer_rel)
 {
 	RelOptInfo *baserel = index->rel;
 	Cost		startup_cost = 0;
@@ -215,8 +217,6 @@ cost_index(IndexPath *path, PlannerInfo *root,
 	Cost		cpu_per_tuple;
 	double		tuples_fetched;
 	double		pages_fetched;
-	double		T,
-				b;
 
 	/* Should only be applied to base relations */
 	Assert(IsA(baserel, RelOptInfo) &&
@@ -233,10 +233,11 @@ cost_index(IndexPath *path, PlannerInfo *root,
 	 * the fraction of main-table tuples we will have to retrieve) and its
 	 * correlation to the main-table tuple order.
 	 */
-	OidFunctionCall7(index->amcostestimate,
+	OidFunctionCall8(index->amcostestimate,
 					 PointerGetDatum(root),
 					 PointerGetDatum(index),
 					 PointerGetDatum(indexQuals),
+					 PointerGetDatum(outer_rel),
 					 PointerGetDatum(&indexStartupCost),
 					 PointerGetDatum(&indexTotalCost),
 					 PointerGetDatum(&indexSelectivity),
@@ -254,86 +255,75 @@ cost_index(IndexPath *path, PlannerInfo *root,
 	startup_cost += indexStartupCost;
 	run_cost += indexTotalCost - indexStartupCost;
 
+	/* estimate number of main-table tuples fetched */
+	tuples_fetched = clamp_row_est(indexSelectivity * baserel->tuples);
+
 	/*----------
-	 * Estimate number of main-table tuples and pages fetched.
+	 * Estimate number of main-table pages fetched, and compute I/O cost.
 	 *
 	 * When the index ordering is uncorrelated with the table ordering,
-	 * we use an approximation proposed by Mackert and Lohman, "Index Scans
-	 * Using a Finite LRU Buffer: A Validated I/O Model", ACM Transactions
-	 * on Database Systems, Vol. 14, No. 3, September 1989, Pages 401-424.
-	 * The Mackert and Lohman approximation is that the number of pages
-	 * fetched is
-	 *	PF =
-	 *		min(2TNs/(2T+Ns), T)			when T <= b
-	 *		2TNs/(2T+Ns)					when T > b and Ns <= 2Tb/(2T-b)
-	 *		b + (Ns - 2Tb/(2T-b))*(T-b)/T	when T > b and Ns > 2Tb/(2T-b)
-	 * where
-	 *		T = # pages in table
-	 *		N = # tuples in table
-	 *		s = selectivity = fraction of table to be scanned
-	 *		b = # buffer pages available (we include kernel space here)
+	 * we use an approximation proposed by Mackert and Lohman (see
+	 * index_pages_fetched() for details) to compute the number of pages
+	 * fetched, and then charge random_page_cost per page fetched.
 	 *
 	 * When the index ordering is exactly correlated with the table ordering
 	 * (just after a CLUSTER, for example), the number of pages fetched should
-	 * be just sT.	What's more, these will be sequential fetches, not the
-	 * random fetches that occur in the uncorrelated case.	So, depending on
-	 * the extent of correlation, we should estimate the actual I/O cost
-	 * somewhere between s * T * seq_page_cost and PF * random_page_cost.
-	 * We currently interpolate linearly between these two endpoints based on
-	 * the correlation squared (XXX is that appropriate?).
-	 *
-	 * In any case the number of tuples fetched is Ns.
+	 * be exactly selectivity * table_size.  What's more, all but the first
+	 * will be sequential fetches, not the random fetches that occur in the
+	 * uncorrelated case.  So if the number of pages is more than 1, we
+	 * ought to charge
+	 *		random_page_cost + (pages_fetched - 1) * seq_page_cost
+	 * For partially-correlated indexes, we ought to charge somewhere between
+	 * these two estimates.  We currently interpolate linearly between the
+	 * estimates based on the correlation squared (XXX is that appropriate?).
 	 *----------
 	 */
+	if (outer_rel != NULL && outer_rel->rows > 1)
+	{
+		/*
+		 * For repeated indexscans, scale up the number of tuples fetched
+		 * in the Mackert and Lohman formula by the number of scans, so
+		 * that we estimate the number of pages fetched by all the scans.
+		 * Then pro-rate the costs for one scan.  In this case we assume
+		 * all the fetches are random accesses.  XXX it'd be good to
+		 * include correlation in this model, but it's not clear how to do
+		 * that without double-counting cache effects.
+		 */
+		double		num_scans = outer_rel->rows;
 
-	tuples_fetched = clamp_row_est(indexSelectivity * baserel->tuples);
-
-	/* This part is the Mackert and Lohman formula */
+		pages_fetched = index_pages_fetched(tuples_fetched * num_scans,
+											baserel->pages,
+											index->pages);
 
-	T = (baserel->pages > 1) ? (double) baserel->pages : 1.0;
-	b = (effective_cache_size > 1) ? effective_cache_size : 1.0;
-
-	if (T <= b)
-	{
-		pages_fetched =
-			(2.0 * T * tuples_fetched) / (2.0 * T + tuples_fetched);
-		if (pages_fetched >= T)
-			pages_fetched = T;
-		else
-			pages_fetched = ceil(pages_fetched);
+		run_cost += (pages_fetched * random_page_cost) / num_scans;
 	}
 	else
 	{
-		double		lim;
+		/*
+		 * Normal case: apply the Mackert and Lohman formula, and then
+		 * interpolate between that and the correlation-derived result.
+		 */
+		pages_fetched = index_pages_fetched(tuples_fetched,
+											baserel->pages,
+											index->pages);
 
-		lim = (2.0 * T * b) / (2.0 * T - b);
-		if (tuples_fetched <= lim)
-		{
-			pages_fetched =
-				(2.0 * T * tuples_fetched) / (2.0 * T + tuples_fetched);
-		}
-		else
-		{
-			pages_fetched =
-				b + (tuples_fetched - lim) * (T - b) / T;
-		}
-		pages_fetched = ceil(pages_fetched);
-	}
+		/* max_IO_cost is for the perfectly uncorrelated case (csquared=0) */
+		max_IO_cost = pages_fetched * random_page_cost;
 
-	/*
-	 * min_IO_cost corresponds to the perfectly correlated case (csquared=1),
-	 * max_IO_cost to the perfectly uncorrelated case (csquared=0).
-	 */
-	min_IO_cost = ceil(indexSelectivity * T) * seq_page_cost;
-	max_IO_cost = pages_fetched * random_page_cost;
+		/* min_IO_cost is for the perfectly correlated case (csquared=1) */
+		pages_fetched = ceil(indexSelectivity * (double) baserel->pages);
+		min_IO_cost = random_page_cost;
+		if (pages_fetched > 1)
+			min_IO_cost += (pages_fetched - 1) * seq_page_cost;
 
-	/*
-	 * Now interpolate based on estimated index order correlation to get total
-	 * disk I/O cost for main table accesses.
-	 */
-	csquared = indexCorrelation * indexCorrelation;
+		/*
+		 * Now interpolate based on estimated index order correlation to get
+		 * total disk I/O cost for main table accesses.
+		 */
+		csquared = indexCorrelation * indexCorrelation;
 
-	run_cost += max_IO_cost + csquared * (min_IO_cost - max_IO_cost);
+		run_cost += max_IO_cost + csquared * (min_IO_cost - max_IO_cost);
+	}
 
 	/*
 	 * Estimate CPU costs per tuple.
@@ -348,7 +338,7 @@ cost_index(IndexPath *path, PlannerInfo *root,
 	startup_cost += baserel->baserestrictcost.startup;
 	cpu_per_tuple = cpu_tuple_cost + baserel->baserestrictcost.per_tuple;
 
-	if (!is_injoin)
+	if (outer_rel == NULL)
 	{
 		QualCost	index_qual_cost;
 
@@ -364,18 +354,101 @@ cost_index(IndexPath *path, PlannerInfo *root,
 }
 
 /*
+ * index_pages_fetched
+ *	  Estimate the number of pages actually fetched after accounting for
+ *	  cache effects.
+ *
+ * We use an approximation proposed by Mackert and Lohman, "Index Scans
+ * Using a Finite LRU Buffer: A Validated I/O Model", ACM Transactions
+ * on Database Systems, Vol. 14, No. 3, September 1989, Pages 401-424.
+ * The Mackert and Lohman approximation is that the number of pages
+ * fetched is
+ *	PF =
+ *		min(2TNs/(2T+Ns), T)			when T <= b
+ *		2TNs/(2T+Ns)					when T > b and Ns <= 2Tb/(2T-b)
+ *		b + (Ns - 2Tb/(2T-b))*(T-b)/T	when T > b and Ns > 2Tb/(2T-b)
+ * where
+ *		T = # pages in table
+ *		N = # tuples in table
+ *		s = selectivity = fraction of table to be scanned
+ *		b = # buffer pages available (we include kernel space here)
+ *
+ * We assume that effective_cache_size is the total number of buffer pages
+ * available for both table and index, and pro-rate that space between the
+ * table and index.  (Ideally other_pages should include all the other
+ * tables and indexes used by the query too; but we don't have a good way
+ * to get that number here.)
+ *
+ * The product Ns is the number of tuples fetched; we pass in that
+ * product rather than calculating it here.
+ *
+ * Caller is expected to have ensured that tuples_fetched is greater than zero
+ * and rounded to integer (see clamp_row_est).  The result will likewise be
+ * greater than zero and integral.
+ */
+double
+index_pages_fetched(double tuples_fetched, BlockNumber pages,
+					BlockNumber other_pages)
+{
+	double		pages_fetched;
+	double		T,
+				b;
+
+	/* T is # pages in table, but don't allow it to be zero */
+	T = (pages > 1) ? (double) pages : 1.0;
+
+	/* b is pro-rated share of effective_cache_size */
+	b = effective_cache_size * T / (T + (double) other_pages);
+	/* force it positive and integral */
+	if (b <= 1.0)
+		b = 1.0;
+	else
+		b = ceil(b);
+
+	/* This part is the Mackert and Lohman formula */
+	if (T <= b)
+	{
+		pages_fetched =
+			(2.0 * T * tuples_fetched) / (2.0 * T + tuples_fetched);
+		if (pages_fetched >= T)
+			pages_fetched = T;
+		else
+			pages_fetched = ceil(pages_fetched);
+	}
+	else
+	{
+		double		lim;
+
+		lim = (2.0 * T * b) / (2.0 * T - b);
+		if (tuples_fetched <= lim)
+		{
+			pages_fetched =
+				(2.0 * T * tuples_fetched) / (2.0 * T + tuples_fetched);
+		}
+		else
+		{
+			pages_fetched =
+				b + (tuples_fetched - lim) * (T - b) / T;
+		}
+		pages_fetched = ceil(pages_fetched);
+	}
+	return pages_fetched;
+}
+
+/*
  * cost_bitmap_heap_scan
  *	  Determines and returns the cost of scanning a relation using a bitmap
  *	  index-then-heap plan.
  *
  * 'baserel' is the relation to be scanned
  * 'bitmapqual' is a tree of IndexPaths, BitmapAndPaths, and BitmapOrPaths
- * 'is_injoin' is T if we are considering using the scan as the inside
- *		of a nestloop join (hence, some of the quals are join clauses)
+ *
+ * Note: we take no explicit notice here of whether this is a join inner path.
+ * If it is, the component IndexPaths should have been costed accordingly.
  */
 void
 cost_bitmap_heap_scan(Path *path, PlannerInfo *root, RelOptInfo *baserel,
-					  Path *bitmapqual, bool is_injoin)
+					  Path *bitmapqual)
 {
 	Cost		startup_cost = 0;
 	Cost		run_cost = 0;
diff --git a/src/backend/optimizer/path/indxpath.c b/src/backend/optimizer/path/indxpath.c
index ac6de3d82a5..eaf4e1cb337 100644
--- a/src/backend/optimizer/path/indxpath.c
+++ b/src/backend/optimizer/path/indxpath.c
@@ -9,7 +9,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/optimizer/path/indxpath.c,v 1.206 2006/05/18 19:56:46 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/optimizer/path/indxpath.c,v 1.207 2006/06/06 17:59:57 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -47,13 +47,11 @@
 
 static List *find_usable_indexes(PlannerInfo *root, RelOptInfo *rel,
 					List *clauses, List *outer_clauses,
-					bool istoplevel, bool isjoininner,
-					Relids outer_relids,
+					bool istoplevel, RelOptInfo *outer_rel,
 					SaOpControl saop_control);
 static List *find_saop_paths(PlannerInfo *root, RelOptInfo *rel,
 				List *clauses, List *outer_clauses,
-				bool istoplevel, bool isjoininner,
-				Relids outer_relids);
+				bool istoplevel, RelOptInfo *outer_rel);
 static Path *choose_bitmap_and(PlannerInfo *root, RelOptInfo *rel, List *paths);
 static int	bitmap_path_comparator(const void *a, const void *b);
 static Cost bitmap_and_cost_est(PlannerInfo *root, RelOptInfo *rel, List *paths);
@@ -164,7 +162,7 @@ create_index_paths(PlannerInfo *root, RelOptInfo *rel)
 	 */
 	indexpaths = find_usable_indexes(root, rel,
 									 rel->baserestrictinfo, NIL,
-									 true, false, NULL, SAOP_FORBID);
+									 true, NULL, SAOP_FORBID);
 
 	/*
 	 * We can submit them all to add_path.	(This generates access paths for
@@ -190,7 +188,7 @@ create_index_paths(PlannerInfo *root, RelOptInfo *rel)
 	 */
 	indexpaths = generate_bitmap_or_paths(root, rel,
 										  rel->baserestrictinfo, NIL,
-										  false, NULL);
+										  NULL);
 	bitindexpaths = list_concat(bitindexpaths, indexpaths);
 
 	/*
@@ -199,7 +197,7 @@ create_index_paths(PlannerInfo *root, RelOptInfo *rel)
 	 */
 	indexpaths = find_saop_paths(root, rel,
 								 rel->baserestrictinfo, NIL,
-								 true, false, NULL);
+								 true, NULL);
 	bitindexpaths = list_concat(bitindexpaths, indexpaths);
 
 	/*
@@ -247,10 +245,8 @@ create_index_paths(PlannerInfo *root, RelOptInfo *rel)
  * 'outer_clauses' is the list of additional upper-level clauses
  * 'istoplevel' is true if clauses are the rel's top-level restriction list
  *		(outer_clauses must be NIL when this is true)
- * 'isjoininner' is true if forming an inner indexscan (so some of the
- *		given clauses are join clauses)
- * 'outer_relids' identifies the outer side of the join (pass NULL
- *		if not isjoininner)
+ * 'outer_rel' is the outer side of the join if forming an inner indexscan
+ *		(so some of the given clauses are join clauses); NULL if not
  * 'saop_control' indicates whether ScalarArrayOpExpr clauses can be used
  *
  * Note: check_partial_indexes() must have been run previously.
@@ -259,10 +255,10 @@ create_index_paths(PlannerInfo *root, RelOptInfo *rel)
 static List *
 find_usable_indexes(PlannerInfo *root, RelOptInfo *rel,
 					List *clauses, List *outer_clauses,
-					bool istoplevel, bool isjoininner,
-					Relids outer_relids,
+					bool istoplevel, RelOptInfo *outer_rel,
 					SaOpControl saop_control)
 {
+	Relids		outer_relids = outer_rel ? outer_rel->relids : NULL;
 	List	   *result = NIL;
 	List	   *all_clauses = NIL;		/* not computed till needed */
 	ListCell   *ilist;
@@ -349,7 +345,7 @@ find_usable_indexes(PlannerInfo *root, RelOptInfo *rel,
 		 * relevant unless we are at top level.
 		 */
 		index_is_ordered = OidIsValid(index->ordering[0]);
-		if (istoplevel && index_is_ordered && !isjoininner)
+		if (index_is_ordered && istoplevel && outer_rel == NULL)
 		{
 			index_pathkeys = build_index_pathkeys(root, index,
 												  ForwardScanDirection,
@@ -374,7 +370,7 @@ find_usable_indexes(PlannerInfo *root, RelOptInfo *rel,
 									  index_is_ordered ?
 									  ForwardScanDirection :
 									  NoMovementScanDirection,
-									  isjoininner);
+									  outer_rel);
 			result = lappend(result, ipath);
 		}
 
@@ -383,7 +379,7 @@ find_usable_indexes(PlannerInfo *root, RelOptInfo *rel,
 		 * that we failed to match, consider variant ways of achieving the
 		 * ordering.  Again, this is only interesting at top level.
 		 */
-		if (istoplevel && index_is_ordered && !isjoininner &&
+		if (index_is_ordered && istoplevel && outer_rel == NULL &&
 			root->query_pathkeys != NIL &&
 			pathkeys_useful_for_ordering(root, useful_pathkeys) == 0)
 		{
@@ -396,7 +392,7 @@ find_usable_indexes(PlannerInfo *root, RelOptInfo *rel,
 										  restrictclauses,
 										  root->query_pathkeys,
 										  scandir,
-										  false);
+										  outer_rel);
 				result = lappend(result, ipath);
 			}
 		}
@@ -417,8 +413,7 @@ find_usable_indexes(PlannerInfo *root, RelOptInfo *rel,
 static List *
 find_saop_paths(PlannerInfo *root, RelOptInfo *rel,
 				List *clauses, List *outer_clauses,
-				bool istoplevel, bool isjoininner,
-				Relids outer_relids)
+				bool istoplevel, RelOptInfo *outer_rel)
 {
 	bool		have_saop = false;
 	ListCell   *l;
@@ -443,8 +438,7 @@ find_saop_paths(PlannerInfo *root, RelOptInfo *rel,
 
 	return find_usable_indexes(root, rel,
 							   clauses, outer_clauses,
-							   istoplevel, isjoininner,
-							   outer_relids,
+							   istoplevel, outer_rel,
 							   SAOP_REQUIRE);
 }
 
@@ -457,13 +451,13 @@ find_saop_paths(PlannerInfo *root, RelOptInfo *rel,
  *
  * outer_clauses is a list of additional clauses that can be assumed true
  * for the purpose of generating indexquals, but are not to be searched for
- * ORs.  (See find_usable_indexes() for motivation.)
+ * ORs.  (See find_usable_indexes() for motivation.)  outer_rel is the outer
+ * side when we are considering a nestloop inner indexpath.
  */
 List *
 generate_bitmap_or_paths(PlannerInfo *root, RelOptInfo *rel,
 						 List *clauses, List *outer_clauses,
-						 bool isjoininner,
-						 Relids outer_relids)
+						 RelOptInfo *outer_rel)
 {
 	List	   *result = NIL;
 	List	   *all_clauses;
@@ -506,16 +500,14 @@ generate_bitmap_or_paths(PlannerInfo *root, RelOptInfo *rel,
 											  andargs,
 											  all_clauses,
 											  false,
-											  isjoininner,
-											  outer_relids,
+											  outer_rel,
 											  SAOP_ALLOW);
 				/* Recurse in case there are sub-ORs */
 				indlist = list_concat(indlist,
 									  generate_bitmap_or_paths(root, rel,
 															   andargs,
 															   all_clauses,
-															   isjoininner,
-															   outer_relids));
+															   outer_rel));
 			}
 			else
 			{
@@ -525,8 +517,7 @@ generate_bitmap_or_paths(PlannerInfo *root, RelOptInfo *rel,
 											  list_make1(orarg),
 											  all_clauses,
 											  false,
-											  isjoininner,
-											  outer_relids,
+											  outer_rel,
 											  SAOP_ALLOW);
 			}
 
@@ -724,7 +715,7 @@ bitmap_and_cost_est(PlannerInfo *root, RelOptInfo *rel, List *paths)
 	cost_bitmap_and_node(&apath, root);
 
 	/* Now we can do cost_bitmap_heap_scan */
-	cost_bitmap_heap_scan(&bpath, root, rel, (Path *) &apath, false);
+	cost_bitmap_heap_scan(&bpath, root, rel, (Path *) &apath);
 
 	return bpath.total_cost;
 }
@@ -1338,7 +1329,7 @@ matches_any_index(RestrictInfo *rinfo, RelOptInfo *rel, Relids outer_relids)
 /*
  * best_inner_indexscan
  *	  Finds the best available inner indexscan for a nestloop join
- *	  with the given rel on the inside and the given outer_relids outside.
+ *	  with the given rel on the inside and the given outer_rel outside.
  *	  May return NULL if there are no possible inner indexscans.
  *
  * We ignore ordering considerations (since a nestloop's inner scan's order
@@ -1353,8 +1344,9 @@ matches_any_index(RestrictInfo *rinfo, RelOptInfo *rel, Relids outer_relids)
  */
 Path *
 best_inner_indexscan(PlannerInfo *root, RelOptInfo *rel,
-					 Relids outer_relids, JoinType jointype)
+					 RelOptInfo *outer_rel, JoinType jointype)
 {
+	Relids		outer_relids;
 	Path	   *cheapest;
 	bool		isouterjoin;
 	List	   *clause_list;
@@ -1396,11 +1388,11 @@ best_inner_indexscan(PlannerInfo *root, RelOptInfo *rel,
 	oldcontext = MemoryContextSwitchTo(GetMemoryChunkContext(rel));
 
 	/*
-	 * Intersect the given outer_relids with index_outer_relids to find the
+	 * Intersect the given outer relids with index_outer_relids to find the
 	 * set of outer relids actually relevant for this rel. If there are none,
 	 * again we can fail immediately.
 	 */
-	outer_relids = bms_intersect(rel->index_outer_relids, outer_relids);
+	outer_relids = bms_intersect(rel->index_outer_relids, outer_rel->relids);
 	if (bms_is_empty(outer_relids))
 	{
 		bms_free(outer_relids);
@@ -1413,6 +1405,13 @@ best_inner_indexscan(PlannerInfo *root, RelOptInfo *rel,
 	 * outerrels.  (We include the isouterjoin status in the cache lookup key
 	 * for safety.	In practice I suspect this is not necessary because it
 	 * should always be the same for a given innerrel.)
+	 *
+	 * NOTE: because we cache on outer_relids rather than outer_rel->relids,
+	 * we will report the same path and hence path cost for joins with
+	 * different sets of irrelevant rels on the outside.  Now that cost_index
+	 * is sensitive to outer_rel->rows, this is not really right.  However
+	 * the error is probably not large.  Is it worth establishing a separate
+	 * cache entry for each distinct outer_rel->relids set to get this right?
 	 */
 	foreach(l, rel->index_inner_paths)
 	{
@@ -1433,9 +1432,9 @@ best_inner_indexscan(PlannerInfo *root, RelOptInfo *rel,
 	 * that could be plain indexscans, ie, they don't require the join context
 	 * at all.  This may seem redundant, but we need to include those scans in
 	 * the input given to choose_bitmap_and() to be sure we find optimal AND
-	 * combinations of join and non-join scans.  The worst case is that we
-	 * might return a "best inner indexscan" that's really just a plain
-	 * indexscan, causing some redundant effort in joinpath.c.
+	 * combinations of join and non-join scans.  Also, even if the "best
+	 * inner indexscan" is just a plain indexscan, it will have a different
+	 * cost estimate because of cache effects.
 	 */
 	clause_list = find_clauses_for_join(root, rel, outer_relids, isouterjoin);
 
@@ -1445,8 +1444,7 @@ best_inner_indexscan(PlannerInfo *root, RelOptInfo *rel,
 	 */
 	indexpaths = find_usable_indexes(root, rel,
 									 clause_list, NIL,
-									 false, true,
-									 outer_relids,
+									 false, outer_rel,
 									 SAOP_FORBID);
 
 	/*
@@ -1455,8 +1453,7 @@ best_inner_indexscan(PlannerInfo *root, RelOptInfo *rel,
 	 */
 	bitindexpaths = generate_bitmap_or_paths(root, rel,
 											 clause_list, NIL,
-											 true,
-											 outer_relids);
+											 outer_rel);
 
 	/*
 	 * Likewise, generate paths using ScalarArrayOpExpr clauses; these can't
@@ -1465,8 +1462,7 @@ best_inner_indexscan(PlannerInfo *root, RelOptInfo *rel,
 	bitindexpaths = list_concat(bitindexpaths,
 								find_saop_paths(root, rel,
 												clause_list, NIL,
-												false, true,
-												outer_relids));
+												false, outer_rel));
 
 	/*
 	 * Include the regular index paths in bitindexpaths.
diff --git a/src/backend/optimizer/path/joinpath.c b/src/backend/optimizer/path/joinpath.c
index 9ae79cc19e4..30cc23b672d 100644
--- a/src/backend/optimizer/path/joinpath.c
+++ b/src/backend/optimizer/path/joinpath.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/optimizer/path/joinpath.c,v 1.103 2006/03/05 15:58:28 momjian Exp $
+ *	  $PostgreSQL: pgsql/src/backend/optimizer/path/joinpath.c,v 1.104 2006/06/06 17:59:57 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -36,7 +36,7 @@ static void hash_inner_and_outer(PlannerInfo *root, RelOptInfo *joinrel,
 					 RelOptInfo *outerrel, RelOptInfo *innerrel,
 					 List *restrictlist, JoinType jointype);
 static Path *best_appendrel_indexscan(PlannerInfo *root, RelOptInfo *rel,
-									  Relids outer_relids, JoinType jointype);
+									 RelOptInfo *outer_rel, JoinType jointype);
 static List *select_mergejoin_clauses(RelOptInfo *joinrel,
 						 RelOptInfo *outerrel,
 						 RelOptInfo *innerrel,
@@ -401,12 +401,10 @@ match_unsorted_outer(PlannerInfo *root,
 		{
 			if (IsA(inner_cheapest_total, AppendPath))
 				bestinnerjoin = best_appendrel_indexscan(root, innerrel,
-														 outerrel->relids,
-														 jointype);
+														 outerrel, jointype);
 			else if (innerrel->rtekind == RTE_RELATION)
 				bestinnerjoin = best_inner_indexscan(root, innerrel,
-													 outerrel->relids,
-													 jointype);
+													 outerrel, jointype);
 		}
 	}
 
@@ -791,13 +789,13 @@ hash_inner_and_outer(PlannerInfo *root,
 /*
  * best_appendrel_indexscan
  *	  Finds the best available set of inner indexscans for a nestloop join
- *	  with the given append relation on the inside and the given outer_relids
+ *	  with the given append relation on the inside and the given outer_rel
  *	  outside.  Returns an AppendPath comprising the best inner scans, or
  *	  NULL if there are no possible inner indexscans.
  */
 static Path *
 best_appendrel_indexscan(PlannerInfo *root, RelOptInfo *rel,
-						 Relids outer_relids, JoinType jointype)
+						 RelOptInfo *outer_rel, JoinType jointype)
 {
 	int			parentRTindex = rel->relid;
 	List	   *append_paths = NIL;
@@ -832,7 +830,7 @@ best_appendrel_indexscan(PlannerInfo *root, RelOptInfo *rel,
 		 * Get the best innerjoin indexpath (if any) for this child rel.
 		 */
 		bestinnerjoin = best_inner_indexscan(root, childrel,
-											 outer_relids, jointype);
+											 outer_rel, jointype);
 
 		/*
 		 * If no luck on an indexpath for this rel, we'll still consider
diff --git a/src/backend/optimizer/path/orindxpath.c b/src/backend/optimizer/path/orindxpath.c
index 252835c5fa7..988145ca185 100644
--- a/src/backend/optimizer/path/orindxpath.c
+++ b/src/backend/optimizer/path/orindxpath.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/optimizer/path/orindxpath.c,v 1.78 2006/03/05 15:58:28 momjian Exp $
+ *	  $PostgreSQL: pgsql/src/backend/optimizer/path/orindxpath.c,v 1.79 2006/06/06 17:59:57 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -107,7 +107,7 @@ create_or_index_quals(PlannerInfo *root, RelOptInfo *rel)
 			 * Use the generate_bitmap_or_paths() machinery to estimate the
 			 * value of each OR clause.  We can use regular restriction
 			 * clauses along with the OR clause contents to generate
-			 * indexquals.	We pass outer_relids = NULL so that sub-clauses
+			 * indexquals.	We pass outer_rel = NULL so that sub-clauses
 			 * that are actually joins will be ignored.
 			 */
 			List	   *orpaths;
@@ -116,7 +116,7 @@ create_or_index_quals(PlannerInfo *root, RelOptInfo *rel)
 			orpaths = generate_bitmap_or_paths(root, rel,
 											   list_make1(rinfo),
 											   rel->baserestrictinfo,
-											   false, NULL);
+											   NULL);
 
 			/* Locate the cheapest OR path */
 			foreach(k, orpaths)
diff --git a/src/backend/optimizer/plan/planagg.c b/src/backend/optimizer/plan/planagg.c
index c2a1199d6c7..c567f8b6b74 100644
--- a/src/backend/optimizer/plan/planagg.c
+++ b/src/backend/optimizer/plan/planagg.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/optimizer/plan/planagg.c,v 1.14 2006/04/28 20:57:49 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/optimizer/plan/planagg.c,v 1.15 2006/06/06 17:59:57 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -373,7 +373,7 @@ build_minmax_path(PlannerInfo *root, RelOptInfo *rel, MinMaxAggInfo *info)
 									 restrictclauses,
 									 NIL,
 									 indexscandir,
-									 false);
+									 NULL);
 
 		/*
 		 * Estimate actual cost of fetching just one row.
diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c
index 170f260bfbc..f8270074142 100644
--- a/src/backend/optimizer/util/pathnode.c
+++ b/src/backend/optimizer/util/pathnode.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/optimizer/util/pathnode.c,v 1.127 2006/03/05 15:58:31 momjian Exp $
+ *	  $PostgreSQL: pgsql/src/backend/optimizer/util/pathnode.c,v 1.128 2006/06/06 17:59:57 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -420,8 +420,8 @@ create_seqscan_path(PlannerInfo *root, RelOptInfo *rel)
  * 'indexscandir' is ForwardScanDirection or BackwardScanDirection
  *			for an ordered index, or NoMovementScanDirection for
  *			an unordered index.
- * 'isjoininner' is TRUE if this is a join inner indexscan path.
- *			(pathkeys and indexscandir are ignored if so.)
+ * 'outer_rel' is the outer relation if this is a join inner indexscan path.
+ *			(pathkeys and indexscandir are ignored if so.)  NULL if not.
  *
  * Returns the new path node.
  */
@@ -431,7 +431,7 @@ create_index_path(PlannerInfo *root,
 				  List *clause_groups,
 				  List *pathkeys,
 				  ScanDirection indexscandir,
-				  bool isjoininner)
+				  RelOptInfo *outer_rel)
 {
 	IndexPath  *pathnode = makeNode(IndexPath);
 	RelOptInfo *rel = index->rel;
@@ -445,7 +445,7 @@ create_index_path(PlannerInfo *root,
 	 * don't really care what order it's scanned in.  (We could expect the
 	 * caller to supply the correct values, but it's easier to force it here.)
 	 */
-	if (isjoininner)
+	if (outer_rel != NULL)
 	{
 		pathkeys = NIL;
 		indexscandir = NoMovementScanDirection;
@@ -466,10 +466,10 @@ create_index_path(PlannerInfo *root,
 	pathnode->indexclauses = allclauses;
 	pathnode->indexquals = indexquals;
 
-	pathnode->isjoininner = isjoininner;
+	pathnode->isjoininner = (outer_rel != NULL);
 	pathnode->indexscandir = indexscandir;
 
-	if (isjoininner)
+	if (outer_rel != NULL)
 	{
 		/*
 		 * We must compute the estimated number of output rows for the
@@ -505,7 +505,7 @@ create_index_path(PlannerInfo *root,
 		pathnode->rows = rel->rows;
 	}
 
-	cost_index(pathnode, root, index, indexquals, isjoininner);
+	cost_index(pathnode, root, index, indexquals, outer_rel);
 
 	return pathnode;
 }
@@ -515,6 +515,9 @@ create_index_path(PlannerInfo *root,
  *	  Creates a path node for a bitmap scan.
  *
  * 'bitmapqual' is a tree of IndexPath, BitmapAndPath, and BitmapOrPath nodes.
+ *
+ * If this is a join inner indexscan path, the component IndexPaths should
+ * have been costed accordingly, and TRUE should be passed for isjoininner.
  */
 BitmapHeapPath *
 create_bitmap_heap_path(PlannerInfo *root,
@@ -560,7 +563,7 @@ create_bitmap_heap_path(PlannerInfo *root,
 		pathnode->rows = rel->rows;
 	}
 
-	cost_bitmap_heap_scan(&pathnode->path, root, rel, bitmapqual, false);
+	cost_bitmap_heap_scan(&pathnode->path, root, rel, bitmapqual);
 
 	return pathnode;
 }
diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index c75ceef3732..8bf517e4208 100644
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -15,7 +15,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/utils/adt/selfuncs.c,v 1.206 2006/06/05 02:49:58 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/utils/adt/selfuncs.c,v 1.207 2006/06/06 17:59:57 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -4465,6 +4465,7 @@ string_to_bytea_const(const char *str, size_t str_len)
 static void
 genericcostestimate(PlannerInfo *root,
 					IndexOptInfo *index, List *indexQuals,
+					RelOptInfo *outer_rel,
 					double numIndexTuples,
 					Cost *indexStartupCost,
 					Cost *indexTotalCost,
@@ -4538,26 +4539,61 @@ genericcostestimate(PlannerInfo *root,
 	/*
 	 * Estimate the number of index pages that will be retrieved.
 	 *
-	 * For all currently-supported index types, the first page of the index is
-	 * a metadata page, and we should figure on fetching that plus a pro-rated
-	 * fraction of the remaining pages.
+	 * We use the simplistic method of taking a pro-rata fraction of the
+	 * total number of index pages.  In effect, this counts only leaf pages
+	 * and not any overhead such as index metapage or upper tree levels.
+	 * In practice this seems a better approximation than charging for
+	 * access to the upper levels, perhaps because those tend to stay in
+	 * cache under load.
 	 */
-	if (index->pages > 1 && index->tuples > 0)
-	{
-		numIndexPages = (numIndexTuples / index->tuples) * (index->pages - 1);
-		numIndexPages += 1;		/* count the metapage too */
-		numIndexPages = ceil(numIndexPages);
-	}
+	if (index->pages > 1 && index->tuples > 1)
+		numIndexPages = ceil(numIndexTuples * index->pages / index->tuples);
 	else
 		numIndexPages = 1.0;
 
 	/*
-	 * Compute the index access cost.
+	 * Now compute the disk access costs.
 	 *
-	 * Disk cost: our generic assumption is that the index pages will be read
-	 * sequentially, so they cost seq_page_cost each, not random_page_cost.
+	 * The above calculations are all per-index-scan.  However, if we are
+	 * in a nestloop inner scan, we can expect the scan to be repeated (with
+	 * different search keys) for each row of the outer relation.  This
+	 * creates the potential for cache effects to reduce the number of
+	 * disk page fetches needed.  We want to estimate the average per-scan
+	 * I/O cost in the presence of caching.
+	 *
+	 * We use the Mackert-Lohman formula (see costsize.c for details) to
+	 * estimate the total number of page fetches that occur.  While this
+	 * wasn't what it was designed for, it seems a reasonable model anyway.
+	 * Note that we are counting pages not tuples anymore, so we take
+	 * N = T = index size, as if there were one "tuple" per page.
 	 */
-	*indexTotalCost = seq_page_cost * numIndexPages;
+	if (outer_rel != NULL && outer_rel->rows > 1)
+	{
+		double		num_scans = outer_rel->rows;
+		double		pages_fetched;
+
+		/* total page fetches ignoring cache effects */
+		pages_fetched = numIndexPages * num_scans;
+
+		/* use Mackert and Lohman formula to adjust for cache effects */
+		pages_fetched = index_pages_fetched(pages_fetched,
+											index->pages,
+											index->rel->pages);
+
+		/*
+		 * Now compute the total disk access cost, and then report a
+		 * pro-rated share for one index scan.
+		 */
+		*indexTotalCost = (pages_fetched * random_page_cost) / num_scans;
+	}
+	else
+	{
+		/*
+		 * For a single index scan, we just charge random_page_cost per page
+		 * touched.
+		 */
+		*indexTotalCost = numIndexPages * random_page_cost;
+	}
 
 	/*
 	 * CPU cost: any complex expressions in the indexquals will need to be
@@ -4594,10 +4630,11 @@ btcostestimate(PG_FUNCTION_ARGS)
 	PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
 	IndexOptInfo *index = (IndexOptInfo *) PG_GETARG_POINTER(1);
 	List	   *indexQuals = (List *) PG_GETARG_POINTER(2);
-	Cost	   *indexStartupCost = (Cost *) PG_GETARG_POINTER(3);
-	Cost	   *indexTotalCost = (Cost *) PG_GETARG_POINTER(4);
-	Selectivity *indexSelectivity = (Selectivity *) PG_GETARG_POINTER(5);
-	double	   *indexCorrelation = (double *) PG_GETARG_POINTER(6);
+	RelOptInfo *outer_rel = (RelOptInfo *) PG_GETARG_POINTER(3);
+	Cost	   *indexStartupCost = (Cost *) PG_GETARG_POINTER(4);
+	Cost	   *indexTotalCost = (Cost *) PG_GETARG_POINTER(5);
+	Selectivity *indexSelectivity = (Selectivity *) PG_GETARG_POINTER(6);
+	double	   *indexCorrelation = (double *) PG_GETARG_POINTER(7);
 	Oid			relid;
 	AttrNumber	colnum;
 	HeapTuple	tuple;
@@ -4728,7 +4765,7 @@ btcostestimate(PG_FUNCTION_ARGS)
 		numIndexTuples = btreeSelectivity * index->rel->tuples;
 	}
 
-	genericcostestimate(root, index, indexQuals, numIndexTuples,
+	genericcostestimate(root, index, indexQuals, outer_rel, numIndexTuples,
 						indexStartupCost, indexTotalCost,
 						indexSelectivity, indexCorrelation);
 
@@ -4800,12 +4837,13 @@ hashcostestimate(PG_FUNCTION_ARGS)
 	PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
 	IndexOptInfo *index = (IndexOptInfo *) PG_GETARG_POINTER(1);
 	List	   *indexQuals = (List *) PG_GETARG_POINTER(2);
-	Cost	   *indexStartupCost = (Cost *) PG_GETARG_POINTER(3);
-	Cost	   *indexTotalCost = (Cost *) PG_GETARG_POINTER(4);
-	Selectivity *indexSelectivity = (Selectivity *) PG_GETARG_POINTER(5);
-	double	   *indexCorrelation = (double *) PG_GETARG_POINTER(6);
+	RelOptInfo *outer_rel = (RelOptInfo *) PG_GETARG_POINTER(3);
+	Cost	   *indexStartupCost = (Cost *) PG_GETARG_POINTER(4);
+	Cost	   *indexTotalCost = (Cost *) PG_GETARG_POINTER(5);
+	Selectivity *indexSelectivity = (Selectivity *) PG_GETARG_POINTER(6);
+	double	   *indexCorrelation = (double *) PG_GETARG_POINTER(7);
 
-	genericcostestimate(root, index, indexQuals, 0.0,
+	genericcostestimate(root, index, indexQuals, outer_rel, 0.0,
 						indexStartupCost, indexTotalCost,
 						indexSelectivity, indexCorrelation);
 
@@ -4818,12 +4856,13 @@ gistcostestimate(PG_FUNCTION_ARGS)
 	PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
 	IndexOptInfo *index = (IndexOptInfo *) PG_GETARG_POINTER(1);
 	List	   *indexQuals = (List *) PG_GETARG_POINTER(2);
-	Cost	   *indexStartupCost = (Cost *) PG_GETARG_POINTER(3);
-	Cost	   *indexTotalCost = (Cost *) PG_GETARG_POINTER(4);
-	Selectivity *indexSelectivity = (Selectivity *) PG_GETARG_POINTER(5);
-	double	   *indexCorrelation = (double *) PG_GETARG_POINTER(6);
+	RelOptInfo *outer_rel = (RelOptInfo *) PG_GETARG_POINTER(3);
+	Cost	   *indexStartupCost = (Cost *) PG_GETARG_POINTER(4);
+	Cost	   *indexTotalCost = (Cost *) PG_GETARG_POINTER(5);
+	Selectivity *indexSelectivity = (Selectivity *) PG_GETARG_POINTER(6);
+	double	   *indexCorrelation = (double *) PG_GETARG_POINTER(7);
 
-	genericcostestimate(root, index, indexQuals, 0.0,
+	genericcostestimate(root, index, indexQuals, outer_rel, 0.0,
 						indexStartupCost, indexTotalCost,
 						indexSelectivity, indexCorrelation);
 
@@ -4836,12 +4875,13 @@ gincostestimate(PG_FUNCTION_ARGS)
 	PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
 	IndexOptInfo *index = (IndexOptInfo *) PG_GETARG_POINTER(1);
 	List	   *indexQuals = (List *) PG_GETARG_POINTER(2);
-	Cost	   *indexStartupCost = (Cost *) PG_GETARG_POINTER(3);
-	Cost	   *indexTotalCost = (Cost *) PG_GETARG_POINTER(4);
-	Selectivity *indexSelectivity = (Selectivity *) PG_GETARG_POINTER(5);
-	double	   *indexCorrelation = (double *) PG_GETARG_POINTER(6);
+	RelOptInfo *outer_rel = (RelOptInfo *) PG_GETARG_POINTER(3);
+	Cost	   *indexStartupCost = (Cost *) PG_GETARG_POINTER(4);
+	Cost	   *indexTotalCost = (Cost *) PG_GETARG_POINTER(5);
+	Selectivity *indexSelectivity = (Selectivity *) PG_GETARG_POINTER(6);
+	double	   *indexCorrelation = (double *) PG_GETARG_POINTER(7);
 
-	genericcostestimate(root, index, indexQuals, 0.0,
+	genericcostestimate(root, index, indexQuals, outer_rel, 0.0,
 						indexStartupCost, indexTotalCost,
 						indexSelectivity, indexCorrelation);