5 files changed, 108 insertions, 110 deletions
diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c
index 7abee7a2d84..4e1c96271a9 100644
--- a/src/backend/nodes/outfuncs.c
+++ b/src/backend/nodes/outfuncs.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/nodes/outfuncs.c,v 1.371 2009/10/28 14:55:38 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/nodes/outfuncs.c,v 1.372 2009/11/15 02:45:34 tgl Exp $
  *
  * NOTES
  *	  Every node type that can appear in stored rules' parsetrees *must*
@@ -1501,6 +1501,7 @@ _outMergePath(StringInfo str, MergePath *node)
 	WRITE_NODE_FIELD(path_mergeclauses);
 	WRITE_NODE_FIELD(outersortkeys);
 	WRITE_NODE_FIELD(innersortkeys);
+	WRITE_BOOL_FIELD(materialize_inner);
 }
 
 static void
diff --git a/src/backend/optimizer/path/allpaths.c b/src/backend/optimizer/path/allpaths.c
index 4d402ca7202..c225d7e2887 100644
--- a/src/backend/optimizer/path/allpaths.c
+++ b/src/backend/optimizer/path/allpaths.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/optimizer/path/allpaths.c,v 1.188 2009/10/26 02:26:33 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/optimizer/path/allpaths.c,v 1.189 2009/11/15 02:45:35 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -1443,13 +1443,12 @@ print_path(PlannerInfo *root, Path *path, int indent)
 		{
 			MergePath  *mp = (MergePath *) path;
 
-			if (mp->outersortkeys || mp->innersortkeys)
-			{
-				for (i = 0; i < indent; i++)
-					printf("\t");
-				printf("  sortouter=%d sortinner=%d\n",
-					   ((mp->outersortkeys) ? 1 : 0),
-					   ((mp->innersortkeys) ? 1 : 0));
+			for (i = 0; i < indent; i++)
+				printf("\t");
+			printf("  sortouter=%d sortinner=%d materializeinner=%d\n",
+				   ((mp->outersortkeys) ? 1 : 0),
+				   ((mp->innersortkeys) ? 1 : 0),
+				   ((mp->materialize_inner) ? 1 : 0));
 			}
 		}
 
diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c
index 6acc5ae34b0..ffbd9afbbae 100644
--- a/src/backend/optimizer/path/costsize.c
+++ b/src/backend/optimizer/path/costsize.c
@@ -54,7 +54,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/optimizer/path/costsize.c,v 1.211 2009/09/12 22:12:03 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/optimizer/path/costsize.c,v 1.212 2009/11/15 02:45:35 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -1167,23 +1167,6 @@ cost_sort(Path *path, PlannerInfo *root,
 }
 
 /*
- * sort_exceeds_work_mem
- *	  Given a finished Sort plan node, detect whether it is expected to
- *	  spill to disk (ie, will need more than work_mem workspace)
- *
- * This assumes there will be no available LIMIT.
- */
-bool
-sort_exceeds_work_mem(Sort *sort)
-{
-	double		input_bytes = relation_byte_size(sort->plan.plan_rows,
-												 sort->plan.plan_width);
-	long		work_mem_bytes = work_mem * 1024L;
-
-	return (input_bytes > work_mem_bytes);
-}
-
-/*
  * cost_material
  *	  Determines and returns the cost of materializing a relation, including
  *	  the cost of reading the input data.
@@ -1543,7 +1526,18 @@ cost_nestloop(NestPath *path, PlannerInfo *root, SpecialJoinInfo *sjinfo)
  *	  Determines and returns the cost of joining two relations using the
  *	  merge join algorithm.
  *
- * 'path' is already filled in except for the cost fields
+ * Unlike other costsize functions, this routine makes one actual decision:
+ * whether we should materialize the inner path.  We do that either because
+ * the inner path can't support mark/restore, or because it's cheaper to
+ * use an interposed Material node to handle mark/restore.  When the decision
+ * is cost-based it would be logically cleaner to build and cost two separate
+ * paths with and without that flag set; but that would require repeating most
+ * of the calculations here, which are not all that cheap.  Since the choice
+ * will not affect output pathkeys or startup cost, only total cost, there is
+ * no possibility of wanting to keep both paths.  So it seems best to make
+ * the decision here and record it in the path's materialize_inner field.
+ *
+ * 'path' is already filled in except for the cost fields and materialize_inner
  * 'sjinfo' is extra info about the join for selectivity estimation
  *
  * Notes: path's mergeclauses should be a subset of the joinrestrictinfo list;
@@ -1561,7 +1555,10 @@ cost_mergejoin(MergePath *path, PlannerInfo *root, SpecialJoinInfo *sjinfo)
 	List	   *innersortkeys = path->innersortkeys;
 	Cost		startup_cost = 0;
 	Cost		run_cost = 0;
-	Cost		cpu_per_tuple;
+	Cost		cpu_per_tuple,
+				inner_run_cost,
+				bare_inner_cost,
+				mat_inner_cost;
 	QualCost	merge_qual_cost;
 	QualCost	qp_qual_cost;
 	double		outer_path_rows = PATH_ROWS(outer_path);
@@ -1606,10 +1603,7 @@ cost_mergejoin(MergePath *path, PlannerInfo *root, SpecialJoinInfo *sjinfo)
 	/*
 	 * When there are equal merge keys in the outer relation, the mergejoin
 	 * must rescan any matching tuples in the inner relation. This means
-	 * re-fetching inner tuples.  Our cost model for this is that a re-fetch
-	 * costs the same as an original fetch, which is probably an overestimate;
-	 * but on the other hand we ignore the bookkeeping costs of mark/restore.
-	 * Not clear if it's worth developing a more refined model.
+	 * re-fetching inner tuples; we have to estimate how often that happens.
 	 *
 	 * For regular inner and outer joins, the number of re-fetches can be
 	 * estimated approximately as size of merge join output minus size of
@@ -1641,7 +1635,7 @@ cost_mergejoin(MergePath *path, PlannerInfo *root, SpecialJoinInfo *sjinfo)
 		if (rescannedtuples < 0)
 			rescannedtuples = 0;
 	}
-	/* We'll inflate inner run cost this much to account for rescanning */
+	/* We'll inflate various costs this much to account for rescanning */
 	rescanratio = 1.0 + (rescannedtuples / inner_path_rows);
 
 	/*
@@ -1778,32 +1772,83 @@ cost_mergejoin(MergePath *path, PlannerInfo *root, SpecialJoinInfo *sjinfo)
 				  -1.0);
 		startup_cost += sort_path.startup_cost;
 		startup_cost += (sort_path.total_cost - sort_path.startup_cost)
-			* innerstartsel * rescanratio;
-		run_cost += (sort_path.total_cost - sort_path.startup_cost)
-			* (innerendsel - innerstartsel) * rescanratio;
-
-		/*
-		 * If the inner sort is expected to spill to disk, we want to add a
-		 * materialize node to shield it from the need to handle mark/restore.
-		 * This will allow it to perform the last merge pass on-the-fly, while
-		 * in most cases not requiring the materialize to spill to disk.
-		 * Charge an extra cpu_tuple_cost per tuple to account for the
-		 * materialize node.  (Keep this estimate in sync with similar ones in
-		 * create_mergejoin_path and create_mergejoin_plan.)
-		 */
-		if (relation_byte_size(inner_path_rows, inner_path->parent->width) >
-			(work_mem * 1024L))
-			run_cost += cpu_tuple_cost * inner_path_rows;
+			* innerstartsel;
+		inner_run_cost = (sort_path.total_cost - sort_path.startup_cost)
+			* (innerendsel - innerstartsel);
 	}
 	else
 	{
 		startup_cost += inner_path->startup_cost;
 		startup_cost += (inner_path->total_cost - inner_path->startup_cost)
-			* innerstartsel * rescanratio;
-		run_cost += (inner_path->total_cost - inner_path->startup_cost)
-			* (innerendsel - innerstartsel) * rescanratio;
+			* innerstartsel;
+		inner_run_cost = (inner_path->total_cost - inner_path->startup_cost)
+			* (innerendsel - innerstartsel);
 	}
 
+	/*
+	 * Decide whether we want to materialize the inner input to shield it from
+	 * mark/restore and performing re-fetches.  Our cost model for regular
+	 * re-fetches is that a re-fetch costs the same as an original fetch,
+	 * which is probably an overestimate; but on the other hand we ignore the
+	 * bookkeeping costs of mark/restore.  Not clear if it's worth developing
+	 * a more refined model.  So we just need to inflate the inner run cost
+	 * by rescanratio.
+	 */
+	bare_inner_cost = inner_run_cost * rescanratio;
+	/*
+	 * When we interpose a Material node the re-fetch cost is assumed to be
+	 * just cpu_tuple_cost per tuple, independently of the underlying plan's
+	 * cost; but we have to charge an extra cpu_tuple_cost per original fetch
+	 * as well.  Note that we're assuming the materialize node will never
+	 * spill to disk, since it only has to remember tuples back to the last
+	 * mark.  (If there are a huge number of duplicates, our other cost
+	 * factors will make the path so expensive that it probably won't get
+	 * chosen anyway.)  So we don't use cost_rescan here.
+	 *
+	 * Note: keep this estimate in sync with create_mergejoin_plan's labeling
+	 * of the generated Material node.
+	 */
+	mat_inner_cost = inner_run_cost +
+		cpu_tuple_cost * inner_path_rows * rescanratio;
+
+	/* Prefer materializing if it looks cheaper */
+	if (mat_inner_cost < bare_inner_cost)
+		path->materialize_inner = true;
+	/*
+	 * Even if materializing doesn't look cheaper, we *must* do it if the
+	 * inner path is to be used directly (without sorting) and it doesn't
+	 * support mark/restore.
+	 *
+	 * Since the inner side must be ordered, and only Sorts and IndexScans can
+	 * create order to begin with, and they both support mark/restore, you
+	 * might think there's no problem --- but you'd be wrong.  Nestloop and
+	 * merge joins can *preserve* the order of their inputs, so they can be
+	 * selected as the input of a mergejoin, and they don't support
+	 * mark/restore at present.
+	 */
+	else if (innersortkeys == NIL &&
+			 !ExecSupportsMarkRestore(inner_path->pathtype))
+		path->materialize_inner = true;
+	/*
+	 * Also, force materializing if the inner path is to be sorted and the
+	 * sort is expected to spill to disk.  This is because the final merge
+	 * pass can be done on-the-fly if it doesn't have to support mark/restore.
+	 * We don't try to adjust the cost estimates for this consideration,
+	 * though.
+	 */
+	else if (innersortkeys != NIL &&
+			 relation_byte_size(inner_path_rows, inner_path->parent->width) >
+			 (work_mem * 1024L))
+		path->materialize_inner = true;
+	else
+		path->materialize_inner = false;
+
+	/* Charge the right incremental cost for the chosen case */
+	if (path->materialize_inner)
+		run_cost += mat_inner_cost;
+	else
+		run_cost += bare_inner_cost;
+
 	/* CPU costs */
 
 	/*
diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c
index b068d2f3f83..41238b7c0e6 100644
--- a/src/backend/optimizer/plan/createplan.c
+++ b/src/backend/optimizer/plan/createplan.c
@@ -10,7 +10,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/optimizer/plan/createplan.c,v 1.266 2009/10/26 02:26:33 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/optimizer/plan/createplan.c,v 1.267 2009/11/15 02:45:35 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -1664,9 +1664,8 @@ create_mergejoin_plan(PlannerInfo *root,
 							 best_path->jpath.outerjoinpath->parent->relids);
 
 	/*
-	 * Create explicit sort nodes for the outer and inner join paths if
-	 * necessary.  The sort cost was already accounted for in the path. Make
-	 * sure there are no excess columns in the inputs if sorting.
+	 * Create explicit sort nodes for the outer and inner paths if necessary.
+	 * Make sure there are no excess columns in the inputs if sorting.
 	 */
 	if (best_path->outersortkeys)
 	{
@@ -1695,23 +1694,17 @@ create_mergejoin_plan(PlannerInfo *root,
 		innerpathkeys = best_path->jpath.innerjoinpath->pathkeys;
 
 	/*
-	 * If inner plan is a sort that is expected to spill to disk, add a
-	 * materialize node to shield it from the need to handle mark/restore.
-	 * This will allow it to perform the last merge pass on-the-fly, while in
-	 * most cases not requiring the materialize to spill to disk.
-	 *
-	 * XXX really, Sort oughta do this for itself, probably, to avoid the
-	 * overhead of a separate plan node.
+	 * If specified, add a materialize node to shield the inner plan from
+	 * the need to handle mark/restore.
 	 */
-	if (IsA(inner_plan, Sort) &&
-		sort_exceeds_work_mem((Sort *) inner_plan))
+	if (best_path->materialize_inner)
 	{
 		Plan	   *matplan = (Plan *) make_material(inner_plan);
 
 		/*
 		 * We assume the materialize will not spill to disk, and therefore
 		 * charge just cpu_tuple_cost per tuple.  (Keep this estimate in sync
-		 * with similar ones in cost_mergejoin and create_mergejoin_path.)
+		 * with cost_mergejoin.)
 		 */
 		copy_plan_costsize(matplan, inner_plan);
 		matplan->total_cost += cpu_tuple_cost * matplan->plan_rows;
@@ -1887,6 +1880,7 @@ create_mergejoin_plan(PlannerInfo *root,
 							   inner_plan,
 							   best_path->jpath.jointype);
 
+	/* Costs of sort and material steps are included in path cost already */
 	copy_path_costsize(&join_plan->join.plan, &best_path->jpath.path);
 
 	return join_plan;
diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c
index e9733892cc2..62169e589b9 100644
--- a/src/backend/optimizer/util/pathnode.c
+++ b/src/backend/optimizer/util/pathnode.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/optimizer/util/pathnode.c,v 1.154 2009/09/17 20:49:29 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/optimizer/util/pathnode.c,v 1.155 2009/11/15 02:45:35 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -17,7 +17,6 @@
 #include <math.h>
 
 #include "catalog/pg_operator.h"
-#include "executor/executor.h"
 #include "miscadmin.h"
 #include "optimizer/clauses.h"
 #include "optimizer/cost.h"
@@ -1414,47 +1413,6 @@ create_mergejoin_path(PlannerInfo *root,
 		pathkeys_contained_in(innersortkeys, inner_path->pathkeys))
 		innersortkeys = NIL;
 
-	/*
-	 * If we are not sorting the inner path, we may need a materialize node to
-	 * ensure it can be marked/restored.
-	 *
-	 * Since the inner side must be ordered, and only Sorts and IndexScans can
-	 * create order to begin with, and they both support mark/restore, you
-	 * might think there's no problem --- but you'd be wrong.  Nestloop and
-	 * merge joins can *preserve* the order of their inputs, so they can be
-	 * selected as the input of a mergejoin, and they don't support
-	 * mark/restore at present.
-	 *
-	 * Note: Sort supports mark/restore, so no materialize is really needed in
-	 * that case; but one may be desirable anyway to optimize the sort.
-	 * However, since we aren't representing the sort step separately in the
-	 * Path tree, we can't explicitly represent the materialize either. So
-	 * that case is not handled here.  Instead, cost_mergejoin has to factor
-	 * in the cost and create_mergejoin_plan has to add the plan node.
-	 */
-	if (innersortkeys == NIL &&
-		!ExecSupportsMarkRestore(inner_path->pathtype))
-	{
-		Path	   *mpath;
-
-		mpath = (Path *) create_material_path(inner_path->parent, inner_path);
-
-		/*
-		 * We expect the materialize won't spill to disk (it could only do so
-		 * if there were a whole lot of duplicate tuples, which is a case
-		 * cost_mergejoin will avoid choosing anyway).	Therefore
-		 * cost_material's cost estimate is bogus and we should charge just
-		 * cpu_tuple_cost per tuple.  (Keep this estimate in sync with similar
-		 * ones in cost_mergejoin and create_mergejoin_plan; also see
-		 * cost_rescan.)
-		 */
-		mpath->startup_cost = inner_path->startup_cost;
-		mpath->total_cost = inner_path->total_cost;
-		mpath->total_cost += cpu_tuple_cost * inner_path->parent->rows;
-
-		inner_path = mpath;
-	}
-
 	pathnode->jpath.path.pathtype = T_MergeJoin;
 	pathnode->jpath.path.parent = joinrel;
 	pathnode->jpath.jointype = jointype;
@@ -1465,6 +1423,7 @@ create_mergejoin_path(PlannerInfo *root,
 	pathnode->path_mergeclauses = mergeclauses;
 	pathnode->outersortkeys = outersortkeys;
 	pathnode->innersortkeys = innersortkeys;
+	/* pathnode->materialize_inner will be set by cost_mergejoin */
 
 	cost_mergejoin(pathnode, root, sjinfo);