aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/backend/executor/nodeLimit.c33
-rw-r--r--src/backend/executor/nodeSort.c12
-rw-r--r--src/backend/optimizer/path/costsize.c77
-rw-r--r--src/backend/optimizer/plan/createplan.c31
-rw-r--r--src/backend/optimizer/plan/planmain.c13
-rw-r--r--src/backend/optimizer/plan/planner.c30
-rw-r--r--src/backend/optimizer/util/pathnode.c5
-rw-r--r--src/backend/utils/misc/guc.c25
-rw-r--r--src/backend/utils/sort/tuplesort.c275
-rw-r--r--src/include/nodes/execnodes.h6
-rw-r--r--src/include/optimizer/cost.h5
-rw-r--r--src/include/optimizer/planmain.h6
-rw-r--r--src/include/utils/tuplesort.h4
13 files changed, 464 insertions, 58 deletions
diff --git a/src/backend/executor/nodeLimit.c b/src/backend/executor/nodeLimit.c
index d2ecb3722f7..9d40952647d 100644
--- a/src/backend/executor/nodeLimit.c
+++ b/src/backend/executor/nodeLimit.c
@@ -8,7 +8,7 @@
*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/executor/nodeLimit.c,v 1.29 2007/01/05 22:19:28 momjian Exp $
+ * $PostgreSQL: pgsql/src/backend/executor/nodeLimit.c,v 1.30 2007/05/04 01:13:43 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@@ -280,6 +280,37 @@ recompute_limits(LimitState *node)
/* Reset position to start-of-scan */
node->position = 0;
node->subSlot = NULL;
+
+ /*
+ * If we have a COUNT, and our input is a Sort node, notify it that it can
+ * use bounded sort.
+ *
+ * This is a bit of a kluge, but we don't have any more-abstract way of
+ * communicating between the two nodes; and it doesn't seem worth trying
+ * to invent one without some more examples of special communication needs.
+ *
+ * Note: it is the responsibility of nodeSort.c to react properly to
+ * changes of these parameters. If we ever do redesign this, it'd be
+ * a good idea to integrate this signaling with the parameter-change
+ * mechanism.
+ */
+ if (IsA(outerPlanState(node), SortState))
+ {
+ SortState *sortState = (SortState *) outerPlanState(node);
+ int64 tuples_needed = node->count + node->offset;
+
+ /* negative test checks for overflow */
+ if (node->noCount || tuples_needed < 0)
+ {
+ /* make sure flag gets reset if needed upon rescan */
+ sortState->bounded = false;
+ }
+ else
+ {
+ sortState->bounded = true;
+ sortState->bound = tuples_needed;
+ }
+ }
}
/* ----------------------------------------------------------------
diff --git a/src/backend/executor/nodeSort.c b/src/backend/executor/nodeSort.c
index 97b5c4ff150..5b18235258f 100644
--- a/src/backend/executor/nodeSort.c
+++ b/src/backend/executor/nodeSort.c
@@ -8,7 +8,7 @@
*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/executor/nodeSort.c,v 1.60 2007/01/09 02:14:11 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/executor/nodeSort.c,v 1.61 2007/05/04 01:13:44 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@@ -89,6 +89,8 @@ ExecSort(SortState *node)
plannode->nullsFirst,
work_mem,
node->randomAccess);
+ if (node->bounded)
+ tuplesort_set_bound(tuplesortstate, node->bound);
node->tuplesortstate = (void *) tuplesortstate;
/*
@@ -119,6 +121,8 @@ ExecSort(SortState *node)
* finally set the sorted flag to true
*/
node->sort_Done = true;
+ node->bounded_Done = node->bounded;
+ node->bound_Done = node->bound;
SO1_printf("ExecSort: %s\n", "sorting done");
}
@@ -167,6 +171,7 @@ ExecInitSort(Sort *node, EState *estate, int eflags)
EXEC_FLAG_BACKWARD |
EXEC_FLAG_MARK)) != 0;
+ sortstate->bounded = false;
sortstate->sort_Done = false;
sortstate->tuplesortstate = NULL;
@@ -307,11 +312,14 @@ ExecReScanSort(SortState *node, ExprContext *exprCtxt)
/*
* If subnode is to be rescanned then we forget previous sort results; we
- * have to re-read the subplan and re-sort.
+ * have to re-read the subplan and re-sort. Also must re-sort if the
+ * bounded-sort parameters changed or we didn't select randomAccess.
*
* Otherwise we can just rewind and rescan the sorted output.
*/
if (((PlanState *) node)->lefttree->chgParam != NULL ||
+ node->bounded != node->bounded_Done ||
+ node->bound != node->bound_Done ||
!node->randomAccess)
{
node->sort_Done = false;
diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c
index 9cd3a51d897..82dfc77f065 100644
--- a/src/backend/optimizer/path/costsize.c
+++ b/src/backend/optimizer/path/costsize.c
@@ -54,7 +54,7 @@
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/optimizer/path/costsize.c,v 1.181 2007/04/21 21:01:44 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/optimizer/path/costsize.c,v 1.182 2007/05/04 01:13:44 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@@ -922,6 +922,10 @@ cost_valuesscan(Path *path, PlannerInfo *root, RelOptInfo *baserel)
* disk traffic = 2 * relsize * ceil(logM(p / (2*work_mem)))
* cpu = comparison_cost * t * log2(t)
*
+ * If the sort is bounded (i.e., only the first k result tuples are needed)
+ * and k tuples can fit into work_mem, we use a heap method that keeps only
+ * k tuples in the heap; this will require about t*log2(k) tuple comparisons.
+ *
* The disk traffic is assumed to be 3/4ths sequential and 1/4th random
* accesses (XXX can't we refine that guess?)
*
@@ -932,6 +936,7 @@ cost_valuesscan(Path *path, PlannerInfo *root, RelOptInfo *baserel)
* 'input_cost' is the total cost for reading the input data
* 'tuples' is the number of tuples in the relation
* 'width' is the average tuple width in bytes
+ * 'limit_tuples' is the bound on the number of output tuples; -1 if no bound
*
* NOTE: some callers currently pass NIL for pathkeys because they
* can't conveniently supply the sort keys. Since this routine doesn't
@@ -942,11 +947,14 @@ cost_valuesscan(Path *path, PlannerInfo *root, RelOptInfo *baserel)
*/
void
cost_sort(Path *path, PlannerInfo *root,
- List *pathkeys, Cost input_cost, double tuples, int width)
+ List *pathkeys, Cost input_cost, double tuples, int width,
+ double limit_tuples)
{
Cost startup_cost = input_cost;
Cost run_cost = 0;
- double nbytes = relation_byte_size(tuples, width);
+ double input_bytes = relation_byte_size(tuples, width);
+ double output_bytes;
+ double output_tuples;
long work_mem_bytes = work_mem * 1024L;
if (!enable_sort)
@@ -959,23 +967,39 @@ cost_sort(Path *path, PlannerInfo *root,
if (tuples < 2.0)
tuples = 2.0;
- /*
- * CPU costs
- *
- * Assume about two operator evals per tuple comparison and N log2 N
- * comparisons
- */
- startup_cost += 2.0 * cpu_operator_cost * tuples * LOG2(tuples);
+ /* Do we have a useful LIMIT? */
+ if (limit_tuples > 0 && limit_tuples < tuples)
+ {
+ output_tuples = limit_tuples;
+ output_bytes = relation_byte_size(output_tuples, width);
+ }
+ else
+ {
+ output_tuples = tuples;
+ output_bytes = input_bytes;
+ }
- /* disk costs */
- if (nbytes > work_mem_bytes)
+ if (output_bytes > work_mem_bytes)
{
- double npages = ceil(nbytes / BLCKSZ);
- double nruns = (nbytes / work_mem_bytes) * 0.5;
+ /*
+ * We'll have to use a disk-based sort of all the tuples
+ */
+ double npages = ceil(input_bytes / BLCKSZ);
+ double nruns = (input_bytes / work_mem_bytes) * 0.5;
double mergeorder = tuplesort_merge_order(work_mem_bytes);
double log_runs;
double npageaccesses;
+ /*
+ * CPU costs
+ *
+ * Assume about two operator evals per tuple comparison and N log2 N
+ * comparisons
+ */
+ startup_cost += 2.0 * cpu_operator_cost * tuples * LOG2(tuples);
+
+ /* Disk costs */
+
/* Compute logM(r) as log(r) / log(M) */
if (nruns > mergeorder)
log_runs = ceil(log(nruns) / log(mergeorder));
@@ -986,10 +1010,27 @@ cost_sort(Path *path, PlannerInfo *root,
startup_cost += npageaccesses *
(seq_page_cost * 0.75 + random_page_cost * 0.25);
}
+ else if (tuples > 2 * output_tuples || input_bytes > work_mem_bytes)
+ {
+ /*
+ * We'll use a bounded heap-sort keeping just K tuples in memory,
+ * for a total number of tuple comparisons of N log2 K; but the
+ * constant factor is a bit higher than for quicksort. Tweak it
+ * so that the cost curve is continuous at the crossover point.
+ */
+ startup_cost += 2.0 * cpu_operator_cost * tuples * LOG2(2.0 * output_tuples);
+ }
+ else
+ {
+ /* We'll use plain quicksort on all the input tuples */
+ startup_cost += 2.0 * cpu_operator_cost * tuples * LOG2(tuples);
+ }
/*
* Also charge a small amount (arbitrarily set equal to operator cost) per
- * extracted tuple.
+ * extracted tuple. Note it's correct to use tuples not output_tuples
+ * here --- the upper LIMIT will pro-rate the run cost so we'd be double
+ * counting the LIMIT otherwise.
*/
run_cost += cpu_operator_cost * tuples;
@@ -1431,7 +1472,8 @@ cost_mergejoin(MergePath *path, PlannerInfo *root)
outersortkeys,
outer_path->total_cost,
outer_path_rows,
- outer_path->parent->width);
+ outer_path->parent->width,
+ -1.0);
startup_cost += sort_path.startup_cost;
run_cost += (sort_path.total_cost - sort_path.startup_cost)
* outerscansel;
@@ -1450,7 +1492,8 @@ cost_mergejoin(MergePath *path, PlannerInfo *root)
innersortkeys,
inner_path->total_cost,
inner_path_rows,
- inner_path->parent->width);
+ inner_path->parent->width,
+ -1.0);
startup_cost += sort_path.startup_cost;
run_cost += (sort_path.total_cost - sort_path.startup_cost)
* innerscansel * rescanratio;
diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c
index 74b89125665..be0162406bd 100644
--- a/src/backend/optimizer/plan/createplan.c
+++ b/src/backend/optimizer/plan/createplan.c
@@ -10,7 +10,7 @@
*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/optimizer/plan/createplan.c,v 1.229 2007/04/21 21:01:44 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/optimizer/plan/createplan.c,v 1.230 2007/05/04 01:13:44 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@@ -122,7 +122,8 @@ static MergeJoin *make_mergejoin(List *tlist,
Plan *lefttree, Plan *righttree,
JoinType jointype);
static Sort *make_sort(PlannerInfo *root, Plan *lefttree, int numCols,
- AttrNumber *sortColIdx, Oid *sortOperators, bool *nullsFirst);
+ AttrNumber *sortColIdx, Oid *sortOperators, bool *nullsFirst,
+ double limit_tuples);
/*
@@ -1579,7 +1580,8 @@ create_mergejoin_plan(PlannerInfo *root,
outer_plan = (Plan *)
make_sort_from_pathkeys(root,
outer_plan,
- best_path->outersortkeys);
+ best_path->outersortkeys,
+ -1.0);
outerpathkeys = best_path->outersortkeys;
}
else
@@ -1591,7 +1593,8 @@ create_mergejoin_plan(PlannerInfo *root,
inner_plan = (Plan *)
make_sort_from_pathkeys(root,
inner_plan,
- best_path->innersortkeys);
+ best_path->innersortkeys,
+ -1.0);
innerpathkeys = best_path->innersortkeys;
}
else
@@ -2589,11 +2592,13 @@ make_mergejoin(List *tlist,
* make_sort --- basic routine to build a Sort plan node
*
* Caller must have built the sortColIdx, sortOperators, and nullsFirst
- * arrays already.
+ * arrays already. limit_tuples is as for cost_sort (in particular, pass
+ * -1 if no limit)
*/
static Sort *
make_sort(PlannerInfo *root, Plan *lefttree, int numCols,
- AttrNumber *sortColIdx, Oid *sortOperators, bool *nullsFirst)
+ AttrNumber *sortColIdx, Oid *sortOperators, bool *nullsFirst,
+ double limit_tuples)
{
Sort *node = makeNode(Sort);
Plan *plan = &node->plan;
@@ -2603,7 +2608,8 @@ make_sort(PlannerInfo *root, Plan *lefttree, int numCols,
cost_sort(&sort_path, root, NIL,
lefttree->total_cost,
lefttree->plan_rows,
- lefttree->plan_width);
+ lefttree->plan_width,
+ limit_tuples);
plan->startup_cost = sort_path.startup_cost;
plan->total_cost = sort_path.total_cost;
plan->targetlist = lefttree->targetlist;
@@ -2664,6 +2670,8 @@ add_sort_column(AttrNumber colIdx, Oid sortOp, bool nulls_first,
*
* 'lefttree' is the node which yields input tuples
* 'pathkeys' is the list of pathkeys by which the result is to be sorted
+ * 'limit_tuples' is the bound on the number of output tuples;
+ * -1 if no bound
*
* We must convert the pathkey information into arrays of sort key column
* numbers and sort operator OIDs.
@@ -2675,7 +2683,8 @@ add_sort_column(AttrNumber colIdx, Oid sortOp, bool nulls_first,
* adding a Result node just to do the projection.
*/
Sort *
-make_sort_from_pathkeys(PlannerInfo *root, Plan *lefttree, List *pathkeys)
+make_sort_from_pathkeys(PlannerInfo *root, Plan *lefttree, List *pathkeys,
+ double limit_tuples)
{
List *tlist = lefttree->targetlist;
ListCell *i;
@@ -2810,7 +2819,7 @@ make_sort_from_pathkeys(PlannerInfo *root, Plan *lefttree, List *pathkeys)
Assert(numsortkeys > 0);
return make_sort(root, lefttree, numsortkeys,
- sortColIdx, sortOperators, nullsFirst);
+ sortColIdx, sortOperators, nullsFirst, limit_tuples);
}
/*
@@ -2859,7 +2868,7 @@ make_sort_from_sortclauses(PlannerInfo *root, List *sortcls, Plan *lefttree)
Assert(numsortkeys > 0);
return make_sort(root, lefttree, numsortkeys,
- sortColIdx, sortOperators, nullsFirst);
+ sortColIdx, sortOperators, nullsFirst, -1.0);
}
/*
@@ -2919,7 +2928,7 @@ make_sort_from_groupcols(PlannerInfo *root,
Assert(numsortkeys > 0);
return make_sort(root, lefttree, numsortkeys,
- sortColIdx, sortOperators, nullsFirst);
+ sortColIdx, sortOperators, nullsFirst, -1.0);
}
Material *
diff --git a/src/backend/optimizer/plan/planmain.c b/src/backend/optimizer/plan/planmain.c
index f12e388d723..c31f7d5aa65 100644
--- a/src/backend/optimizer/plan/planmain.c
+++ b/src/backend/optimizer/plan/planmain.c
@@ -14,7 +14,7 @@
*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/optimizer/plan/planmain.c,v 1.100 2007/04/21 21:01:45 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/optimizer/plan/planmain.c,v 1.101 2007/05/04 01:13:44 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@@ -47,6 +47,8 @@
* tlist is the target list the query should produce
* (this is NOT necessarily root->parse->targetList!)
* tuple_fraction is the fraction of tuples we expect will be retrieved
+ * limit_tuples is a hard limit on number of tuples to retrieve,
+ * or -1 if no limit
*
* Output parameters:
* *cheapest_path receives the overall-cheapest path for the query
@@ -74,9 +76,13 @@
* from the plan to be retrieved
* tuple_fraction >= 1: tuple_fraction is the absolute number of tuples
* expected to be retrieved (ie, a LIMIT specification)
+ * Note that a nonzero tuple_fraction could come from outer context; it is
+ * therefore not redundant with limit_tuples. We use limit_tuples to determine
+ * whether a bounded sort can be used at runtime.
*/
void
-query_planner(PlannerInfo *root, List *tlist, double tuple_fraction,
+query_planner(PlannerInfo *root, List *tlist,
+ double tuple_fraction, double limit_tuples,
Path **cheapest_path, Path **sorted_path,
double *num_groups)
{
@@ -354,7 +360,8 @@ query_planner(PlannerInfo *root, List *tlist, double tuple_fraction,
/* Figure cost for sorting */
cost_sort(&sort_path, root, root->query_pathkeys,
cheapestpath->total_cost,
- final_rel->rows, final_rel->width);
+ final_rel->rows, final_rel->width,
+ limit_tuples);
}
if (compare_fractional_path_costs(sortedpath, &sort_path,
diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c
index d5dc6f7c7e6..366c50f2730 100644
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -8,7 +8,7 @@
*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/optimizer/plan/planner.c,v 1.218 2007/04/27 22:05:47 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/optimizer/plan/planner.c,v 1.219 2007/05/04 01:13:44 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@@ -61,7 +61,8 @@ static double preprocess_limit(PlannerInfo *root,
double tuple_fraction,
int64 *offset_est, int64 *count_est);
static Oid *extract_grouping_ops(List *groupClause);
-static bool choose_hashed_grouping(PlannerInfo *root, double tuple_fraction,
+static bool choose_hashed_grouping(PlannerInfo *root,
+ double tuple_fraction, double limit_tuples,
Path *cheapest_path, Path *sorted_path,
Oid *groupOperators, double dNumGroups,
AggClauseCounts *agg_counts);
@@ -696,6 +697,7 @@ grouping_planner(PlannerInfo *root, double tuple_fraction)
List *tlist = parse->targetList;
int64 offset_est = 0;
int64 count_est = 0;
+ double limit_tuples = -1.0;
Plan *result_plan;
List *current_pathkeys;
List *sort_pathkeys;
@@ -703,8 +705,16 @@ grouping_planner(PlannerInfo *root, double tuple_fraction)
/* Tweak caller-supplied tuple_fraction if have LIMIT/OFFSET */
if (parse->limitCount || parse->limitOffset)
+ {
tuple_fraction = preprocess_limit(root, tuple_fraction,
&offset_est, &count_est);
+ /*
+ * If we have a known LIMIT, and don't have an unknown OFFSET,
+ * we can estimate the effects of using a bounded sort.
+ */
+ if (count_est > 0 && offset_est >= 0)
+ limit_tuples = (double) count_est + (double) offset_est;
+ }
if (parse->setOperations)
{
@@ -850,7 +860,7 @@ grouping_planner(PlannerInfo *root, double tuple_fraction)
* estimate the number of groups in the query, and canonicalize all
* the pathkeys.
*/
- query_planner(root, sub_tlist, tuple_fraction,
+ query_planner(root, sub_tlist, tuple_fraction, limit_tuples,
&cheapest_path, &sorted_path, &dNumGroups);
group_pathkeys = root->group_pathkeys;
@@ -864,7 +874,7 @@ grouping_planner(PlannerInfo *root, double tuple_fraction)
{
groupOperators = extract_grouping_ops(parse->groupClause);
use_hashed_grouping =
- choose_hashed_grouping(root, tuple_fraction,
+ choose_hashed_grouping(root, tuple_fraction, limit_tuples,
cheapest_path, sorted_path,
groupOperators, dNumGroups,
&agg_counts);
@@ -1099,7 +1109,8 @@ grouping_planner(PlannerInfo *root, double tuple_fraction)
{
result_plan = (Plan *) make_sort_from_pathkeys(root,
result_plan,
- sort_pathkeys);
+ sort_pathkeys,
+ limit_tuples);
current_pathkeys = sort_pathkeys;
}
}
@@ -1414,7 +1425,8 @@ extract_grouping_ops(List *groupClause)
* choose_hashed_grouping - should we use hashed grouping?
*/
static bool
-choose_hashed_grouping(PlannerInfo *root, double tuple_fraction,
+choose_hashed_grouping(PlannerInfo *root,
+ double tuple_fraction, double limit_tuples,
Path *cheapest_path, Path *sorted_path,
Oid *groupOperators, double dNumGroups,
AggClauseCounts *agg_counts)
@@ -1499,7 +1511,7 @@ choose_hashed_grouping(PlannerInfo *root, double tuple_fraction,
/* Result of hashed agg is always unsorted */
if (root->sort_pathkeys)
cost_sort(&hashed_p, root, root->sort_pathkeys, hashed_p.total_cost,
- dNumGroups, cheapest_path_width);
+ dNumGroups, cheapest_path_width, limit_tuples);
if (sorted_path)
{
@@ -1516,7 +1528,7 @@ choose_hashed_grouping(PlannerInfo *root, double tuple_fraction,
if (!pathkeys_contained_in(root->group_pathkeys, current_pathkeys))
{
cost_sort(&sorted_p, root, root->group_pathkeys, sorted_p.total_cost,
- cheapest_path_rows, cheapest_path_width);
+ cheapest_path_rows, cheapest_path_width, -1.0);
current_pathkeys = root->group_pathkeys;
}
@@ -1533,7 +1545,7 @@ choose_hashed_grouping(PlannerInfo *root, double tuple_fraction,
if (root->sort_pathkeys &&
!pathkeys_contained_in(root->sort_pathkeys, current_pathkeys))
cost_sort(&sorted_p, root, root->sort_pathkeys, sorted_p.total_cost,
- dNumGroups, cheapest_path_width);
+ dNumGroups, cheapest_path_width, limit_tuples);
/*
* Now make the decision using the top-level tuple fraction. First we
diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c
index 6d440001d6b..bd95a0e0e23 100644
--- a/src/backend/optimizer/util/pathnode.c
+++ b/src/backend/optimizer/util/pathnode.c
@@ -8,7 +8,7 @@
*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/optimizer/util/pathnode.c,v 1.139 2007/04/21 21:01:45 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/optimizer/util/pathnode.c,v 1.140 2007/05/04 01:13:44 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@@ -842,7 +842,8 @@ create_unique_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath)
cost_sort(&sort_path, root, NIL,
subpath->total_cost,
rel->rows,
- rel->width);
+ rel->width,
+ -1.0);
/*
* Charge one cpu_operator_cost per comparison per input tuple. We assume
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index a3a8f79a5d1..9f9d1e1bcbf 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -10,7 +10,7 @@
* Written by Peter Eisentraut <peter_e@gmx.net>.
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/utils/misc/guc.c,v 1.389 2007/04/26 16:13:12 neilc Exp $
+ * $PostgreSQL: pgsql/src/backend/utils/misc/guc.c,v 1.390 2007/05/04 01:13:44 tgl Exp $
*
*--------------------------------------------------------------------
*/
@@ -108,6 +108,9 @@ extern bool fullPageWrites;
#ifdef TRACE_SORT
extern bool trace_sort;
#endif
+#ifdef DEBUG_BOUNDED_SORT
+extern bool optimize_bounded_sort;
+#endif
#ifdef USE_SSL
extern char *SSLCipherSuites;
@@ -966,6 +969,20 @@ static struct config_bool ConfigureNamesBool[] =
},
#endif
+#ifdef DEBUG_BOUNDED_SORT
+ /* this is undocumented because not exposed in a standard build */
+ {
+ {
+ "optimize_bounded_sort", PGC_USERSET, QUERY_TUNING_METHOD,
+ gettext_noop("Enable bounded sorting using heap sort."),
+ NULL,
+ GUC_NOT_IN_SAMPLE
+ },
+ &optimize_bounded_sort,
+ true, NULL, NULL
+ },
+#endif
+
#ifdef WAL_DEBUG
{
{"wal_debug", PGC_SUSET, DEVELOPER_OPTIONS,
@@ -1711,7 +1728,7 @@ static struct config_int ConfigureNamesInt[] =
&server_version_num,
PG_VERSION_NUM, PG_VERSION_NUM, PG_VERSION_NUM, NULL, NULL
},
-
+
{
{"log_temp_files", PGC_USERSET, LOGGING_WHAT,
gettext_noop("Log the use of temporary files larger than this number of kilobytes."),
@@ -2883,7 +2900,7 @@ InitializeGUCOptions(void)
PGC_S_DEFAULT))
elog(FATAL, "failed to initialize %s to %d",
conf->gen.name, conf->boot_val);
- *conf->variable = conf->reset_val = conf->boot_val;
+ *conf->variable = conf->reset_val = conf->boot_val;
break;
}
case PGC_REAL:
@@ -2897,7 +2914,7 @@ InitializeGUCOptions(void)
PGC_S_DEFAULT))
elog(FATAL, "failed to initialize %s to %g",
conf->gen.name, conf->boot_val);
- *conf->variable = conf->reset_val = conf->boot_val;
+ *conf->variable = conf->reset_val = conf->boot_val;
break;
}
case PGC_STRING:
diff --git a/src/backend/utils/sort/tuplesort.c b/src/backend/utils/sort/tuplesort.c
index 76c59a42382..e4fc74b3e93 100644
--- a/src/backend/utils/sort/tuplesort.c
+++ b/src/backend/utils/sort/tuplesort.c
@@ -91,13 +91,15 @@
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/utils/sort/tuplesort.c,v 1.74 2007/01/10 18:06:04 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/utils/sort/tuplesort.c,v 1.75 2007/05/04 01:13:44 tgl Exp $
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
+#include <limits.h>
+
#include "access/heapam.h"
#include "access/nbtree.h"
#include "catalog/pg_amop.h"
@@ -112,10 +114,13 @@
#include "utils/tuplesort.h"
-/* GUC variable */
+/* GUC variables */
#ifdef TRACE_SORT
bool trace_sort = false;
#endif
+#ifdef DEBUG_BOUNDED_SORT
+bool optimize_bounded_sort = true;
+#endif
/*
@@ -159,6 +164,7 @@ typedef struct
typedef enum
{
TSS_INITIAL, /* Loading tuples; still within memory limit */
+ TSS_BOUNDED, /* Loading tuples into bounded-size heap */
TSS_BUILDRUNS, /* Loading tuples; writing to tape */
TSS_SORTEDINMEM, /* Sort completed entirely in memory */
TSS_SORTEDONTAPE, /* Sort completed, final run is on tape */
@@ -188,6 +194,9 @@ struct Tuplesortstate
TupSortStatus status; /* enumerated value as shown above */
int nKeys; /* number of columns in sort key */
bool randomAccess; /* did caller request random access? */
+ bool bounded; /* did caller specify a maximum number of
+ * tuples to return? */
+ int bound; /* if bounded, the maximum number of tuples */
long availMem; /* remaining memory available, in bytes */
long allowedMem; /* total memory allowed, in bytes */
int maxTapes; /* number of tapes (Knuth's T) */
@@ -235,6 +244,13 @@ struct Tuplesortstate
int tapenum, unsigned int len);
/*
+ * Function to reverse the sort direction from its current state.
+ * (We could dispense with this if we wanted to enforce that all variants
+ * represent the sort key information alike.)
+ */
+ void (*reversedirection) (Tuplesortstate *state);
+
+ /*
* This array holds the tuples now in sort memory. If we are in state
* INITIAL, the tuples are in no particular order; if we are in state
* SORTEDINMEM, the tuples are in final sorted order; in states BUILDRUNS
@@ -347,6 +363,7 @@ struct Tuplesortstate
#define COPYTUP(state,stup,tup) ((*(state)->copytup) (state, stup, tup))
#define WRITETUP(state,tape,stup) ((*(state)->writetup) (state, tape, stup))
#define READTUP(state,stup,tape,len) ((*(state)->readtup) (state, stup, tape, len))
+#define REVERSEDIRECTION(state) ((*(state)->reversedirection) (state))
#define LACKMEM(state) ((state)->availMem < 0)
#define USEMEM(state,amt) ((state)->availMem -= (amt))
#define FREEMEM(state,amt) ((state)->availMem += (amt))
@@ -403,6 +420,8 @@ static void beginmerge(Tuplesortstate *state);
static void mergepreread(Tuplesortstate *state);
static void mergeprereadone(Tuplesortstate *state, int srcTape);
static void dumptuples(Tuplesortstate *state, bool alltuples);
+static void make_bounded_heap(Tuplesortstate *state);
+static void sort_bounded_heap(Tuplesortstate *state);
static void tuplesort_heap_insert(Tuplesortstate *state, SortTuple *tuple,
int tupleindex, bool checkIndex);
static void tuplesort_heap_siftup(Tuplesortstate *state, bool checkIndex);
@@ -415,6 +434,7 @@ static void writetup_heap(Tuplesortstate *state, int tapenum,
SortTuple *stup);
static void readtup_heap(Tuplesortstate *state, SortTuple *stup,
int tapenum, unsigned int len);
+static void reversedirection_heap(Tuplesortstate *state);
static int comparetup_index(const SortTuple *a, const SortTuple *b,
Tuplesortstate *state);
static void copytup_index(Tuplesortstate *state, SortTuple *stup, void *tup);
@@ -422,6 +442,7 @@ static void writetup_index(Tuplesortstate *state, int tapenum,
SortTuple *stup);
static void readtup_index(Tuplesortstate *state, SortTuple *stup,
int tapenum, unsigned int len);
+static void reversedirection_index(Tuplesortstate *state);
static int comparetup_datum(const SortTuple *a, const SortTuple *b,
Tuplesortstate *state);
static void copytup_datum(Tuplesortstate *state, SortTuple *stup, void *tup);
@@ -429,6 +450,8 @@ static void writetup_datum(Tuplesortstate *state, int tapenum,
SortTuple *stup);
static void readtup_datum(Tuplesortstate *state, SortTuple *stup,
int tapenum, unsigned int len);
+static void reversedirection_datum(Tuplesortstate *state);
+static void free_sort_tuple(Tuplesortstate *state, SortTuple *stup);
/*
@@ -538,6 +561,7 @@ tuplesort_begin_heap(TupleDesc tupDesc,
state->copytup = copytup_heap;
state->writetup = writetup_heap;
state->readtup = readtup_heap;
+ state->reversedirection = reversedirection_heap;
state->tupDesc = tupDesc; /* assume we need not copy tupDesc */
state->scanKeys = (ScanKey) palloc0(nkeys * sizeof(ScanKeyData));
@@ -601,6 +625,7 @@ tuplesort_begin_index(Relation indexRel,
state->copytup = copytup_index;
state->writetup = writetup_index;
state->readtup = readtup_index;
+ state->reversedirection = reversedirection_index;
state->indexRel = indexRel;
/* see comments below about btree dependence of this code... */
@@ -639,6 +664,7 @@ tuplesort_begin_datum(Oid datumType,
state->copytup = copytup_datum;
state->writetup = writetup_datum;
state->readtup = readtup_datum;
+ state->reversedirection = reversedirection_datum;
state->datumType = datumType;
@@ -665,6 +691,40 @@ tuplesort_begin_datum(Oid datumType,
}
/*
+ * tuplesort_set_bound
+ *
+ * Advise tuplesort that at most the first N result tuples are required.
+ *
+ * Must be called before inserting any tuples. (Actually, we could allow it
+ * as long as the sort hasn't spilled to disk, but there seems no need for
+ * delayed calls at the moment.)
+ *
+ * This is a hint only. The tuplesort may still return more tuples than
+ * requested.
+ */
+void
+tuplesort_set_bound(Tuplesortstate *state, int64 bound)
+{
+ /* Assert we're called before loading any tuples */
+ Assert(state->status == TSS_INITIAL);
+ Assert(state->memtupcount == 0);
+ Assert(!state->bounded);
+
+#ifdef DEBUG_BOUNDED_SORT
+ /* Honor GUC setting that disables the feature (for easy testing) */
+ if (!optimize_bounded_sort)
+ return;
+#endif
+
+ /* We want to be able to compute bound * 2, so limit the setting */
+ if (bound > (int64) (INT_MAX/2))
+ return;
+
+ state->bounded = true;
+ state->bound = (int) bound;
+}
+
+/*
* tuplesort_end
*
* Release resources and clean up.
@@ -863,6 +923,32 @@ puttuple_common(Tuplesortstate *state, SortTuple *tuple)
state->memtuples[state->memtupcount++] = *tuple;
/*
+ * Check if it's time to switch over to a bounded heapsort.
+ * We do so if the input tuple count exceeds twice the desired
+ * tuple count (this is a heuristic for where heapsort becomes
+ * cheaper than a quicksort), or if we've just filled workMem
+ * and have enough tuples to meet the bound.
+ *
+ * Note that once we enter TSS_BOUNDED state we will always try
+ * to complete the sort that way. In the worst case, if later
+ * input tuples are larger than earlier ones, this might cause
+ * us to exceed workMem significantly.
+ */
+ if (state->bounded &&
+ (state->memtupcount > state->bound * 2 ||
+ (state->memtupcount > state->bound && LACKMEM(state))))
+ {
+#ifdef TRACE_SORT
+ if (trace_sort)
+ elog(LOG, "switching to bounded heapsort at %d tuples: %s",
+ state->memtupcount,
+ pg_rusage_show(&state->ru_start));
+#endif
+ make_bounded_heap(state);
+ return;
+ }
+
+ /*
* Done if we still fit in available memory and have array slots.
*/
if (state->memtupcount < state->memtupsize && !LACKMEM(state))
@@ -878,6 +964,31 @@ puttuple_common(Tuplesortstate *state, SortTuple *tuple)
*/
dumptuples(state, false);
break;
+
+ case TSS_BOUNDED:
+ /*
+ * We don't want to grow the array here, so check whether the
+ * new tuple can be discarded before putting it in. This should
+ * be a good speed optimization, too, since when there are many
+ * more input tuples than the bound, most input tuples can be
+ * discarded with just this one comparison. Note that because
+ * we currently have the sort direction reversed, we must check
+ * for <= not >=.
+ */
+ if (COMPARETUP(state, tuple, &state->memtuples[0]) <= 0)
+ {
+ /* new tuple <= top of the heap, so we can discard it */
+ free_sort_tuple(state, tuple);
+ }
+ else
+ {
+ /* discard top of heap, sift up, insert new tuple */
+ free_sort_tuple(state, &state->memtuples[0]);
+ tuplesort_heap_siftup(state, false);
+ tuplesort_heap_insert(state, tuple, 0, false);
+ }
+ break;
+
case TSS_BUILDRUNS:
/*
@@ -904,6 +1015,7 @@ puttuple_common(Tuplesortstate *state, SortTuple *tuple)
*/
dumptuples(state, false);
break;
+
default:
elog(ERROR, "invalid tuplesort state");
break;
@@ -944,6 +1056,23 @@ tuplesort_performsort(Tuplesortstate *state)
state->markpos_eof = false;
state->status = TSS_SORTEDINMEM;
break;
+
+ case TSS_BOUNDED:
+
+ /*
+ * We were able to accumulate all the tuples required for output
+ * in memory, using a heap to eliminate excess tuples. Now we have
+ * to transform the heap to a properly-sorted array.
+ */
+ if (state->memtupcount > 1)
+ sort_bounded_heap(state);
+ state->current = 0;
+ state->eof_reached = false;
+ state->markpos_offset = 0;
+ state->markpos_eof = false;
+ state->status = TSS_SORTEDINMEM;
+ break;
+
case TSS_BUILDRUNS:
/*
@@ -959,6 +1088,7 @@ tuplesort_performsort(Tuplesortstate *state)
state->markpos_offset = 0;
state->markpos_eof = false;
break;
+
default:
elog(ERROR, "invalid tuplesort state");
break;
@@ -1004,6 +1134,15 @@ tuplesort_gettuple_common(Tuplesortstate *state, bool forward,
return true;
}
state->eof_reached = true;
+
+ /*
+ * Complain if caller tries to retrieve more tuples than
+ * originally asked for in a bounded sort. This is because
+ * returning EOF here might be the wrong thing.
+ */
+ if (state->bounded && state->current >= state->bound)
+ elog(ERROR, "retrieved too many tuples in a bounded sort");
+
return false;
}
else
@@ -1988,6 +2127,98 @@ tuplesort_restorepos(Tuplesortstate *state)
COMPARETUP(state, tup1, tup2))
/*
+ * Convert the existing unordered array of SortTuples to a bounded heap,
+ * discarding all but the smallest "state->bound" tuples.
+ *
+ * When working with a bounded heap, we want to keep the largest entry
+ * at the root (array entry zero), instead of the smallest as in the normal
+ * sort case. This allows us to discard the largest entry cheaply.
+ * Therefore, we temporarily reverse the sort direction.
+ *
+ * We assume that all entries in a bounded heap will always have tupindex
+ * zero; it therefore doesn't matter that HEAPCOMPARE() doesn't reverse
+ * the direction of comparison for tupindexes.
+ */
+static void
+make_bounded_heap(Tuplesortstate *state)
+{
+ int tupcount = state->memtupcount;
+ int i;
+
+ Assert(state->status == TSS_INITIAL);
+ Assert(state->bounded);
+ Assert(tupcount >= state->bound);
+
+ /* Reverse sort direction so largest entry will be at root */
+ REVERSEDIRECTION(state);
+
+ state->memtupcount = 0; /* make the heap empty */
+ for (i=0; i<tupcount; i++)
+ {
+ if (state->memtupcount >= state->bound &&
+ COMPARETUP(state, &state->memtuples[i], &state->memtuples[0]) <= 0)
+ {
+ /* New tuple would just get thrown out, so skip it */
+ free_sort_tuple(state, &state->memtuples[i]);
+ }
+ else
+ {
+ /* Insert next tuple into heap */
+ /* Must copy source tuple to avoid possible overwrite */
+ SortTuple stup = state->memtuples[i];
+
+ tuplesort_heap_insert(state, &stup, 0, false);
+
+ /* If heap too full, discard largest entry */
+ if (state->memtupcount > state->bound)
+ {
+ free_sort_tuple(state, &state->memtuples[0]);
+ tuplesort_heap_siftup(state, false);
+ }
+ }
+ }
+
+ Assert(state->memtupcount == state->bound);
+ state->status = TSS_BOUNDED;
+}
+
+/*
+ * Convert the bounded heap to a properly-sorted array
+ */
+static void
+sort_bounded_heap(Tuplesortstate *state)
+{
+ int tupcount = state->memtupcount;
+
+ Assert(state->status == TSS_BOUNDED);
+ Assert(state->bounded);
+ Assert(tupcount == state->bound);
+
+ /*
+ * We can unheapify in place because each sift-up will remove the largest
+ * entry, which we can promptly store in the newly freed slot at the end.
+ * Once we're down to a single-entry heap, we're done.
+ */
+ while (state->memtupcount > 1)
+ {
+ SortTuple stup = state->memtuples[0];
+
+ /* this sifts-up the next-largest entry and decreases memtupcount */
+ tuplesort_heap_siftup(state, false);
+ state->memtuples[state->memtupcount] = stup;
+ }
+ state->memtupcount = tupcount;
+
+ /*
+ * Reverse sort direction back to the original state. This is not
+ * actually necessary but seems like a good idea for tidiness.
+ */
+ REVERSEDIRECTION(state);
+
+ state->status = TSS_SORTEDINMEM;
+}
+
+/*
* Insert a new tuple into an empty or existing heap, maintaining the
* heap invariant. Caller is responsible for ensuring there's room.
*
@@ -2322,6 +2553,18 @@ readtup_heap(Tuplesortstate *state, SortTuple *stup,
&stup->isnull1);
}
+static void
+reversedirection_heap(Tuplesortstate *state)
+{
+ ScanKey scanKey = state->scanKeys;
+ int nkey;
+
+ for (nkey = 0; nkey < state->nKeys; nkey++, scanKey++)
+ {
+ scanKey->sk_flags ^= (SK_BT_DESC | SK_BT_NULLS_FIRST);
+ }
+}
+
/*
* Routines specialized for IndexTuple case
@@ -2497,6 +2740,18 @@ readtup_index(Tuplesortstate *state, SortTuple *stup,
&stup->isnull1);
}
+static void
+reversedirection_index(Tuplesortstate *state)
+{
+ ScanKey scanKey = state->indexScanKey;
+ int nkey;
+
+ for (nkey = 0; nkey < state->nKeys; nkey++, scanKey++)
+ {
+ scanKey->sk_flags ^= (SK_BT_DESC | SK_BT_NULLS_FIRST);
+ }
+}
+
/*
* Routines specialized for DatumTuple case
@@ -2601,3 +2856,19 @@ readtup_datum(Tuplesortstate *state, SortTuple *stup,
sizeof(tuplen)) != sizeof(tuplen))
elog(ERROR, "unexpected end of data");
}
+
+static void
+reversedirection_datum(Tuplesortstate *state)
+{
+ state->sortFnFlags ^= (SK_BT_DESC | SK_BT_NULLS_FIRST);
+}
+
+/*
+ * Convenience routine to free a tuple previously loaded into sort memory
+ */
+static void
+free_sort_tuple(Tuplesortstate *state, SortTuple *stup)
+{
+ FREEMEM(state, GetMemoryChunkSpace(stup->tuple));
+ pfree(stup->tuple);
+}
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index 726ee5bdae6..12219b883eb 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
- * $PostgreSQL: pgsql/src/include/nodes/execnodes.h,v 1.172 2007/04/26 23:24:44 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/nodes/execnodes.h,v 1.173 2007/05/04 01:13:45 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@@ -1294,7 +1294,11 @@ typedef struct SortState
{
ScanState ss; /* its first field is NodeTag */
bool randomAccess; /* need random access to sort output? */
+ bool bounded; /* is the result set bounded? */
+ int64 bound; /* if bounded, how many tuples are needed */
bool sort_Done; /* sort completed yet? */
+ bool bounded_Done; /* value of bounded we did the sort with */
+ int64 bound_Done; /* value of bound we did the sort with */
void *tuplesortstate; /* private state of tuplesort.c */
} SortState;
diff --git a/src/include/optimizer/cost.h b/src/include/optimizer/cost.h
index ef92d685eb4..641555b7068 100644
--- a/src/include/optimizer/cost.h
+++ b/src/include/optimizer/cost.h
@@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
- * $PostgreSQL: pgsql/src/include/optimizer/cost.h,v 1.85 2007/02/22 22:00:26 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/optimizer/cost.h,v 1.86 2007/05/04 01:13:45 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@@ -73,7 +73,8 @@ extern void cost_functionscan(Path *path, PlannerInfo *root,
extern void cost_valuesscan(Path *path, PlannerInfo *root,
RelOptInfo *baserel);
extern void cost_sort(Path *path, PlannerInfo *root,
- List *pathkeys, Cost input_cost, double tuples, int width);
+ List *pathkeys, Cost input_cost, double tuples, int width,
+ double limit_tuples);
extern void cost_material(Path *path,
Cost input_cost, double tuples, int width);
extern void cost_agg(Path *path, PlannerInfo *root,
diff --git a/src/include/optimizer/planmain.h b/src/include/optimizer/planmain.h
index b5ad4b3c3b5..319c38c66b5 100644
--- a/src/include/optimizer/planmain.h
+++ b/src/include/optimizer/planmain.h
@@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
- * $PostgreSQL: pgsql/src/include/optimizer/planmain.h,v 1.100 2007/02/22 22:00:26 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/optimizer/planmain.h,v 1.101 2007/05/04 01:13:45 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@@ -21,7 +21,7 @@
* prototypes for plan/planmain.c
*/
extern void query_planner(PlannerInfo *root, List *tlist,
- double tuple_fraction,
+ double tuple_fraction, double limit_tuples,
Path **cheapest_path, Path **sorted_path,
double *num_groups);
@@ -39,7 +39,7 @@ extern SubqueryScan *make_subqueryscan(List *qptlist, List *qpqual,
Index scanrelid, Plan *subplan, List *subrtable);
extern Append *make_append(List *appendplans, bool isTarget, List *tlist);
extern Sort *make_sort_from_pathkeys(PlannerInfo *root, Plan *lefttree,
- List *pathkeys);
+ List *pathkeys, double limit_tuples);
extern Sort *make_sort_from_sortclauses(PlannerInfo *root, List *sortcls,
Plan *lefttree);
extern Sort *make_sort_from_groupcols(PlannerInfo *root, List *groupcls,
diff --git a/src/include/utils/tuplesort.h b/src/include/utils/tuplesort.h
index cea50b4836b..c736896a9a9 100644
--- a/src/include/utils/tuplesort.h
+++ b/src/include/utils/tuplesort.h
@@ -13,7 +13,7 @@
* Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
- * $PostgreSQL: pgsql/src/include/utils/tuplesort.h,v 1.25 2007/01/09 02:14:16 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/utils/tuplesort.h,v 1.26 2007/05/04 01:13:45 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@@ -55,6 +55,8 @@ extern Tuplesortstate *tuplesort_begin_datum(Oid datumType,
Oid sortOperator, bool nullsFirstFlag,
int workMem, bool randomAccess);
+extern void tuplesort_set_bound(Tuplesortstate *state, int64 bound);
+
extern void tuplesort_puttupleslot(Tuplesortstate *state,
TupleTableSlot *slot);
extern void tuplesort_putindextuple(Tuplesortstate *state, IndexTuple tuple);