aboutsummaryrefslogtreecommitdiff
path: root/src/backend/commands/explain.c
diff options
context:
space:
mode:
authorTomas Vondra <tomas.vondra@postgresql.org>2020-04-06 21:33:28 +0200
committerTomas Vondra <tomas.vondra@postgresql.org>2020-04-06 21:35:10 +0200
commitd2d8a229bc58a2014dce1c7a4fcdb6c5ab9fb8da (patch)
tree66e2560c49ee43d13c29bd9f5760731001312738 /src/backend/commands/explain.c
parent3c8553547b1493c4afdb80393f4a47dbfa019a79 (diff)
downloadpostgresql-d2d8a229bc58a2014dce1c7a4fcdb6c5ab9fb8da.tar.gz
postgresql-d2d8a229bc58a2014dce1c7a4fcdb6c5ab9fb8da.zip
Implement Incremental Sort
Incremental Sort is an optimized variant of multikey sort for cases when the input is already sorted by a prefix of the requested sort keys. For example when the relation is already sorted by (key1, key2) and we need to sort it by (key1, key2, key3) we can simply split the input rows into groups having equal values in (key1, key2), and only sort/compare the remaining column key3. This has a number of benefits: - Reduced memory consumption, because only a single group (determined by values in the sorted prefix) needs to be kept in memory. This may also eliminate the need to spill to disk. - Lower startup cost, because Incremental Sort produce results after each prefix group, which is beneficial for plans where startup cost matters (like for example queries with LIMIT clause). We consider both Sort and Incremental Sort, and decide based on costing. The implemented algorithm operates in two different modes: - Fetching a minimum number of tuples without check of equality on the prefix keys, and sorting on all columns when safe. - Fetching all tuples for a single prefix group and then sorting by comparing only the remaining (non-prefix) keys. We always start in the first mode, and employ a heuristic to switch into the second mode if we believe it's beneficial - the goal is to minimize the number of unnecessary comparions while keeping memory consumption below work_mem. This is a very old patch series. The idea was originally proposed by Alexander Korotkov back in 2013, and then revived in 2017. In 2018 the patch was taken over by James Coleman, who wrote and rewrote most of the current code. There were many reviewers/contributors since 2013 - I've done my best to pick the most active ones, and listed them in this commit message. Author: James Coleman, Alexander Korotkov Reviewed-by: Tomas Vondra, Andreas Karlsson, Marti Raudsepp, Peter Geoghegan, Robert Haas, Thomas Munro, Antonin Houska, Andres Freund, Alexander Kuzmenkov Discussion: https://postgr.es/m/CAPpHfdscOX5an71nHd8WSUH6GNOCf=V7wgDaTXdDd9=goN-gfA@mail.gmail.com Discussion: https://postgr.es/m/CAPpHfds1waRZ=NOmueYq0sx1ZSCnt+5QJvizT8ndT2=etZEeAQ@mail.gmail.com
Diffstat (limited to 'src/backend/commands/explain.c')
-rw-r--r--src/backend/commands/explain.c239
1 files changed, 232 insertions, 7 deletions
diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index bb58f928518..62c86ecdc59 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -82,6 +82,8 @@ static void show_upper_qual(List *qual, const char *qlabel,
ExplainState *es);
static void show_sort_keys(SortState *sortstate, List *ancestors,
ExplainState *es);
+static void show_incremental_sort_keys(IncrementalSortState *incrsortstate,
+ List *ancestors, ExplainState *es);
static void show_merge_append_keys(MergeAppendState *mstate, List *ancestors,
ExplainState *es);
static void show_agg_keys(AggState *astate, List *ancestors,
@@ -95,7 +97,7 @@ static void show_grouping_set_keys(PlanState *planstate,
static void show_group_keys(GroupState *gstate, List *ancestors,
ExplainState *es);
static void show_sort_group_keys(PlanState *planstate, const char *qlabel,
- int nkeys, AttrNumber *keycols,
+ int nkeys, int nPresortedKeys, AttrNumber *keycols,
Oid *sortOperators, Oid *collations, bool *nullsFirst,
List *ancestors, ExplainState *es);
static void show_sortorder_options(StringInfo buf, Node *sortexpr,
@@ -103,6 +105,8 @@ static void show_sortorder_options(StringInfo buf, Node *sortexpr,
static void show_tablesample(TableSampleClause *tsc, PlanState *planstate,
List *ancestors, ExplainState *es);
static void show_sort_info(SortState *sortstate, ExplainState *es);
+static void show_incremental_sort_info(IncrementalSortState *incrsortstate,
+ ExplainState *es);
static void show_hash_info(HashState *hashstate, ExplainState *es);
static void show_hashagg_info(AggState *hashstate, ExplainState *es);
static void show_tidbitmap_info(BitmapHeapScanState *planstate,
@@ -1278,6 +1282,9 @@ ExplainNode(PlanState *planstate, List *ancestors,
case T_Sort:
pname = sname = "Sort";
break;
+ case T_IncrementalSort:
+ pname = sname = "Incremental Sort";
+ break;
case T_Group:
pname = sname = "Group";
break;
@@ -1937,6 +1944,12 @@ ExplainNode(PlanState *planstate, List *ancestors,
show_sort_keys(castNode(SortState, planstate), ancestors, es);
show_sort_info(castNode(SortState, planstate), es);
break;
+ case T_IncrementalSort:
+ show_incremental_sort_keys(castNode(IncrementalSortState, planstate),
+ ancestors, es);
+ show_incremental_sort_info(castNode(IncrementalSortState, planstate),
+ es);
+ break;
case T_MergeAppend:
show_merge_append_keys(castNode(MergeAppendState, planstate),
ancestors, es);
@@ -2270,13 +2283,30 @@ show_sort_keys(SortState *sortstate, List *ancestors, ExplainState *es)
Sort *plan = (Sort *) sortstate->ss.ps.plan;
show_sort_group_keys((PlanState *) sortstate, "Sort Key",
- plan->numCols, plan->sortColIdx,
+ plan->numCols, 0, plan->sortColIdx,
plan->sortOperators, plan->collations,
plan->nullsFirst,
ancestors, es);
}
/*
+ * Show the sort keys for a IncrementalSort node.
+ */
+static void
+show_incremental_sort_keys(IncrementalSortState *incrsortstate,
+ List *ancestors, ExplainState *es)
+{
+ IncrementalSort *plan = (IncrementalSort *) incrsortstate->ss.ps.plan;
+
+ show_sort_group_keys((PlanState *) incrsortstate, "Sort Key",
+ plan->sort.numCols, plan->nPresortedCols,
+ plan->sort.sortColIdx,
+ plan->sort.sortOperators, plan->sort.collations,
+ plan->sort.nullsFirst,
+ ancestors, es);
+}
+
+/*
* Likewise, for a MergeAppend node.
*/
static void
@@ -2286,7 +2316,7 @@ show_merge_append_keys(MergeAppendState *mstate, List *ancestors,
MergeAppend *plan = (MergeAppend *) mstate->ps.plan;
show_sort_group_keys((PlanState *) mstate, "Sort Key",
- plan->numCols, plan->sortColIdx,
+ plan->numCols, 0, plan->sortColIdx,
plan->sortOperators, plan->collations,
plan->nullsFirst,
ancestors, es);
@@ -2310,7 +2340,7 @@ show_agg_keys(AggState *astate, List *ancestors,
show_grouping_sets(outerPlanState(astate), plan, ancestors, es);
else
show_sort_group_keys(outerPlanState(astate), "Group Key",
- plan->numCols, plan->grpColIdx,
+ plan->numCols, 0, plan->grpColIdx,
NULL, NULL, NULL,
ancestors, es);
@@ -2379,7 +2409,7 @@ show_grouping_set_keys(PlanState *planstate,
if (sortnode)
{
show_sort_group_keys(planstate, "Sort Key",
- sortnode->numCols, sortnode->sortColIdx,
+ sortnode->numCols, 0, sortnode->sortColIdx,
sortnode->sortOperators, sortnode->collations,
sortnode->nullsFirst,
ancestors, es);
@@ -2436,7 +2466,7 @@ show_group_keys(GroupState *gstate, List *ancestors,
/* The key columns refer to the tlist of the child plan */
ancestors = lcons(plan, ancestors);
show_sort_group_keys(outerPlanState(gstate), "Group Key",
- plan->numCols, plan->grpColIdx,
+ plan->numCols, 0, plan->grpColIdx,
NULL, NULL, NULL,
ancestors, es);
ancestors = list_delete_first(ancestors);
@@ -2449,13 +2479,14 @@ show_group_keys(GroupState *gstate, List *ancestors,
*/
static void
show_sort_group_keys(PlanState *planstate, const char *qlabel,
- int nkeys, AttrNumber *keycols,
+ int nkeys, int nPresortedKeys, AttrNumber *keycols,
Oid *sortOperators, Oid *collations, bool *nullsFirst,
List *ancestors, ExplainState *es)
{
Plan *plan = planstate->plan;
List *context;
List *result = NIL;
+ List *resultPresorted = NIL;
StringInfoData sortkeybuf;
bool useprefix;
int keyno;
@@ -2495,9 +2526,13 @@ show_sort_group_keys(PlanState *planstate, const char *qlabel,
nullsFirst[keyno]);
/* Emit one property-list item per sort key */
result = lappend(result, pstrdup(sortkeybuf.data));
+ if (keyno < nPresortedKeys)
+ resultPresorted = lappend(resultPresorted, exprstr);
}
ExplainPropertyList(qlabel, result, es);
+ if (nPresortedKeys > 0)
+ ExplainPropertyList("Presorted Key", resultPresorted, es);
}
/*
@@ -2712,6 +2747,196 @@ show_sort_info(SortState *sortstate, ExplainState *es)
}
/*
+ * Incremental sort nodes sort in (a potentially very large number of) batches,
+ * so EXPLAIN ANALYZE needs to roll up the tuplesort stats from each batch into
+ * an intelligible summary.
+ *
+ * This function is used for both a non-parallel node and each worker in a
+ * parallel incremental sort node.
+ */
+static void
+show_incremental_sort_group_info(IncrementalSortGroupInfo *groupInfo,
+ const char *groupLabel, bool indent, ExplainState *es)
+{
+ ListCell *methodCell;
+ List *methodNames = NIL;
+
+ /* Generate a list of sort methods used across all groups. */
+ for (int bit = 0; bit < sizeof(bits32); ++bit)
+ {
+ if (groupInfo->sortMethods & (1 << bit))
+ {
+ TuplesortMethod sortMethod = (1 << bit);
+ const char *methodName;
+
+ methodName = tuplesort_method_name(sortMethod);
+ methodNames = lappend(methodNames, unconstify(char *, methodName));
+ }
+ }
+
+ if (es->format == EXPLAIN_FORMAT_TEXT)
+ {
+ if (indent)
+ appendStringInfoSpaces(es->str, es->indent * 2);
+ appendStringInfo(es->str, "%s Groups: %ld Sort Method", groupLabel,
+ groupInfo->groupCount);
+ /* plural/singular based on methodNames size */
+ if (list_length(methodNames) > 1)
+ appendStringInfo(es->str, "s: ");
+ else
+ appendStringInfo(es->str, ": ");
+ foreach(methodCell, methodNames)
+ {
+ appendStringInfo(es->str, "%s", (char *) methodCell->ptr_value);
+ if (foreach_current_index(methodCell) < list_length(methodNames) - 1)
+ appendStringInfo(es->str, ", ");
+ }
+
+ if (groupInfo->maxMemorySpaceUsed > 0)
+ {
+ long avgSpace = groupInfo->totalMemorySpaceUsed / groupInfo->groupCount;
+ const char *spaceTypeName;
+
+ spaceTypeName = tuplesort_space_type_name(SORT_SPACE_TYPE_MEMORY);
+ appendStringInfo(es->str, " %s: avg=%ldkB peak=%ldkB",
+ spaceTypeName, avgSpace,
+ groupInfo->maxMemorySpaceUsed);
+ }
+
+ if (groupInfo->maxDiskSpaceUsed > 0)
+ {
+ long avgSpace = groupInfo->totalDiskSpaceUsed / groupInfo->groupCount;
+
+ const char *spaceTypeName;
+
+ spaceTypeName = tuplesort_space_type_name(SORT_SPACE_TYPE_DISK);
+ /* Add a semicolon separator only if memory stats were printed. */
+ if (groupInfo->maxMemorySpaceUsed > 0)
+ appendStringInfo(es->str, ";");
+ appendStringInfo(es->str, " %s: avg=%ldkB peak=%ldkB",
+ spaceTypeName, avgSpace,
+ groupInfo->maxDiskSpaceUsed);
+ }
+ }
+ else
+ {
+ StringInfoData groupName;
+
+ initStringInfo(&groupName);
+ appendStringInfo(&groupName, "%s Groups", groupLabel);
+ ExplainOpenGroup("Incremental Sort Groups", groupName.data, true, es);
+ ExplainPropertyInteger("Group Count", NULL, groupInfo->groupCount, es);
+
+ ExplainPropertyList("Sort Methods Used", methodNames, es);
+
+ if (groupInfo->maxMemorySpaceUsed > 0)
+ {
+ long avgSpace = groupInfo->totalMemorySpaceUsed / groupInfo->groupCount;
+ const char *spaceTypeName;
+ StringInfoData memoryName;
+
+ spaceTypeName = tuplesort_space_type_name(SORT_SPACE_TYPE_MEMORY);
+ initStringInfo(&memoryName);
+ appendStringInfo(&memoryName, "Sort Space %s", spaceTypeName);
+ ExplainOpenGroup("Sort Space", memoryName.data, true, es);
+
+ ExplainPropertyInteger("Average Sort Space Used", "kB", avgSpace, es);
+ ExplainPropertyInteger("Maximum Sort Space Used", "kB",
+ groupInfo->maxMemorySpaceUsed, es);
+
+ ExplainCloseGroup("Sort Spaces", memoryName.data, true, es);
+ }
+ if (groupInfo->maxDiskSpaceUsed > 0)
+ {
+ long avgSpace = groupInfo->totalDiskSpaceUsed / groupInfo->groupCount;
+ const char *spaceTypeName;
+ StringInfoData diskName;
+
+ spaceTypeName = tuplesort_space_type_name(SORT_SPACE_TYPE_DISK);
+ initStringInfo(&diskName);
+ appendStringInfo(&diskName, "Sort Space %s", spaceTypeName);
+ ExplainOpenGroup("Sort Space", diskName.data, true, es);
+
+ ExplainPropertyInteger("Average Sort Space Used", "kB", avgSpace, es);
+ ExplainPropertyInteger("Maximum Sort Space Used", "kB",
+ groupInfo->maxDiskSpaceUsed, es);
+
+ ExplainCloseGroup("Sort Spaces", diskName.data, true, es);
+ }
+
+ ExplainCloseGroup("Incremental Sort Groups", groupName.data, true, es);
+ }
+}
+
+/*
+ * If it's EXPLAIN ANALYZE, show tuplesort stats for a incremental sort node
+ */
+static void
+show_incremental_sort_info(IncrementalSortState *incrsortstate,
+ ExplainState *es)
+{
+ IncrementalSortGroupInfo *fullsortGroupInfo;
+ IncrementalSortGroupInfo *prefixsortGroupInfo;
+
+ fullsortGroupInfo = &incrsortstate->incsort_info.fullsortGroupInfo;
+
+ if (!(es->analyze && fullsortGroupInfo->groupCount > 0))
+ return;
+
+ show_incremental_sort_group_info(fullsortGroupInfo, "Full-sort", true, es);
+ prefixsortGroupInfo = &incrsortstate->incsort_info.prefixsortGroupInfo;
+ if (prefixsortGroupInfo->groupCount > 0)
+ {
+ if (es->format == EXPLAIN_FORMAT_TEXT)
+ appendStringInfo(es->str, " ");
+ show_incremental_sort_group_info(prefixsortGroupInfo, "Presorted", false, es);
+ }
+ if (es->format == EXPLAIN_FORMAT_TEXT)
+ appendStringInfo(es->str, "\n");
+
+ if (incrsortstate->shared_info != NULL)
+ {
+ int n;
+ bool indent_first_line;
+
+ for (n = 0; n < incrsortstate->shared_info->num_workers; n++)
+ {
+ IncrementalSortInfo *incsort_info =
+ &incrsortstate->shared_info->sinfo[n];
+
+ /*
+ * If a worker hasn't process any sort groups at all, then exclude
+ * it from output since it either didn't launch or didn't
+ * contribute anything meaningful.
+ */
+ fullsortGroupInfo = &incsort_info->fullsortGroupInfo;
+ prefixsortGroupInfo = &incsort_info->prefixsortGroupInfo;
+ if (fullsortGroupInfo->groupCount == 0 &&
+ prefixsortGroupInfo->groupCount == 0)
+ continue;
+
+ if (es->workers_state)
+ ExplainOpenWorker(n, es);
+
+ indent_first_line = es->workers_state == NULL || es->verbose;
+ show_incremental_sort_group_info(fullsortGroupInfo, "Full-sort",
+ indent_first_line, es);
+ if (prefixsortGroupInfo->groupCount > 0)
+ {
+ if (es->format == EXPLAIN_FORMAT_TEXT)
+ appendStringInfo(es->str, " ");
+ show_incremental_sort_group_info(prefixsortGroupInfo, "Presorted", false, es);
+ }
+ if (es->format == EXPLAIN_FORMAT_TEXT)
+ appendStringInfo(es->str, "\n");
+
+ if (es->workers_state)
+ ExplainCloseWorker(n, es);
+ }
+ }
+}
+
+/*
* Show information on hash buckets/batches.
*/
static void