Implement Incremental Sort

Incremental Sort is an optimized variant of multikey sort for cases when the input is already sorted by a prefix of the requested sort keys. For example when the relation is already sorted by (key1, key2) and we need to sort it by (key1, key2, key3) we can simply split the input rows into groups having equal values in (key1, key2), and only sort/compare the remaining column key3. This has a number of benefits: - Reduced memory consumption, because only a single group (determined by values in the sorted prefix) needs to be kept in memory. This may also eliminate the need to spill to disk. - Lower startup cost, because Incremental Sort produce results after each prefix group, which is beneficial for plans where startup cost matters (like for example queries with LIMIT clause). We consider both Sort and Incremental Sort, and decide based on costing. The implemented algorithm operates in two different modes: - Fetching a minimum number of tuples without check of equality on the prefix keys, and sorting on all columns when safe. - Fetching all tuples for a single prefix group and then sorting by comparing only the remaining (non-prefix) keys. We always start in the first mode, and employ a heuristic to switch into the second mode if we believe it's beneficial - the goal is to minimize the number of unnecessary comparions while keeping memory consumption below work_mem. This is a very old patch series. The idea was originally proposed by Alexander Korotkov back in 2013, and then revived in 2017. In 2018 the patch was taken over by James Coleman, who wrote and rewrote most of the current code. There were many reviewers/contributors since 2013 - I've done my best to pick the most active ones, and listed them in this commit message. Author: James Coleman, Alexander Korotkov Reviewed-by: Tomas Vondra, Andreas Karlsson, Marti Raudsepp, Peter Geoghegan, Robert Haas, Thomas Munro, Antonin Houska, Andres Freund, Alexander Kuzmenkov Discussion: https://postgr.es/m/CAPpHfdscOX5an71nHd8WSUH6GNOCf=V7wgDaTXdDd9=goN-gfA@mail.gmail.com Discussion: https://postgr.es/m/CAPpHfds1waRZ=NOmueYq0sx1ZSCnt+5QJvizT8ndT2=etZEeAQ@mail.gmail.com
author: Tomas Vondra <tomas.vondra@postgresql.org> 2020-04-06 21:33:28 +0200
committer: Tomas Vondra <tomas.vondra@postgresql.org> 2020-04-06 21:35:10 +0200
commit: d2d8a229bc58a2014dce1c7a4fcdb6c5ab9fb8da (patch)
tree: 66e2560c49ee43d13c29bd9f5760731001312738 /src/backend/utils
parent: 3c8553547b1493c4afdb80393f4a47dbfa019a79 (diff)
download: postgresql-d2d8a229bc58a2014dce1c7a4fcdb6c5ab9fb8da.tar.gz
postgresql-d2d8a229bc58a2014dce1c7a4fcdb6c5ab9fb8da.zip
3 files changed, 248 insertions, 68 deletions
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 477af5d552a..03a22d71ac2 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -992,6 +992,15 @@ static struct config_bool ConfigureNamesBool[] =
 		NULL, NULL, NULL
 	},
 	{
+		{"enable_incrementalsort", PGC_USERSET, QUERY_TUNING_METHOD,
+			gettext_noop("Enables the planner's use of incremental sort steps."),
+			NULL
+		},
+		&enable_incrementalsort,
+		true,
+		NULL, NULL, NULL
+	},
+	{
 		{"enable_hashagg", PGC_USERSET, QUERY_TUNING_METHOD,
 			gettext_noop("Enables the planner's use of hashed aggregation plans."),
 			NULL,
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index 91fa185053d..1ae8b77306c 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -360,6 +360,7 @@
 #enable_parallel_append = on
 #enable_seqscan = on
 #enable_sort = on
+#enable_incrementalsort = on
 #enable_tidscan = on
 #enable_partitionwise_join = off
 #enable_partitionwise_aggregate = off
diff --git a/src/backend/utils/sort/tuplesort.c b/src/backend/utils/sort/tuplesort.c
index d02e676aa33..cc33a857314 100644
--- a/src/backend/utils/sort/tuplesort.c
+++ b/src/backend/utils/sort/tuplesort.c
@@ -125,6 +125,16 @@
 #define PARALLEL_SORT(state)	((state)->shared == NULL ? 0 : \
 								 (state)->worker >= 0 ? 1 : 2)
 
+/*
+ * Initial size of memtuples array.  We're trying to select this size so that
+ * array doesn't exceed ALLOCSET_SEPARATE_THRESHOLD and so that the overhead of
+ * allocation might possibly be lowered.  However, we don't consider array sizes
+ * less than 1024.
+ *
+ */
+#define INITIAL_MEMTUPSIZE Max(1024, \
+	ALLOCSET_SEPARATE_THRESHOLD / sizeof(SortTuple) + 1)
+
 /* GUC variables */
 #ifdef TRACE_SORT
 bool		trace_sort = false;
@@ -241,6 +251,14 @@ struct Tuplesortstate
 	int64		allowedMem;		/* total memory allowed, in bytes */
 	int			maxTapes;		/* number of tapes (Knuth's T) */
 	int			tapeRange;		/* maxTapes-1 (Knuth's P) */
+	int64		maxSpace;		/* maximum amount of space occupied among sort
+								 * of groups, either in-memory or on-disk */
+	bool		isMaxSpaceDisk; /* true when maxSpace is value for on-disk
+								 * space, false when it's value for in-memory
+								 * space */
+	TupSortStatus	maxSpaceStatus;	/* sort status when maxSpace was reached */
+	MemoryContext	maincontext;	/* memory context for tuple sort metadata that
+								 * persists across multiple batches */
 	MemoryContext sortcontext;	/* memory context holding most sort data */
 	MemoryContext tuplecontext; /* sub-context of sortcontext for tuple data */
 	LogicalTapeSet *tapeset;	/* logtape.c object for tapes in a temp file */
@@ -591,6 +609,7 @@ struct Sharedsort
 static Tuplesortstate *tuplesort_begin_common(int workMem,
 											  SortCoordinate coordinate,
 											  bool randomAccess);
+static void tuplesort_begin_batch(Tuplesortstate *state);
 static void puttuple_common(Tuplesortstate *state, SortTuple *tuple);
 static bool consider_abort_common(Tuplesortstate *state);
 static void inittapes(Tuplesortstate *state, bool mergeruns);
@@ -647,6 +666,8 @@ static void worker_freeze_result_tape(Tuplesortstate *state);
 static void worker_nomergeruns(Tuplesortstate *state);
 static void leader_takeover_tapes(Tuplesortstate *state);
 static void free_sort_tuple(Tuplesortstate *state, SortTuple *stup);
+static void tuplesort_free(Tuplesortstate *state);
+static void tuplesort_updatemax(Tuplesortstate *state);
 
 /*
  * Special versions of qsort just for SortTuple objects.  qsort_tuple() sorts
@@ -682,8 +703,8 @@ tuplesort_begin_common(int workMem, SortCoordinate coordinate,
 					   bool randomAccess)
 {
 	Tuplesortstate *state;
+	MemoryContext maincontext;
 	MemoryContext sortcontext;
-	MemoryContext tuplecontext;
 	MemoryContext oldcontext;
 
 	/* See leader_takeover_tapes() remarks on randomAccess support */
@@ -691,31 +712,31 @@ tuplesort_begin_common(int workMem, SortCoordinate coordinate,
 		elog(ERROR, "random access disallowed under parallel sort");
 
 	/*
-	 * Create a working memory context for this sort operation. All data
-	 * needed by the sort will live inside this context.
+	 * Memory context surviving tuplesort_reset.  This memory context holds
+	 * data which is useful to keep while sorting multiple similar batches.
 	 */
-	sortcontext = AllocSetContextCreate(CurrentMemoryContext,
+	maincontext = AllocSetContextCreate(CurrentMemoryContext,
 										"TupleSort main",
 										ALLOCSET_DEFAULT_SIZES);
 
 	/*
-	 * Caller tuple (e.g. IndexTuple) memory context.
-	 *
-	 * A dedicated child context used exclusively for caller passed tuples
-	 * eases memory management.  Resetting at key points reduces
-	 * fragmentation. Note that the memtuples array of SortTuples is allocated
-	 * in the parent context, not this context, because there is no need to
-	 * free memtuples early.
+	 * Create a working memory context for one sort operation.  The content of
+	 * this context is deleted by tuplesort_reset.
+	 */
+	sortcontext = AllocSetContextCreate(maincontext,
+										"TupleSort sort",
+										ALLOCSET_DEFAULT_SIZES);
+
+	/*
+	 * Additionally a working memory context for tuples is setup in
+	 * tuplesort_begin_batch.
 	 */
-	tuplecontext = AllocSetContextCreate(sortcontext,
-										 "Caller tuples",
-										 ALLOCSET_DEFAULT_SIZES);
 
 	/*
-	 * Make the Tuplesortstate within the per-sort context.  This way, we
+	 * Make the Tuplesortstate within the per-sortstate context.  This way, we
 	 * don't need a separate pfree() operation for it at shutdown.
 	 */
-	oldcontext = MemoryContextSwitchTo(sortcontext);
+	oldcontext = MemoryContextSwitchTo(maincontext);
 
 	state = (Tuplesortstate *) palloc0(sizeof(Tuplesortstate));
 
@@ -724,11 +745,8 @@ tuplesort_begin_common(int workMem, SortCoordinate coordinate,
 		pg_rusage_init(&state->ru_start);
 #endif
 
-	state->status = TSS_INITIAL;
 	state->randomAccess = randomAccess;
-	state->bounded = false;
 	state->tuples = true;
-	state->boundUsed = false;
 
 	/*
 	 * workMem is forced to be at least 64KB, the current minimum valid value
@@ -737,38 +755,21 @@ tuplesort_begin_common(int workMem, SortCoordinate coordinate,
 	 * with very little memory.
 	 */
 	state->allowedMem = Max(workMem, 64) * (int64) 1024;
-	state->availMem = state->allowedMem;
 	state->sortcontext = sortcontext;
-	state->tuplecontext = tuplecontext;
-	state->tapeset = NULL;
-
-	state->memtupcount = 0;
+	state->maincontext = maincontext;
 
 	/*
 	 * Initial size of array must be more than ALLOCSET_SEPARATE_THRESHOLD;
 	 * see comments in grow_memtuples().
 	 */
-	state->memtupsize = Max(1024,
-							ALLOCSET_SEPARATE_THRESHOLD / sizeof(SortTuple) + 1);
-
-	state->growmemtuples = true;
-	state->slabAllocatorUsed = false;
-	state->memtuples = (SortTuple *) palloc(state->memtupsize * sizeof(SortTuple));
-
-	USEMEM(state, GetMemoryChunkSpace(state->memtuples));
-
-	/* workMem must be large enough for the minimal memtuples array */
-	if (LACKMEM(state))
-		elog(ERROR, "insufficient memory allowed for sort");
-
-	state->currentRun = 0;
+	state->memtupsize = INITIAL_MEMTUPSIZE;
+	state->memtuples = NULL;
 
 	/*
-	 * maxTapes, tapeRange, and Algorithm D variables will be initialized by
-	 * inittapes(), if needed
+	 * After all of the other non-parallel-related state, we setup all of the
+	 * state needed for each batch.
 	 */
-
-	state->result_tape = -1;	/* flag that result tape has not been formed */
+	tuplesort_begin_batch(state);
 
 	/*
 	 * Initialize parallel-related state based on coordination information
@@ -802,6 +803,77 @@ tuplesort_begin_common(int workMem, SortCoordinate coordinate,
 	return state;
 }
 
+/*
+ *		tuplesort_begin_batch
+ *
+ * Setup, or reset, all state need for processing a new set of tuples with this
+ * sort state. Called both from tuplesort_begin_common (the first time sorting
+ * with this sort state) and tuplesort_reseti (for subsequent usages).
+ */
+static void
+tuplesort_begin_batch(Tuplesortstate *state)
+{
+	MemoryContext oldcontext;
+
+	oldcontext = MemoryContextSwitchTo(state->maincontext);
+
+	/*
+	 * Caller tuple (e.g. IndexTuple) memory context.
+	 *
+	 * A dedicated child context used exclusively for caller passed tuples
+	 * eases memory management.  Resetting at key points reduces
+	 * fragmentation. Note that the memtuples array of SortTuples is allocated
+	 * in the parent context, not this context, because there is no need to
+	 * free memtuples early.
+	 */
+	state->tuplecontext = AllocSetContextCreate(state->sortcontext,
+												"Caller tuples",
+												ALLOCSET_DEFAULT_SIZES);
+
+	state->status = TSS_INITIAL;
+	state->bounded = false;
+	state->boundUsed = false;
+
+	state->availMem = state->allowedMem;
+
+	state->tapeset = NULL;
+
+	state->memtupcount = 0;
+
+	/*
+	 * Initial size of array must be more than ALLOCSET_SEPARATE_THRESHOLD;
+	 * see comments in grow_memtuples().
+	 */
+	state->growmemtuples = true;
+	state->slabAllocatorUsed = false;
+	if (state->memtuples != NULL && state->memtupsize != INITIAL_MEMTUPSIZE)
+	{
+		pfree(state->memtuples);
+		state->memtuples = NULL;
+		state->memtupsize = INITIAL_MEMTUPSIZE;
+	}
+	if (state->memtuples == NULL)
+	{
+		state->memtuples = (SortTuple *) palloc(state->memtupsize * sizeof(SortTuple));
+		USEMEM(state, GetMemoryChunkSpace(state->memtuples));
+	}
+
+	/* workMem must be large enough for the minimal memtuples array */
+	if (LACKMEM(state))
+		elog(ERROR, "insufficient memory allowed for sort");
+
+	state->currentRun = 0;
+
+	/*
+	 * maxTapes, tapeRange, and Algorithm D variables will be initialized by
+	 * inittapes(), if needed
+	 */
+
+	state->result_tape = -1;	/* flag that result tape has not been formed */
+
+	MemoryContextSwitchTo(oldcontext);
+}
+
 Tuplesortstate *
 tuplesort_begin_heap(TupleDesc tupDesc,
 					 int nkeys, AttrNumber *attNums,
@@ -814,7 +886,7 @@ tuplesort_begin_heap(TupleDesc tupDesc,
 	MemoryContext oldcontext;
 	int			i;
 
-	oldcontext = MemoryContextSwitchTo(state->sortcontext);
+	oldcontext = MemoryContextSwitchTo(state->maincontext);
 
 	AssertArg(nkeys > 0);
 
@@ -890,7 +962,7 @@ tuplesort_begin_cluster(TupleDesc tupDesc,
 
 	Assert(indexRel->rd_rel->relam == BTREE_AM_OID);
 
-	oldcontext = MemoryContextSwitchTo(state->sortcontext);
+	oldcontext = MemoryContextSwitchTo(state->maincontext);
 
 #ifdef TRACE_SORT
 	if (trace_sort)
@@ -985,7 +1057,7 @@ tuplesort_begin_index_btree(Relation heapRel,
 	MemoryContext oldcontext;
 	int			i;
 
-	oldcontext = MemoryContextSwitchTo(state->sortcontext);
+	oldcontext = MemoryContextSwitchTo(state->maincontext);
 
 #ifdef TRACE_SORT
 	if (trace_sort)
@@ -1063,7 +1135,7 @@ tuplesort_begin_index_hash(Relation heapRel,
 												   randomAccess);
 	MemoryContext oldcontext;
 
-	oldcontext = MemoryContextSwitchTo(state->sortcontext);
+	oldcontext = MemoryContextSwitchTo(state->maincontext);
 
 #ifdef TRACE_SORT
 	if (trace_sort)
@@ -1106,7 +1178,7 @@ tuplesort_begin_datum(Oid datumType, Oid sortOperator, Oid sortCollation,
 	int16		typlen;
 	bool		typbyval;
 
-	oldcontext = MemoryContextSwitchTo(state->sortcontext);
+	oldcontext = MemoryContextSwitchTo(state->maincontext);
 
 #ifdef TRACE_SORT
 	if (trace_sort)
@@ -1224,16 +1296,23 @@ tuplesort_set_bound(Tuplesortstate *state, int64 bound)
 }
 
 /*
- * tuplesort_end
+ * tuplesort_used_bound
  *
- *	Release resources and clean up.
+ * Allow callers to find out if the sort state was able to use a bound.
+ */
+bool
+tuplesort_used_bound(Tuplesortstate *state)
+{
+	return state->boundUsed;
+}
+
+/*
+ * tuplesort_free
  *
- * NOTE: after calling this, any pointers returned by tuplesort_getXXX are
- * pointing to garbage.  Be careful not to attempt to use or free such
- * pointers afterwards!
+ *	Internal routine for freeing resources of tuplesort.
  */
-void
-tuplesort_end(Tuplesortstate *state)
+static void
+tuplesort_free(Tuplesortstate *state)
 {
 	/* context swap probably not needed, but let's be safe */
 	MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext);
@@ -1291,10 +1370,104 @@ tuplesort_end(Tuplesortstate *state)
 	MemoryContextSwitchTo(oldcontext);
 
 	/*
-	 * Free the per-sort memory context, thereby releasing all working memory,
-	 * including the Tuplesortstate struct itself.
+	 * Free the per-sort memory context, thereby releasing all working memory.
 	 */
-	MemoryContextDelete(state->sortcontext);
+	MemoryContextReset(state->sortcontext);
+}
+
+/*
+ * tuplesort_end
+ *
+ *	Release resources and clean up.
+ *
+ * NOTE: after calling this, any pointers returned by tuplesort_getXXX are
+ * pointing to garbage.  Be careful not to attempt to use or free such
+ * pointers afterwards!
+ */
+void
+tuplesort_end(Tuplesortstate *state)
+{
+	tuplesort_free(state);
+
+	/*
+	 * Free the main memory context, including the Tuplesortstate struct
+	 * itself.
+	 */
+	MemoryContextDelete(state->maincontext);
+}
+
+/*
+ * tuplesort_updatemax
+ *
+ *	Update maximum resource usage statistics.
+ */
+static void
+tuplesort_updatemax(Tuplesortstate *state)
+{
+	int64		spaceUsed;
+	bool		isSpaceDisk;
+
+	/*
+	 * Note: it might seem we should provide both memory and disk usage for a
+	 * disk-based sort.  However, the current code doesn't track memory space
+	 * accurately once we have begun to return tuples to the caller (since we
+	 * don't account for pfree's the caller is expected to do), so we cannot
+	 * rely on availMem in a disk sort.  This does not seem worth the overhead
+	 * to fix.  Is it worth creating an API for the memory context code to
+	 * tell us how much is actually used in sortcontext?
+	 */
+	if (state->tapeset)
+	{
+		isSpaceDisk = true;
+		spaceUsed = LogicalTapeSetBlocks(state->tapeset) * BLCKSZ;
+	}
+	else
+	{
+		isSpaceDisk = false;
+		spaceUsed = state->allowedMem - state->availMem;
+	}
+
+	/*
+	 * Sort evicts data to the disk when it didn't manage to fit those data to
+	 * the main memory.  This is why we assume space used on the disk to be
+	 * more important for tracking resource usage than space used in memory.
+	 * Note that amount of space occupied by some tuple set on the disk might
+	 * be less than amount of space occupied by the same tuple set in the
+	 * memory due to more compact representation.
+	 */
+	if ((isSpaceDisk && !state->isMaxSpaceDisk) ||
+		(isSpaceDisk == state->isMaxSpaceDisk && spaceUsed > state->maxSpace))
+	{
+		state->maxSpace = spaceUsed;
+		state->isMaxSpaceDisk = isSpaceDisk;
+		state->maxSpaceStatus = state->status;
+	}
+}
+
+/*
+ * tuplesort_reset
+ *
+ *	Reset the tuplesort.  Reset all the data in the tuplesort, but leave the
+ *	meta-information in.  After tuplesort_reset, tuplesort is ready to start
+ *	a new sort.  This allows avoiding recreation of tuple sort states (and
+ *	save resources) when sorting multiple small batches.
+ */
+void
+tuplesort_reset(Tuplesortstate *state)
+{
+	tuplesort_updatemax(state);
+	tuplesort_free(state);
+
+	/*
+	 * After we've freed up per-batch memory, re-setup all of the state common
+	 * to both the first batch and any subsequent batch.
+	 */
+	tuplesort_begin_batch(state);
+
+	state->lastReturnedTuple = NULL;
+	state->slabMemoryBegin = NULL;
+	state->slabMemoryEnd = NULL;
+	state->slabFreeHead = NULL;
 }
 
 /*
@@ -2591,8 +2764,7 @@ mergeruns(Tuplesortstate *state)
 	 * Reset tuple memory.  We've freed all the tuples that we previously
 	 * allocated.  We will use the slab allocator from now on.
 	 */
-	MemoryContextDelete(state->tuplecontext);
-	state->tuplecontext = NULL;
+	MemoryContextResetOnly(state->tuplecontext);
 
 	/*
 	 * We no longer need a large memtuples array.  (We will allocate a smaller
@@ -2642,7 +2814,8 @@ mergeruns(Tuplesortstate *state)
 	 * from each input tape.
 	 */
 	state->memtupsize = numInputTapes;
-	state->memtuples = (SortTuple *) palloc(numInputTapes * sizeof(SortTuple));
+	state->memtuples = (SortTuple *) MemoryContextAlloc(state->maincontext,
+														numInputTapes * sizeof(SortTuple));
 	USEMEM(state, GetMemoryChunkSpace(state->memtuples));
 
 	/*
@@ -3138,18 +3311,15 @@ tuplesort_get_stats(Tuplesortstate *state,
 	 * to fix.  Is it worth creating an API for the memory context code to
 	 * tell us how much is actually used in sortcontext?
 	 */
-	if (state->tapeset)
-	{
+	tuplesort_updatemax(state);
+
+	if (state->isMaxSpaceDisk)
 		stats->spaceType = SORT_SPACE_TYPE_DISK;
-		stats->spaceUsed = LogicalTapeSetBlocks(state->tapeset) * (BLCKSZ / 1024);
-	}
 	else
-	{
 		stats->spaceType = SORT_SPACE_TYPE_MEMORY;
-		stats->spaceUsed = (state->allowedMem - state->availMem + 1023) / 1024;
-	}
+	stats->spaceUsed = (state->maxSpace + 1023) / 1024;
 
-	switch (state->status)
+	switch (state->maxSpaceStatus)
 	{
 		case TSS_SORTEDINMEM:
 			if (state->boundUsed)
author	Tomas Vondra <tomas.vondra@postgresql.org>	2020-04-06 21:33:28 +0200
committer	Tomas Vondra <tomas.vondra@postgresql.org>	2020-04-06 21:35:10 +0200
commit	d2d8a229bc58a2014dce1c7a4fcdb6c5ab9fb8da (patch)
tree	66e2560c49ee43d13c29bd9f5760731001312738 /src/backend/utils
parent	3c8553547b1493c4afdb80393f4a47dbfa019a79 (diff)
download	postgresql-d2d8a229bc58a2014dce1c7a4fcdb6c5ab9fb8da.tar.gz postgresql-d2d8a229bc58a2014dce1c7a4fcdb6c5ab9fb8da.zip