diff options
Diffstat (limited to 'src/backend/utils')
-rw-r--r-- | src/backend/utils/misc/guc.c | 9 | ||||
-rw-r--r-- | src/backend/utils/misc/postgresql.conf.sample | 1 | ||||
-rw-r--r-- | src/backend/utils/sort/tuplesort.c | 306 |
3 files changed, 248 insertions, 68 deletions
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index 477af5d552a..03a22d71ac2 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -992,6 +992,15 @@ static struct config_bool ConfigureNamesBool[] = NULL, NULL, NULL }, { + {"enable_incrementalsort", PGC_USERSET, QUERY_TUNING_METHOD, + gettext_noop("Enables the planner's use of incremental sort steps."), + NULL + }, + &enable_incrementalsort, + true, + NULL, NULL, NULL + }, + { {"enable_hashagg", PGC_USERSET, QUERY_TUNING_METHOD, gettext_noop("Enables the planner's use of hashed aggregation plans."), NULL, diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index 91fa185053d..1ae8b77306c 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -360,6 +360,7 @@ #enable_parallel_append = on #enable_seqscan = on #enable_sort = on +#enable_incrementalsort = on #enable_tidscan = on #enable_partitionwise_join = off #enable_partitionwise_aggregate = off diff --git a/src/backend/utils/sort/tuplesort.c b/src/backend/utils/sort/tuplesort.c index d02e676aa33..cc33a857314 100644 --- a/src/backend/utils/sort/tuplesort.c +++ b/src/backend/utils/sort/tuplesort.c @@ -125,6 +125,16 @@ #define PARALLEL_SORT(state) ((state)->shared == NULL ? 0 : \ (state)->worker >= 0 ? 1 : 2) +/* + * Initial size of memtuples array. We're trying to select this size so that + * array doesn't exceed ALLOCSET_SEPARATE_THRESHOLD and so that the overhead of + * allocation might possibly be lowered. However, we don't consider array sizes + * less than 1024. + * + */ +#define INITIAL_MEMTUPSIZE Max(1024, \ + ALLOCSET_SEPARATE_THRESHOLD / sizeof(SortTuple) + 1) + /* GUC variables */ #ifdef TRACE_SORT bool trace_sort = false; @@ -241,6 +251,14 @@ struct Tuplesortstate int64 allowedMem; /* total memory allowed, in bytes */ int maxTapes; /* number of tapes (Knuth's T) */ int tapeRange; /* maxTapes-1 (Knuth's P) */ + int64 maxSpace; /* maximum amount of space occupied among sort + * of groups, either in-memory or on-disk */ + bool isMaxSpaceDisk; /* true when maxSpace is value for on-disk + * space, false when it's value for in-memory + * space */ + TupSortStatus maxSpaceStatus; /* sort status when maxSpace was reached */ + MemoryContext maincontext; /* memory context for tuple sort metadata that + * persists across multiple batches */ MemoryContext sortcontext; /* memory context holding most sort data */ MemoryContext tuplecontext; /* sub-context of sortcontext for tuple data */ LogicalTapeSet *tapeset; /* logtape.c object for tapes in a temp file */ @@ -591,6 +609,7 @@ struct Sharedsort static Tuplesortstate *tuplesort_begin_common(int workMem, SortCoordinate coordinate, bool randomAccess); +static void tuplesort_begin_batch(Tuplesortstate *state); static void puttuple_common(Tuplesortstate *state, SortTuple *tuple); static bool consider_abort_common(Tuplesortstate *state); static void inittapes(Tuplesortstate *state, bool mergeruns); @@ -647,6 +666,8 @@ static void worker_freeze_result_tape(Tuplesortstate *state); static void worker_nomergeruns(Tuplesortstate *state); static void leader_takeover_tapes(Tuplesortstate *state); static void free_sort_tuple(Tuplesortstate *state, SortTuple *stup); +static void tuplesort_free(Tuplesortstate *state); +static void tuplesort_updatemax(Tuplesortstate *state); /* * Special versions of qsort just for SortTuple objects. qsort_tuple() sorts @@ -682,8 +703,8 @@ tuplesort_begin_common(int workMem, SortCoordinate coordinate, bool randomAccess) { Tuplesortstate *state; + MemoryContext maincontext; MemoryContext sortcontext; - MemoryContext tuplecontext; MemoryContext oldcontext; /* See leader_takeover_tapes() remarks on randomAccess support */ @@ -691,31 +712,31 @@ tuplesort_begin_common(int workMem, SortCoordinate coordinate, elog(ERROR, "random access disallowed under parallel sort"); /* - * Create a working memory context for this sort operation. All data - * needed by the sort will live inside this context. + * Memory context surviving tuplesort_reset. This memory context holds + * data which is useful to keep while sorting multiple similar batches. */ - sortcontext = AllocSetContextCreate(CurrentMemoryContext, + maincontext = AllocSetContextCreate(CurrentMemoryContext, "TupleSort main", ALLOCSET_DEFAULT_SIZES); /* - * Caller tuple (e.g. IndexTuple) memory context. - * - * A dedicated child context used exclusively for caller passed tuples - * eases memory management. Resetting at key points reduces - * fragmentation. Note that the memtuples array of SortTuples is allocated - * in the parent context, not this context, because there is no need to - * free memtuples early. + * Create a working memory context for one sort operation. The content of + * this context is deleted by tuplesort_reset. + */ + sortcontext = AllocSetContextCreate(maincontext, + "TupleSort sort", + ALLOCSET_DEFAULT_SIZES); + + /* + * Additionally a working memory context for tuples is setup in + * tuplesort_begin_batch. */ - tuplecontext = AllocSetContextCreate(sortcontext, - "Caller tuples", - ALLOCSET_DEFAULT_SIZES); /* - * Make the Tuplesortstate within the per-sort context. This way, we + * Make the Tuplesortstate within the per-sortstate context. This way, we * don't need a separate pfree() operation for it at shutdown. */ - oldcontext = MemoryContextSwitchTo(sortcontext); + oldcontext = MemoryContextSwitchTo(maincontext); state = (Tuplesortstate *) palloc0(sizeof(Tuplesortstate)); @@ -724,11 +745,8 @@ tuplesort_begin_common(int workMem, SortCoordinate coordinate, pg_rusage_init(&state->ru_start); #endif - state->status = TSS_INITIAL; state->randomAccess = randomAccess; - state->bounded = false; state->tuples = true; - state->boundUsed = false; /* * workMem is forced to be at least 64KB, the current minimum valid value @@ -737,38 +755,21 @@ tuplesort_begin_common(int workMem, SortCoordinate coordinate, * with very little memory. */ state->allowedMem = Max(workMem, 64) * (int64) 1024; - state->availMem = state->allowedMem; state->sortcontext = sortcontext; - state->tuplecontext = tuplecontext; - state->tapeset = NULL; - - state->memtupcount = 0; + state->maincontext = maincontext; /* * Initial size of array must be more than ALLOCSET_SEPARATE_THRESHOLD; * see comments in grow_memtuples(). */ - state->memtupsize = Max(1024, - ALLOCSET_SEPARATE_THRESHOLD / sizeof(SortTuple) + 1); - - state->growmemtuples = true; - state->slabAllocatorUsed = false; - state->memtuples = (SortTuple *) palloc(state->memtupsize * sizeof(SortTuple)); - - USEMEM(state, GetMemoryChunkSpace(state->memtuples)); - - /* workMem must be large enough for the minimal memtuples array */ - if (LACKMEM(state)) - elog(ERROR, "insufficient memory allowed for sort"); - - state->currentRun = 0; + state->memtupsize = INITIAL_MEMTUPSIZE; + state->memtuples = NULL; /* - * maxTapes, tapeRange, and Algorithm D variables will be initialized by - * inittapes(), if needed + * After all of the other non-parallel-related state, we setup all of the + * state needed for each batch. */ - - state->result_tape = -1; /* flag that result tape has not been formed */ + tuplesort_begin_batch(state); /* * Initialize parallel-related state based on coordination information @@ -802,6 +803,77 @@ tuplesort_begin_common(int workMem, SortCoordinate coordinate, return state; } +/* + * tuplesort_begin_batch + * + * Setup, or reset, all state need for processing a new set of tuples with this + * sort state. Called both from tuplesort_begin_common (the first time sorting + * with this sort state) and tuplesort_reseti (for subsequent usages). + */ +static void +tuplesort_begin_batch(Tuplesortstate *state) +{ + MemoryContext oldcontext; + + oldcontext = MemoryContextSwitchTo(state->maincontext); + + /* + * Caller tuple (e.g. IndexTuple) memory context. + * + * A dedicated child context used exclusively for caller passed tuples + * eases memory management. Resetting at key points reduces + * fragmentation. Note that the memtuples array of SortTuples is allocated + * in the parent context, not this context, because there is no need to + * free memtuples early. + */ + state->tuplecontext = AllocSetContextCreate(state->sortcontext, + "Caller tuples", + ALLOCSET_DEFAULT_SIZES); + + state->status = TSS_INITIAL; + state->bounded = false; + state->boundUsed = false; + + state->availMem = state->allowedMem; + + state->tapeset = NULL; + + state->memtupcount = 0; + + /* + * Initial size of array must be more than ALLOCSET_SEPARATE_THRESHOLD; + * see comments in grow_memtuples(). + */ + state->growmemtuples = true; + state->slabAllocatorUsed = false; + if (state->memtuples != NULL && state->memtupsize != INITIAL_MEMTUPSIZE) + { + pfree(state->memtuples); + state->memtuples = NULL; + state->memtupsize = INITIAL_MEMTUPSIZE; + } + if (state->memtuples == NULL) + { + state->memtuples = (SortTuple *) palloc(state->memtupsize * sizeof(SortTuple)); + USEMEM(state, GetMemoryChunkSpace(state->memtuples)); + } + + /* workMem must be large enough for the minimal memtuples array */ + if (LACKMEM(state)) + elog(ERROR, "insufficient memory allowed for sort"); + + state->currentRun = 0; + + /* + * maxTapes, tapeRange, and Algorithm D variables will be initialized by + * inittapes(), if needed + */ + + state->result_tape = -1; /* flag that result tape has not been formed */ + + MemoryContextSwitchTo(oldcontext); +} + Tuplesortstate * tuplesort_begin_heap(TupleDesc tupDesc, int nkeys, AttrNumber *attNums, @@ -814,7 +886,7 @@ tuplesort_begin_heap(TupleDesc tupDesc, MemoryContext oldcontext; int i; - oldcontext = MemoryContextSwitchTo(state->sortcontext); + oldcontext = MemoryContextSwitchTo(state->maincontext); AssertArg(nkeys > 0); @@ -890,7 +962,7 @@ tuplesort_begin_cluster(TupleDesc tupDesc, Assert(indexRel->rd_rel->relam == BTREE_AM_OID); - oldcontext = MemoryContextSwitchTo(state->sortcontext); + oldcontext = MemoryContextSwitchTo(state->maincontext); #ifdef TRACE_SORT if (trace_sort) @@ -985,7 +1057,7 @@ tuplesort_begin_index_btree(Relation heapRel, MemoryContext oldcontext; int i; - oldcontext = MemoryContextSwitchTo(state->sortcontext); + oldcontext = MemoryContextSwitchTo(state->maincontext); #ifdef TRACE_SORT if (trace_sort) @@ -1063,7 +1135,7 @@ tuplesort_begin_index_hash(Relation heapRel, randomAccess); MemoryContext oldcontext; - oldcontext = MemoryContextSwitchTo(state->sortcontext); + oldcontext = MemoryContextSwitchTo(state->maincontext); #ifdef TRACE_SORT if (trace_sort) @@ -1106,7 +1178,7 @@ tuplesort_begin_datum(Oid datumType, Oid sortOperator, Oid sortCollation, int16 typlen; bool typbyval; - oldcontext = MemoryContextSwitchTo(state->sortcontext); + oldcontext = MemoryContextSwitchTo(state->maincontext); #ifdef TRACE_SORT if (trace_sort) @@ -1224,16 +1296,23 @@ tuplesort_set_bound(Tuplesortstate *state, int64 bound) } /* - * tuplesort_end + * tuplesort_used_bound * - * Release resources and clean up. + * Allow callers to find out if the sort state was able to use a bound. + */ +bool +tuplesort_used_bound(Tuplesortstate *state) +{ + return state->boundUsed; +} + +/* + * tuplesort_free * - * NOTE: after calling this, any pointers returned by tuplesort_getXXX are - * pointing to garbage. Be careful not to attempt to use or free such - * pointers afterwards! + * Internal routine for freeing resources of tuplesort. */ -void -tuplesort_end(Tuplesortstate *state) +static void +tuplesort_free(Tuplesortstate *state) { /* context swap probably not needed, but let's be safe */ MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); @@ -1291,10 +1370,104 @@ tuplesort_end(Tuplesortstate *state) MemoryContextSwitchTo(oldcontext); /* - * Free the per-sort memory context, thereby releasing all working memory, - * including the Tuplesortstate struct itself. + * Free the per-sort memory context, thereby releasing all working memory. */ - MemoryContextDelete(state->sortcontext); + MemoryContextReset(state->sortcontext); +} + +/* + * tuplesort_end + * + * Release resources and clean up. + * + * NOTE: after calling this, any pointers returned by tuplesort_getXXX are + * pointing to garbage. Be careful not to attempt to use or free such + * pointers afterwards! + */ +void +tuplesort_end(Tuplesortstate *state) +{ + tuplesort_free(state); + + /* + * Free the main memory context, including the Tuplesortstate struct + * itself. + */ + MemoryContextDelete(state->maincontext); +} + +/* + * tuplesort_updatemax + * + * Update maximum resource usage statistics. + */ +static void +tuplesort_updatemax(Tuplesortstate *state) +{ + int64 spaceUsed; + bool isSpaceDisk; + + /* + * Note: it might seem we should provide both memory and disk usage for a + * disk-based sort. However, the current code doesn't track memory space + * accurately once we have begun to return tuples to the caller (since we + * don't account for pfree's the caller is expected to do), so we cannot + * rely on availMem in a disk sort. This does not seem worth the overhead + * to fix. Is it worth creating an API for the memory context code to + * tell us how much is actually used in sortcontext? + */ + if (state->tapeset) + { + isSpaceDisk = true; + spaceUsed = LogicalTapeSetBlocks(state->tapeset) * BLCKSZ; + } + else + { + isSpaceDisk = false; + spaceUsed = state->allowedMem - state->availMem; + } + + /* + * Sort evicts data to the disk when it didn't manage to fit those data to + * the main memory. This is why we assume space used on the disk to be + * more important for tracking resource usage than space used in memory. + * Note that amount of space occupied by some tuple set on the disk might + * be less than amount of space occupied by the same tuple set in the + * memory due to more compact representation. + */ + if ((isSpaceDisk && !state->isMaxSpaceDisk) || + (isSpaceDisk == state->isMaxSpaceDisk && spaceUsed > state->maxSpace)) + { + state->maxSpace = spaceUsed; + state->isMaxSpaceDisk = isSpaceDisk; + state->maxSpaceStatus = state->status; + } +} + +/* + * tuplesort_reset + * + * Reset the tuplesort. Reset all the data in the tuplesort, but leave the + * meta-information in. After tuplesort_reset, tuplesort is ready to start + * a new sort. This allows avoiding recreation of tuple sort states (and + * save resources) when sorting multiple small batches. + */ +void +tuplesort_reset(Tuplesortstate *state) +{ + tuplesort_updatemax(state); + tuplesort_free(state); + + /* + * After we've freed up per-batch memory, re-setup all of the state common + * to both the first batch and any subsequent batch. + */ + tuplesort_begin_batch(state); + + state->lastReturnedTuple = NULL; + state->slabMemoryBegin = NULL; + state->slabMemoryEnd = NULL; + state->slabFreeHead = NULL; } /* @@ -2591,8 +2764,7 @@ mergeruns(Tuplesortstate *state) * Reset tuple memory. We've freed all the tuples that we previously * allocated. We will use the slab allocator from now on. */ - MemoryContextDelete(state->tuplecontext); - state->tuplecontext = NULL; + MemoryContextResetOnly(state->tuplecontext); /* * We no longer need a large memtuples array. (We will allocate a smaller @@ -2642,7 +2814,8 @@ mergeruns(Tuplesortstate *state) * from each input tape. */ state->memtupsize = numInputTapes; - state->memtuples = (SortTuple *) palloc(numInputTapes * sizeof(SortTuple)); + state->memtuples = (SortTuple *) MemoryContextAlloc(state->maincontext, + numInputTapes * sizeof(SortTuple)); USEMEM(state, GetMemoryChunkSpace(state->memtuples)); /* @@ -3138,18 +3311,15 @@ tuplesort_get_stats(Tuplesortstate *state, * to fix. Is it worth creating an API for the memory context code to * tell us how much is actually used in sortcontext? */ - if (state->tapeset) - { + tuplesort_updatemax(state); + + if (state->isMaxSpaceDisk) stats->spaceType = SORT_SPACE_TYPE_DISK; - stats->spaceUsed = LogicalTapeSetBlocks(state->tapeset) * (BLCKSZ / 1024); - } else - { stats->spaceType = SORT_SPACE_TYPE_MEMORY; - stats->spaceUsed = (state->allowedMem - state->availMem + 1023) / 1024; - } + stats->spaceUsed = (state->maxSpace + 1023) / 1024; - switch (state->status) + switch (state->maxSpaceStatus) { case TSS_SORTEDINMEM: if (state->boundUsed) |