diff options
author | Jeff Davis <jdavis@postgresql.org> | 2025-03-24 22:05:33 -0700 |
---|---|---|
committer | Jeff Davis <jdavis@postgresql.org> | 2025-03-24 22:05:33 -0700 |
commit | cc721c459d3783bbdb4beb1bbfa802a5328d15a2 (patch) | |
tree | bdc6fd24da625fd7d96a40a59b08f241ae6208fe /src/backend/executor/nodeAgg.c | |
parent | cc4331605ad81ad3040704470ff56904624eef75 (diff) | |
download | postgresql-cc721c459d3783bbdb4beb1bbfa802a5328d15a2.tar.gz postgresql-cc721c459d3783bbdb4beb1bbfa802a5328d15a2.zip |
HashAgg: use Bump allocator for hash TupleHashTable entries.
The entries aren't freed until the entire hash table is destroyed, so
use the Bump allocator to improve allocation speed, avoid wasting
space on the chunk header, and avoid wasting space due to the
power-of-two allocations.
Discussion: https://postgr.es/m/CAApHDvqv1aNB4cM36FzRwivXrEvBO_LsG_eQ3nqDXTjECaatOQ@mail.gmail.com
Reviewed-by: David Rowley
Diffstat (limited to 'src/backend/executor/nodeAgg.c')
-rw-r--r-- | src/backend/executor/nodeAgg.c | 111 |
1 files changed, 93 insertions, 18 deletions
diff --git a/src/backend/executor/nodeAgg.c b/src/backend/executor/nodeAgg.c index b4a7698a0b3..beccbfdc6fe 100644 --- a/src/backend/executor/nodeAgg.c +++ b/src/backend/executor/nodeAgg.c @@ -406,6 +406,7 @@ static void build_hash_tables(AggState *aggstate); static void build_hash_table(AggState *aggstate, int setno, long nbuckets); static void hashagg_recompile_expressions(AggState *aggstate, bool minslot, bool nullcheck); +static void hash_create_memory(AggState *aggstate); static long hash_choose_num_buckets(double hashentrysize, long ngroups, Size memory); static int hash_choose_num_partitions(double input_groups, @@ -1509,7 +1510,7 @@ build_hash_table(AggState *aggstate, int setno, long nbuckets) { AggStatePerHash perhash = &aggstate->perhash[setno]; MemoryContext metacxt = aggstate->hash_metacxt; - MemoryContext hashcxt = aggstate->hashcontext->ecxt_per_tuple_memory; + MemoryContext tablecxt = aggstate->hash_tablecxt; MemoryContext tmpcxt = aggstate->tmpcontext->ecxt_per_tuple_memory; Size additionalsize; @@ -1535,7 +1536,7 @@ build_hash_table(AggState *aggstate, int setno, long nbuckets) nbuckets, additionalsize, metacxt, - hashcxt, + tablecxt, tmpcxt, DO_AGGSPLIT_SKIPFINAL(aggstate->aggsplit)); } @@ -1706,15 +1707,19 @@ hash_agg_entry_size(int numTrans, Size tupleWidth, Size transitionSpace) tupleWidth); Size pergroupSize = numTrans * sizeof(AggStatePerGroupData); - tupleChunkSize = CHUNKHDRSZ + tupleSize; - - if (pergroupSize > 0) - pergroupChunkSize = CHUNKHDRSZ + pergroupSize; - else - pergroupChunkSize = 0; + /* + * Entries use the Bump allocator, so the chunk sizes are the same as the + * requested sizes. + */ + tupleChunkSize = MAXALIGN(tupleSize); + pergroupChunkSize = pergroupSize; + /* + * Transition values use AllocSet, which has a chunk header and also uses + * power-of-two allocations. + */ if (transitionSpace > 0) - transitionChunkSize = CHUNKHDRSZ + transitionSpace; + transitionChunkSize = CHUNKHDRSZ + pg_nextpower2_size_t(transitionSpace); else transitionChunkSize = 0; @@ -1864,8 +1869,11 @@ hash_agg_check_limits(AggState *aggstate) uint64 ngroups = aggstate->hash_ngroups_current; Size meta_mem = MemoryContextMemAllocated(aggstate->hash_metacxt, true); - Size hashkey_mem = MemoryContextMemAllocated(aggstate->hashcontext->ecxt_per_tuple_memory, - true); + Size entry_mem = MemoryContextMemAllocated(aggstate->hash_tablecxt, + true); + Size tval_mem = MemoryContextMemAllocated(aggstate->hashcontext->ecxt_per_tuple_memory, + true); + Size total_mem = meta_mem + entry_mem + tval_mem; bool do_spill = false; #ifdef USE_INJECTION_POINTS @@ -1884,7 +1892,7 @@ hash_agg_check_limits(AggState *aggstate) * can be sure to make progress even in edge cases. */ if (aggstate->hash_ngroups_current > 0 && - (meta_mem + hashkey_mem > aggstate->hash_mem_limit || + (total_mem > aggstate->hash_mem_limit || ngroups > aggstate->hash_ngroups_limit)) { do_spill = true; @@ -1939,6 +1947,7 @@ static void hash_agg_update_metrics(AggState *aggstate, bool from_tape, int npartitions) { Size meta_mem; + Size entry_mem; Size hashkey_mem; Size buffer_mem; Size total_mem; @@ -1950,7 +1959,10 @@ hash_agg_update_metrics(AggState *aggstate, bool from_tape, int npartitions) /* memory for the hash table itself */ meta_mem = MemoryContextMemAllocated(aggstate->hash_metacxt, true); - /* memory for the group keys and transition states */ + /* memory for hash entries */ + entry_mem = MemoryContextMemAllocated(aggstate->hash_tablecxt, true); + + /* memory for byref transition states */ hashkey_mem = MemoryContextMemAllocated(aggstate->hashcontext->ecxt_per_tuple_memory, true); /* memory for read/write tape buffers, if spilled */ @@ -1959,7 +1971,7 @@ hash_agg_update_metrics(AggState *aggstate, bool from_tape, int npartitions) buffer_mem += HASHAGG_READ_BUFFER_SIZE; /* update peak mem */ - total_mem = meta_mem + hashkey_mem + buffer_mem; + total_mem = meta_mem + entry_mem + hashkey_mem + buffer_mem; if (total_mem > aggstate->hash_mem_peak) aggstate->hash_mem_peak = total_mem; @@ -1982,6 +1994,64 @@ hash_agg_update_metrics(AggState *aggstate, bool from_tape, int npartitions) } /* + * Create memory contexts used for hash aggregation. + */ +static void +hash_create_memory(AggState *aggstate) +{ + Size maxBlockSize = ALLOCSET_DEFAULT_MAXSIZE; + + /* + * The hashcontext's per-tuple memory will be used for byref transition + * values and returned by AggCheckCallContext(). + */ + aggstate->hashcontext = CreateWorkExprContext(aggstate->ss.ps.state); + + /* + * The meta context will be used for the bucket array of + * TupleHashEntryData (or arrays, in the case of grouping sets). As the + * hash table grows, the bucket array will double in size and the old one + * will be freed, so an AllocSet is appropriate. For large bucket arrays, + * the large allocation path will be used, so it's not worth worrying + * about wasting space due to power-of-two allocations. + */ + aggstate->hash_metacxt = AllocSetContextCreate(aggstate->ss.ps.state->es_query_cxt, + "HashAgg meta context", + ALLOCSET_DEFAULT_SIZES); + + /* + * The hash entries themselves, which include the grouping key + * (firstTuple) and pergroup data, are stored in the table context. The + * bump allocator can be used because the entries are not freed until the + * entire hash table is reset. The bump allocator is faster for + * allocations and avoids wasting space on the chunk header or + * power-of-two allocations. + * + * Like CreateWorkExprContext(), use smaller sizings for smaller work_mem, + * to avoid large jumps in memory usage. + */ + + /* + * Like CreateWorkExprContext(), use smaller sizings for smaller work_mem, + * to avoid large jumps in memory usage. + */ + maxBlockSize = pg_prevpower2_size_t(work_mem * (Size) 1024 / 16); + + /* But no bigger than ALLOCSET_DEFAULT_MAXSIZE */ + maxBlockSize = Min(maxBlockSize, ALLOCSET_DEFAULT_MAXSIZE); + + /* and no smaller than ALLOCSET_DEFAULT_INITSIZE */ + maxBlockSize = Max(maxBlockSize, ALLOCSET_DEFAULT_INITSIZE); + + aggstate->hash_tablecxt = BumpContextCreate(aggstate->ss.ps.state->es_query_cxt, + "HashAgg table context", + ALLOCSET_DEFAULT_MINSIZE, + ALLOCSET_DEFAULT_INITSIZE, + maxBlockSize); + +} + +/* * Choose a reasonable number of buckets for the initial hash table size. */ static long @@ -2642,6 +2712,7 @@ agg_refill_hash_table(AggState *aggstate) /* free memory and reset hash tables */ ReScanExprContext(aggstate->hashcontext); + MemoryContextReset(aggstate->hash_tablecxt); for (int setno = 0; setno < aggstate->num_hashes; setno++) ResetTupleHashTable(aggstate->perhash[setno].hashtable); @@ -3326,7 +3397,7 @@ ExecInitAgg(Agg *node, EState *estate, int eflags) } if (use_hashing) - aggstate->hashcontext = CreateWorkExprContext(estate); + hash_create_memory(aggstate); ExecAssignExprContext(estate, &aggstate->ss.ps); @@ -3621,9 +3692,6 @@ ExecInitAgg(Agg *node, EState *estate, int eflags) Plan *outerplan = outerPlan(node); uint64 totalGroups = 0; - aggstate->hash_metacxt = AllocSetContextCreate(aggstate->ss.ps.state->es_query_cxt, - "HashAgg meta context", - ALLOCSET_DEFAULT_SIZES); aggstate->hash_spill_rslot = ExecInitExtraTupleSlot(estate, scanDesc, &TTSOpsMinimalTuple); aggstate->hash_spill_wslot = ExecInitExtraTupleSlot(estate, scanDesc, @@ -4368,6 +4436,12 @@ ExecEndAgg(AggState *node) MemoryContextDelete(node->hash_metacxt); node->hash_metacxt = NULL; } + if (node->hash_tablecxt != NULL) + { + MemoryContextDelete(node->hash_tablecxt); + node->hash_tablecxt = NULL; + } + for (transno = 0; transno < node->numtrans; transno++) { @@ -4484,6 +4558,7 @@ ExecReScanAgg(AggState *node) node->hash_ngroups_current = 0; ReScanExprContext(node->hashcontext); + MemoryContextReset(node->hash_tablecxt); /* Rebuild an empty hash table */ build_hash_tables(node); node->table_filled = false; |