aboutsummaryrefslogtreecommitdiff
path: root/src/backend/executor/nodeAgg.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/backend/executor/nodeAgg.c')
-rw-r--r--src/backend/executor/nodeAgg.c111
1 files changed, 93 insertions, 18 deletions
diff --git a/src/backend/executor/nodeAgg.c b/src/backend/executor/nodeAgg.c
index b4a7698a0b3..beccbfdc6fe 100644
--- a/src/backend/executor/nodeAgg.c
+++ b/src/backend/executor/nodeAgg.c
@@ -406,6 +406,7 @@ static void build_hash_tables(AggState *aggstate);
static void build_hash_table(AggState *aggstate, int setno, long nbuckets);
static void hashagg_recompile_expressions(AggState *aggstate, bool minslot,
bool nullcheck);
+static void hash_create_memory(AggState *aggstate);
static long hash_choose_num_buckets(double hashentrysize,
long ngroups, Size memory);
static int hash_choose_num_partitions(double input_groups,
@@ -1509,7 +1510,7 @@ build_hash_table(AggState *aggstate, int setno, long nbuckets)
{
AggStatePerHash perhash = &aggstate->perhash[setno];
MemoryContext metacxt = aggstate->hash_metacxt;
- MemoryContext hashcxt = aggstate->hashcontext->ecxt_per_tuple_memory;
+ MemoryContext tablecxt = aggstate->hash_tablecxt;
MemoryContext tmpcxt = aggstate->tmpcontext->ecxt_per_tuple_memory;
Size additionalsize;
@@ -1535,7 +1536,7 @@ build_hash_table(AggState *aggstate, int setno, long nbuckets)
nbuckets,
additionalsize,
metacxt,
- hashcxt,
+ tablecxt,
tmpcxt,
DO_AGGSPLIT_SKIPFINAL(aggstate->aggsplit));
}
@@ -1706,15 +1707,19 @@ hash_agg_entry_size(int numTrans, Size tupleWidth, Size transitionSpace)
tupleWidth);
Size pergroupSize = numTrans * sizeof(AggStatePerGroupData);
- tupleChunkSize = CHUNKHDRSZ + tupleSize;
-
- if (pergroupSize > 0)
- pergroupChunkSize = CHUNKHDRSZ + pergroupSize;
- else
- pergroupChunkSize = 0;
+ /*
+ * Entries use the Bump allocator, so the chunk sizes are the same as the
+ * requested sizes.
+ */
+ tupleChunkSize = MAXALIGN(tupleSize);
+ pergroupChunkSize = pergroupSize;
+ /*
+ * Transition values use AllocSet, which has a chunk header and also uses
+ * power-of-two allocations.
+ */
if (transitionSpace > 0)
- transitionChunkSize = CHUNKHDRSZ + transitionSpace;
+ transitionChunkSize = CHUNKHDRSZ + pg_nextpower2_size_t(transitionSpace);
else
transitionChunkSize = 0;
@@ -1864,8 +1869,11 @@ hash_agg_check_limits(AggState *aggstate)
uint64 ngroups = aggstate->hash_ngroups_current;
Size meta_mem = MemoryContextMemAllocated(aggstate->hash_metacxt,
true);
- Size hashkey_mem = MemoryContextMemAllocated(aggstate->hashcontext->ecxt_per_tuple_memory,
- true);
+ Size entry_mem = MemoryContextMemAllocated(aggstate->hash_tablecxt,
+ true);
+ Size tval_mem = MemoryContextMemAllocated(aggstate->hashcontext->ecxt_per_tuple_memory,
+ true);
+ Size total_mem = meta_mem + entry_mem + tval_mem;
bool do_spill = false;
#ifdef USE_INJECTION_POINTS
@@ -1884,7 +1892,7 @@ hash_agg_check_limits(AggState *aggstate)
* can be sure to make progress even in edge cases.
*/
if (aggstate->hash_ngroups_current > 0 &&
- (meta_mem + hashkey_mem > aggstate->hash_mem_limit ||
+ (total_mem > aggstate->hash_mem_limit ||
ngroups > aggstate->hash_ngroups_limit))
{
do_spill = true;
@@ -1939,6 +1947,7 @@ static void
hash_agg_update_metrics(AggState *aggstate, bool from_tape, int npartitions)
{
Size meta_mem;
+ Size entry_mem;
Size hashkey_mem;
Size buffer_mem;
Size total_mem;
@@ -1950,7 +1959,10 @@ hash_agg_update_metrics(AggState *aggstate, bool from_tape, int npartitions)
/* memory for the hash table itself */
meta_mem = MemoryContextMemAllocated(aggstate->hash_metacxt, true);
- /* memory for the group keys and transition states */
+ /* memory for hash entries */
+ entry_mem = MemoryContextMemAllocated(aggstate->hash_tablecxt, true);
+
+ /* memory for byref transition states */
hashkey_mem = MemoryContextMemAllocated(aggstate->hashcontext->ecxt_per_tuple_memory, true);
/* memory for read/write tape buffers, if spilled */
@@ -1959,7 +1971,7 @@ hash_agg_update_metrics(AggState *aggstate, bool from_tape, int npartitions)
buffer_mem += HASHAGG_READ_BUFFER_SIZE;
/* update peak mem */
- total_mem = meta_mem + hashkey_mem + buffer_mem;
+ total_mem = meta_mem + entry_mem + hashkey_mem + buffer_mem;
if (total_mem > aggstate->hash_mem_peak)
aggstate->hash_mem_peak = total_mem;
@@ -1982,6 +1994,64 @@ hash_agg_update_metrics(AggState *aggstate, bool from_tape, int npartitions)
}
/*
+ * Create memory contexts used for hash aggregation.
+ */
+static void
+hash_create_memory(AggState *aggstate)
+{
+ Size maxBlockSize = ALLOCSET_DEFAULT_MAXSIZE;
+
+ /*
+ * The hashcontext's per-tuple memory will be used for byref transition
+ * values and returned by AggCheckCallContext().
+ */
+ aggstate->hashcontext = CreateWorkExprContext(aggstate->ss.ps.state);
+
+ /*
+ * The meta context will be used for the bucket array of
+ * TupleHashEntryData (or arrays, in the case of grouping sets). As the
+ * hash table grows, the bucket array will double in size and the old one
+ * will be freed, so an AllocSet is appropriate. For large bucket arrays,
+ * the large allocation path will be used, so it's not worth worrying
+ * about wasting space due to power-of-two allocations.
+ */
+ aggstate->hash_metacxt = AllocSetContextCreate(aggstate->ss.ps.state->es_query_cxt,
+ "HashAgg meta context",
+ ALLOCSET_DEFAULT_SIZES);
+
+ /*
+ * The hash entries themselves, which include the grouping key
+ * (firstTuple) and pergroup data, are stored in the table context. The
+ * bump allocator can be used because the entries are not freed until the
+ * entire hash table is reset. The bump allocator is faster for
+ * allocations and avoids wasting space on the chunk header or
+ * power-of-two allocations.
+ *
+ * Like CreateWorkExprContext(), use smaller sizings for smaller work_mem,
+ * to avoid large jumps in memory usage.
+ */
+
+ /*
+ * Like CreateWorkExprContext(), use smaller sizings for smaller work_mem,
+ * to avoid large jumps in memory usage.
+ */
+ maxBlockSize = pg_prevpower2_size_t(work_mem * (Size) 1024 / 16);
+
+ /* But no bigger than ALLOCSET_DEFAULT_MAXSIZE */
+ maxBlockSize = Min(maxBlockSize, ALLOCSET_DEFAULT_MAXSIZE);
+
+ /* and no smaller than ALLOCSET_DEFAULT_INITSIZE */
+ maxBlockSize = Max(maxBlockSize, ALLOCSET_DEFAULT_INITSIZE);
+
+ aggstate->hash_tablecxt = BumpContextCreate(aggstate->ss.ps.state->es_query_cxt,
+ "HashAgg table context",
+ ALLOCSET_DEFAULT_MINSIZE,
+ ALLOCSET_DEFAULT_INITSIZE,
+ maxBlockSize);
+
+}
+
+/*
* Choose a reasonable number of buckets for the initial hash table size.
*/
static long
@@ -2642,6 +2712,7 @@ agg_refill_hash_table(AggState *aggstate)
/* free memory and reset hash tables */
ReScanExprContext(aggstate->hashcontext);
+ MemoryContextReset(aggstate->hash_tablecxt);
for (int setno = 0; setno < aggstate->num_hashes; setno++)
ResetTupleHashTable(aggstate->perhash[setno].hashtable);
@@ -3326,7 +3397,7 @@ ExecInitAgg(Agg *node, EState *estate, int eflags)
}
if (use_hashing)
- aggstate->hashcontext = CreateWorkExprContext(estate);
+ hash_create_memory(aggstate);
ExecAssignExprContext(estate, &aggstate->ss.ps);
@@ -3621,9 +3692,6 @@ ExecInitAgg(Agg *node, EState *estate, int eflags)
Plan *outerplan = outerPlan(node);
uint64 totalGroups = 0;
- aggstate->hash_metacxt = AllocSetContextCreate(aggstate->ss.ps.state->es_query_cxt,
- "HashAgg meta context",
- ALLOCSET_DEFAULT_SIZES);
aggstate->hash_spill_rslot = ExecInitExtraTupleSlot(estate, scanDesc,
&TTSOpsMinimalTuple);
aggstate->hash_spill_wslot = ExecInitExtraTupleSlot(estate, scanDesc,
@@ -4368,6 +4436,12 @@ ExecEndAgg(AggState *node)
MemoryContextDelete(node->hash_metacxt);
node->hash_metacxt = NULL;
}
+ if (node->hash_tablecxt != NULL)
+ {
+ MemoryContextDelete(node->hash_tablecxt);
+ node->hash_tablecxt = NULL;
+ }
+
for (transno = 0; transno < node->numtrans; transno++)
{
@@ -4484,6 +4558,7 @@ ExecReScanAgg(AggState *node)
node->hash_ngroups_current = 0;
ReScanExprContext(node->hashcontext);
+ MemoryContextReset(node->hash_tablecxt);
/* Rebuild an empty hash table */
build_hash_tables(node);
node->table_filled = false;