aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/backend/executor/execUtils.c17
-rw-r--r--src/backend/executor/nodeAgg.c111
-rw-r--r--src/include/nodes/execnodes.h5
3 files changed, 104 insertions, 29 deletions
diff --git a/src/backend/executor/execUtils.c b/src/backend/executor/execUtils.c
index 55ab18fb826..772c86e70e9 100644
--- a/src/backend/executor/execUtils.c
+++ b/src/backend/executor/execUtils.c
@@ -322,19 +322,18 @@ CreateExprContext(EState *estate)
ExprContext *
CreateWorkExprContext(EState *estate)
{
- Size minContextSize = ALLOCSET_DEFAULT_MINSIZE;
- Size initBlockSize = ALLOCSET_DEFAULT_INITSIZE;
Size maxBlockSize = ALLOCSET_DEFAULT_MAXSIZE;
- /* choose the maxBlockSize to be no larger than 1/16 of work_mem */
- while (maxBlockSize > work_mem * (Size) 1024 / 16)
- maxBlockSize >>= 1;
+ maxBlockSize = pg_prevpower2_size_t(work_mem * (Size) 1024 / 16);
- if (maxBlockSize < ALLOCSET_DEFAULT_INITSIZE)
- maxBlockSize = ALLOCSET_DEFAULT_INITSIZE;
+ /* But no bigger than ALLOCSET_DEFAULT_MAXSIZE */
+ maxBlockSize = Min(maxBlockSize, ALLOCSET_DEFAULT_MAXSIZE);
- return CreateExprContextInternal(estate, minContextSize,
- initBlockSize, maxBlockSize);
+ /* and no smaller than ALLOCSET_DEFAULT_INITSIZE */
+ maxBlockSize = Max(maxBlockSize, ALLOCSET_DEFAULT_INITSIZE);
+
+ return CreateExprContextInternal(estate, ALLOCSET_DEFAULT_MINSIZE,
+ ALLOCSET_DEFAULT_INITSIZE, maxBlockSize);
}
/* ----------------
diff --git a/src/backend/executor/nodeAgg.c b/src/backend/executor/nodeAgg.c
index b4a7698a0b3..beccbfdc6fe 100644
--- a/src/backend/executor/nodeAgg.c
+++ b/src/backend/executor/nodeAgg.c
@@ -406,6 +406,7 @@ static void build_hash_tables(AggState *aggstate);
static void build_hash_table(AggState *aggstate, int setno, long nbuckets);
static void hashagg_recompile_expressions(AggState *aggstate, bool minslot,
bool nullcheck);
+static void hash_create_memory(AggState *aggstate);
static long hash_choose_num_buckets(double hashentrysize,
long ngroups, Size memory);
static int hash_choose_num_partitions(double input_groups,
@@ -1509,7 +1510,7 @@ build_hash_table(AggState *aggstate, int setno, long nbuckets)
{
AggStatePerHash perhash = &aggstate->perhash[setno];
MemoryContext metacxt = aggstate->hash_metacxt;
- MemoryContext hashcxt = aggstate->hashcontext->ecxt_per_tuple_memory;
+ MemoryContext tablecxt = aggstate->hash_tablecxt;
MemoryContext tmpcxt = aggstate->tmpcontext->ecxt_per_tuple_memory;
Size additionalsize;
@@ -1535,7 +1536,7 @@ build_hash_table(AggState *aggstate, int setno, long nbuckets)
nbuckets,
additionalsize,
metacxt,
- hashcxt,
+ tablecxt,
tmpcxt,
DO_AGGSPLIT_SKIPFINAL(aggstate->aggsplit));
}
@@ -1706,15 +1707,19 @@ hash_agg_entry_size(int numTrans, Size tupleWidth, Size transitionSpace)
tupleWidth);
Size pergroupSize = numTrans * sizeof(AggStatePerGroupData);
- tupleChunkSize = CHUNKHDRSZ + tupleSize;
-
- if (pergroupSize > 0)
- pergroupChunkSize = CHUNKHDRSZ + pergroupSize;
- else
- pergroupChunkSize = 0;
+ /*
+ * Entries use the Bump allocator, so the chunk sizes are the same as the
+ * requested sizes.
+ */
+ tupleChunkSize = MAXALIGN(tupleSize);
+ pergroupChunkSize = pergroupSize;
+ /*
+ * Transition values use AllocSet, which has a chunk header and also uses
+ * power-of-two allocations.
+ */
if (transitionSpace > 0)
- transitionChunkSize = CHUNKHDRSZ + transitionSpace;
+ transitionChunkSize = CHUNKHDRSZ + pg_nextpower2_size_t(transitionSpace);
else
transitionChunkSize = 0;
@@ -1864,8 +1869,11 @@ hash_agg_check_limits(AggState *aggstate)
uint64 ngroups = aggstate->hash_ngroups_current;
Size meta_mem = MemoryContextMemAllocated(aggstate->hash_metacxt,
true);
- Size hashkey_mem = MemoryContextMemAllocated(aggstate->hashcontext->ecxt_per_tuple_memory,
- true);
+ Size entry_mem = MemoryContextMemAllocated(aggstate->hash_tablecxt,
+ true);
+ Size tval_mem = MemoryContextMemAllocated(aggstate->hashcontext->ecxt_per_tuple_memory,
+ true);
+ Size total_mem = meta_mem + entry_mem + tval_mem;
bool do_spill = false;
#ifdef USE_INJECTION_POINTS
@@ -1884,7 +1892,7 @@ hash_agg_check_limits(AggState *aggstate)
* can be sure to make progress even in edge cases.
*/
if (aggstate->hash_ngroups_current > 0 &&
- (meta_mem + hashkey_mem > aggstate->hash_mem_limit ||
+ (total_mem > aggstate->hash_mem_limit ||
ngroups > aggstate->hash_ngroups_limit))
{
do_spill = true;
@@ -1939,6 +1947,7 @@ static void
hash_agg_update_metrics(AggState *aggstate, bool from_tape, int npartitions)
{
Size meta_mem;
+ Size entry_mem;
Size hashkey_mem;
Size buffer_mem;
Size total_mem;
@@ -1950,7 +1959,10 @@ hash_agg_update_metrics(AggState *aggstate, bool from_tape, int npartitions)
/* memory for the hash table itself */
meta_mem = MemoryContextMemAllocated(aggstate->hash_metacxt, true);
- /* memory for the group keys and transition states */
+ /* memory for hash entries */
+ entry_mem = MemoryContextMemAllocated(aggstate->hash_tablecxt, true);
+
+ /* memory for byref transition states */
hashkey_mem = MemoryContextMemAllocated(aggstate->hashcontext->ecxt_per_tuple_memory, true);
/* memory for read/write tape buffers, if spilled */
@@ -1959,7 +1971,7 @@ hash_agg_update_metrics(AggState *aggstate, bool from_tape, int npartitions)
buffer_mem += HASHAGG_READ_BUFFER_SIZE;
/* update peak mem */
- total_mem = meta_mem + hashkey_mem + buffer_mem;
+ total_mem = meta_mem + entry_mem + hashkey_mem + buffer_mem;
if (total_mem > aggstate->hash_mem_peak)
aggstate->hash_mem_peak = total_mem;
@@ -1982,6 +1994,64 @@ hash_agg_update_metrics(AggState *aggstate, bool from_tape, int npartitions)
}
/*
+ * Create memory contexts used for hash aggregation.
+ */
+static void
+hash_create_memory(AggState *aggstate)
+{
+ Size maxBlockSize = ALLOCSET_DEFAULT_MAXSIZE;
+
+ /*
+ * The hashcontext's per-tuple memory will be used for byref transition
+ * values and returned by AggCheckCallContext().
+ */
+ aggstate->hashcontext = CreateWorkExprContext(aggstate->ss.ps.state);
+
+ /*
+ * The meta context will be used for the bucket array of
+ * TupleHashEntryData (or arrays, in the case of grouping sets). As the
+ * hash table grows, the bucket array will double in size and the old one
+ * will be freed, so an AllocSet is appropriate. For large bucket arrays,
+ * the large allocation path will be used, so it's not worth worrying
+ * about wasting space due to power-of-two allocations.
+ */
+ aggstate->hash_metacxt = AllocSetContextCreate(aggstate->ss.ps.state->es_query_cxt,
+ "HashAgg meta context",
+ ALLOCSET_DEFAULT_SIZES);
+
+ /*
+ * The hash entries themselves, which include the grouping key
+ * (firstTuple) and pergroup data, are stored in the table context. The
+ * bump allocator can be used because the entries are not freed until the
+ * entire hash table is reset. The bump allocator is faster for
+ * allocations and avoids wasting space on the chunk header or
+ * power-of-two allocations.
+ *
+ * Like CreateWorkExprContext(), use smaller sizings for smaller work_mem,
+ * to avoid large jumps in memory usage.
+ */
+
+ /*
+ * Like CreateWorkExprContext(), use smaller sizings for smaller work_mem,
+ * to avoid large jumps in memory usage.
+ */
+ maxBlockSize = pg_prevpower2_size_t(work_mem * (Size) 1024 / 16);
+
+ /* But no bigger than ALLOCSET_DEFAULT_MAXSIZE */
+ maxBlockSize = Min(maxBlockSize, ALLOCSET_DEFAULT_MAXSIZE);
+
+ /* and no smaller than ALLOCSET_DEFAULT_INITSIZE */
+ maxBlockSize = Max(maxBlockSize, ALLOCSET_DEFAULT_INITSIZE);
+
+ aggstate->hash_tablecxt = BumpContextCreate(aggstate->ss.ps.state->es_query_cxt,
+ "HashAgg table context",
+ ALLOCSET_DEFAULT_MINSIZE,
+ ALLOCSET_DEFAULT_INITSIZE,
+ maxBlockSize);
+
+}
+
+/*
* Choose a reasonable number of buckets for the initial hash table size.
*/
static long
@@ -2642,6 +2712,7 @@ agg_refill_hash_table(AggState *aggstate)
/* free memory and reset hash tables */
ReScanExprContext(aggstate->hashcontext);
+ MemoryContextReset(aggstate->hash_tablecxt);
for (int setno = 0; setno < aggstate->num_hashes; setno++)
ResetTupleHashTable(aggstate->perhash[setno].hashtable);
@@ -3326,7 +3397,7 @@ ExecInitAgg(Agg *node, EState *estate, int eflags)
}
if (use_hashing)
- aggstate->hashcontext = CreateWorkExprContext(estate);
+ hash_create_memory(aggstate);
ExecAssignExprContext(estate, &aggstate->ss.ps);
@@ -3621,9 +3692,6 @@ ExecInitAgg(Agg *node, EState *estate, int eflags)
Plan *outerplan = outerPlan(node);
uint64 totalGroups = 0;
- aggstate->hash_metacxt = AllocSetContextCreate(aggstate->ss.ps.state->es_query_cxt,
- "HashAgg meta context",
- ALLOCSET_DEFAULT_SIZES);
aggstate->hash_spill_rslot = ExecInitExtraTupleSlot(estate, scanDesc,
&TTSOpsMinimalTuple);
aggstate->hash_spill_wslot = ExecInitExtraTupleSlot(estate, scanDesc,
@@ -4368,6 +4436,12 @@ ExecEndAgg(AggState *node)
MemoryContextDelete(node->hash_metacxt);
node->hash_metacxt = NULL;
}
+ if (node->hash_tablecxt != NULL)
+ {
+ MemoryContextDelete(node->hash_tablecxt);
+ node->hash_tablecxt = NULL;
+ }
+
for (transno = 0; transno < node->numtrans; transno++)
{
@@ -4484,6 +4558,7 @@ ExecReScanAgg(AggState *node)
node->hash_ngroups_current = 0;
ReScanExprContext(node->hashcontext);
+ MemoryContextReset(node->hash_tablecxt);
/* Rebuild an empty hash table */
build_hash_tables(node);
node->table_filled = false;
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index d4d4e655180..b5539ddb41e 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -2560,7 +2560,8 @@ typedef struct AggState
/* these fields are used in AGG_HASHED and AGG_MIXED modes: */
bool table_filled; /* hash table filled yet? */
int num_hashes;
- MemoryContext hash_metacxt; /* memory for hash table itself */
+ MemoryContext hash_metacxt; /* memory for hash table bucket array */
+ MemoryContext hash_tablecxt; /* memory for hash table entries */
struct LogicalTapeSet *hash_tapeset; /* tape set for hash spill tapes */
struct HashAggSpill *hash_spills; /* HashAggSpill for each grouping set,
* exists only during first pass */
@@ -2586,7 +2587,7 @@ typedef struct AggState
* per-group pointers */
/* support for evaluation of agg input expressions: */
-#define FIELDNO_AGGSTATE_ALL_PERGROUPS 53
+#define FIELDNO_AGGSTATE_ALL_PERGROUPS 54
AggStatePerGroup *all_pergroups; /* array of first ->pergroups, than
* ->hash_pergroup */
SharedAggInfo *shared_info; /* one entry per worker */