1 files changed, 170 insertions, 5 deletions
diff --git a/src/include/executor/hashjoin.h b/src/include/executor/hashjoin.h
index 82acadf85ba..d8c82d4e7c0 100644
--- a/src/include/executor/hashjoin.h
+++ b/src/include/executor/hashjoin.h
@@ -15,7 +15,10 @@
 #define HASHJOIN_H
 
 #include "nodes/execnodes.h"
+#include "port/atomics.h"
+#include "storage/barrier.h"
 #include "storage/buffile.h"
+#include "storage/lwlock.h"
 
 /* ----------------------------------------------------------------
  *				hash-join hash table structures
@@ -63,7 +66,12 @@
 
 typedef struct HashJoinTupleData
 {
-	struct HashJoinTupleData *next; /* link to next tuple in same bucket */
+	/* link to next tuple in same bucket */
+	union
+	{
+		struct HashJoinTupleData *unshared;
+		dsa_pointer shared;
+	}			next;
 	uint32		hashvalue;		/* tuple's hash code */
 	/* Tuple data, in MinimalTuple format, follows on a MAXALIGN boundary */
 }			HashJoinTupleData;
@@ -112,8 +120,12 @@ typedef struct HashMemoryChunkData
 	size_t		maxlen;			/* size of the buffer holding the tuples */
 	size_t		used;			/* number of buffer bytes already used */
 
-	struct HashMemoryChunkData *next;	/* pointer to the next chunk (linked
-										 * list) */
+	/* pointer to the next chunk (linked list) */
+	union
+	{
+		struct HashMemoryChunkData *unshared;
+		dsa_pointer shared;
+	}			next;
 
 	char		data[FLEXIBLE_ARRAY_MEMBER];	/* buffer allocated at the end */
 }			HashMemoryChunkData;
@@ -121,8 +133,148 @@ typedef struct HashMemoryChunkData
 typedef struct HashMemoryChunkData *HashMemoryChunk;
 
 #define HASH_CHUNK_SIZE			(32 * 1024L)
+#define HASH_CHUNK_HEADER_SIZE (offsetof(HashMemoryChunkData, data))
 #define HASH_CHUNK_THRESHOLD	(HASH_CHUNK_SIZE / 4)
 
+/*
+ * For each batch of a Parallel Hash Join, we have a ParallelHashJoinBatch
+ * object in shared memory to coordinate access to it.  Since they are
+ * followed by variable-sized objects, they are arranged in contiguous memory
+ * but not accessed directly as an array.
+ */
+typedef struct ParallelHashJoinBatch
+{
+	dsa_pointer buckets;		/* array of hash table buckets */
+	Barrier		batch_barrier;	/* synchronization for joining this batch */
+
+	dsa_pointer chunks;			/* chunks of tuples loaded */
+	size_t		size;			/* size of buckets + chunks in memory */
+	size_t		estimated_size; /* size of buckets + chunks while writing */
+	size_t		ntuples;		/* number of tuples loaded */
+	size_t		old_ntuples;	/* number of tuples before repartitioning */
+	bool		space_exhausted;
+
+	/*
+	 * Variable-sized SharedTuplestore objects follow this struct in memory.
+	 * See the accessor macros below.
+	 */
+} ParallelHashJoinBatch;
+
+/* Accessor for inner batch tuplestore following a ParallelHashJoinBatch. */
+#define ParallelHashJoinBatchInner(batch)							\
+	((SharedTuplestore *)											\
+	 ((char *) (batch) + MAXALIGN(sizeof(ParallelHashJoinBatch))))
+
+/* Accessor for outer batch tuplestore following a ParallelHashJoinBatch. */
+#define ParallelHashJoinBatchOuter(batch, nparticipants) \
+	((SharedTuplestore *)												\
+	 ((char *) ParallelHashJoinBatchInner(batch) +						\
+	  MAXALIGN(sts_estimate(nparticipants))))
+
+/* Total size of a ParallelHashJoinBatch and tuplestores. */
+#define EstimateParallelHashJoinBatch(hashtable)						\
+	(MAXALIGN(sizeof(ParallelHashJoinBatch)) +							\
+	 MAXALIGN(sts_estimate((hashtable)->parallel_state->nparticipants)) * 2)
+
+/* Accessor for the nth ParallelHashJoinBatch given the base. */
+#define NthParallelHashJoinBatch(base, n)								\
+	((ParallelHashJoinBatch *)											\
+	 ((char *) (base) +													\
+	  EstimateParallelHashJoinBatch(hashtable) *  (n)))
+
+/*
+ * Each backend requires a small amount of per-batch state to interact with
+ * each ParalellHashJoinBatch.
+ */
+typedef struct ParallelHashJoinBatchAccessor
+{
+	ParallelHashJoinBatch *shared;	/* pointer to shared state */
+
+	/* Per-backend partial counters to reduce contention. */
+	size_t		preallocated;	/* pre-allocated space for this backend */
+	size_t		ntuples;		/* number of tuples */
+	size_t		size;			/* size of partition in memory */
+	size_t		estimated_size; /* size of partition on disk */
+	size_t		old_ntuples;	/* how many tuples before repartioning? */
+	bool		at_least_one_chunk; /* has this backend allocated a chunk? */
+
+	bool		done;			/* flag to remember that a batch is done */
+	SharedTuplestoreAccessor *inner_tuples;
+	SharedTuplestoreAccessor *outer_tuples;
+} ParallelHashJoinBatchAccessor;
+
+/*
+ * While hashing the inner relation, any participant might determine that it's
+ * time to increase the number of buckets to reduce the load factor or batches
+ * to reduce the memory size.  This is indicated by setting the growth flag to
+ * these values.
+ */
+typedef enum ParallelHashGrowth
+{
+	/* The current dimensions are sufficient. */
+	PHJ_GROWTH_OK,
+	/* The load factor is too high, so we need to add buckets. */
+	PHJ_GROWTH_NEED_MORE_BUCKETS,
+	/* The memory budget would be exhausted, so we need to repartition. */
+	PHJ_GROWTH_NEED_MORE_BATCHES,
+	/* Repartitioning didn't help last time, so don't try to do that again. */
+	PHJ_GROWTH_DISABLED
+} ParallelHashGrowth;
+
+/*
+ * The shared state used to coordinate a Parallel Hash Join.  This is stored
+ * in the DSM segment.
+ */
+typedef struct ParallelHashJoinState
+{
+	dsa_pointer batches;		/* array of ParallelHashJoinBatch */
+	dsa_pointer old_batches;	/* previous generation during repartition */
+	int			nbatch;			/* number of batches now */
+	int			old_nbatch;		/* previous number of batches */
+	int			nbuckets;		/* number of buckets */
+	ParallelHashGrowth growth;	/* control batch/bucket growth */
+	dsa_pointer chunk_work_queue;	/* chunk work queue */
+	int			nparticipants;
+	size_t		space_allowed;
+	size_t		total_tuples;	/* total number of inner tuples */
+	LWLock		lock;			/* lock protecting the above */
+
+	Barrier		build_barrier;	/* synchronization for the build phases */
+	Barrier		grow_batches_barrier;
+	Barrier		grow_buckets_barrier;
+	pg_atomic_uint32 distributor;	/* counter for load balancing */
+
+	SharedFileSet fileset;		/* space for shared temporary files */
+} ParallelHashJoinState;
+
+/* The phases for building batches, used by build_barrier. */
+#define PHJ_BUILD_ELECTING				0
+#define PHJ_BUILD_ALLOCATING			1
+#define PHJ_BUILD_HASHING_INNER			2
+#define PHJ_BUILD_HASHING_OUTER			3
+#define PHJ_BUILD_DONE					4
+
+/* The phases for probing each batch, used by for batch_barrier. */
+#define PHJ_BATCH_ELECTING				0
+#define PHJ_BATCH_ALLOCATING			1
+#define PHJ_BATCH_LOADING				2
+#define PHJ_BATCH_PROBING				3
+#define PHJ_BATCH_DONE					4
+
+/* The phases of batch growth while hashing, for grow_batches_barrier. */
+#define PHJ_GROW_BATCHES_ELECTING		0
+#define PHJ_GROW_BATCHES_ALLOCATING		1
+#define PHJ_GROW_BATCHES_REPARTITIONING 2
+#define PHJ_GROW_BATCHES_DECIDING		3
+#define PHJ_GROW_BATCHES_FINISHING		4
+#define PHJ_GROW_BATCHES_PHASE(n)		((n) % 5)	/* circular phases */
+
+/* The phases of bucket growth while hashing, for grow_buckets_barrier. */
+#define PHJ_GROW_BUCKETS_ELECTING		0
+#define PHJ_GROW_BUCKETS_ALLOCATING		1
+#define PHJ_GROW_BUCKETS_REINSERTING	2
+#define PHJ_GROW_BUCKETS_PHASE(n)		((n) % 3)	/* circular phases */
+
 typedef struct HashJoinTableData
 {
 	int			nbuckets;		/* # buckets in the in-memory hash table */
@@ -133,8 +285,13 @@ typedef struct HashJoinTableData
 	int			log2_nbuckets_optimal;	/* log2(nbuckets_optimal) */
 
 	/* buckets[i] is head of list of tuples in i'th in-memory bucket */
-	struct HashJoinTupleData **buckets;
-	/* buckets array is per-batch storage, as are all the tuples */
+	union
+	{
+		/* unshared array is per-batch storage, as are all the tuples */
+		struct HashJoinTupleData **unshared;
+		/* shared array is per-query DSA area, as are all the tuples */
+		dsa_pointer_atomic *shared;
+	}			buckets;
 
 	bool		keepNulls;		/* true to store unmatchable NULL tuples */
 
@@ -153,6 +310,7 @@ typedef struct HashJoinTableData
 	bool		growEnabled;	/* flag to shut off nbatch increases */
 
 	double		totalTuples;	/* # tuples obtained from inner plan */
+	double		partialTuples;	/* # tuples obtained from inner plan by me */
 	double		skewTuples;		/* # tuples inserted into skew tuples */
 
 	/*
@@ -185,6 +343,13 @@ typedef struct HashJoinTableData
 
 	/* used for dense allocation of tuples (into linked chunks) */
 	HashMemoryChunk chunks;		/* one list for the whole batch */
+
+	/* Shared and private state for Parallel Hash. */
+	HashMemoryChunk current_chunk;	/* this backend's current chunk */
+	dsa_area   *area;			/* DSA area to allocate memory from */
+	ParallelHashJoinState *parallel_state;
+	ParallelHashJoinBatchAccessor *batches;
+	dsa_pointer current_chunk_shared;
 }			HashJoinTableData;
 
 #endif							/* HASHJOIN_H */