Rewrite hash join to use simple linked lists instead of a

fixed-size hashtable. This should prevent 'hashtable out of memory' errors, unless you really do run out of memory. Note: target size for hashtable is now taken from -S postmaster switch, not -B, since it is local memory in the backend rather than shared memory.
author: Tom Lane <tgl@sss.pgh.pa.us> 1999-05-18 21:33:06 +0000
committer: Tom Lane <tgl@sss.pgh.pa.us> 1999-05-18 21:33:06 +0000
commit: 26069a58e8e4e4f3bef27e52d2d5cad2baa46c9f (patch)
tree: c40f7d3e130df1fce0e1fc73520b9e465c02a607 /src/include/executor/hashjoin.h
parent: d261a5ec861c001f0331e36e01499d8dde7f5c67 (diff)
download: postgresql-26069a58e8e4e4f3bef27e52d2d5cad2baa46c9f.tar.gz
postgresql-26069a58e8e4e4f3bef27e52d2d5cad2baa46c9f.zip
1 files changed, 64 insertions, 57 deletions
diff --git a/src/include/executor/hashjoin.h b/src/include/executor/hashjoin.h
index 62e2164df31..751b5efee10 100644
--- a/src/include/executor/hashjoin.h
+++ b/src/include/executor/hashjoin.h
@@ -1,85 +1,92 @@
 /*-------------------------------------------------------------------------
  *
  * hashjoin.h
- *	  internal structures for hash table and buckets
+ *	  internal structures for hash joins
  *
  *
  * Copyright (c) 1994, Regents of the University of California
  *
- * $Id: hashjoin.h,v 1.10 1999/05/09 00:53:18 tgl Exp $
+ * $Id: hashjoin.h,v 1.11 1999/05/18 21:33:04 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
 #ifndef HASHJOIN_H
 #define HASHJOIN_H
 
-#include <storage/ipc.h>
+#include "access/htup.h"
+#include "storage/fd.h"
+#include "utils/mcxt.h"
 
-/* -----------------
- *	have to use relative address as pointers in the hashtable
- *	because the hashtable may reallocate in different processes
+/* ----------------------------------------------------------------
+ *				hash-join hash table structures
+ *
+ * Each active hashjoin has a HashJoinTable control block which is
+ * palloc'd in the executor's context.  All other storage needed for
+ * the hashjoin is kept in a private "named portal", one for each hashjoin.
+ * This makes it easy and fast to release the storage when we don't need it
+ * anymore.
+ *
+ * The portal manager guarantees that portals will be discarded at end of
+ * transaction, so we have no problem with a memory leak if the join is
+ * aborted early by an error.  (Likewise, any temporary files we make will
+ * be cleaned up by the virtual file manager in event of an error.)
  *
- *  XXX: this relative-address stuff is useless on all supported platforms
- *  and is a ever-dangerous source of bugs.  Really ought to rip it out.
- * -----------------
+ * Storage that should live through the entire join is allocated from the
+ * portal's "variable context", while storage that is only wanted for the
+ * current batch is allocated in the portal's "heap context".  By popping
+ * the portal's heap at the end of a batch, we free all the per-batch storage
+ * reliably and without tedium.
+ * ----------------------------------------------------------------
  */
-typedef int RelativeAddr;
 
-/* ------------------
- *	The relative addresses are always relative to the head of the
- *	hashtable, the following macros convert them to/from absolute address.
- *  NULL is represented as -1 (CAUTION: RELADDR() doesn't handle that!).
- *  CAUTION: ABSADDR evaluates its arg twice!!
- * ------------------
- */
-#define ABSADDR(X)		((X) < 0 ? (char*) NULL : (char*)hashtable + (X))
-#define RELADDR(X)		((RelativeAddr)((char*)(X) - (char*)hashtable))
+typedef struct HashJoinTupleData
+{
+	struct HashJoinTupleData *next;	/* link to next tuple in same bucket */
+	HeapTupleData	htup;		/* tuple header */
+} HashJoinTupleData;
 
-typedef char **charPP;
-typedef int *intP;
+typedef HashJoinTupleData *HashJoinTuple;
 
-/* ----------------------------------------------------------------
- *				hash-join hash table structures
- * ----------------------------------------------------------------
- */
 typedef struct HashTableData
 {
-	int			nbuckets;
-	int			totalbuckets;
-	int			bucketsize;
-	IpcMemoryId shmid;
-	RelativeAddr top;			/* char* */
-	RelativeAddr bottom;		/* char* */
-	RelativeAddr overflownext;	/* char* */
-	RelativeAddr batch;			/* char* */
-	RelativeAddr readbuf;		/* char* */
-	int			nbatch;
-	RelativeAddr outerbatchPos; /* RelativeAddr* */
-	RelativeAddr innerbatchPos; /* RelativeAddr* */
-	RelativeAddr innerbatchSizes;		/* int* */
-	int			curbatch;
-	int			nprocess;
-	int			pcount;
-} HashTableData;				/* real hash table follows here */
+	int			nbuckets;		/* buckets in use during this batch */
+	int			totalbuckets;	/* total number of (virtual) buckets */
+	HashJoinTuple  *buckets;	/* buckets[i] is head of list of tuples */
+	/* buckets array is per-batch storage, as are all the tuples */
 
-typedef HashTableData *HashJoinTable;
+	int			nbatch;			/* number of batches; 0 means 1-pass join */
+	int			curbatch;		/* current batch #, or 0 during 1st pass */
 
-typedef struct OverflowTupleData
-{
-	RelativeAddr tuple;			/* HeapTuple */
-	RelativeAddr next;			/* struct OverflowTupleData * */
-} OverflowTupleData;			/* real tuple follows here */
+	/* all these arrays are allocated for the life of the hash join,
+	 * but only if nbatch > 0:
+	 */
+	BufFile	  **innerBatchFile;	/* buffered virtual temp file per batch */
+	BufFile	  **outerBatchFile;	/* buffered virtual temp file per batch */
+	long	   *outerBatchSize; /* count of tuples in each outer batch file */
+	long	   *innerBatchSize; /* count of tuples in each inner batch file */
 
-typedef OverflowTupleData *OverflowTuple;
+	/* During 1st scan of inner relation, we get tuples from executor.
+	 * If nbatch > 0 then tuples that don't belong in first nbuckets logical
+	 * buckets get dumped into inner-batch temp files.
+	 * The same statements apply for the 1st scan of the outer relation,
+	 * except we write tuples to outer-batch temp files.
+	 * If nbatch > 0 then we do the following for each batch:
+	 *  1. Read tuples from inner batch file, load into hash buckets.
+	 *  2. Read tuples from outer batch file, match to hash buckets and output.
+	 */
 
-typedef struct HashBucketData
-{
-	RelativeAddr top;			/* HeapTuple */
-	RelativeAddr bottom;		/* HeapTuple */
-	RelativeAddr firstotuple;	/* OverflowTuple */
-	RelativeAddr lastotuple;	/* OverflowTuple */
-} HashBucketData;				/* real bucket follows here */
+	/* Ugly kluge: myPortal ought to be declared as type Portal (ie, PortalD*)
+	 * but if we try to include utils/portal.h here, we end up with a
+	 * circular dependency of include files!  Until the various node.h files
+	 * are restructured in a cleaner way, we have to fake it.  The most
+	 * reliable fake seems to be to declare myPortal as void * and then
+	 * cast it to the right things in nodeHash.c.
+	 */
+	void		   *myPortal;	/* where to keep working storage */
+	MemoryContext	hashCxt;	/* context for whole-hash-join storage */
+	MemoryContext	batchCxt;	/* context for this-batch-only storage */
+} HashTableData;
 
-typedef HashBucketData *HashBucket;
+typedef HashTableData *HashJoinTable;
 
 #endif	 /* HASHJOIN_H */
author	Tom Lane <tgl@sss.pgh.pa.us>	1999-05-18 21:33:06 +0000
committer	Tom Lane <tgl@sss.pgh.pa.us>	1999-05-18 21:33:06 +0000
commit	26069a58e8e4e4f3bef27e52d2d5cad2baa46c9f (patch)
tree	c40f7d3e130df1fce0e1fc73520b9e465c02a607 /src/include/executor/hashjoin.h
parent	d261a5ec861c001f0331e36e01499d8dde7f5c67 (diff)
download	postgresql-26069a58e8e4e4f3bef27e52d2d5cad2baa46c9f.tar.gz postgresql-26069a58e8e4e4f3bef27e52d2d5cad2baa46c9f.zip