1 files changed, 318 insertions, 15 deletions
diff --git a/src/backend/catalog/storage.c b/src/backend/catalog/storage.c
index fddfbf1d8c6..d713d5cade9 100644
--- a/src/backend/catalog/storage.c
+++ b/src/backend/catalog/storage.c
@@ -19,6 +19,7 @@
 
 #include "postgres.h"
 
+#include "access/parallel.h"
 #include "access/visibilitymap.h"
 #include "access/xact.h"
 #include "access/xlog.h"
@@ -29,9 +30,13 @@
 #include "miscadmin.h"
 #include "storage/freespace.h"
 #include "storage/smgr.h"
+#include "utils/hsearch.h"
 #include "utils/memutils.h"
 #include "utils/rel.h"
 
+/* GUC variables */
+int			wal_skip_threshold = 2048;	/* in kilobytes */
+
 /*
  * We keep a list of all relations (represented as RelFileNode values)
  * that have been created or deleted in the current transaction.  When
@@ -61,7 +66,42 @@ typedef struct PendingRelDelete
 	struct PendingRelDelete *next;	/* linked-list link */
 } PendingRelDelete;
 
+typedef struct PendingRelSync
+{
+	RelFileNode rnode;
+	bool		is_truncated;	/* Has the file experienced truncation? */
+} PendingRelSync;
+
 static PendingRelDelete *pendingDeletes = NULL; /* head of linked list */
+HTAB	   *pendingSyncHash = NULL;
+
+
+/*
+ * AddPendingSync
+ *		Queue an at-commit fsync.
+ */
+static void
+AddPendingSync(const RelFileNode *rnode)
+{
+	PendingRelSync *pending;
+	bool		found;
+
+	/* create the hash if not yet */
+	if (!pendingSyncHash)
+	{
+		HASHCTL		ctl;
+
+		ctl.keysize = sizeof(RelFileNode);
+		ctl.entrysize = sizeof(PendingRelSync);
+		ctl.hcxt = TopTransactionContext;
+		pendingSyncHash = hash_create("pending sync hash", 16, &ctl,
+									  HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
+	}
+
+	pending = hash_search(pendingSyncHash, rnode, HASH_ENTER, &found);
+	Assert(!found);
+	pending->is_truncated = false;
+}
 
 /*
  * RelationCreateStorage
@@ -82,6 +122,8 @@ RelationCreateStorage(RelFileNode rnode, char relpersistence)
 	BackendId	backend;
 	bool		needs_wal;
 
+	Assert(!IsInParallelMode());	/* couldn't update pendingSyncHash */
+
 	switch (relpersistence)
 	{
 		case RELPERSISTENCE_TEMP:
@@ -117,6 +159,12 @@ RelationCreateStorage(RelFileNode rnode, char relpersistence)
 	pending->next = pendingDeletes;
 	pendingDeletes = pending;
 
+	if (relpersistence == RELPERSISTENCE_PERMANENT && !XLogIsNeeded())
+	{
+		Assert(backend == InvalidBackendId);
+		AddPendingSync(&rnode);
+	}
+
 	return srel;
 }
 
@@ -275,6 +323,8 @@ RelationTruncate(Relation rel, BlockNumber nblocks)
 		}
 	}
 
+	RelationPreTruncate(rel);
+
 	/*
 	 * We WAL-log the truncation before actually truncating, which means
 	 * trouble if the truncation fails. If we then crash, the WAL replay
@@ -326,6 +376,28 @@ RelationTruncate(Relation rel, BlockNumber nblocks)
 }
 
 /*
+ * RelationPreTruncate
+ *		Perform AM-independent work before a physical truncation.
+ *
+ * If an access method's relation_nontransactional_truncate does not call
+ * RelationTruncate(), it must call this before decreasing the table size.
+ */
+void
+RelationPreTruncate(Relation rel)
+{
+	PendingRelSync *pending;
+
+	if (!pendingSyncHash)
+		return;
+	RelationOpenSmgr(rel);
+
+	pending = hash_search(pendingSyncHash, &(rel->rd_smgr->smgr_rnode.node),
+						  HASH_FIND, NULL);
+	if (pending)
+		pending->is_truncated = true;
+}
+
+/*
  * Copy a fork's data, block by block.
  *
  * Note that this requires that there is no dirty data in shared buffers. If
@@ -355,7 +427,9 @@ RelationCopyStorage(SMgrRelation src, SMgrRelation dst,
 
 	/*
 	 * We need to log the copied data in WAL iff WAL archiving/streaming is
-	 * enabled AND it's a permanent relation.
+	 * enabled AND it's a permanent relation.  This gives the same answer as
+	 * "RelationNeedsWAL(rel) || copying_initfork", because we know the
+	 * current operation created a new relfilenode.
 	 */
 	use_wal = XLogIsNeeded() &&
 		(relpersistence == RELPERSISTENCE_PERMANENT || copying_initfork);
@@ -397,25 +471,116 @@ RelationCopyStorage(SMgrRelation src, SMgrRelation dst,
 	}
 
 	/*
-	 * If the rel is WAL-logged, must fsync before commit.  We use heap_sync
-	 * to ensure that the toast table gets fsync'd too.  (For a temp or
-	 * unlogged rel we don't care since the data will be gone after a crash
-	 * anyway.)
-	 *
-	 * It's obvious that we must do this when not WAL-logging the copy. It's
-	 * less obvious that we have to do it even if we did WAL-log the copied
-	 * pages. The reason is that since we're copying outside shared buffers, a
-	 * CHECKPOINT occurring during the copy has no way to flush the previously
-	 * written data to disk (indeed it won't know the new rel even exists).  A
-	 * crash later on would replay WAL from the checkpoint, therefore it
-	 * wouldn't replay our earlier WAL entries. If we do not fsync those pages
-	 * here, they might still not be on disk when the crash occurs.
+	 * When we WAL-logged rel pages, we must nonetheless fsync them.  The
+	 * reason is that since we're copying outside shared buffers, a CHECKPOINT
+	 * occurring during the copy has no way to flush the previously written
+	 * data to disk (indeed it won't know the new rel even exists).  A crash
+	 * later on would replay WAL from the checkpoint, therefore it wouldn't
+	 * replay our earlier WAL entries. If we do not fsync those pages here,
+	 * they might still not be on disk when the crash occurs.
 	 */
-	if (relpersistence == RELPERSISTENCE_PERMANENT || copying_initfork)
+	if (use_wal || copying_initfork)
 		smgrimmedsync(dst, forkNum);
 }
 
 /*
+ * RelFileNodeSkippingWAL
+ *		Check if a BM_PERMANENT relfilenode is using WAL.
+ *
+ * Changes of certain relfilenodes must not write WAL; see "Skipping WAL for
+ * New RelFileNode" in src/backend/access/transam/README.  Though it is known
+ * from Relation efficiently, this function is intended for the code paths not
+ * having access to Relation.
+ */
+bool
+RelFileNodeSkippingWAL(RelFileNode rnode)
+{
+	if (!pendingSyncHash ||
+		hash_search(pendingSyncHash, &rnode, HASH_FIND, NULL) == NULL)
+		return false;
+
+	return true;
+}
+
+/*
+ * EstimatePendingSyncsSpace
+ *		Estimate space needed to pass syncs to parallel workers.
+ */
+Size
+EstimatePendingSyncsSpace(void)
+{
+	long		entries;
+
+	entries = pendingSyncHash ? hash_get_num_entries(pendingSyncHash) : 0;
+	return mul_size(1 + entries, sizeof(RelFileNode));
+}
+
+/*
+ * SerializePendingSyncs
+ *		Serialize syncs for parallel workers.
+ */
+void
+SerializePendingSyncs(Size maxSize, char *startAddress)
+{
+	HTAB	   *tmphash;
+	HASHCTL		ctl;
+	HASH_SEQ_STATUS scan;
+	PendingRelSync *sync;
+	PendingRelDelete *delete;
+	RelFileNode *src;
+	RelFileNode *dest = (RelFileNode *) startAddress;
+
+	if (!pendingSyncHash)
+		goto terminate;
+
+	/* Create temporary hash to collect active relfilenodes */
+	ctl.keysize = sizeof(RelFileNode);
+	ctl.entrysize = sizeof(RelFileNode);
+	ctl.hcxt = CurrentMemoryContext;
+	tmphash = hash_create("tmp relfilenodes",
+						  hash_get_num_entries(pendingSyncHash), &ctl,
+						  HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
+
+	/* collect all rnodes from pending syncs */
+	hash_seq_init(&scan, pendingSyncHash);
+	while ((sync = (PendingRelSync *) hash_seq_search(&scan)))
+		(void) hash_search(tmphash, &sync->rnode, HASH_ENTER, NULL);
+
+	/* remove deleted rnodes */
+	for (delete = pendingDeletes; delete != NULL; delete = delete->next)
+		if (delete->atCommit)
+			(void) hash_search(tmphash, (void *) &delete->relnode,
+							   HASH_REMOVE, NULL);
+
+	hash_seq_init(&scan, tmphash);
+	while ((src = (RelFileNode *) hash_seq_search(&scan)))
+		*dest++ = *src;
+
+	hash_destroy(tmphash);
+
+terminate:
+	MemSet(dest, 0, sizeof(RelFileNode));
+}
+
+/*
+ * RestorePendingSyncs
+ *		Restore syncs within a parallel worker.
+ *
+ * RelationNeedsWAL() and RelFileNodeSkippingWAL() must offer the correct
+ * answer to parallel workers.  Only smgrDoPendingSyncs() reads the
+ * is_truncated field, at end of transaction.  Hence, don't restore it.
+ */
+void
+RestorePendingSyncs(char *startAddress)
+{
+	RelFileNode *rnode;
+
+	Assert(pendingSyncHash == NULL);
+	for (rnode = (RelFileNode *) startAddress; rnode->relNode != 0; rnode++)
+		AddPendingSync(rnode);
+}
+
+/*
  *	smgrDoPendingDeletes() -- Take care of relation deletes at end of xact.
  *
  * This also runs when aborting a subxact; we want to clean up a failed
@@ -493,6 +658,144 @@ smgrDoPendingDeletes(bool isCommit)
 }
 
 /*
+ *	smgrDoPendingSyncs() -- Take care of relation syncs at end of xact.
+ */
+void
+smgrDoPendingSyncs(bool isCommit, bool isParallelWorker)
+{
+	PendingRelDelete *pending;
+	int			nrels = 0,
+				maxrels = 0;
+	SMgrRelation *srels = NULL;
+	HASH_SEQ_STATUS scan;
+	PendingRelSync *pendingsync;
+
+	Assert(GetCurrentTransactionNestLevel() == 1);
+
+	if (!pendingSyncHash)
+		return;					/* no relation needs sync */
+
+	/* Abort -- just throw away all pending syncs */
+	if (!isCommit)
+	{
+		pendingSyncHash = NULL;
+		return;
+	}
+
+	AssertPendingSyncs_RelationCache();
+
+	/* Parallel worker -- just throw away all pending syncs */
+	if (isParallelWorker)
+	{
+		pendingSyncHash = NULL;
+		return;
+	}
+
+	/* Skip syncing nodes that smgrDoPendingDeletes() will delete. */
+	for (pending = pendingDeletes; pending != NULL; pending = pending->next)
+		if (pending->atCommit)
+			(void) hash_search(pendingSyncHash, (void *) &pending->relnode,
+							   HASH_REMOVE, NULL);
+
+	hash_seq_init(&scan, pendingSyncHash);
+	while ((pendingsync = (PendingRelSync *) hash_seq_search(&scan)))
+	{
+		ForkNumber	fork;
+		BlockNumber nblocks[MAX_FORKNUM + 1];
+		BlockNumber total_blocks = 0;
+		SMgrRelation srel;
+
+		srel = smgropen(pendingsync->rnode, InvalidBackendId);
+
+		/*
+		 * We emit newpage WAL records for smaller relations.
+		 *
+		 * Small WAL records have a chance to be emitted along with other
+		 * backends' WAL records.  We emit WAL records instead of syncing for
+		 * files that are smaller than a certain threshold, expecting faster
+		 * commit.  The threshold is defined by the GUC wal_skip_threshold.
+		 */
+		if (!pendingsync->is_truncated)
+		{
+			for (fork = 0; fork <= MAX_FORKNUM; fork++)
+			{
+				if (smgrexists(srel, fork))
+				{
+					BlockNumber n = smgrnblocks(srel, fork);
+
+					/* we shouldn't come here for unlogged relations */
+					Assert(fork != INIT_FORKNUM);
+					nblocks[fork] = n;
+					total_blocks += n;
+				}
+				else
+					nblocks[fork] = InvalidBlockNumber;
+			}
+		}
+
+		/*
+		 * Sync file or emit WAL records for its contents.
+		 *
+		 * Although we emit WAL record if the file is small enough, do file
+		 * sync regardless of the size if the file has experienced a
+		 * truncation. It is because the file would be followed by trailing
+		 * garbage blocks after a crash recovery if, while a past longer file
+		 * had been flushed out, we omitted syncing-out of the file and
+		 * emitted WAL instead.  You might think that we could choose WAL if
+		 * the current main fork is longer than ever, but there's a case where
+		 * main fork is longer than ever but FSM fork gets shorter.
+		 */
+		if (pendingsync->is_truncated ||
+			total_blocks * BLCKSZ / 1024 >= wal_skip_threshold)
+		{
+			/* allocate the initial array, or extend it, if needed */
+			if (maxrels == 0)
+			{
+				maxrels = 8;
+				srels = palloc(sizeof(SMgrRelation) * maxrels);
+			}
+			else if (maxrels <= nrels)
+			{
+				maxrels *= 2;
+				srels = repalloc(srels, sizeof(SMgrRelation) * maxrels);
+			}
+
+			srels[nrels++] = srel;
+		}
+		else
+		{
+			/* Emit WAL records for all blocks.  The file is small enough. */
+			for (fork = 0; fork <= MAX_FORKNUM; fork++)
+			{
+				int			n = nblocks[fork];
+				Relation	rel;
+
+				if (!BlockNumberIsValid(n))
+					continue;
+
+				/*
+				 * Emit WAL for the whole file.  Unfortunately we don't know
+				 * what kind of a page this is, so we have to log the full
+				 * page including any unused space.  ReadBufferExtended()
+				 * counts some pgstat events; unfortunately, we discard them.
+				 */
+				rel = CreateFakeRelcacheEntry(srel->smgr_rnode.node);
+				log_newpage_range(rel, fork, 0, n, false);
+				FreeFakeRelcacheEntry(rel);
+			}
+		}
+	}
+
+	pendingSyncHash = NULL;
+
+	if (nrels > 0)
+	{
+		smgrdosyncall(srels, nrels);
+		pfree(srels);
+	}
+}
+
+/*
  * smgrGetPendingDeletes() -- Get a list of non-temp relations to be deleted.
  *
  * The return value is the number of relations scheduled for termination.