aboutsummaryrefslogtreecommitdiff
path: root/src/backend/catalog/storage.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/backend/catalog/storage.c')
-rw-r--r--src/backend/catalog/storage.c333
1 files changed, 318 insertions, 15 deletions
diff --git a/src/backend/catalog/storage.c b/src/backend/catalog/storage.c
index fddfbf1d8c6..d713d5cade9 100644
--- a/src/backend/catalog/storage.c
+++ b/src/backend/catalog/storage.c
@@ -19,6 +19,7 @@
#include "postgres.h"
+#include "access/parallel.h"
#include "access/visibilitymap.h"
#include "access/xact.h"
#include "access/xlog.h"
@@ -29,9 +30,13 @@
#include "miscadmin.h"
#include "storage/freespace.h"
#include "storage/smgr.h"
+#include "utils/hsearch.h"
#include "utils/memutils.h"
#include "utils/rel.h"
+/* GUC variables */
+int wal_skip_threshold = 2048; /* in kilobytes */
+
/*
* We keep a list of all relations (represented as RelFileNode values)
* that have been created or deleted in the current transaction. When
@@ -61,7 +66,42 @@ typedef struct PendingRelDelete
struct PendingRelDelete *next; /* linked-list link */
} PendingRelDelete;
+typedef struct PendingRelSync
+{
+ RelFileNode rnode;
+ bool is_truncated; /* Has the file experienced truncation? */
+} PendingRelSync;
+
static PendingRelDelete *pendingDeletes = NULL; /* head of linked list */
+HTAB *pendingSyncHash = NULL;
+
+
+/*
+ * AddPendingSync
+ * Queue an at-commit fsync.
+ */
+static void
+AddPendingSync(const RelFileNode *rnode)
+{
+ PendingRelSync *pending;
+ bool found;
+
+ /* create the hash if not yet */
+ if (!pendingSyncHash)
+ {
+ HASHCTL ctl;
+
+ ctl.keysize = sizeof(RelFileNode);
+ ctl.entrysize = sizeof(PendingRelSync);
+ ctl.hcxt = TopTransactionContext;
+ pendingSyncHash = hash_create("pending sync hash", 16, &ctl,
+ HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
+ }
+
+ pending = hash_search(pendingSyncHash, rnode, HASH_ENTER, &found);
+ Assert(!found);
+ pending->is_truncated = false;
+}
/*
* RelationCreateStorage
@@ -82,6 +122,8 @@ RelationCreateStorage(RelFileNode rnode, char relpersistence)
BackendId backend;
bool needs_wal;
+ Assert(!IsInParallelMode()); /* couldn't update pendingSyncHash */
+
switch (relpersistence)
{
case RELPERSISTENCE_TEMP:
@@ -117,6 +159,12 @@ RelationCreateStorage(RelFileNode rnode, char relpersistence)
pending->next = pendingDeletes;
pendingDeletes = pending;
+ if (relpersistence == RELPERSISTENCE_PERMANENT && !XLogIsNeeded())
+ {
+ Assert(backend == InvalidBackendId);
+ AddPendingSync(&rnode);
+ }
+
return srel;
}
@@ -275,6 +323,8 @@ RelationTruncate(Relation rel, BlockNumber nblocks)
}
}
+ RelationPreTruncate(rel);
+
/*
* We WAL-log the truncation before actually truncating, which means
* trouble if the truncation fails. If we then crash, the WAL replay
@@ -326,6 +376,28 @@ RelationTruncate(Relation rel, BlockNumber nblocks)
}
/*
+ * RelationPreTruncate
+ * Perform AM-independent work before a physical truncation.
+ *
+ * If an access method's relation_nontransactional_truncate does not call
+ * RelationTruncate(), it must call this before decreasing the table size.
+ */
+void
+RelationPreTruncate(Relation rel)
+{
+ PendingRelSync *pending;
+
+ if (!pendingSyncHash)
+ return;
+ RelationOpenSmgr(rel);
+
+ pending = hash_search(pendingSyncHash, &(rel->rd_smgr->smgr_rnode.node),
+ HASH_FIND, NULL);
+ if (pending)
+ pending->is_truncated = true;
+}
+
+/*
* Copy a fork's data, block by block.
*
* Note that this requires that there is no dirty data in shared buffers. If
@@ -355,7 +427,9 @@ RelationCopyStorage(SMgrRelation src, SMgrRelation dst,
/*
* We need to log the copied data in WAL iff WAL archiving/streaming is
- * enabled AND it's a permanent relation.
+ * enabled AND it's a permanent relation. This gives the same answer as
+ * "RelationNeedsWAL(rel) || copying_initfork", because we know the
+ * current operation created a new relfilenode.
*/
use_wal = XLogIsNeeded() &&
(relpersistence == RELPERSISTENCE_PERMANENT || copying_initfork);
@@ -397,25 +471,116 @@ RelationCopyStorage(SMgrRelation src, SMgrRelation dst,
}
/*
- * If the rel is WAL-logged, must fsync before commit. We use heap_sync
- * to ensure that the toast table gets fsync'd too. (For a temp or
- * unlogged rel we don't care since the data will be gone after a crash
- * anyway.)
- *
- * It's obvious that we must do this when not WAL-logging the copy. It's
- * less obvious that we have to do it even if we did WAL-log the copied
- * pages. The reason is that since we're copying outside shared buffers, a
- * CHECKPOINT occurring during the copy has no way to flush the previously
- * written data to disk (indeed it won't know the new rel even exists). A
- * crash later on would replay WAL from the checkpoint, therefore it
- * wouldn't replay our earlier WAL entries. If we do not fsync those pages
- * here, they might still not be on disk when the crash occurs.
+ * When we WAL-logged rel pages, we must nonetheless fsync them. The
+ * reason is that since we're copying outside shared buffers, a CHECKPOINT
+ * occurring during the copy has no way to flush the previously written
+ * data to disk (indeed it won't know the new rel even exists). A crash
+ * later on would replay WAL from the checkpoint, therefore it wouldn't
+ * replay our earlier WAL entries. If we do not fsync those pages here,
+ * they might still not be on disk when the crash occurs.
*/
- if (relpersistence == RELPERSISTENCE_PERMANENT || copying_initfork)
+ if (use_wal || copying_initfork)
smgrimmedsync(dst, forkNum);
}
/*
+ * RelFileNodeSkippingWAL
+ * Check if a BM_PERMANENT relfilenode is using WAL.
+ *
+ * Changes of certain relfilenodes must not write WAL; see "Skipping WAL for
+ * New RelFileNode" in src/backend/access/transam/README. Though it is known
+ * from Relation efficiently, this function is intended for the code paths not
+ * having access to Relation.
+ */
+bool
+RelFileNodeSkippingWAL(RelFileNode rnode)
+{
+ if (!pendingSyncHash ||
+ hash_search(pendingSyncHash, &rnode, HASH_FIND, NULL) == NULL)
+ return false;
+
+ return true;
+}
+
+/*
+ * EstimatePendingSyncsSpace
+ * Estimate space needed to pass syncs to parallel workers.
+ */
+Size
+EstimatePendingSyncsSpace(void)
+{
+ long entries;
+
+ entries = pendingSyncHash ? hash_get_num_entries(pendingSyncHash) : 0;
+ return mul_size(1 + entries, sizeof(RelFileNode));
+}
+
+/*
+ * SerializePendingSyncs
+ * Serialize syncs for parallel workers.
+ */
+void
+SerializePendingSyncs(Size maxSize, char *startAddress)
+{
+ HTAB *tmphash;
+ HASHCTL ctl;
+ HASH_SEQ_STATUS scan;
+ PendingRelSync *sync;
+ PendingRelDelete *delete;
+ RelFileNode *src;
+ RelFileNode *dest = (RelFileNode *) startAddress;
+
+ if (!pendingSyncHash)
+ goto terminate;
+
+ /* Create temporary hash to collect active relfilenodes */
+ ctl.keysize = sizeof(RelFileNode);
+ ctl.entrysize = sizeof(RelFileNode);
+ ctl.hcxt = CurrentMemoryContext;
+ tmphash = hash_create("tmp relfilenodes",
+ hash_get_num_entries(pendingSyncHash), &ctl,
+ HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
+
+ /* collect all rnodes from pending syncs */
+ hash_seq_init(&scan, pendingSyncHash);
+ while ((sync = (PendingRelSync *) hash_seq_search(&scan)))
+ (void) hash_search(tmphash, &sync->rnode, HASH_ENTER, NULL);
+
+ /* remove deleted rnodes */
+ for (delete = pendingDeletes; delete != NULL; delete = delete->next)
+ if (delete->atCommit)
+ (void) hash_search(tmphash, (void *) &delete->relnode,
+ HASH_REMOVE, NULL);
+
+ hash_seq_init(&scan, tmphash);
+ while ((src = (RelFileNode *) hash_seq_search(&scan)))
+ *dest++ = *src;
+
+ hash_destroy(tmphash);
+
+terminate:
+ MemSet(dest, 0, sizeof(RelFileNode));
+}
+
+/*
+ * RestorePendingSyncs
+ * Restore syncs within a parallel worker.
+ *
+ * RelationNeedsWAL() and RelFileNodeSkippingWAL() must offer the correct
+ * answer to parallel workers. Only smgrDoPendingSyncs() reads the
+ * is_truncated field, at end of transaction. Hence, don't restore it.
+ */
+void
+RestorePendingSyncs(char *startAddress)
+{
+ RelFileNode *rnode;
+
+ Assert(pendingSyncHash == NULL);
+ for (rnode = (RelFileNode *) startAddress; rnode->relNode != 0; rnode++)
+ AddPendingSync(rnode);
+}
+
+/*
* smgrDoPendingDeletes() -- Take care of relation deletes at end of xact.
*
* This also runs when aborting a subxact; we want to clean up a failed
@@ -493,6 +658,144 @@ smgrDoPendingDeletes(bool isCommit)
}
/*
+ * smgrDoPendingSyncs() -- Take care of relation syncs at end of xact.
+ */
+void
+smgrDoPendingSyncs(bool isCommit, bool isParallelWorker)
+{
+ PendingRelDelete *pending;
+ int nrels = 0,
+ maxrels = 0;
+ SMgrRelation *srels = NULL;
+ HASH_SEQ_STATUS scan;
+ PendingRelSync *pendingsync;
+
+ Assert(GetCurrentTransactionNestLevel() == 1);
+
+ if (!pendingSyncHash)
+ return; /* no relation needs sync */
+
+ /* Abort -- just throw away all pending syncs */
+ if (!isCommit)
+ {
+ pendingSyncHash = NULL;
+ return;
+ }
+
+ AssertPendingSyncs_RelationCache();
+
+ /* Parallel worker -- just throw away all pending syncs */
+ if (isParallelWorker)
+ {
+ pendingSyncHash = NULL;
+ return;
+ }
+
+ /* Skip syncing nodes that smgrDoPendingDeletes() will delete. */
+ for (pending = pendingDeletes; pending != NULL; pending = pending->next)
+ if (pending->atCommit)
+ (void) hash_search(pendingSyncHash, (void *) &pending->relnode,
+ HASH_REMOVE, NULL);
+
+ hash_seq_init(&scan, pendingSyncHash);
+ while ((pendingsync = (PendingRelSync *) hash_seq_search(&scan)))
+ {
+ ForkNumber fork;
+ BlockNumber nblocks[MAX_FORKNUM + 1];
+ BlockNumber total_blocks = 0;
+ SMgrRelation srel;
+
+ srel = smgropen(pendingsync->rnode, InvalidBackendId);
+
+ /*
+ * We emit newpage WAL records for smaller relations.
+ *
+ * Small WAL records have a chance to be emitted along with other
+ * backends' WAL records. We emit WAL records instead of syncing for
+ * files that are smaller than a certain threshold, expecting faster
+ * commit. The threshold is defined by the GUC wal_skip_threshold.
+ */
+ if (!pendingsync->is_truncated)
+ {
+ for (fork = 0; fork <= MAX_FORKNUM; fork++)
+ {
+ if (smgrexists(srel, fork))
+ {
+ BlockNumber n = smgrnblocks(srel, fork);
+
+ /* we shouldn't come here for unlogged relations */
+ Assert(fork != INIT_FORKNUM);
+ nblocks[fork] = n;
+ total_blocks += n;
+ }
+ else
+ nblocks[fork] = InvalidBlockNumber;
+ }
+ }
+
+ /*
+ * Sync file or emit WAL records for its contents.
+ *
+ * Although we emit WAL record if the file is small enough, do file
+ * sync regardless of the size if the file has experienced a
+ * truncation. It is because the file would be followed by trailing
+ * garbage blocks after a crash recovery if, while a past longer file
+ * had been flushed out, we omitted syncing-out of the file and
+ * emitted WAL instead. You might think that we could choose WAL if
+ * the current main fork is longer than ever, but there's a case where
+ * main fork is longer than ever but FSM fork gets shorter.
+ */
+ if (pendingsync->is_truncated ||
+ total_blocks * BLCKSZ / 1024 >= wal_skip_threshold)
+ {
+ /* allocate the initial array, or extend it, if needed */
+ if (maxrels == 0)
+ {
+ maxrels = 8;
+ srels = palloc(sizeof(SMgrRelation) * maxrels);
+ }
+ else if (maxrels <= nrels)
+ {
+ maxrels *= 2;
+ srels = repalloc(srels, sizeof(SMgrRelation) * maxrels);
+ }
+
+ srels[nrels++] = srel;
+ }
+ else
+ {
+ /* Emit WAL records for all blocks. The file is small enough. */
+ for (fork = 0; fork <= MAX_FORKNUM; fork++)
+ {
+ int n = nblocks[fork];
+ Relation rel;
+
+ if (!BlockNumberIsValid(n))
+ continue;
+
+ /*
+ * Emit WAL for the whole file. Unfortunately we don't know
+ * what kind of a page this is, so we have to log the full
+ * page including any unused space. ReadBufferExtended()
+ * counts some pgstat events; unfortunately, we discard them.
+ */
+ rel = CreateFakeRelcacheEntry(srel->smgr_rnode.node);
+ log_newpage_range(rel, fork, 0, n, false);
+ FreeFakeRelcacheEntry(rel);
+ }
+ }
+ }
+
+ pendingSyncHash = NULL;
+
+ if (nrels > 0)
+ {
+ smgrdosyncall(srels, nrels);
+ pfree(srels);
+ }
+}
+
+/*
* smgrGetPendingDeletes() -- Get a list of non-temp relations to be deleted.
*
* The return value is the number of relations scheduled for termination.