aboutsummaryrefslogtreecommitdiff
path: root/src/backend/catalog/storage.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/backend/catalog/storage.c')
-rw-r--r--src/backend/catalog/storage.c246
1 files changed, 15 insertions, 231 deletions
diff --git a/src/backend/catalog/storage.c b/src/backend/catalog/storage.c
index 826a0e82af5..3cc886f7fe2 100644
--- a/src/backend/catalog/storage.c
+++ b/src/backend/catalog/storage.c
@@ -30,13 +30,9 @@
#include "catalog/storage_xlog.h"
#include "storage/freespace.h"
#include "storage/smgr.h"
-#include "utils/hsearch.h"
#include "utils/memutils.h"
#include "utils/rel.h"
-/* GUC variables */
-int wal_skip_threshold = 2048; /* in kilobytes */
-
/*
* We keep a list of all relations (represented as RelFileNode values)
* that have been created or deleted in the current transaction. When
@@ -66,14 +62,7 @@ typedef struct PendingRelDelete
struct PendingRelDelete *next; /* linked-list link */
} PendingRelDelete;
-typedef struct pendingSync
-{
- RelFileNode rnode;
- bool is_truncated; /* Has the file experienced truncation? */
-} pendingSync;
-
static PendingRelDelete *pendingDeletes = NULL; /* head of linked list */
-HTAB *pendingSyncHash = NULL;
/*
* RelationCreateStorage
@@ -129,32 +118,6 @@ RelationCreateStorage(RelFileNode rnode, char relpersistence)
pending->next = pendingDeletes;
pendingDeletes = pending;
- /* Queue an at-commit sync. */
- if (relpersistence == RELPERSISTENCE_PERMANENT && !XLogIsNeeded())
- {
- pendingSync *pending;
- bool found;
-
- /* we sync only permanent relations */
- Assert(backend == InvalidBackendId);
-
- if (!pendingSyncHash)
- {
- HASHCTL ctl;
-
- ctl.keysize = sizeof(RelFileNode);
- ctl.entrysize = sizeof(pendingSync);
- ctl.hcxt = TopTransactionContext;
- pendingSyncHash =
- hash_create("pending sync hash",
- 16, &ctl, HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
- }
-
- pending = hash_search(pendingSyncHash, &rnode, HASH_ENTER, &found);
- Assert(!found);
- pending->is_truncated = false;
- }
-
return srel;
}
@@ -289,8 +252,6 @@ RelationTruncate(Relation rel, BlockNumber nblocks)
if (vm)
visibilitymap_truncate(rel, nblocks);
- RelationPreTruncate(rel);
-
/*
* We WAL-log the truncation before actually truncating, which means
* trouble if the truncation fails. If we then crash, the WAL replay
@@ -334,28 +295,6 @@ RelationTruncate(Relation rel, BlockNumber nblocks)
}
/*
- * RelationPreTruncate
- * Perform AM-independent work before a physical truncation.
- *
- * If an access method's relation_nontransactional_truncate does not call
- * RelationTruncate(), it must call this before decreasing the table size.
- */
-void
-RelationPreTruncate(Relation rel)
-{
- pendingSync *pending;
-
- if (!pendingSyncHash)
- return;
- RelationOpenSmgr(rel);
-
- pending = hash_search(pendingSyncHash, &(rel->rd_smgr->smgr_rnode.node),
- HASH_FIND, NULL);
- if (pending)
- pending->is_truncated = true;
-}
-
-/*
* Copy a fork's data, block by block.
*
* Note that this requires that there is no dirty data in shared buffers. If
@@ -385,9 +324,7 @@ RelationCopyStorage(SMgrRelation src, SMgrRelation dst,
/*
* We need to log the copied data in WAL iff WAL archiving/streaming is
- * enabled AND it's a permanent relation. This gives the same answer as
- * "RelationNeedsWAL(rel) || copying_initfork", because we know the
- * current operation created a new relfilenode.
+ * enabled AND it's a permanent relation.
*/
use_wal = XLogIsNeeded() &&
(relpersistence == RELPERSISTENCE_PERMANENT || copying_initfork);
@@ -429,40 +366,25 @@ RelationCopyStorage(SMgrRelation src, SMgrRelation dst,
}
/*
- * When we WAL-logged rel pages, we must nonetheless fsync them. The
- * reason is that since we're copying outside shared buffers, a CHECKPOINT
- * occurring during the copy has no way to flush the previously written
- * data to disk (indeed it won't know the new rel even exists). A crash
- * later on would replay WAL from the checkpoint, therefore it wouldn't
- * replay our earlier WAL entries. If we do not fsync those pages here,
- * they might still not be on disk when the crash occurs.
+ * If the rel is WAL-logged, must fsync before commit. We use heap_sync
+ * to ensure that the toast table gets fsync'd too. (For a temp or
+ * unlogged rel we don't care since the data will be gone after a crash
+ * anyway.)
+ *
+ * It's obvious that we must do this when not WAL-logging the copy. It's
+ * less obvious that we have to do it even if we did WAL-log the copied
+ * pages. The reason is that since we're copying outside shared buffers, a
+ * CHECKPOINT occurring during the copy has no way to flush the previously
+ * written data to disk (indeed it won't know the new rel even exists). A
+ * crash later on would replay WAL from the checkpoint, therefore it
+ * wouldn't replay our earlier WAL entries. If we do not fsync those pages
+ * here, they might still not be on disk when the crash occurs.
*/
- if (use_wal || copying_initfork)
+ if (relpersistence == RELPERSISTENCE_PERMANENT || copying_initfork)
smgrimmedsync(dst, forkNum);
}
/*
- * RelFileNodeSkippingWAL - check if a BM_PERMANENT relfilenode is using WAL
- *
- * Changes of certain relfilenodes must not write WAL; see "Skipping WAL for
- * New RelFileNode" in src/backend/access/transam/README. Though it is
- * known from Relation efficiently, this function is intended for the code
- * paths not having access to Relation.
- */
-bool
-RelFileNodeSkippingWAL(RelFileNode rnode)
-{
- if (XLogIsNeeded())
- return false; /* no permanent relfilenode skips WAL */
-
- if (!pendingSyncHash ||
- hash_search(pendingSyncHash, &rnode, HASH_FIND, NULL) == NULL)
- return false;
-
- return true;
-}
-
-/*
* smgrDoPendingDeletes() -- Take care of relation deletes at end of xact.
*
* This also runs when aborting a subxact; we want to clean up a failed
@@ -540,144 +462,6 @@ smgrDoPendingDeletes(bool isCommit)
}
/*
- * smgrDoPendingSyncs() -- Take care of relation syncs at end of xact.
- */
-void
-smgrDoPendingSyncs(bool isCommit)
-{
- PendingRelDelete *pending;
- int nrels = 0,
- maxrels = 0;
- SMgrRelation *srels = NULL;
- HASH_SEQ_STATUS scan;
- pendingSync *pendingsync;
-
- if (XLogIsNeeded())
- return; /* no relation can use this */
-
- Assert(GetCurrentTransactionNestLevel() == 1);
-
- if (!pendingSyncHash)
- return; /* no relation needs sync */
-
- /* Just throw away all pending syncs if any at rollback */
- if (!isCommit)
- {
- pendingSyncHash = NULL;
- return;
- }
-
- AssertPendingSyncs_RelationCache();
-
- /* Skip syncing nodes that smgrDoPendingDeletes() will delete. */
- for (pending = pendingDeletes; pending != NULL; pending = pending->next)
- {
- if (!pending->atCommit)
- continue;
-
- (void) hash_search(pendingSyncHash, (void *) &pending->relnode,
- HASH_REMOVE, NULL);
- }
-
- hash_seq_init(&scan, pendingSyncHash);
- while ((pendingsync = (pendingSync *) hash_seq_search(&scan)))
- {
- ForkNumber fork;
- BlockNumber nblocks[MAX_FORKNUM + 1];
- BlockNumber total_blocks = 0;
- SMgrRelation srel;
-
- srel = smgropen(pendingsync->rnode, InvalidBackendId);
-
- /*
- * We emit newpage WAL records for smaller relations.
- *
- * Small WAL records have a chance to be emitted along with other
- * backends' WAL records. We emit WAL records instead of syncing for
- * files that are smaller than a certain threshold, expecting faster
- * commit. The threshold is defined by the GUC wal_skip_threshold.
- */
- if (!pendingsync->is_truncated)
- {
- for (fork = 0; fork <= MAX_FORKNUM; fork++)
- {
- if (smgrexists(srel, fork))
- {
- BlockNumber n = smgrnblocks(srel, fork);
-
- /* we shouldn't come here for unlogged relations */
- Assert(fork != INIT_FORKNUM);
- nblocks[fork] = n;
- total_blocks += n;
- }
- else
- nblocks[fork] = InvalidBlockNumber;
- }
- }
-
- /*
- * Sync file or emit WAL records for its contents.
- *
- * Although we emit WAL record if the file is small enough, do file
- * sync regardless of the size if the file has experienced a
- * truncation. It is because the file would be followed by trailing
- * garbage blocks after a crash recovery if, while a past longer file
- * had been flushed out, we omitted syncing-out of the file and
- * emitted WAL instead. You might think that we could choose WAL if
- * the current main fork is longer than ever, but there's a case where
- * main fork is longer than ever but FSM fork gets shorter.
- */
- if (pendingsync->is_truncated ||
- total_blocks * BLCKSZ / 1024 >= wal_skip_threshold)
- {
- /* allocate the initial array, or extend it, if needed */
- if (maxrels == 0)
- {
- maxrels = 8;
- srels = palloc(sizeof(SMgrRelation) * maxrels);
- }
- else if (maxrels <= nrels)
- {
- maxrels *= 2;
- srels = repalloc(srels, sizeof(SMgrRelation) * maxrels);
- }
-
- srels[nrels++] = srel;
- }
- else
- {
- /* Emit WAL records for all blocks. The file is small enough. */
- for (fork = 0; fork <= MAX_FORKNUM; fork++)
- {
- int n = nblocks[fork];
- Relation rel;
-
- if (!BlockNumberIsValid(n))
- continue;
-
- /*
- * Emit WAL for the whole file. Unfortunately we don't know
- * what kind of a page this is, so we have to log the full
- * page including any unused space. ReadBufferExtended()
- * counts some pgstat events; unfortunately, we discard them.
- */
- rel = CreateFakeRelcacheEntry(srel->smgr_rnode.node);
- log_newpage_range(rel, fork, 0, n, false);
- FreeFakeRelcacheEntry(rel);
- }
- }
- }
-
- pendingSyncHash = NULL;
-
- if (nrels > 0)
- {
- smgrdosyncall(srels, nrels);
- pfree(srels);
- }
-}
-
-/*
* smgrGetPendingDeletes() -- Get a list of non-temp relations to be deleted.
*
* The return value is the number of relations scheduled for termination.