aboutsummaryrefslogtreecommitdiff
path: root/src/backend/access
diff options
context:
space:
mode:
Diffstat (limited to 'src/backend/access')
-rw-r--r--src/backend/access/gist/gistbuild.c2
-rw-r--r--src/backend/access/gist/gistutil.c31
-rw-r--r--src/backend/access/gist/gistxlog.c21
-rw-r--r--src/backend/access/heap/heapam.c30
-rw-r--r--src/backend/access/heap/rewriteheap.c21
-rw-r--r--src/backend/access/nbtree/nbtsort.c41
-rw-r--r--src/backend/access/rmgrdesc/gistdesc.c6
-rw-r--r--src/backend/access/transam/README45
-rw-r--r--src/backend/access/transam/xact.c15
-rw-r--r--src/backend/access/transam/xlogutils.c18
10 files changed, 89 insertions, 141 deletions
diff --git a/src/backend/access/gist/gistbuild.c b/src/backend/access/gist/gistbuild.c
index 46d150f9d82..b9c4e27e1a5 100644
--- a/src/backend/access/gist/gistbuild.c
+++ b/src/backend/access/gist/gistbuild.c
@@ -191,7 +191,7 @@ gistbuild(Relation heap, Relation index, IndexInfo *indexInfo)
PageSetLSN(page, recptr);
}
else
- PageSetLSN(page, gistGetFakeLSN(index));
+ PageSetLSN(page, gistGetFakeLSN(heap));
UnlockReleaseBuffer(buffer);
diff --git a/src/backend/access/gist/gistutil.c b/src/backend/access/gist/gistutil.c
index d17965aa4b8..55cccd247a0 100644
--- a/src/backend/access/gist/gistutil.c
+++ b/src/backend/access/gist/gistutil.c
@@ -972,44 +972,23 @@ gistproperty(Oid index_oid, int attno,
}
/*
- * Some indexes are not WAL-logged, but we need LSNs to detect concurrent page
- * splits anyway. This function provides a fake sequence of LSNs for that
- * purpose.
+ * Temporary and unlogged GiST indexes are not WAL-logged, but we need LSNs
+ * to detect concurrent page splits anyway. This function provides a fake
+ * sequence of LSNs for that purpose.
*/
XLogRecPtr
gistGetFakeLSN(Relation rel)
{
+ static XLogRecPtr counter = 1;
+
if (rel->rd_rel->relpersistence == RELPERSISTENCE_TEMP)
{
/*
* Temporary relations are only accessible in our session, so a simple
* backend-local counter will do.
*/
- static XLogRecPtr counter = 1;
-
return counter++;
}
- else if (rel->rd_rel->relpersistence == RELPERSISTENCE_PERMANENT)
- {
- /*
- * WAL-logging on this relation will start after commit, so its LSNs
- * must be distinct numbers smaller than the LSN at the next commit.
- * Emit a dummy WAL record if insert-LSN hasn't advanced after the
- * last call.
- */
- static XLogRecPtr lastlsn = InvalidXLogRecPtr;
- XLogRecPtr currlsn = GetXLogInsertRecPtr();
-
- /* Shouldn't be called for WAL-logging relations */
- Assert(!RelationNeedsWAL(rel));
-
- /* No need for an actual record if we already have a distinct LSN */
- if (!XLogRecPtrIsInvalid(lastlsn) && lastlsn == currlsn)
- currlsn = gistXLogAssignLSN();
-
- lastlsn = currlsn;
- return currlsn;
- }
else
{
/*
diff --git a/src/backend/access/gist/gistxlog.c b/src/backend/access/gist/gistxlog.c
index 93e64682131..17e213967b9 100644
--- a/src/backend/access/gist/gistxlog.c
+++ b/src/backend/access/gist/gistxlog.c
@@ -505,9 +505,6 @@ gist_redo(XLogReaderState *record)
case XLOG_GIST_CREATE_INDEX:
gistRedoCreateIndex(record);
break;
- case XLOG_GIST_ASSIGN_LSN:
- /* nop. See gistGetFakeLSN(). */
- break;
default:
elog(PANIC, "gist_redo: unknown op code %u", info);
}
@@ -627,24 +624,6 @@ gistXLogSplit(bool page_is_leaf,
}
/*
- * Write an empty XLOG record to assign a distinct LSN.
- */
-XLogRecPtr
-gistXLogAssignLSN(void)
-{
- int dummy = 0;
-
- /*
- * Records other than SWITCH_WAL must have content. We use an integer 0 to
- * follow the restriction.
- */
- XLogBeginInsert();
- XLogSetRecordFlags(XLOG_MARK_UNIMPORTANT);
- XLogRegisterData((char *) &dummy, sizeof(dummy));
- return XLogInsert(RM_GIST_ID, XLOG_GIST_ASSIGN_LSN);
-}
-
-/*
* Write XLOG record describing a page update. The update can include any
* number of deletions and/or insertions of tuples on a single index page.
*
diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index af16d2d2809..8ebf86f6878 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -27,6 +27,7 @@
* heap_multi_insert - insert multiple tuples into a relation
* heap_delete - delete a tuple from a relation
* heap_update - replace a tuple in a relation with another tuple
+ * heap_sync - sync heap, for when no WAL has been written
*
* NOTES
* This file contains the heap_ routines which implement
@@ -2395,6 +2396,12 @@ ReleaseBulkInsertStatePin(BulkInsertState bistate)
* The new tuple is stamped with current transaction ID and the specified
* command ID.
*
+ * If the HEAP_INSERT_SKIP_WAL option is specified, the new tuple is not
+ * logged in WAL, even for a non-temp relation. Safe usage of this behavior
+ * requires that we arrange that all new tuples go into new pages not
+ * containing any tuples from other transactions, and that the relation gets
+ * fsync'd before commit. (See also heap_sync() comments)
+ *
* The HEAP_INSERT_SKIP_FSM option is passed directly to
* RelationGetBufferForTuple, which see for more info.
*
@@ -2503,7 +2510,7 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid,
MarkBufferDirty(buffer);
/* XLOG stuff */
- if (RelationNeedsWAL(relation))
+ if (!(options & HEAP_INSERT_SKIP_WAL) && RelationNeedsWAL(relation))
{
xl_heap_insert xlrec;
xl_heap_header xlhdr;
@@ -2713,7 +2720,7 @@ heap_multi_insert(Relation relation, HeapTuple *tuples, int ntuples,
/* currently not needed (thus unsupported) for heap_multi_insert() */
AssertArg(!(options & HEAP_INSERT_NO_LOGICAL));
- needwal = RelationNeedsWAL(relation);
+ needwal = !(options & HEAP_INSERT_SKIP_WAL) && RelationNeedsWAL(relation);
saveFreeSpace = RelationGetTargetPageFreeSpace(relation,
HEAP_DEFAULT_FILLFACTOR);
@@ -9413,13 +9420,18 @@ heap2_redo(XLogReaderState *record)
}
/*
- * heap_sync - for binary compatibility
- *
- * A newer PostgreSQL version removes this function. It exists here just in
- * case an extension calls it. See "Skipping WAL for New RelFileNode" in
- * src/backend/access/transam/README for the system that superseded it,
- * allowing removal of most calls. Cases like copy_relation_data() should
- * call smgrimmedsync() directly.
+ * heap_sync - sync a heap, for use when no WAL has been written
+ *
+ * This forces the heap contents (including TOAST heap if any) down to disk.
+ * If we skipped using WAL, and WAL is otherwise needed, we must force the
+ * relation down to disk before it's safe to commit the transaction. This
+ * requires writing out any dirty buffers and then doing a forced fsync.
+ *
+ * Indexes are not touched. (Currently, index operations associated with
+ * the commands that use this are WAL-logged and so do not need fsync.
+ * That behavior might change someday, but in any case it's likely that
+ * any fsync decisions required would be per-index and hence not appropriate
+ * to be done here.)
*/
void
heap_sync(Relation rel)
diff --git a/src/backend/access/heap/rewriteheap.c b/src/backend/access/heap/rewriteheap.c
index dcb4d6877f5..9f0b586b5b6 100644
--- a/src/backend/access/heap/rewriteheap.c
+++ b/src/backend/access/heap/rewriteheap.c
@@ -145,6 +145,7 @@ typedef struct RewriteStateData
Page rs_buffer; /* page currently being built */
BlockNumber rs_blockno; /* block where page will go */
bool rs_buffer_valid; /* T if any tuples in buffer */
+ bool rs_use_wal; /* must we WAL-log inserts? */
bool rs_logical_rewrite; /* do we need to do logical rewriting */
TransactionId rs_oldest_xmin; /* oldest xmin used by caller to determine
* tuple visibility */
@@ -238,13 +239,15 @@ static void logical_end_heap_rewrite(RewriteState state);
* oldest_xmin xid used by the caller to determine which tuples are dead
* freeze_xid xid before which tuples will be frozen
* min_multi multixact before which multis will be removed
+ * use_wal should the inserts to the new heap be WAL-logged?
*
* Returns an opaque RewriteState, allocated in current memory context,
* to be used in subsequent calls to the other functions.
*/
RewriteState
begin_heap_rewrite(Relation old_heap, Relation new_heap, TransactionId oldest_xmin,
- TransactionId freeze_xid, MultiXactId cutoff_multi)
+ TransactionId freeze_xid, MultiXactId cutoff_multi,
+ bool use_wal)
{
RewriteState state;
MemoryContext rw_cxt;
@@ -269,6 +272,7 @@ begin_heap_rewrite(Relation old_heap, Relation new_heap, TransactionId oldest_xm
/* new_heap needn't be empty, just locked */
state->rs_blockno = RelationGetNumberOfBlocks(new_heap);
state->rs_buffer_valid = false;
+ state->rs_use_wal = use_wal;
state->rs_oldest_xmin = oldest_xmin;
state->rs_freeze_xid = freeze_xid;
state->rs_cutoff_multi = cutoff_multi;
@@ -327,7 +331,7 @@ end_heap_rewrite(RewriteState state)
/* Write the last page, if any */
if (state->rs_buffer_valid)
{
- if (RelationNeedsWAL(state->rs_new_rel))
+ if (state->rs_use_wal)
log_newpage(&state->rs_new_rel->rd_node,
MAIN_FORKNUM,
state->rs_blockno,
@@ -342,14 +346,18 @@ end_heap_rewrite(RewriteState state)
}
/*
- * When we WAL-logged rel pages, we must nonetheless fsync them. The
+ * If the rel is WAL-logged, must fsync before commit. We use heap_sync
+ * to ensure that the toast table gets fsync'd too.
+ *
+ * It's obvious that we must do this when not WAL-logging. It's less
+ * obvious that we have to do it even if we did WAL-log the pages. The
* reason is the same as in tablecmds.c's copy_relation_data(): we're
* writing data that's not in shared buffers, and so a CHECKPOINT
* occurring during the rewriteheap operation won't have fsync'd data we
* wrote before the checkpoint.
*/
if (RelationNeedsWAL(state->rs_new_rel))
- smgrimmedsync(state->rs_new_rel->rd_smgr, MAIN_FORKNUM);
+ heap_sync(state->rs_new_rel);
logical_end_heap_rewrite(state);
@@ -647,6 +655,9 @@ raw_heap_insert(RewriteState state, HeapTuple tup)
{
int options = HEAP_INSERT_SKIP_FSM;
+ if (!state->rs_use_wal)
+ options |= HEAP_INSERT_SKIP_WAL;
+
/*
* While rewriting the heap for VACUUM FULL / CLUSTER, make sure data
* for the TOAST table are not logically decoded. The main heap is
@@ -685,7 +696,7 @@ raw_heap_insert(RewriteState state, HeapTuple tup)
/* Doesn't fit, so write out the existing page */
/* XLOG stuff */
- if (RelationNeedsWAL(state->rs_new_rel))
+ if (state->rs_use_wal)
log_newpage(&state->rs_new_rel->rd_node,
MAIN_FORKNUM,
state->rs_blockno,
diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c
index 58c702d7a08..dab41ea298a 100644
--- a/src/backend/access/nbtree/nbtsort.c
+++ b/src/backend/access/nbtree/nbtsort.c
@@ -31,6 +31,18 @@
* them. They will need to be re-read into shared buffers on first use after
* the build finishes.
*
+ * Since the index will never be used unless it is completely built,
+ * from a crash-recovery point of view there is no need to WAL-log the
+ * steps of the build. After completing the index build, we can just sync
+ * the whole file to disk using smgrimmedsync() before exiting this module.
+ * This can be seen to be sufficient for crash recovery by considering that
+ * it's effectively equivalent to what would happen if a CHECKPOINT occurred
+ * just after the index build. However, it is clearly not sufficient if the
+ * DBA is using the WAL log for PITR or replication purposes, since another
+ * machine would not be able to reconstruct the index from WAL. Therefore,
+ * we log the completed index pages to WAL if and only if WAL archiving is
+ * active.
+ *
* This code isn't concerned about the FSM at all. The caller is responsible
* for initializing that.
*
@@ -518,7 +530,12 @@ _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2)
wstate.heap = btspool->heap;
wstate.index = btspool->index;
- wstate.btws_use_wal = RelationNeedsWAL(wstate.index);
+
+ /*
+ * We need to log index creation in WAL iff WAL archiving/streaming is
+ * enabled UNLESS the index isn't WAL-logged anyway.
+ */
+ wstate.btws_use_wal = XLogIsNeeded() && RelationNeedsWAL(wstate.index);
/* reserve the metapage */
wstate.btws_pages_alloced = BTREE_METAPAGE + 1;
@@ -1173,15 +1190,21 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
_bt_uppershutdown(wstate, state);
/*
- * When we WAL-logged index pages, we must nonetheless fsync index files.
- * Since we're building outside shared buffers, a CHECKPOINT occurring
- * during the build has no way to flush the previously written data to
- * disk (indeed it won't know the index even exists). A crash later on
- * would replay WAL from the checkpoint, therefore it wouldn't replay our
- * earlier WAL entries. If we do not fsync those pages here, they might
- * still not be on disk when the crash occurs.
+ * If the index is WAL-logged, we must fsync it down to disk before it's
+ * safe to commit the transaction. (For a non-WAL-logged index we don't
+ * care since the index will be uninteresting after a crash anyway.)
+ *
+ * It's obvious that we must do this when not WAL-logging the build. It's
+ * less obvious that we have to do it even if we did WAL-log the index
+ * pages. The reason is that since we're building outside shared buffers,
+ * a CHECKPOINT occurring during the build has no way to flush the
+ * previously written data to disk (indeed it won't know the index even
+ * exists). A crash later on would replay WAL from the checkpoint,
+ * therefore it wouldn't replay our earlier WAL entries. If we do not
+ * fsync those pages here, they might still not be on disk when the crash
+ * occurs.
*/
- if (wstate->btws_use_wal)
+ if (RelationNeedsWAL(wstate->index))
{
RelationOpenSmgr(wstate->index);
smgrimmedsync(wstate->index->rd_smgr, MAIN_FORKNUM);
diff --git a/src/backend/access/rmgrdesc/gistdesc.c b/src/backend/access/rmgrdesc/gistdesc.c
index 8c44925442b..e5e925e0c5a 100644
--- a/src/backend/access/rmgrdesc/gistdesc.c
+++ b/src/backend/access/rmgrdesc/gistdesc.c
@@ -46,9 +46,6 @@ gist_desc(StringInfo buf, XLogReaderState *record)
break;
case XLOG_GIST_CREATE_INDEX:
break;
- case XLOG_GIST_ASSIGN_LSN:
- /* No details to write out */
- break;
}
}
@@ -68,9 +65,6 @@ gist_identify(uint8 info)
case XLOG_GIST_CREATE_INDEX:
id = "CREATE_INDEX";
break;
- case XLOG_GIST_ASSIGN_LSN:
- id = "ASSIGN_LSN";
- break;
}
return id;
diff --git a/src/backend/access/transam/README b/src/backend/access/transam/README
index 28045f30876..ad4083eb6b5 100644
--- a/src/backend/access/transam/README
+++ b/src/backend/access/transam/README
@@ -717,38 +717,6 @@ then restart recovery. This is part of the reason for not writing a WAL
entry until we've successfully done the original action.
-Skipping WAL for New RelFileNode
---------------------------------
-
-Under wal_level=minimal, if a change modifies a relfilenode that ROLLBACK
-would unlink, in-tree access methods write no WAL for that change. Code that
-writes WAL without calling RelationNeedsWAL() must check for this case. This
-skipping is mandatory. If a WAL-writing change preceded a WAL-skipping change
-for the same block, REDO could overwrite the WAL-skipping change. If a
-WAL-writing change followed a WAL-skipping change for the same block, a
-related problem would arise. When a WAL record contains no full-page image,
-REDO expects the page to match its contents from just before record insertion.
-A WAL-skipping change may not reach disk at all, violating REDO's expectation
-under full_page_writes=off. For any access method, CommitTransaction() writes
-and fsyncs affected blocks before recording the commit.
-
-Prefer to do the same in future access methods. However, two other approaches
-can work. First, an access method can irreversibly transition a given fork
-from WAL-skipping to WAL-writing by calling FlushRelationBuffers() and
-smgrimmedsync(). Second, an access method can opt to write WAL
-unconditionally for permanent relations. Under these approaches, the access
-method callbacks must not call functions that react to RelationNeedsWAL().
-
-This applies only to WAL records whose replay would modify bytes stored in the
-new relfilenode. It does not apply to other records about the relfilenode,
-such as XLOG_SMGR_CREATE. Because it operates at the level of individual
-relfilenodes, RelationNeedsWAL() can differ for tightly-coupled relations.
-Consider "CREATE TABLE t (); BEGIN; ALTER TABLE t ADD c text; ..." in which
-ALTER TABLE adds a TOAST relation. The TOAST relation will skip WAL, while
-the table owning it will not. ALTER TABLE SET TABLESPACE will cause a table
-to skip WAL, but that won't affect its indexes.
-
-
Asynchronous Commit
-------------------
@@ -852,12 +820,13 @@ Changes to a temp table are not WAL-logged, hence could reach disk in
advance of T1's commit, but we don't care since temp table contents don't
survive crashes anyway.
-Database writes that skip WAL for new relfilenodes are also safe. In these
-cases it's entirely possible for the data to reach disk before T1's commit,
-because T1 will fsync it down to disk without any sort of interlock. However,
-all these paths are designed to write data that no other transaction can see
-until after T1 commits. The situation is thus not different from ordinary
-WAL-logged updates.
+Database writes made via any of the paths we have introduced to avoid WAL
+overhead for bulk updates are also safe. In these cases it's entirely
+possible for the data to reach disk before T1's commit, because T1 will
+fsync it down to disk without any sort of interlock, as soon as it finishes
+the bulk update. However, all these paths are designed to write data that
+no other transaction can see until after T1 commits. The situation is thus
+not different from ordinary WAL-logged updates.
Transaction Emulation during Recovery
-------------------------------------
diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c
index 134c95c9bee..7c1771eae76 100644
--- a/src/backend/access/transam/xact.c
+++ b/src/backend/access/transam/xact.c
@@ -2035,13 +2035,6 @@ CommitTransaction(void)
*/
PreCommit_on_commit_actions();
- /*
- * Synchronize files that are created and not WAL-logged during this
- * transaction. This must happen before AtEOXact_RelationMap(), so that we
- * don't see committed-but-broken files after a crash.
- */
- smgrDoPendingSyncs(true);
-
/* close large objects before lower-level cleanup */
AtEOXact_LargeObject(true);
@@ -2271,13 +2264,6 @@ PrepareTransaction(void)
*/
PreCommit_on_commit_actions();
- /*
- * Synchronize files that are created and not WAL-logged during this
- * transaction. This must happen before EndPrepare(), so that we don't see
- * committed-but-broken files after a crash and COMMIT PREPARED.
- */
- smgrDoPendingSyncs(true);
-
/* close large objects before lower-level cleanup */
AtEOXact_LargeObject(true);
@@ -2602,7 +2588,6 @@ AbortTransaction(void)
*/
AfterTriggerEndXact(false); /* 'false' means it's abort */
AtAbort_Portals();
- smgrDoPendingSyncs(false);
AtEOXact_LargeObject(false);
AtAbort_Notify();
AtEOXact_RelationMap(false);
diff --git a/src/backend/access/transam/xlogutils.c b/src/backend/access/transam/xlogutils.c
index e18abdfe525..4ecdc9220f0 100644
--- a/src/backend/access/transam/xlogutils.c
+++ b/src/backend/access/transam/xlogutils.c
@@ -544,8 +544,6 @@ typedef FakeRelCacheEntryData *FakeRelCacheEntry;
* fields related to physical storage, like rd_rel, are initialized, so the
* fake entry is only usable in low-level operations like ReadBuffer().
*
- * This is also used for syncing WAL-skipped files.
- *
* Caller must free the returned entry with FreeFakeRelcacheEntry().
*/
Relation
@@ -554,20 +552,18 @@ CreateFakeRelcacheEntry(RelFileNode rnode)
FakeRelCacheEntry fakeentry;
Relation rel;
+ Assert(InRecovery);
+
/* Allocate the Relation struct and all related space in one block. */
fakeentry = palloc0(sizeof(FakeRelCacheEntryData));
rel = (Relation) fakeentry;
rel->rd_rel = &fakeentry->pgc;
rel->rd_node = rnode;
-
- /*
- * We will never be working with temp rels during recovery or while
- * syncing WAL-skipped files.
- */
+ /* We will never be working with temp rels during recovery */
rel->rd_backend = InvalidBackendId;
- /* It must be a permanent table here */
+ /* It must be a permanent table if we're in recovery. */
rel->rd_rel->relpersistence = RELPERSISTENCE_PERMANENT;
/* We don't know the name of the relation; use relfilenode instead */
@@ -576,9 +572,9 @@ CreateFakeRelcacheEntry(RelFileNode rnode)
/*
* We set up the lockRelId in case anything tries to lock the dummy
* relation. Note that this is fairly bogus since relNode may be
- * different from the relation's OID. It shouldn't really matter though.
- * In recovery, we are running by ourselves and can't have any lock
- * conflicts. While syncing, we already hold AccessExclusiveLock.
+ * different from the relation's OID. It shouldn't really matter though,
+ * since we are presumably running by ourselves and can't have any lock
+ * conflicts ...
*/
rel->rd_lockInfo.lockRelId.dbId = rnode.dbNode;
rel->rd_lockInfo.lockRelId.relId = rnode.relNode;