diff options
Diffstat (limited to 'src/backend/access')
-rw-r--r-- | src/backend/access/gist/gistbuild.c | 2 | ||||
-rw-r--r-- | src/backend/access/gist/gistutil.c | 31 | ||||
-rw-r--r-- | src/backend/access/gist/gistxlog.c | 20 | ||||
-rw-r--r-- | src/backend/access/heap/heapam.c | 30 | ||||
-rw-r--r-- | src/backend/access/heap/rewriteheap.c | 21 | ||||
-rw-r--r-- | src/backend/access/nbtree/nbtsort.c | 41 | ||||
-rw-r--r-- | src/backend/access/rmgrdesc/gistdesc.c | 6 | ||||
-rw-r--r-- | src/backend/access/transam/README | 45 | ||||
-rw-r--r-- | src/backend/access/transam/xact.c | 15 | ||||
-rw-r--r-- | src/backend/access/transam/xlogutils.c | 18 |
10 files changed, 140 insertions, 89 deletions
diff --git a/src/backend/access/gist/gistbuild.c b/src/backend/access/gist/gistbuild.c index ff888e2e01d..0444e3a1072 100644 --- a/src/backend/access/gist/gistbuild.c +++ b/src/backend/access/gist/gistbuild.c @@ -191,7 +191,7 @@ gistbuild(PG_FUNCTION_ARGS) PageSetLSN(page, recptr); } else - PageSetLSN(page, gistGetFakeLSN(heap)); + PageSetLSN(page, gistGetFakeLSN(index)); UnlockReleaseBuffer(buffer); diff --git a/src/backend/access/gist/gistutil.c b/src/backend/access/gist/gistutil.c index 7d596a3e2e6..47cb7fde518 100644 --- a/src/backend/access/gist/gistutil.c +++ b/src/backend/access/gist/gistutil.c @@ -840,23 +840,44 @@ gistoptions(PG_FUNCTION_ARGS) } /* - * Temporary and unlogged GiST indexes are not WAL-logged, but we need LSNs - * to detect concurrent page splits anyway. This function provides a fake - * sequence of LSNs for that purpose. + * Some indexes are not WAL-logged, but we need LSNs to detect concurrent page + * splits anyway. This function provides a fake sequence of LSNs for that + * purpose. */ XLogRecPtr gistGetFakeLSN(Relation rel) { - static XLogRecPtr counter = 1; - if (rel->rd_rel->relpersistence == RELPERSISTENCE_TEMP) { /* * Temporary relations are only accessible in our session, so a simple * backend-local counter will do. */ + static XLogRecPtr counter = 1; + return counter++; } + else if (rel->rd_rel->relpersistence == RELPERSISTENCE_PERMANENT) + { + /* + * WAL-logging on this relation will start after commit, so its LSNs + * must be distinct numbers smaller than the LSN at the next commit. + * Emit a dummy WAL record if insert-LSN hasn't advanced after the + * last call. + */ + static XLogRecPtr lastlsn = InvalidXLogRecPtr; + XLogRecPtr currlsn = GetXLogInsertRecPtr(); + + /* Shouldn't be called for WAL-logging relations */ + Assert(!RelationNeedsWAL(rel)); + + /* No need for an actual record if we already have a distinct LSN */ + if (!XLogRecPtrIsInvalid(lastlsn) && lastlsn == currlsn) + currlsn = gistXLogAssignLSN(); + + lastlsn = currlsn; + return currlsn; + } else { /* diff --git a/src/backend/access/gist/gistxlog.c b/src/backend/access/gist/gistxlog.c index fbdbb3c51f2..e002ad3d829 100644 --- a/src/backend/access/gist/gistxlog.c +++ b/src/backend/access/gist/gistxlog.c @@ -301,6 +301,9 @@ gist_redo(XLogReaderState *record) case XLOG_GIST_CREATE_INDEX: gistRedoCreateIndex(record); break; + case XLOG_GIST_ASSIGN_LSN: + /* nop. See gistGetFakeLSN(). */ + break; default: elog(PANIC, "gist_redo: unknown op code %u", info); } @@ -378,6 +381,23 @@ gistXLogSplit(RelFileNode node, BlockNumber blkno, bool page_is_leaf, } /* + * Write an empty XLOG record to assign a distinct LSN. + */ +XLogRecPtr +gistXLogAssignLSN(void) +{ + int dummy = 0; + + /* + * Records other than SWITCH_WAL must have content. We use an integer 0 to + * follow the restriction. + */ + XLogBeginInsert(); + XLogRegisterData((char *) &dummy, sizeof(dummy)); + return XLogInsert(RM_GIST_ID, XLOG_GIST_ASSIGN_LSN); +} + +/* * Write XLOG record describing a page update. The update can include any * number of deletions and/or insertions of tuples on a single index page. * diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index 9554704456c..f605c1abaee 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -27,7 +27,6 @@ * heap_multi_insert - insert multiple tuples into a relation * heap_delete - delete a tuple from a relation * heap_update - replace a tuple in a relation with another tuple - * heap_sync - sync heap, for when no WAL has been written * * NOTES * This file contains the heap_ routines which implement @@ -2103,12 +2102,6 @@ FreeBulkInsertState(BulkInsertState bistate) * The new tuple is stamped with current transaction ID and the specified * command ID. * - * If the HEAP_INSERT_SKIP_WAL option is specified, the new tuple is not - * logged in WAL, even for a non-temp relation. Safe usage of this behavior - * requires that we arrange that all new tuples go into new pages not - * containing any tuples from other transactions, and that the relation gets - * fsync'd before commit. (See also heap_sync() comments) - * * The HEAP_INSERT_SKIP_FSM option is passed directly to * RelationGetBufferForTuple, which see for more info. * @@ -2217,7 +2210,7 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid, MarkBufferDirty(buffer); /* XLOG stuff */ - if (!(options & HEAP_INSERT_SKIP_WAL) && RelationNeedsWAL(relation)) + if (RelationNeedsWAL(relation)) { xl_heap_insert xlrec; xl_heap_header xlhdr; @@ -2425,7 +2418,7 @@ heap_multi_insert(Relation relation, HeapTuple *tuples, int ntuples, /* currently not needed (thus unsupported) for heap_multi_insert() */ AssertArg(!(options & HEAP_INSERT_NO_LOGICAL)); - needwal = !(options & HEAP_INSERT_SKIP_WAL) && RelationNeedsWAL(relation); + needwal = RelationNeedsWAL(relation); saveFreeSpace = RelationGetTargetPageFreeSpace(relation, HEAP_DEFAULT_FILLFACTOR); @@ -8753,18 +8746,13 @@ heap2_redo(XLogReaderState *record) } /* - * heap_sync - sync a heap, for use when no WAL has been written - * - * This forces the heap contents (including TOAST heap if any) down to disk. - * If we skipped using WAL, and WAL is otherwise needed, we must force the - * relation down to disk before it's safe to commit the transaction. This - * requires writing out any dirty buffers and then doing a forced fsync. - * - * Indexes are not touched. (Currently, index operations associated with - * the commands that use this are WAL-logged and so do not need fsync. - * That behavior might change someday, but in any case it's likely that - * any fsync decisions required would be per-index and hence not appropriate - * to be done here.) + * heap_sync - for binary compatibility + * + * A newer PostgreSQL version removes this function. It exists here just in + * case an extension calls it. See "Skipping WAL for New RelFileNode" in + * src/backend/access/transam/README for the system that superseded it, + * allowing removal of most calls. Cases like copy_relation_data() should + * call smgrimmedsync() directly. */ void heap_sync(Relation rel) diff --git a/src/backend/access/heap/rewriteheap.c b/src/backend/access/heap/rewriteheap.c index 7f1b798f72b..e7a24732cc4 100644 --- a/src/backend/access/heap/rewriteheap.c +++ b/src/backend/access/heap/rewriteheap.c @@ -143,7 +143,6 @@ typedef struct RewriteStateData Page rs_buffer; /* page currently being built */ BlockNumber rs_blockno; /* block where page will go */ bool rs_buffer_valid; /* T if any tuples in buffer */ - bool rs_use_wal; /* must we WAL-log inserts? */ bool rs_logical_rewrite; /* do we need to do logical rewriting */ TransactionId rs_oldest_xmin; /* oldest xmin used by caller to * determine tuple visibility */ @@ -237,15 +236,13 @@ static void logical_end_heap_rewrite(RewriteState state); * oldest_xmin xid used by the caller to determine which tuples are dead * freeze_xid xid before which tuples will be frozen * min_multi multixact before which multis will be removed - * use_wal should the inserts to the new heap be WAL-logged? * * Returns an opaque RewriteState, allocated in current memory context, * to be used in subsequent calls to the other functions. */ RewriteState begin_heap_rewrite(Relation old_heap, Relation new_heap, TransactionId oldest_xmin, - TransactionId freeze_xid, MultiXactId cutoff_multi, - bool use_wal) + TransactionId freeze_xid, MultiXactId cutoff_multi) { RewriteState state; MemoryContext rw_cxt; @@ -272,7 +269,6 @@ begin_heap_rewrite(Relation old_heap, Relation new_heap, TransactionId oldest_xm /* new_heap needn't be empty, just locked */ state->rs_blockno = RelationGetNumberOfBlocks(new_heap); state->rs_buffer_valid = false; - state->rs_use_wal = use_wal; state->rs_oldest_xmin = oldest_xmin; state->rs_freeze_xid = freeze_xid; state->rs_cutoff_multi = cutoff_multi; @@ -331,7 +327,7 @@ end_heap_rewrite(RewriteState state) /* Write the last page, if any */ if (state->rs_buffer_valid) { - if (state->rs_use_wal) + if (RelationNeedsWAL(state->rs_new_rel)) log_newpage(&state->rs_new_rel->rd_node, MAIN_FORKNUM, state->rs_blockno, @@ -346,18 +342,14 @@ end_heap_rewrite(RewriteState state) } /* - * If the rel is WAL-logged, must fsync before commit. We use heap_sync - * to ensure that the toast table gets fsync'd too. - * - * It's obvious that we must do this when not WAL-logging. It's less - * obvious that we have to do it even if we did WAL-log the pages. The + * When we WAL-logged rel pages, we must nonetheless fsync them. The * reason is the same as in tablecmds.c's copy_relation_data(): we're * writing data that's not in shared buffers, and so a CHECKPOINT * occurring during the rewriteheap operation won't have fsync'd data we * wrote before the checkpoint. */ if (RelationNeedsWAL(state->rs_new_rel)) - heap_sync(state->rs_new_rel); + smgrimmedsync(state->rs_new_rel->rd_smgr, MAIN_FORKNUM); logical_end_heap_rewrite(state); @@ -654,9 +646,6 @@ raw_heap_insert(RewriteState state, HeapTuple tup) { int options = HEAP_INSERT_SKIP_FSM; - if (!state->rs_use_wal) - options |= HEAP_INSERT_SKIP_WAL; - /* * While rewriting the heap for VACUUM FULL / CLUSTER, make sure data * for the TOAST table are not logically decoded. The main heap is @@ -695,7 +684,7 @@ raw_heap_insert(RewriteState state, HeapTuple tup) /* Doesn't fit, so write out the existing page */ /* XLOG stuff */ - if (state->rs_use_wal) + if (RelationNeedsWAL(state->rs_new_rel)) log_newpage(&state->rs_new_rel->rd_node, MAIN_FORKNUM, state->rs_blockno, diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c index f95f67ad4b5..7b88e977196 100644 --- a/src/backend/access/nbtree/nbtsort.c +++ b/src/backend/access/nbtree/nbtsort.c @@ -40,18 +40,6 @@ * them. They will need to be re-read into shared buffers on first use after * the build finishes. * - * Since the index will never be used unless it is completely built, - * from a crash-recovery point of view there is no need to WAL-log the - * steps of the build. After completing the index build, we can just sync - * the whole file to disk using smgrimmedsync() before exiting this module. - * This can be seen to be sufficient for crash recovery by considering that - * it's effectively equivalent to what would happen if a CHECKPOINT occurred - * just after the index build. However, it is clearly not sufficient if the - * DBA is using the WAL log for PITR or replication purposes, since another - * machine would not be able to reconstruct the index from WAL. Therefore, - * we log the completed index pages to WAL if and only if WAL archiving is - * active. - * * This code isn't concerned about the FSM at all. The caller is responsible * for initializing that. * @@ -216,12 +204,7 @@ _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2) wstate.heap = btspool->heap; wstate.index = btspool->index; - - /* - * We need to log index creation in WAL iff WAL archiving/streaming is - * enabled UNLESS the index isn't WAL-logged anyway. - */ - wstate.btws_use_wal = XLogIsNeeded() && RelationNeedsWAL(wstate.index); + wstate.btws_use_wal = RelationNeedsWAL(wstate.index); /* reserve the metapage */ wstate.btws_pages_alloced = BTREE_METAPAGE + 1; @@ -811,21 +794,15 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2) _bt_uppershutdown(wstate, state); /* - * If the index is WAL-logged, we must fsync it down to disk before it's - * safe to commit the transaction. (For a non-WAL-logged index we don't - * care since the index will be uninteresting after a crash anyway.) - * - * It's obvious that we must do this when not WAL-logging the build. It's - * less obvious that we have to do it even if we did WAL-log the index - * pages. The reason is that since we're building outside shared buffers, - * a CHECKPOINT occurring during the build has no way to flush the - * previously written data to disk (indeed it won't know the index even - * exists). A crash later on would replay WAL from the checkpoint, - * therefore it wouldn't replay our earlier WAL entries. If we do not - * fsync those pages here, they might still not be on disk when the crash - * occurs. + * When we WAL-logged index pages, we must nonetheless fsync index files. + * Since we're building outside shared buffers, a CHECKPOINT occurring + * during the build has no way to flush the previously written data to + * disk (indeed it won't know the index even exists). A crash later on + * would replay WAL from the checkpoint, therefore it wouldn't replay our + * earlier WAL entries. If we do not fsync those pages here, they might + * still not be on disk when the crash occurs. */ - if (RelationNeedsWAL(wstate->index)) + if (wstate->btws_use_wal) { RelationOpenSmgr(wstate->index); smgrimmedsync(wstate->index->rd_smgr, MAIN_FORKNUM); diff --git a/src/backend/access/rmgrdesc/gistdesc.c b/src/backend/access/rmgrdesc/gistdesc.c index b199c6fa207..d53cdc6984f 100644 --- a/src/backend/access/rmgrdesc/gistdesc.c +++ b/src/backend/access/rmgrdesc/gistdesc.c @@ -46,6 +46,9 @@ gist_desc(StringInfo buf, XLogReaderState *record) break; case XLOG_GIST_CREATE_INDEX: break; + case XLOG_GIST_ASSIGN_LSN: + /* No details to write out */ + break; } } @@ -65,6 +68,9 @@ gist_identify(uint8 info) case XLOG_GIST_CREATE_INDEX: id = "CREATE_INDEX"; break; + case XLOG_GIST_ASSIGN_LSN: + id = "ASSIGN_LSN"; + break; } return id; diff --git a/src/backend/access/transam/README b/src/backend/access/transam/README index 81b27a119a0..27322713a0c 100644 --- a/src/backend/access/transam/README +++ b/src/backend/access/transam/README @@ -714,6 +714,38 @@ then restart recovery. This is part of the reason for not writing a WAL entry until we've successfully done the original action. +Skipping WAL for New RelFileNode +-------------------------------- + +Under wal_level=minimal, if a change modifies a relfilenode that ROLLBACK +would unlink, in-tree access methods write no WAL for that change. Code that +writes WAL without calling RelationNeedsWAL() must check for this case. This +skipping is mandatory. If a WAL-writing change preceded a WAL-skipping change +for the same block, REDO could overwrite the WAL-skipping change. If a +WAL-writing change followed a WAL-skipping change for the same block, a +related problem would arise. When a WAL record contains no full-page image, +REDO expects the page to match its contents from just before record insertion. +A WAL-skipping change may not reach disk at all, violating REDO's expectation +under full_page_writes=off. For any access method, CommitTransaction() writes +and fsyncs affected blocks before recording the commit. + +Prefer to do the same in future access methods. However, two other approaches +can work. First, an access method can irreversibly transition a given fork +from WAL-skipping to WAL-writing by calling FlushRelationBuffers() and +smgrimmedsync(). Second, an access method can opt to write WAL +unconditionally for permanent relations. Under these approaches, the access +method callbacks must not call functions that react to RelationNeedsWAL(). + +This applies only to WAL records whose replay would modify bytes stored in the +new relfilenode. It does not apply to other records about the relfilenode, +such as XLOG_SMGR_CREATE. Because it operates at the level of individual +relfilenodes, RelationNeedsWAL() can differ for tightly-coupled relations. +Consider "CREATE TABLE t (); BEGIN; ALTER TABLE t ADD c text; ..." in which +ALTER TABLE adds a TOAST relation. The TOAST relation will skip WAL, while +the table owning it will not. ALTER TABLE SET TABLESPACE will cause a table +to skip WAL, but that won't affect its indexes. + + Asynchronous Commit ------------------- @@ -813,13 +845,12 @@ Changes to a temp table are not WAL-logged, hence could reach disk in advance of T1's commit, but we don't care since temp table contents don't survive crashes anyway. -Database writes made via any of the paths we have introduced to avoid WAL -overhead for bulk updates are also safe. In these cases it's entirely -possible for the data to reach disk before T1's commit, because T1 will -fsync it down to disk without any sort of interlock, as soon as it finishes -the bulk update. However, all these paths are designed to write data that -no other transaction can see until after T1 commits. The situation is thus -not different from ordinary WAL-logged updates. +Database writes that skip WAL for new relfilenodes are also safe. In these +cases it's entirely possible for the data to reach disk before T1's commit, +because T1 will fsync it down to disk without any sort of interlock. However, +all these paths are designed to write data that no other transaction can see +until after T1 commits. The situation is thus not different from ordinary +WAL-logged updates. Transaction Emulation during Recovery ------------------------------------- diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c index 03cadb018f4..eeec2b669a3 100644 --- a/src/backend/access/transam/xact.c +++ b/src/backend/access/transam/xact.c @@ -2018,6 +2018,13 @@ CommitTransaction(void) */ PreCommit_on_commit_actions(); + /* + * Synchronize files that are created and not WAL-logged during this + * transaction. This must happen before AtEOXact_RelationMap(), so that we + * don't see committed-but-broken files after a crash. + */ + smgrDoPendingSyncs(true); + /* close large objects before lower-level cleanup */ AtEOXact_LargeObject(true); @@ -2246,6 +2253,13 @@ PrepareTransaction(void) */ PreCommit_on_commit_actions(); + /* + * Synchronize files that are created and not WAL-logged during this + * transaction. This must happen before EndPrepare(), so that we don't see + * committed-but-broken files after a crash and COMMIT PREPARED. + */ + smgrDoPendingSyncs(true); + /* close large objects before lower-level cleanup */ AtEOXact_LargeObject(true); @@ -2542,6 +2556,7 @@ AbortTransaction(void) */ AfterTriggerEndXact(false); /* 'false' means it's abort */ AtAbort_Portals(); + smgrDoPendingSyncs(false); AtEOXact_LargeObject(false); AtAbort_Notify(); AtEOXact_RelationMap(false); diff --git a/src/backend/access/transam/xlogutils.c b/src/backend/access/transam/xlogutils.c index c0386d96889..45b1cbd8dcc 100644 --- a/src/backend/access/transam/xlogutils.c +++ b/src/backend/access/transam/xlogutils.c @@ -538,6 +538,8 @@ typedef FakeRelCacheEntryData *FakeRelCacheEntry; * fields related to physical storage, like rd_rel, are initialized, so the * fake entry is only usable in low-level operations like ReadBuffer(). * + * This is also used for syncing WAL-skipped files. + * * Caller must free the returned entry with FreeFakeRelcacheEntry(). */ Relation @@ -546,18 +548,20 @@ CreateFakeRelcacheEntry(RelFileNode rnode) FakeRelCacheEntry fakeentry; Relation rel; - Assert(InRecovery); - /* Allocate the Relation struct and all related space in one block. */ fakeentry = palloc0(sizeof(FakeRelCacheEntryData)); rel = (Relation) fakeentry; rel->rd_rel = &fakeentry->pgc; rel->rd_node = rnode; - /* We will never be working with temp rels during recovery */ + + /* + * We will never be working with temp rels during recovery or while + * syncing WAL-skipped files. + */ rel->rd_backend = InvalidBackendId; - /* It must be a permanent table if we're in recovery. */ + /* It must be a permanent table here */ rel->rd_rel->relpersistence = RELPERSISTENCE_PERMANENT; /* We don't know the name of the relation; use relfilenode instead */ @@ -566,9 +570,9 @@ CreateFakeRelcacheEntry(RelFileNode rnode) /* * We set up the lockRelId in case anything tries to lock the dummy * relation. Note that this is fairly bogus since relNode may be - * different from the relation's OID. It shouldn't really matter though, - * since we are presumably running by ourselves and can't have any lock - * conflicts ... + * different from the relation's OID. It shouldn't really matter though. + * In recovery, we are running by ourselves and can't have any lock + * conflicts. While syncing, we already hold AccessExclusiveLock. */ rel->rd_lockInfo.lockRelId.dbId = rnode.dbNode; rel->rd_lockInfo.lockRelId.relId = rnode.relNode; |