diff options
Diffstat (limited to 'src/backend/access')
-rw-r--r-- | src/backend/access/gist/gistbuild.c | 2 | ||||
-rw-r--r-- | src/backend/access/gist/gistutil.c | 31 | ||||
-rw-r--r-- | src/backend/access/gist/gistxlog.c | 21 | ||||
-rw-r--r-- | src/backend/access/heap/heapam.c | 30 | ||||
-rw-r--r-- | src/backend/access/heap/rewriteheap.c | 21 | ||||
-rw-r--r-- | src/backend/access/nbtree/nbtsort.c | 41 | ||||
-rw-r--r-- | src/backend/access/rmgrdesc/gistdesc.c | 6 | ||||
-rw-r--r-- | src/backend/access/transam/README | 45 | ||||
-rw-r--r-- | src/backend/access/transam/xact.c | 15 | ||||
-rw-r--r-- | src/backend/access/transam/xlogutils.c | 18 |
10 files changed, 89 insertions, 141 deletions
diff --git a/src/backend/access/gist/gistbuild.c b/src/backend/access/gist/gistbuild.c index 46d150f9d82..b9c4e27e1a5 100644 --- a/src/backend/access/gist/gistbuild.c +++ b/src/backend/access/gist/gistbuild.c @@ -191,7 +191,7 @@ gistbuild(Relation heap, Relation index, IndexInfo *indexInfo) PageSetLSN(page, recptr); } else - PageSetLSN(page, gistGetFakeLSN(index)); + PageSetLSN(page, gistGetFakeLSN(heap)); UnlockReleaseBuffer(buffer); diff --git a/src/backend/access/gist/gistutil.c b/src/backend/access/gist/gistutil.c index d17965aa4b8..55cccd247a0 100644 --- a/src/backend/access/gist/gistutil.c +++ b/src/backend/access/gist/gistutil.c @@ -972,44 +972,23 @@ gistproperty(Oid index_oid, int attno, } /* - * Some indexes are not WAL-logged, but we need LSNs to detect concurrent page - * splits anyway. This function provides a fake sequence of LSNs for that - * purpose. + * Temporary and unlogged GiST indexes are not WAL-logged, but we need LSNs + * to detect concurrent page splits anyway. This function provides a fake + * sequence of LSNs for that purpose. */ XLogRecPtr gistGetFakeLSN(Relation rel) { + static XLogRecPtr counter = 1; + if (rel->rd_rel->relpersistence == RELPERSISTENCE_TEMP) { /* * Temporary relations are only accessible in our session, so a simple * backend-local counter will do. */ - static XLogRecPtr counter = 1; - return counter++; } - else if (rel->rd_rel->relpersistence == RELPERSISTENCE_PERMANENT) - { - /* - * WAL-logging on this relation will start after commit, so its LSNs - * must be distinct numbers smaller than the LSN at the next commit. - * Emit a dummy WAL record if insert-LSN hasn't advanced after the - * last call. - */ - static XLogRecPtr lastlsn = InvalidXLogRecPtr; - XLogRecPtr currlsn = GetXLogInsertRecPtr(); - - /* Shouldn't be called for WAL-logging relations */ - Assert(!RelationNeedsWAL(rel)); - - /* No need for an actual record if we already have a distinct LSN */ - if (!XLogRecPtrIsInvalid(lastlsn) && lastlsn == currlsn) - currlsn = gistXLogAssignLSN(); - - lastlsn = currlsn; - return currlsn; - } else { /* diff --git a/src/backend/access/gist/gistxlog.c b/src/backend/access/gist/gistxlog.c index 93e64682131..17e213967b9 100644 --- a/src/backend/access/gist/gistxlog.c +++ b/src/backend/access/gist/gistxlog.c @@ -505,9 +505,6 @@ gist_redo(XLogReaderState *record) case XLOG_GIST_CREATE_INDEX: gistRedoCreateIndex(record); break; - case XLOG_GIST_ASSIGN_LSN: - /* nop. See gistGetFakeLSN(). */ - break; default: elog(PANIC, "gist_redo: unknown op code %u", info); } @@ -627,24 +624,6 @@ gistXLogSplit(bool page_is_leaf, } /* - * Write an empty XLOG record to assign a distinct LSN. - */ -XLogRecPtr -gistXLogAssignLSN(void) -{ - int dummy = 0; - - /* - * Records other than SWITCH_WAL must have content. We use an integer 0 to - * follow the restriction. - */ - XLogBeginInsert(); - XLogSetRecordFlags(XLOG_MARK_UNIMPORTANT); - XLogRegisterData((char *) &dummy, sizeof(dummy)); - return XLogInsert(RM_GIST_ID, XLOG_GIST_ASSIGN_LSN); -} - -/* * Write XLOG record describing a page update. The update can include any * number of deletions and/or insertions of tuples on a single index page. * diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index af16d2d2809..8ebf86f6878 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -27,6 +27,7 @@ * heap_multi_insert - insert multiple tuples into a relation * heap_delete - delete a tuple from a relation * heap_update - replace a tuple in a relation with another tuple + * heap_sync - sync heap, for when no WAL has been written * * NOTES * This file contains the heap_ routines which implement @@ -2395,6 +2396,12 @@ ReleaseBulkInsertStatePin(BulkInsertState bistate) * The new tuple is stamped with current transaction ID and the specified * command ID. * + * If the HEAP_INSERT_SKIP_WAL option is specified, the new tuple is not + * logged in WAL, even for a non-temp relation. Safe usage of this behavior + * requires that we arrange that all new tuples go into new pages not + * containing any tuples from other transactions, and that the relation gets + * fsync'd before commit. (See also heap_sync() comments) + * * The HEAP_INSERT_SKIP_FSM option is passed directly to * RelationGetBufferForTuple, which see for more info. * @@ -2503,7 +2510,7 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid, MarkBufferDirty(buffer); /* XLOG stuff */ - if (RelationNeedsWAL(relation)) + if (!(options & HEAP_INSERT_SKIP_WAL) && RelationNeedsWAL(relation)) { xl_heap_insert xlrec; xl_heap_header xlhdr; @@ -2713,7 +2720,7 @@ heap_multi_insert(Relation relation, HeapTuple *tuples, int ntuples, /* currently not needed (thus unsupported) for heap_multi_insert() */ AssertArg(!(options & HEAP_INSERT_NO_LOGICAL)); - needwal = RelationNeedsWAL(relation); + needwal = !(options & HEAP_INSERT_SKIP_WAL) && RelationNeedsWAL(relation); saveFreeSpace = RelationGetTargetPageFreeSpace(relation, HEAP_DEFAULT_FILLFACTOR); @@ -9413,13 +9420,18 @@ heap2_redo(XLogReaderState *record) } /* - * heap_sync - for binary compatibility - * - * A newer PostgreSQL version removes this function. It exists here just in - * case an extension calls it. See "Skipping WAL for New RelFileNode" in - * src/backend/access/transam/README for the system that superseded it, - * allowing removal of most calls. Cases like copy_relation_data() should - * call smgrimmedsync() directly. + * heap_sync - sync a heap, for use when no WAL has been written + * + * This forces the heap contents (including TOAST heap if any) down to disk. + * If we skipped using WAL, and WAL is otherwise needed, we must force the + * relation down to disk before it's safe to commit the transaction. This + * requires writing out any dirty buffers and then doing a forced fsync. + * + * Indexes are not touched. (Currently, index operations associated with + * the commands that use this are WAL-logged and so do not need fsync. + * That behavior might change someday, but in any case it's likely that + * any fsync decisions required would be per-index and hence not appropriate + * to be done here.) */ void heap_sync(Relation rel) diff --git a/src/backend/access/heap/rewriteheap.c b/src/backend/access/heap/rewriteheap.c index dcb4d6877f5..9f0b586b5b6 100644 --- a/src/backend/access/heap/rewriteheap.c +++ b/src/backend/access/heap/rewriteheap.c @@ -145,6 +145,7 @@ typedef struct RewriteStateData Page rs_buffer; /* page currently being built */ BlockNumber rs_blockno; /* block where page will go */ bool rs_buffer_valid; /* T if any tuples in buffer */ + bool rs_use_wal; /* must we WAL-log inserts? */ bool rs_logical_rewrite; /* do we need to do logical rewriting */ TransactionId rs_oldest_xmin; /* oldest xmin used by caller to determine * tuple visibility */ @@ -238,13 +239,15 @@ static void logical_end_heap_rewrite(RewriteState state); * oldest_xmin xid used by the caller to determine which tuples are dead * freeze_xid xid before which tuples will be frozen * min_multi multixact before which multis will be removed + * use_wal should the inserts to the new heap be WAL-logged? * * Returns an opaque RewriteState, allocated in current memory context, * to be used in subsequent calls to the other functions. */ RewriteState begin_heap_rewrite(Relation old_heap, Relation new_heap, TransactionId oldest_xmin, - TransactionId freeze_xid, MultiXactId cutoff_multi) + TransactionId freeze_xid, MultiXactId cutoff_multi, + bool use_wal) { RewriteState state; MemoryContext rw_cxt; @@ -269,6 +272,7 @@ begin_heap_rewrite(Relation old_heap, Relation new_heap, TransactionId oldest_xm /* new_heap needn't be empty, just locked */ state->rs_blockno = RelationGetNumberOfBlocks(new_heap); state->rs_buffer_valid = false; + state->rs_use_wal = use_wal; state->rs_oldest_xmin = oldest_xmin; state->rs_freeze_xid = freeze_xid; state->rs_cutoff_multi = cutoff_multi; @@ -327,7 +331,7 @@ end_heap_rewrite(RewriteState state) /* Write the last page, if any */ if (state->rs_buffer_valid) { - if (RelationNeedsWAL(state->rs_new_rel)) + if (state->rs_use_wal) log_newpage(&state->rs_new_rel->rd_node, MAIN_FORKNUM, state->rs_blockno, @@ -342,14 +346,18 @@ end_heap_rewrite(RewriteState state) } /* - * When we WAL-logged rel pages, we must nonetheless fsync them. The + * If the rel is WAL-logged, must fsync before commit. We use heap_sync + * to ensure that the toast table gets fsync'd too. + * + * It's obvious that we must do this when not WAL-logging. It's less + * obvious that we have to do it even if we did WAL-log the pages. The * reason is the same as in tablecmds.c's copy_relation_data(): we're * writing data that's not in shared buffers, and so a CHECKPOINT * occurring during the rewriteheap operation won't have fsync'd data we * wrote before the checkpoint. */ if (RelationNeedsWAL(state->rs_new_rel)) - smgrimmedsync(state->rs_new_rel->rd_smgr, MAIN_FORKNUM); + heap_sync(state->rs_new_rel); logical_end_heap_rewrite(state); @@ -647,6 +655,9 @@ raw_heap_insert(RewriteState state, HeapTuple tup) { int options = HEAP_INSERT_SKIP_FSM; + if (!state->rs_use_wal) + options |= HEAP_INSERT_SKIP_WAL; + /* * While rewriting the heap for VACUUM FULL / CLUSTER, make sure data * for the TOAST table are not logically decoded. The main heap is @@ -685,7 +696,7 @@ raw_heap_insert(RewriteState state, HeapTuple tup) /* Doesn't fit, so write out the existing page */ /* XLOG stuff */ - if (RelationNeedsWAL(state->rs_new_rel)) + if (state->rs_use_wal) log_newpage(&state->rs_new_rel->rd_node, MAIN_FORKNUM, state->rs_blockno, diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c index 58c702d7a08..dab41ea298a 100644 --- a/src/backend/access/nbtree/nbtsort.c +++ b/src/backend/access/nbtree/nbtsort.c @@ -31,6 +31,18 @@ * them. They will need to be re-read into shared buffers on first use after * the build finishes. * + * Since the index will never be used unless it is completely built, + * from a crash-recovery point of view there is no need to WAL-log the + * steps of the build. After completing the index build, we can just sync + * the whole file to disk using smgrimmedsync() before exiting this module. + * This can be seen to be sufficient for crash recovery by considering that + * it's effectively equivalent to what would happen if a CHECKPOINT occurred + * just after the index build. However, it is clearly not sufficient if the + * DBA is using the WAL log for PITR or replication purposes, since another + * machine would not be able to reconstruct the index from WAL. Therefore, + * we log the completed index pages to WAL if and only if WAL archiving is + * active. + * * This code isn't concerned about the FSM at all. The caller is responsible * for initializing that. * @@ -518,7 +530,12 @@ _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2) wstate.heap = btspool->heap; wstate.index = btspool->index; - wstate.btws_use_wal = RelationNeedsWAL(wstate.index); + + /* + * We need to log index creation in WAL iff WAL archiving/streaming is + * enabled UNLESS the index isn't WAL-logged anyway. + */ + wstate.btws_use_wal = XLogIsNeeded() && RelationNeedsWAL(wstate.index); /* reserve the metapage */ wstate.btws_pages_alloced = BTREE_METAPAGE + 1; @@ -1173,15 +1190,21 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2) _bt_uppershutdown(wstate, state); /* - * When we WAL-logged index pages, we must nonetheless fsync index files. - * Since we're building outside shared buffers, a CHECKPOINT occurring - * during the build has no way to flush the previously written data to - * disk (indeed it won't know the index even exists). A crash later on - * would replay WAL from the checkpoint, therefore it wouldn't replay our - * earlier WAL entries. If we do not fsync those pages here, they might - * still not be on disk when the crash occurs. + * If the index is WAL-logged, we must fsync it down to disk before it's + * safe to commit the transaction. (For a non-WAL-logged index we don't + * care since the index will be uninteresting after a crash anyway.) + * + * It's obvious that we must do this when not WAL-logging the build. It's + * less obvious that we have to do it even if we did WAL-log the index + * pages. The reason is that since we're building outside shared buffers, + * a CHECKPOINT occurring during the build has no way to flush the + * previously written data to disk (indeed it won't know the index even + * exists). A crash later on would replay WAL from the checkpoint, + * therefore it wouldn't replay our earlier WAL entries. If we do not + * fsync those pages here, they might still not be on disk when the crash + * occurs. */ - if (wstate->btws_use_wal) + if (RelationNeedsWAL(wstate->index)) { RelationOpenSmgr(wstate->index); smgrimmedsync(wstate->index->rd_smgr, MAIN_FORKNUM); diff --git a/src/backend/access/rmgrdesc/gistdesc.c b/src/backend/access/rmgrdesc/gistdesc.c index 8c44925442b..e5e925e0c5a 100644 --- a/src/backend/access/rmgrdesc/gistdesc.c +++ b/src/backend/access/rmgrdesc/gistdesc.c @@ -46,9 +46,6 @@ gist_desc(StringInfo buf, XLogReaderState *record) break; case XLOG_GIST_CREATE_INDEX: break; - case XLOG_GIST_ASSIGN_LSN: - /* No details to write out */ - break; } } @@ -68,9 +65,6 @@ gist_identify(uint8 info) case XLOG_GIST_CREATE_INDEX: id = "CREATE_INDEX"; break; - case XLOG_GIST_ASSIGN_LSN: - id = "ASSIGN_LSN"; - break; } return id; diff --git a/src/backend/access/transam/README b/src/backend/access/transam/README index 28045f30876..ad4083eb6b5 100644 --- a/src/backend/access/transam/README +++ b/src/backend/access/transam/README @@ -717,38 +717,6 @@ then restart recovery. This is part of the reason for not writing a WAL entry until we've successfully done the original action. -Skipping WAL for New RelFileNode --------------------------------- - -Under wal_level=minimal, if a change modifies a relfilenode that ROLLBACK -would unlink, in-tree access methods write no WAL for that change. Code that -writes WAL without calling RelationNeedsWAL() must check for this case. This -skipping is mandatory. If a WAL-writing change preceded a WAL-skipping change -for the same block, REDO could overwrite the WAL-skipping change. If a -WAL-writing change followed a WAL-skipping change for the same block, a -related problem would arise. When a WAL record contains no full-page image, -REDO expects the page to match its contents from just before record insertion. -A WAL-skipping change may not reach disk at all, violating REDO's expectation -under full_page_writes=off. For any access method, CommitTransaction() writes -and fsyncs affected blocks before recording the commit. - -Prefer to do the same in future access methods. However, two other approaches -can work. First, an access method can irreversibly transition a given fork -from WAL-skipping to WAL-writing by calling FlushRelationBuffers() and -smgrimmedsync(). Second, an access method can opt to write WAL -unconditionally for permanent relations. Under these approaches, the access -method callbacks must not call functions that react to RelationNeedsWAL(). - -This applies only to WAL records whose replay would modify bytes stored in the -new relfilenode. It does not apply to other records about the relfilenode, -such as XLOG_SMGR_CREATE. Because it operates at the level of individual -relfilenodes, RelationNeedsWAL() can differ for tightly-coupled relations. -Consider "CREATE TABLE t (); BEGIN; ALTER TABLE t ADD c text; ..." in which -ALTER TABLE adds a TOAST relation. The TOAST relation will skip WAL, while -the table owning it will not. ALTER TABLE SET TABLESPACE will cause a table -to skip WAL, but that won't affect its indexes. - - Asynchronous Commit ------------------- @@ -852,12 +820,13 @@ Changes to a temp table are not WAL-logged, hence could reach disk in advance of T1's commit, but we don't care since temp table contents don't survive crashes anyway. -Database writes that skip WAL for new relfilenodes are also safe. In these -cases it's entirely possible for the data to reach disk before T1's commit, -because T1 will fsync it down to disk without any sort of interlock. However, -all these paths are designed to write data that no other transaction can see -until after T1 commits. The situation is thus not different from ordinary -WAL-logged updates. +Database writes made via any of the paths we have introduced to avoid WAL +overhead for bulk updates are also safe. In these cases it's entirely +possible for the data to reach disk before T1's commit, because T1 will +fsync it down to disk without any sort of interlock, as soon as it finishes +the bulk update. However, all these paths are designed to write data that +no other transaction can see until after T1 commits. The situation is thus +not different from ordinary WAL-logged updates. Transaction Emulation during Recovery ------------------------------------- diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c index 134c95c9bee..7c1771eae76 100644 --- a/src/backend/access/transam/xact.c +++ b/src/backend/access/transam/xact.c @@ -2035,13 +2035,6 @@ CommitTransaction(void) */ PreCommit_on_commit_actions(); - /* - * Synchronize files that are created and not WAL-logged during this - * transaction. This must happen before AtEOXact_RelationMap(), so that we - * don't see committed-but-broken files after a crash. - */ - smgrDoPendingSyncs(true); - /* close large objects before lower-level cleanup */ AtEOXact_LargeObject(true); @@ -2271,13 +2264,6 @@ PrepareTransaction(void) */ PreCommit_on_commit_actions(); - /* - * Synchronize files that are created and not WAL-logged during this - * transaction. This must happen before EndPrepare(), so that we don't see - * committed-but-broken files after a crash and COMMIT PREPARED. - */ - smgrDoPendingSyncs(true); - /* close large objects before lower-level cleanup */ AtEOXact_LargeObject(true); @@ -2602,7 +2588,6 @@ AbortTransaction(void) */ AfterTriggerEndXact(false); /* 'false' means it's abort */ AtAbort_Portals(); - smgrDoPendingSyncs(false); AtEOXact_LargeObject(false); AtAbort_Notify(); AtEOXact_RelationMap(false); diff --git a/src/backend/access/transam/xlogutils.c b/src/backend/access/transam/xlogutils.c index e18abdfe525..4ecdc9220f0 100644 --- a/src/backend/access/transam/xlogutils.c +++ b/src/backend/access/transam/xlogutils.c @@ -544,8 +544,6 @@ typedef FakeRelCacheEntryData *FakeRelCacheEntry; * fields related to physical storage, like rd_rel, are initialized, so the * fake entry is only usable in low-level operations like ReadBuffer(). * - * This is also used for syncing WAL-skipped files. - * * Caller must free the returned entry with FreeFakeRelcacheEntry(). */ Relation @@ -554,20 +552,18 @@ CreateFakeRelcacheEntry(RelFileNode rnode) FakeRelCacheEntry fakeentry; Relation rel; + Assert(InRecovery); + /* Allocate the Relation struct and all related space in one block. */ fakeentry = palloc0(sizeof(FakeRelCacheEntryData)); rel = (Relation) fakeentry; rel->rd_rel = &fakeentry->pgc; rel->rd_node = rnode; - - /* - * We will never be working with temp rels during recovery or while - * syncing WAL-skipped files. - */ + /* We will never be working with temp rels during recovery */ rel->rd_backend = InvalidBackendId; - /* It must be a permanent table here */ + /* It must be a permanent table if we're in recovery. */ rel->rd_rel->relpersistence = RELPERSISTENCE_PERMANENT; /* We don't know the name of the relation; use relfilenode instead */ @@ -576,9 +572,9 @@ CreateFakeRelcacheEntry(RelFileNode rnode) /* * We set up the lockRelId in case anything tries to lock the dummy * relation. Note that this is fairly bogus since relNode may be - * different from the relation's OID. It shouldn't really matter though. - * In recovery, we are running by ourselves and can't have any lock - * conflicts. While syncing, we already hold AccessExclusiveLock. + * different from the relation's OID. It shouldn't really matter though, + * since we are presumably running by ourselves and can't have any lock + * conflicts ... */ rel->rd_lockInfo.lockRelId.dbId = rnode.dbNode; rel->rd_lockInfo.lockRelId.relId = rnode.relNode; |