diff options
author | Noah Misch <noah@leadboat.com> | 2020-03-22 09:24:09 -0700 |
---|---|---|
committer | Noah Misch <noah@leadboat.com> | 2020-03-22 09:24:13 -0700 |
commit | 63631ee64f84bae7feeb06b8417b628e82319ce2 (patch) | |
tree | 35d8fe6f43ea3c9d7489062474c26ba599973c6b /src/backend/commands | |
parent | e4b0a02ef8c8c69f2a49f4ca86de12ef34e97ac3 (diff) | |
download | postgresql-63631ee64f84bae7feeb06b8417b628e82319ce2.tar.gz postgresql-63631ee64f84bae7feeb06b8417b628e82319ce2.zip |
Revert "Skip WAL for new relfilenodes, under wal_level=minimal."
This reverts commit cb2fd7eac285b1b0a24eeb2b8ed4456b66c5a09f. Per
numerous buildfarm members, it was incompatible with parallel query, and
a test case assumed LP64. Back-patch to 9.5 (all supported versions).
Discussion: https://postgr.es/m/20200321224920.GB1763544@rfd.leadboat.com
Diffstat (limited to 'src/backend/commands')
-rw-r--r-- | src/backend/commands/cluster.c | 19 | ||||
-rw-r--r-- | src/backend/commands/copy.c | 58 | ||||
-rw-r--r-- | src/backend/commands/createas.c | 11 | ||||
-rw-r--r-- | src/backend/commands/indexcmds.c | 2 | ||||
-rw-r--r-- | src/backend/commands/matview.c | 12 | ||||
-rw-r--r-- | src/backend/commands/tablecmds.c | 26 |
6 files changed, 78 insertions, 50 deletions
diff --git a/src/backend/commands/cluster.c b/src/backend/commands/cluster.c index b53f89ee71b..c7fda21634d 100644 --- a/src/backend/commands/cluster.c +++ b/src/backend/commands/cluster.c @@ -1112,25 +1112,6 @@ swap_relation_files(Oid r1, Oid r2, bool target_is_pg_class, } /* - * Recognize that rel1's relfilenode (swapped from rel2) is new in this - * subtransaction. The rel2 storage (swapped from rel1) may or may not be - * new. - */ - { - Relation rel1, - rel2; - - rel1 = relation_open(r1, NoLock); - rel2 = relation_open(r2, NoLock); - rel2->rd_createSubid = rel1->rd_createSubid; - rel2->rd_newRelfilenodeSubid = rel1->rd_newRelfilenodeSubid; - rel2->rd_firstRelfilenodeSubid = rel1->rd_firstRelfilenodeSubid; - RelationAssumeNewRelfilenode(rel1); - relation_close(rel1, NoLock); - relation_close(rel2, NoLock); - } - - /* * In the case of a shared catalog, these next few steps will only affect * our own database's pg_class row; but that's okay, because they are all * noncritical updates. That's also an important fact for the case of a diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c index 57c35bd6aa0..4f04d122c30 100644 --- a/src/backend/commands/copy.c +++ b/src/backend/commands/copy.c @@ -2715,15 +2715,63 @@ CopyFrom(CopyState cstate) RelationGetRelationName(cstate->rel)))); } - /* - * If the target file is new-in-transaction, we assume that checking FSM - * for free space is a waste of time. This could possibly be wrong, but - * it's unlikely. + /*---------- + * Check to see if we can avoid writing WAL + * + * If archive logging/streaming is not enabled *and* either + * - table was created in same transaction as this COPY + * - data is being written to relfilenode created in this transaction + * then we can skip writing WAL. It's safe because if the transaction + * doesn't commit, we'll discard the table (or the new relfilenode file). + * If it does commit, we'll have done the table_finish_bulk_insert() at + * the bottom of this routine first. + * + * As mentioned in comments in utils/rel.h, the in-same-transaction test + * is not always set correctly, since in rare cases rd_newRelfilenodeSubid + * can be cleared before the end of the transaction. The exact case is + * when a relation sets a new relfilenode twice in same transaction, yet + * the second one fails in an aborted subtransaction, e.g. + * + * BEGIN; + * TRUNCATE t; + * SAVEPOINT save; + * TRUNCATE t; + * ROLLBACK TO save; + * COPY ... + * + * Also, if the target file is new-in-transaction, we assume that checking + * FSM for free space is a waste of time, even if we must use WAL because + * of archiving. This could possibly be wrong, but it's unlikely. + * + * The comments for table_tuple_insert and RelationGetBufferForTuple + * specify that skipping WAL logging is only safe if we ensure that our + * tuples do not go into pages containing tuples from any other + * transactions --- but this must be the case if we have a new table or + * new relfilenode, so we need no additional work to enforce that. + * + * We currently don't support this optimization if the COPY target is a + * partitioned table as we currently only lazily initialize partition + * information when routing the first tuple to the partition. We cannot + * know at this stage if we can perform this optimization. It should be + * possible to improve on this, but it does mean maintaining heap insert + * option flags per partition and setting them when we first open the + * partition. + * + * This optimization is not supported for relation types which do not + * have any physical storage, with foreign tables and views using + * INSTEAD OF triggers entering in this category. Partitioned tables + * are not supported as per the description above. + *---------- */ + /* createSubid is creation check, newRelfilenodeSubid is truncation check */ if (RELKIND_HAS_STORAGE(cstate->rel->rd_rel->relkind) && (cstate->rel->rd_createSubid != InvalidSubTransactionId || - cstate->rel->rd_firstRelfilenodeSubid != InvalidSubTransactionId)) + cstate->rel->rd_newRelfilenodeSubid != InvalidSubTransactionId)) + { ti_options |= TABLE_INSERT_SKIP_FSM; + if (!XLogIsNeeded()) + ti_options |= TABLE_INSERT_SKIP_WAL; + } /* * Optimize if new relfilenode was created in this subxact or one of its diff --git a/src/backend/commands/createas.c b/src/backend/commands/createas.c index 568cfba08c0..4c1d909d380 100644 --- a/src/backend/commands/createas.c +++ b/src/backend/commands/createas.c @@ -553,13 +553,16 @@ intorel_startup(DestReceiver *self, int operation, TupleDesc typeinfo) myState->rel = intoRelationDesc; myState->reladdr = intoRelationAddr; myState->output_cid = GetCurrentCommandId(true); - myState->ti_options = TABLE_INSERT_SKIP_FSM; - myState->bistate = GetBulkInsertState(); /* - * Valid smgr_targblock implies something already wrote to the relation. - * This may be harmless, but this function hasn't planned for it. + * We can skip WAL-logging the insertions, unless PITR or streaming + * replication is in use. We can skip the FSM in any case. */ + myState->ti_options = TABLE_INSERT_SKIP_FSM | + (XLogIsNeeded() ? 0 : TABLE_INSERT_SKIP_WAL); + myState->bistate = GetBulkInsertState(); + + /* Not using WAL requires smgr_targblock be initially invalid */ Assert(RelationGetTargetBlock(intoRelationDesc) == InvalidBlockNumber); } diff --git a/src/backend/commands/indexcmds.c b/src/backend/commands/indexcmds.c index 41c67be15bd..ea5e4f67e9e 100644 --- a/src/backend/commands/indexcmds.c +++ b/src/backend/commands/indexcmds.c @@ -1195,8 +1195,6 @@ DefineIndex(Oid relationId, childStmt->relation = NULL; childStmt->indexOid = InvalidOid; childStmt->oldNode = InvalidOid; - childStmt->oldCreateSubid = InvalidSubTransactionId; - childStmt->oldFirstRelfilenodeSubid = InvalidSubTransactionId; /* * Adjust any Vars (both in expressions and in the index's diff --git a/src/backend/commands/matview.c b/src/backend/commands/matview.c index ae809c98012..537d0e8ceff 100644 --- a/src/backend/commands/matview.c +++ b/src/backend/commands/matview.c @@ -457,13 +457,17 @@ transientrel_startup(DestReceiver *self, int operation, TupleDesc typeinfo) */ myState->transientrel = transientrel; myState->output_cid = GetCurrentCommandId(true); - myState->ti_options = TABLE_INSERT_SKIP_FSM | TABLE_INSERT_FROZEN; - myState->bistate = GetBulkInsertState(); /* - * Valid smgr_targblock implies something already wrote to the relation. - * This may be harmless, but this function hasn't planned for it. + * We can skip WAL-logging the insertions, unless PITR or streaming + * replication is in use. We can skip the FSM in any case. */ + myState->ti_options = TABLE_INSERT_SKIP_FSM | TABLE_INSERT_FROZEN; + if (!XLogIsNeeded()) + myState->ti_options |= TABLE_INSERT_SKIP_WAL; + myState->bistate = GetBulkInsertState(); + + /* Not using WAL requires smgr_targblock be initially invalid */ Assert(RelationGetTargetBlock(transientrel) == InvalidBlockNumber); } diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c index b125d51e2fc..74eb7ac8d76 100644 --- a/src/backend/commands/tablecmds.c +++ b/src/backend/commands/tablecmds.c @@ -4785,14 +4785,19 @@ ATRewriteTable(AlteredTableInfo *tab, Oid OIDNewHeap, LOCKMODE lockmode) newrel = NULL; /* - * Prepare a BulkInsertState and options for table_tuple_insert. The FSM - * is empty, so don't bother using it. + * Prepare a BulkInsertState and options for table_tuple_insert. Because + * we're building a new heap, we can skip WAL-logging and fsync it to disk + * at the end instead (unless WAL-logging is required for archiving or + * streaming replication). The FSM is empty too, so don't bother using it. */ if (newrel) { mycid = GetCurrentCommandId(true); bistate = GetBulkInsertState(); + ti_options = TABLE_INSERT_SKIP_FSM; + if (!XLogIsNeeded()) + ti_options |= TABLE_INSERT_SKIP_WAL; } else { @@ -7300,19 +7305,14 @@ ATExecAddIndex(AlteredTableInfo *tab, Relation rel, /* * If TryReuseIndex() stashed a relfilenode for us, we used it for the new - * index instead of building from scratch. Restore associated fields. - * This may store InvalidSubTransactionId in both fields, in which case - * relcache.c will assume it can rebuild the relcache entry. Hence, do - * this after the CCI that made catalog rows visible to any rebuild. The - * DROP of the old edition of this index will have scheduled the storage - * for deletion at commit, so cancel that pending deletion. + * index instead of building from scratch. The DROP of the old edition of + * this index will have scheduled the storage for deletion at commit, so + * cancel that pending deletion. */ if (OidIsValid(stmt->oldNode)) { Relation irel = index_open(address.objectId, NoLock); - irel->rd_createSubid = stmt->oldCreateSubid; - irel->rd_firstRelfilenodeSubid = stmt->oldFirstRelfilenodeSubid; RelationPreserveStorage(irel->rd_node, true); index_close(irel, NoLock); } @@ -11619,11 +11619,7 @@ TryReuseIndex(Oid oldId, IndexStmt *stmt) /* If it's a partitioned index, there is no storage to share. */ if (irel->rd_rel->relkind != RELKIND_PARTITIONED_INDEX) - { stmt->oldNode = irel->rd_node.relNode; - stmt->oldCreateSubid = irel->rd_createSubid; - stmt->oldFirstRelfilenodeSubid = irel->rd_firstRelfilenodeSubid; - } index_close(irel, NoLock); } } @@ -12557,8 +12553,6 @@ ATExecSetTableSpace(Oid tableOid, Oid newTableSpace, LOCKMODE lockmode) table_close(pg_class, RowExclusiveLock); - RelationAssumeNewRelfilenode(rel); - relation_close(rel, NoLock); /* Make sure the reltablespace change is visible */ |