aboutsummaryrefslogtreecommitdiff
path: root/src/backend
diff options
context:
space:
mode:
authorAndres Freund <andres@anarazel.de>2019-04-29 19:28:05 -0700
committerAndres Freund <andres@anarazel.de>2019-04-29 19:28:05 -0700
commit5c1560606dc4c73993fb07f0176b5ec6c515a1b1 (patch)
treec0f60b1632e1a0fb2e76b24e9546e2920f336bd8 /src/backend
parent9ee7414ed0435d8946d040eb523824f2d71e2418 (diff)
downloadpostgresql-5c1560606dc4c73993fb07f0176b5ec6c515a1b1.tar.gz
postgresql-5c1560606dc4c73993fb07f0176b5ec6c515a1b1.zip
Fix several recently introduced issues around handling new relation forks.
Most of these stem from d25f519107 "tableam: relation creation, VACUUM FULL/CLUSTER, SET TABLESPACE.". 1) To pass data to the relation_set_new_filenode() RelationSetNewRelfilenode() was made to update RelationData.rd_rel directly. That's not OK however, as it makes the relcache entries temporarily inconsistent. Which among other scenarios is a problem if a REINDEX targets an index on pg_class - the CatalogTupleUpdate() in RelationSetNewRelfilenode(). Presumably that was introduced because other places in the code do so - while those aren't "good practice" they don't appear to be actively buggy (e.g. because system tables may not be targeted). I (Andres) should have caught this while reviewing and signficantly evolving the code in that commit, mea culpa. Fix that by instead passing in the new RelFileNode as separate argument to relation_set_new_filenode() and rely on the relcache to update the catalog entry. Also revert that the RelationMapUpdateMap() call was changed to immediate, and undo some other more unnecessary changes. 2) Document that the relation_set_new_filenode cannot rely on the whole relcache entry to be valid. It might be worthwhile to refactor the code to never have to rely on that, but given the way heap_create() is currently coded, that'd be a large change. 3) ATExecSetTableSpace() shouldn't do FlushRelationBuffers() itself. A table AM might not use shared buffers at all. Move to index_copy_data() and heapam_relation_copy_data(). 4) heapam_relation_set_new_filenode() previously sometimes accessed rel->rd_rel->relpersistence rather than the `persistence` argument. Code movement mistake. 5) Previously heapam_relation_set_new_filenode() re-opened the smgr relation to create the init for, if necesary. Instead have RelationCreateStorage() return the SMgrRelation and use it to create the init fork. 6) Add a note about the danger of modifying the relcache directly to ATExecSetTableSpace() - it's currently not a bug because there's a check ERRORing for catalog tables. Regression tests and assertion improvements that together trigger the bug described in 1) will be added in a later commit, as there is a related bug on all branches. Reported-By: Michael Paquier Diagnosed-By: Tom Lane and Andres Freund Author: Andres Freund Reviewed-By: Tom Lane Discussion: https://postgr.es/m/20190418011430.GA19133@paquier.xyz
Diffstat (limited to 'src/backend')
-rw-r--r--src/backend/access/heap/heapam_handler.c35
-rw-r--r--src/backend/catalog/heap.c5
-rw-r--r--src/backend/catalog/storage.c12
-rw-r--r--src/backend/commands/tablecmds.c26
-rw-r--r--src/backend/utils/cache/relcache.c55
5 files changed, 81 insertions, 52 deletions
diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c
index 6584a9cb8da..4d179881f27 100644
--- a/src/backend/access/heap/heapam_handler.c
+++ b/src/backend/access/heap/heapam_handler.c
@@ -566,10 +566,14 @@ heapam_finish_bulk_insert(Relation relation, int options)
*/
static void
-heapam_relation_set_new_filenode(Relation rel, char persistence,
+heapam_relation_set_new_filenode(Relation rel,
+ const RelFileNode *newrnode,
+ char persistence,
TransactionId *freezeXid,
MultiXactId *minmulti)
{
+ SMgrRelation srel;
+
/*
* Initialize to the minimum XID that could put tuples in the table. We
* know that no xacts older than RecentXmin are still running, so that
@@ -587,7 +591,7 @@ heapam_relation_set_new_filenode(Relation rel, char persistence,
*/
*minmulti = GetOldestMultiXactId();
- RelationCreateStorage(rel->rd_node, persistence);
+ srel = RelationCreateStorage(*newrnode, persistence);
/*
* If required, set up an init fork for an unlogged table so that it can
@@ -598,16 +602,17 @@ heapam_relation_set_new_filenode(Relation rel, char persistence,
* while replaying, for example, XLOG_DBASE_CREATE or XLOG_TBLSPC_CREATE
* record. Therefore, logging is necessary even if wal_level=minimal.
*/
- if (rel->rd_rel->relpersistence == RELPERSISTENCE_UNLOGGED)
+ if (persistence == RELPERSISTENCE_UNLOGGED)
{
Assert(rel->rd_rel->relkind == RELKIND_RELATION ||
rel->rd_rel->relkind == RELKIND_MATVIEW ||
rel->rd_rel->relkind == RELKIND_TOASTVALUE);
- RelationOpenSmgr(rel);
- smgrcreate(rel->rd_smgr, INIT_FORKNUM, false);
- log_smgrcreate(&rel->rd_smgr->smgr_rnode.node, INIT_FORKNUM);
- smgrimmedsync(rel->rd_smgr, INIT_FORKNUM);
+ smgrcreate(srel, INIT_FORKNUM, false);
+ log_smgrcreate(newrnode, INIT_FORKNUM);
+ smgrimmedsync(srel, INIT_FORKNUM);
}
+
+ smgrclose(srel);
}
static void
@@ -617,21 +622,29 @@ heapam_relation_nontransactional_truncate(Relation rel)
}
static void
-heapam_relation_copy_data(Relation rel, RelFileNode newrnode)
+heapam_relation_copy_data(Relation rel, const RelFileNode *newrnode)
{
SMgrRelation dstrel;
- dstrel = smgropen(newrnode, rel->rd_backend);
+ dstrel = smgropen(*newrnode, rel->rd_backend);
RelationOpenSmgr(rel);
/*
+ * Since we copy the file directly without looking at the shared buffers,
+ * we'd better first flush out any pages of the source relation that are
+ * in shared buffers. We assume no new changes will be made while we are
+ * holding exclusive lock on the rel.
+ */
+ FlushRelationBuffers(rel);
+
+ /*
* Create and copy all forks of the relation, and schedule unlinking of
* old physical files.
*
* NOTE: any conflict in relfilenode value will be caught in
* RelationCreateStorage().
*/
- RelationCreateStorage(newrnode, rel->rd_rel->relpersistence);
+ RelationCreateStorage(*newrnode, rel->rd_rel->relpersistence);
/* copy main fork */
RelationCopyStorage(rel->rd_smgr, dstrel, MAIN_FORKNUM,
@@ -652,7 +665,7 @@ heapam_relation_copy_data(Relation rel, RelFileNode newrnode)
if (rel->rd_rel->relpersistence == RELPERSISTENCE_PERMANENT ||
(rel->rd_rel->relpersistence == RELPERSISTENCE_UNLOGGED &&
forkNum == INIT_FORKNUM))
- log_smgrcreate(&newrnode, forkNum);
+ log_smgrcreate(newrnode, forkNum);
RelationCopyStorage(rel->rd_smgr, dstrel, forkNum,
rel->rd_rel->relpersistence);
}
diff --git a/src/backend/catalog/heap.c b/src/backend/catalog/heap.c
index 6b77eff0af1..ee6b72e550a 100644
--- a/src/backend/catalog/heap.c
+++ b/src/backend/catalog/heap.c
@@ -435,8 +435,9 @@ heap_create(const char *relname,
case RELKIND_RELATION:
case RELKIND_TOASTVALUE:
case RELKIND_MATVIEW:
- table_relation_set_new_filenode(rel, relpersistence,
- relfrozenxid, relminmxid);
+ table_relation_set_new_filenode(rel, &rel->rd_node,
+ relpersistence,
+ relfrozenxid, relminmxid);
break;
}
}
diff --git a/src/backend/catalog/storage.c b/src/backend/catalog/storage.c
index 72242b24761..fb41f223ada 100644
--- a/src/backend/catalog/storage.c
+++ b/src/backend/catalog/storage.c
@@ -75,7 +75,7 @@ static PendingRelDelete *pendingDeletes = NULL; /* head of linked list */
* This function is transactional. The creation is WAL-logged, and if the
* transaction aborts later on, the storage will be destroyed.
*/
-void
+SMgrRelation
RelationCreateStorage(RelFileNode rnode, char relpersistence)
{
PendingRelDelete *pending;
@@ -99,7 +99,7 @@ RelationCreateStorage(RelFileNode rnode, char relpersistence)
break;
default:
elog(ERROR, "invalid relpersistence: %c", relpersistence);
- return; /* placate compiler */
+ return NULL; /* placate compiler */
}
srel = smgropen(rnode, backend);
@@ -117,13 +117,15 @@ RelationCreateStorage(RelFileNode rnode, char relpersistence)
pending->nestLevel = GetCurrentTransactionNestLevel();
pending->next = pendingDeletes;
pendingDeletes = pending;
+
+ return srel;
}
/*
* Perform XLogInsert of an XLOG_SMGR_CREATE record to WAL.
*/
void
-log_smgrcreate(RelFileNode *rnode, ForkNumber forkNum)
+log_smgrcreate(const RelFileNode *rnode, ForkNumber forkNum)
{
xl_smgr_create xlrec;
@@ -294,6 +296,10 @@ RelationTruncate(Relation rel, BlockNumber nblocks)
/*
* Copy a fork's data, block by block.
+ *
+ * Note that this requires that there is no dirty data in shared buffers. If
+ * it's possible that there are, callers need to flush those using
+ * e.g. FlushRelationBuffers(rel).
*/
void
RelationCopyStorage(SMgrRelation src, SMgrRelation dst,
diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c
index 14fcad9034b..2d0ef92badf 100644
--- a/src/backend/commands/tablecmds.c
+++ b/src/backend/commands/tablecmds.c
@@ -12237,14 +12237,6 @@ ATExecSetTableSpace(Oid tableOid, Oid newTableSpace, LOCKMODE lockmode)
rd_rel = (Form_pg_class) GETSTRUCT(tuple);
/*
- * Since we copy the file directly without looking at the shared buffers,
- * we'd better first flush out any pages of the source relation that are
- * in shared buffers. We assume no new changes will be made while we are
- * holding exclusive lock on the rel.
- */
- FlushRelationBuffers(rel);
-
- /*
* Relfilenodes are not unique in databases across tablespaces, so we need
* to allocate a new one in the new tablespace.
*/
@@ -12266,10 +12258,16 @@ ATExecSetTableSpace(Oid tableOid, Oid newTableSpace, LOCKMODE lockmode)
Assert(rel->rd_rel->relkind == RELKIND_RELATION ||
rel->rd_rel->relkind == RELKIND_MATVIEW ||
rel->rd_rel->relkind == RELKIND_TOASTVALUE);
- table_relation_copy_data(rel, newrnode);
+ table_relation_copy_data(rel, &newrnode);
}
- /* update the pg_class row */
+ /*
+ * Update the pg_class row.
+ *
+ * NB: This wouldn't work if ATExecSetTableSpace() were allowed to be
+ * executed on pg_class or its indexes (the above copy wouldn't contain
+ * the updated pg_class entry), but that's forbidden above.
+ */
rd_rel->reltablespace = (newTableSpace == MyDatabaseTableSpace) ? InvalidOid : newTableSpace;
rd_rel->relfilenode = newrelfilenode;
CatalogTupleUpdate(pg_class, &tuple->t_self, tuple);
@@ -12538,6 +12536,14 @@ index_copy_data(Relation rel, RelFileNode newrnode)
RelationOpenSmgr(rel);
/*
+ * Since we copy the file directly without looking at the shared buffers,
+ * we'd better first flush out any pages of the source relation that are
+ * in shared buffers. We assume no new changes will be made while we are
+ * holding exclusive lock on the rel.
+ */
+ FlushRelationBuffers(rel);
+
+ /*
* Create and copy all forks of the relation, and schedule unlinking of
* old physical files.
*
diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c
index bab59f16e68..90ff8ccf54f 100644
--- a/src/backend/utils/cache/relcache.c
+++ b/src/backend/utils/cache/relcache.c
@@ -3440,6 +3440,7 @@ RelationSetNewRelfilenode(Relation relation, char persistence)
Form_pg_class classform;
MultiXactId minmulti = InvalidMultiXactId;
TransactionId freezeXid = InvalidTransactionId;
+ RelFileNode newrnode;
/* Allocate a new relfilenode */
newrelfilenode = GetNewRelFileNode(relation->rd_rel->reltablespace, NULL,
@@ -3462,39 +3463,23 @@ RelationSetNewRelfilenode(Relation relation, char persistence)
*/
RelationDropStorage(relation);
- /*
- * Now update the pg_class row. However, if we're dealing with a mapped
- * index, pg_class.relfilenode doesn't change; instead we have to send the
- * update to the relation mapper.
- */
- if (RelationIsMapped(relation))
- RelationMapUpdateMap(RelationGetRelid(relation),
- newrelfilenode,
- relation->rd_rel->relisshared,
- true);
- else
- {
- relation->rd_rel->relfilenode = newrelfilenode;
- classform->relfilenode = newrelfilenode;
- }
-
- RelationInitPhysicalAddr(relation);
+ /* initialize new relfilenode from old relfilenode */
+ newrnode = relation->rd_node;
/*
* Create storage for the main fork of the new relfilenode. If it's
* table-like object, call into table AM to do so, which'll also create
* the table's init fork.
*
- * NOTE: any conflict in relfilenode value will be caught here, if
- * GetNewRelFileNode messes up for any reason.
+ * NOTE: If relevant for the AM, any conflict in relfilenode value will be
+ * caught here, if GetNewRelFileNode messes up for any reason.
*/
+ newrnode = relation->rd_node;
+ newrnode.relNode = newrelfilenode;
- /*
- * Create storage for relation.
- */
switch (relation->rd_rel->relkind)
{
- /* shouldn't be called for these */
+ /* shouldn't be called for these */
case RELKIND_VIEW:
case RELKIND_COMPOSITE_TYPE:
case RELKIND_FOREIGN_TABLE:
@@ -3505,18 +3490,36 @@ RelationSetNewRelfilenode(Relation relation, char persistence)
case RELKIND_INDEX:
case RELKIND_SEQUENCE:
- RelationCreateStorage(relation->rd_node, persistence);
- RelationOpenSmgr(relation);
+ {
+ SMgrRelation srel;
+
+ srel = RelationCreateStorage(newrnode, persistence);
+ smgrclose(srel);
+ }
break;
case RELKIND_RELATION:
case RELKIND_TOASTVALUE:
case RELKIND_MATVIEW:
- table_relation_set_new_filenode(relation, persistence,
+ table_relation_set_new_filenode(relation, &newrnode,
+ persistence,
&freezeXid, &minmulti);
break;
}
+ /*
+ * However, if we're dealing with a mapped index, pg_class.relfilenode
+ * doesn't change; instead we have to send the update to the relation
+ * mapper.
+ */
+ if (RelationIsMapped(relation))
+ RelationMapUpdateMap(RelationGetRelid(relation),
+ newrelfilenode,
+ relation->rd_rel->relisshared,
+ false);
+ else
+ classform->relfilenode = newrelfilenode;
+
/* These changes are safe even for a mapped relation */
if (relation->rd_rel->relkind != RELKIND_SEQUENCE)
{