aboutsummaryrefslogtreecommitdiff
path: root/src/backend/commands/dbcommands.c
diff options
context:
space:
mode:
authorTom Lane <tgl@sss.pgh.pa.us>2013-01-18 18:06:20 -0500
committerTom Lane <tgl@sss.pgh.pa.us>2013-01-18 18:06:20 -0500
commitc2a14bc7c994065edcc48183913a1fae2af27990 (patch)
tree861568bdf9f37ca8f2cc011ec3b8ea522f696124 /src/backend/commands/dbcommands.c
parent530bbfac57c8c5df9d38754759d95f1588c427f7 (diff)
downloadpostgresql-c2a14bc7c994065edcc48183913a1fae2af27990.tar.gz
postgresql-c2a14bc7c994065edcc48183913a1fae2af27990.zip
Protect against SnapshotNow race conditions in pg_tablespace scans.
Use of SnapshotNow is known to expose us to race conditions if the tuple(s) being sought could be updated by concurrently-committing transactions. CREATE DATABASE and DROP DATABASE are particularly exposed because they do heavyweight filesystem operations during their scans of pg_tablespace, so that the scans run for a very long time compared to most. Furthermore, the potential consequences of a missed or twice-visited row are nastier than average: * createdb() could fail with a bogus "file already exists" error, or silently fail to copy one or more tablespace's worth of files into the new database. * remove_dbtablespaces() could miss one or more tablespaces, thus failing to free filesystem space for the dropped database. * check_db_file_conflict() could likewise miss a tablespace, leading to an OID conflict that could result in data loss either immediately or in future operations. (This seems of very low probability, though, since a duplicate database OID would be unlikely to start with.) Hence, it seems worth fixing these three places to use MVCC snapshots, even though this will someday be superseded by a generic solution to SnapshotNow race conditions. Back-patch to all active branches. Stephen Frost and Tom Lane
Diffstat (limited to 'src/backend/commands/dbcommands.c')
-rw-r--r--src/backend/commands/dbcommands.c57
1 files changed, 54 insertions, 3 deletions
diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c
index 1f6e02d3c9e..4ad4b997585 100644
--- a/src/backend/commands/dbcommands.c
+++ b/src/backend/commands/dbcommands.c
@@ -131,6 +131,7 @@ createdb(const CreatedbStmt *stmt)
int notherbackends;
int npreparedxacts;
createdb_failure_params fparms;
+ Snapshot snapshot;
/* Extract options from the statement node tree */
foreach(option, stmt->options)
@@ -535,6 +536,29 @@ createdb(const CreatedbStmt *stmt)
RequestCheckpoint(CHECKPOINT_IMMEDIATE | CHECKPOINT_FORCE | CHECKPOINT_WAIT);
/*
+ * Take an MVCC snapshot to use while scanning through pg_tablespace. For
+ * safety, register the snapshot (this prevents it from changing if
+ * something else were to request a snapshot during the loop).
+ *
+ * Traversing pg_tablespace with an MVCC snapshot is necessary to provide
+ * us with a consistent view of the tablespaces that exist. Using
+ * SnapshotNow here would risk seeing the same tablespace multiple times,
+ * or worse not seeing a tablespace at all, if its tuple is moved around
+ * by a concurrent update (eg an ACL change).
+ *
+ * Inconsistency of this sort is inherent to all SnapshotNow scans, unless
+ * some lock is held to prevent concurrent updates of the rows being
+ * sought. There should be a generic fix for that, but in the meantime
+ * it's worth fixing this case in particular because we are doing very
+ * heavyweight operations within the scan, so that the elapsed time for
+ * the scan is vastly longer than for most other catalog scans. That
+ * means there's a much wider window for concurrent updates to cause
+ * trouble here than anywhere else. XXX this code should be changed
+ * whenever a generic fix is implemented.
+ */
+ snapshot = RegisterSnapshot(GetLatestSnapshot());
+
+ /*
* Once we start copying subdirectories, we need to be able to clean 'em
* up if we fail. Use an ENSURE block to make sure this happens. (This
* is not a 100% solution, because of the possibility of failure during
@@ -551,7 +575,7 @@ createdb(const CreatedbStmt *stmt)
* each one to the new database.
*/
rel = heap_open(TableSpaceRelationId, AccessShareLock);
- scan = heap_beginscan(rel, SnapshotNow, 0, NULL);
+ scan = heap_beginscan(rel, snapshot, 0, NULL);
while ((tuple = heap_getnext(scan, ForwardScanDirection)) != NULL)
{
Oid srctablespace = HeapTupleGetOid(tuple);
@@ -656,6 +680,9 @@ createdb(const CreatedbStmt *stmt)
PG_END_ENSURE_ERROR_CLEANUP(createdb_failure_callback,
PointerGetDatum(&fparms));
+ /* Free our snapshot */
+ UnregisterSnapshot(snapshot);
+
return dboid;
}
@@ -1715,9 +1742,20 @@ remove_dbtablespaces(Oid db_id)
Relation rel;
HeapScanDesc scan;
HeapTuple tuple;
+ Snapshot snapshot;
+
+ /*
+ * As in createdb(), we'd better use an MVCC snapshot here, since this
+ * scan can run for a long time. Duplicate visits to tablespaces would be
+ * harmless, but missing a tablespace could result in permanently leaked
+ * files.
+ *
+ * XXX change this when a generic fix for SnapshotNow races is implemented
+ */
+ snapshot = RegisterSnapshot(GetLatestSnapshot());
rel = heap_open(TableSpaceRelationId, AccessShareLock);
- scan = heap_beginscan(rel, SnapshotNow, 0, NULL);
+ scan = heap_beginscan(rel, snapshot, 0, NULL);
while ((tuple = heap_getnext(scan, ForwardScanDirection)) != NULL)
{
Oid dsttablespace = HeapTupleGetOid(tuple);
@@ -1763,6 +1801,7 @@ remove_dbtablespaces(Oid db_id)
heap_endscan(scan);
heap_close(rel, AccessShareLock);
+ UnregisterSnapshot(snapshot);
}
/*
@@ -1784,9 +1823,19 @@ check_db_file_conflict(Oid db_id)
Relation rel;
HeapScanDesc scan;
HeapTuple tuple;
+ Snapshot snapshot;
+
+ /*
+ * As in createdb(), we'd better use an MVCC snapshot here; missing a
+ * tablespace could result in falsely reporting the OID is unique, with
+ * disastrous future consequences per the comment above.
+ *
+ * XXX change this when a generic fix for SnapshotNow races is implemented
+ */
+ snapshot = RegisterSnapshot(GetLatestSnapshot());
rel = heap_open(TableSpaceRelationId, AccessShareLock);
- scan = heap_beginscan(rel, SnapshotNow, 0, NULL);
+ scan = heap_beginscan(rel, snapshot, 0, NULL);
while ((tuple = heap_getnext(scan, ForwardScanDirection)) != NULL)
{
Oid dsttablespace = HeapTupleGetOid(tuple);
@@ -1812,6 +1861,8 @@ check_db_file_conflict(Oid db_id)
heap_endscan(scan);
heap_close(rel, AccessShareLock);
+ UnregisterSnapshot(snapshot);
+
return result;
}