aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRobert Haas <rhaas@postgresql.org>2016-12-08 14:09:09 -0500
committerRobert Haas <rhaas@postgresql.org>2016-12-08 14:12:08 -0500
commitfa0f466d5329e10b16f3b38c8eaf5306f7e234e8 (patch)
tree2a4b356448f6cf103949cd0cd403fe21e019354c
parent0b78106cd4651ad7867036a463ec743f6d3d2e86 (diff)
downloadpostgresql-fa0f466d5329e10b16f3b38c8eaf5306f7e234e8.tar.gz
postgresql-fa0f466d5329e10b16f3b38c8eaf5306f7e234e8.zip
Log the creation of an init fork unconditionally.
Previously, it was thought that this only needed to be done for the benefit of possible standbys, so wal_level = minimal skipped it. But that's not safe, because during crash recovery we might replay XLOG_DBASE_CREATE or XLOG_TBLSPC_CREATE record which recursively removes the directory that contains the new init fork. So log it always. The user-visible effect of this bug is that if you create a database or tablespace, then create an unlogged table, then crash without checkpointing, then restart, accessing the table will fail, because the it won't have been properly reset. This commit fixes that. Michael Paquier, per a report from Konstantin Knizhnik. Wording of the comments per a suggestion from me.
-rw-r--r--contrib/bloom/blinsert.c13
-rw-r--r--src/backend/access/nbtree/nbtree.c13
-rw-r--r--src/backend/access/spgist/spginsert.c23
-rw-r--r--src/backend/catalog/heap.c13
4 files changed, 38 insertions, 24 deletions
diff --git a/contrib/bloom/blinsert.c b/contrib/bloom/blinsert.c
index 0946aa29ecc..a31149f044a 100644
--- a/contrib/bloom/blinsert.c
+++ b/contrib/bloom/blinsert.c
@@ -164,13 +164,18 @@ blbuildempty(Relation index)
metapage = (Page) palloc(BLCKSZ);
BloomFillMetapage(index, metapage);
- /* Write the page. If archiving/streaming, XLOG it. */
+ /*
+ * Write the page and log it. It might seem that an immediate sync
+ * would be sufficient to guarantee that the file exists on disk, but
+ * recovery itself might remove it while replaying, for example, an
+ * XLOG_DBASE_CREATE or XLOG_TBLSPC_CREATE record. Therefore, we
+ * need this even when wal_level=minimal.
+ */
PageSetChecksumInplace(metapage, BLOOM_METAPAGE_BLKNO);
smgrwrite(index->rd_smgr, INIT_FORKNUM, BLOOM_METAPAGE_BLKNO,
(char *) metapage, true);
- if (XLogIsNeeded())
- log_newpage(&index->rd_smgr->smgr_rnode.node, INIT_FORKNUM,
- BLOOM_METAPAGE_BLKNO, metapage, false);
+ log_newpage(&index->rd_smgr->smgr_rnode.node, INIT_FORKNUM,
+ BLOOM_METAPAGE_BLKNO, metapage, false);
/*
* An immediate sync is required even if we xlog'd the page, because the
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index 128744c5b76..a264b92c13d 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -242,13 +242,18 @@ btbuildempty(Relation index)
metapage = (Page) palloc(BLCKSZ);
_bt_initmetapage(metapage, P_NONE, 0);
- /* Write the page. If archiving/streaming, XLOG it. */
+ /*
+ * Write the page and log it. It might seem that an immediate sync
+ * would be sufficient to guarantee that the file exists on disk, but
+ * recovery itself might remove it while replaying, for example, an
+ * XLOG_DBASE_CREATE or XLOG_TBLSPC_CREATE record. Therefore, we
+ * need this even when wal_level=minimal.
+ */
PageSetChecksumInplace(metapage, BTREE_METAPAGE);
smgrwrite(index->rd_smgr, INIT_FORKNUM, BTREE_METAPAGE,
(char *) metapage, true);
- if (XLogIsNeeded())
- log_newpage(&index->rd_smgr->smgr_rnode.node, INIT_FORKNUM,
- BTREE_METAPAGE, metapage, false);
+ log_newpage(&index->rd_smgr->smgr_rnode.node, INIT_FORKNUM,
+ BTREE_METAPAGE, metapage, false);
/*
* An immediate sync is required even if we xlog'd the page, because the
diff --git a/src/backend/access/spgist/spginsert.c b/src/backend/access/spgist/spginsert.c
index 01c8d213f5c..3eaa649eff3 100644
--- a/src/backend/access/spgist/spginsert.c
+++ b/src/backend/access/spgist/spginsert.c
@@ -161,13 +161,18 @@ spgbuildempty(Relation index)
page = (Page) palloc(BLCKSZ);
SpGistInitMetapage(page);
- /* Write the page. If archiving/streaming, XLOG it. */
+ /*
+ * Write the page and log it unconditionally. This is important
+ * particularly for indexes created on tablespaces and databases
+ * whose creation happened after the last redo pointer as recovery
+ * removes any of their existing content when the corresponding
+ * create records are replayed.
+ */
PageSetChecksumInplace(page, SPGIST_METAPAGE_BLKNO);
smgrwrite(index->rd_smgr, INIT_FORKNUM, SPGIST_METAPAGE_BLKNO,
(char *) page, true);
- if (XLogIsNeeded())
- log_newpage(&index->rd_smgr->smgr_rnode.node, INIT_FORKNUM,
- SPGIST_METAPAGE_BLKNO, page, false);
+ log_newpage(&index->rd_smgr->smgr_rnode.node, INIT_FORKNUM,
+ SPGIST_METAPAGE_BLKNO, page, false);
/* Likewise for the root page. */
SpGistInitPage(page, SPGIST_LEAF);
@@ -175,9 +180,8 @@ spgbuildempty(Relation index)
PageSetChecksumInplace(page, SPGIST_ROOT_BLKNO);
smgrwrite(index->rd_smgr, INIT_FORKNUM, SPGIST_ROOT_BLKNO,
(char *) page, true);
- if (XLogIsNeeded())
- log_newpage(&index->rd_smgr->smgr_rnode.node, INIT_FORKNUM,
- SPGIST_ROOT_BLKNO, page, true);
+ log_newpage(&index->rd_smgr->smgr_rnode.node, INIT_FORKNUM,
+ SPGIST_ROOT_BLKNO, page, true);
/* Likewise for the null-tuples root page. */
SpGistInitPage(page, SPGIST_LEAF | SPGIST_NULLS);
@@ -185,9 +189,8 @@ spgbuildempty(Relation index)
PageSetChecksumInplace(page, SPGIST_NULL_BLKNO);
smgrwrite(index->rd_smgr, INIT_FORKNUM, SPGIST_NULL_BLKNO,
(char *) page, true);
- if (XLogIsNeeded())
- log_newpage(&index->rd_smgr->smgr_rnode.node, INIT_FORKNUM,
- SPGIST_NULL_BLKNO, page, true);
+ log_newpage(&index->rd_smgr->smgr_rnode.node, INIT_FORKNUM,
+ SPGIST_NULL_BLKNO, page, true);
/*
* An immediate sync is required even if we xlog'd the pages, because the
diff --git a/src/backend/catalog/heap.c b/src/backend/catalog/heap.c
index 7f5bad0b5da..5ffea748554 100644
--- a/src/backend/catalog/heap.c
+++ b/src/backend/catalog/heap.c
@@ -1380,18 +1380,19 @@ heap_create_with_catalog(const char *relname,
/*
* Set up an init fork for an unlogged table so that it can be correctly
- * reinitialized on restart. Since we're going to do an immediate sync, we
- * only need to xlog this if archiving or streaming is enabled. And the
- * immediate sync is required, because otherwise there's no guarantee that
- * this will hit the disk before the next checkpoint moves the redo pointer.
+ * reinitialized on restart. An immediate sync is required even if the
+ * page has been logged, because the write did not go through
+ * shared_buffers and therefore a concurrent checkpoint may have moved
+ * the redo pointer past our xlog record. Recovery may as well remove it
+ * while replaying, for example, XLOG_DBASE_CREATE or XLOG_TBLSPC_CREATE
+ * record. Therefore, logging is necessary even if wal_level=minimal.
*/
void
heap_create_init_fork(Relation rel)
{
RelationOpenSmgr(rel);
smgrcreate(rel->rd_smgr, INIT_FORKNUM, false);
- if (XLogIsNeeded())
- log_smgrcreate(&rel->rd_smgr->smgr_rnode.node, INIT_FORKNUM);
+ log_smgrcreate(&rel->rd_smgr->smgr_rnode.node, INIT_FORKNUM);
smgrimmedsync(rel->rd_smgr, INIT_FORKNUM);
}