aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/bin/pg_rewind/parsexlog.c11
-rw-r--r--src/bin/pg_rewind/pg_rewind.c58
-rw-r--r--src/bin/pg_rewind/t/008_min_recovery_point.pl156
3 files changed, 201 insertions, 24 deletions
diff --git a/src/bin/pg_rewind/parsexlog.c b/src/bin/pg_rewind/parsexlog.c
index bc6f9769941..1a67725ec04 100644
--- a/src/bin/pg_rewind/parsexlog.c
+++ b/src/bin/pg_rewind/parsexlog.c
@@ -54,6 +54,9 @@ static int SimpleXLogPageRead(XLogReaderState *xlogreader,
* Read WAL from the datadir/pg_wal, starting from 'startpoint' on timeline
* index 'tliIndex' in target timeline history, until 'endpoint'. Make note of
* the data blocks touched by the WAL records, and return them in a page map.
+ *
+ * 'endpoint' is the end of the last record to read. The record starting at
+ * 'endpoint' is the first one that is not read.
*/
void
extractPageMap(const char *datadir, XLogRecPtr startpoint, int tliIndex,
@@ -92,7 +95,13 @@ extractPageMap(const char *datadir, XLogRecPtr startpoint, int tliIndex,
extractPageInfo(xlogreader);
- } while (xlogreader->ReadRecPtr != endpoint);
+ } while (xlogreader->EndRecPtr < endpoint);
+
+ /*
+ * If 'endpoint' didn't point exactly at a record boundary, the caller
+ * messed up.
+ */
+ Assert(xlogreader->EndRecPtr == endpoint);
XLogReaderFree(xlogreader);
if (xlogreadfd != -1)
diff --git a/src/bin/pg_rewind/pg_rewind.c b/src/bin/pg_rewind/pg_rewind.c
index 23fc749e445..789f2ffe2c2 100644
--- a/src/bin/pg_rewind/pg_rewind.c
+++ b/src/bin/pg_rewind/pg_rewind.c
@@ -123,6 +123,7 @@ main(int argc, char **argv)
XLogRecPtr chkptrec;
TimeLineID chkpttli;
XLogRecPtr chkptredo;
+ XLogRecPtr target_wal_endrec;
size_t size;
char *buffer;
bool no_ensure_shutdown = false;
@@ -315,43 +316,56 @@ main(int argc, char **argv)
{
pg_log_info("source and target cluster are on the same timeline");
rewind_needed = false;
+ target_wal_endrec = 0;
}
else
{
+ XLogRecPtr chkptendrec;
+
findCommonAncestorTimeline(&divergerec, &lastcommontliIndex);
pg_log_info("servers diverged at WAL location %X/%X on timeline %u",
(uint32) (divergerec >> 32), (uint32) divergerec,
targetHistory[lastcommontliIndex].tli);
/*
+ * Determine the end-of-WAL on the target.
+ *
+ * The WAL ends at the last shutdown checkpoint, or at
+ * minRecoveryPoint if it was a standby. (If we supported rewinding a
+ * server that was not shut down cleanly, we would need to replay
+ * until we reach the first invalid record, like crash recovery does.)
+ */
+
+ /* read the checkpoint record on the target to see where it ends. */
+ chkptendrec = readOneRecord(datadir_target,
+ ControlFile_target.checkPoint,
+ targetNentries - 1,
+ restore_command);
+
+ if (ControlFile_target.minRecoveryPoint > chkptendrec)
+ {
+ target_wal_endrec = ControlFile_target.minRecoveryPoint;
+ }
+ else
+ {
+ target_wal_endrec = chkptendrec;
+ }
+
+ /*
* Check for the possibility that the target is in fact a direct
* ancestor of the source. In that case, there is no divergent history
* in the target that needs rewinding.
*/
- if (ControlFile_target.checkPoint >= divergerec)
+ if (target_wal_endrec > divergerec)
{
rewind_needed = true;
}
else
{
- XLogRecPtr chkptendrec;
-
- /* Read the checkpoint record on the target to see where it ends. */
- chkptendrec = readOneRecord(datadir_target,
- ControlFile_target.checkPoint,
- targetNentries - 1,
- restore_command);
-
- /*
- * If the histories diverged exactly at the end of the shutdown
- * checkpoint record on the target, there are no WAL records in
- * the target that don't belong in the source's history, and no
- * rewind is needed.
- */
- if (chkptendrec == divergerec)
- rewind_needed = false;
- else
- rewind_needed = true;
+ /* the last common checkpoint record must be part of target WAL */
+ Assert(target_wal_endrec == divergerec);
+
+ rewind_needed = false;
}
}
@@ -384,14 +398,12 @@ main(int argc, char **argv)
/*
* Read the target WAL from last checkpoint before the point of fork, to
* extract all the pages that were modified on the target cluster after
- * the fork. We can stop reading after reaching the final shutdown record.
- * XXX: If we supported rewinding a server that was not shut down cleanly,
- * we would need to replay until the end of WAL here.
+ * the fork.
*/
if (showprogress)
pg_log_info("reading WAL in target");
extractPageMap(datadir_target, chkptrec, lastcommontliIndex,
- ControlFile_target.checkPoint, restore_command);
+ target_wal_endrec, restore_command);
filemap_finalize();
if (showprogress)
diff --git a/src/bin/pg_rewind/t/008_min_recovery_point.pl b/src/bin/pg_rewind/t/008_min_recovery_point.pl
new file mode 100644
index 00000000000..8f42016c435
--- /dev/null
+++ b/src/bin/pg_rewind/t/008_min_recovery_point.pl
@@ -0,0 +1,156 @@
+#
+# Test situation where a target data directory contains
+# WAL records beyond both the last checkpoint and the divergence
+# point:
+#
+# Target WAL (TLI 2):
+#
+# backup ... Checkpoint A ... INSERT 'rewind this'
+# (TLI 1 -> 2)
+#
+# ^ last common ^ minRecoveryPoint
+# checkpoint
+#
+# Source WAL (TLI 3):
+#
+# backup ... Checkpoint A ... Checkpoint B ... INSERT 'keep this'
+# (TLI 1 -> 2) (TLI 2 -> 3)
+#
+#
+# The last common checkpoint is Checkpoint A. But there is WAL on TLI 2
+# after the last common checkpoint that needs to be rewound. We used to
+# have a bug where minRecoveryPoint was ignored, and pg_rewind concluded
+# that the target doesn't need rewinding in this scenario, because the
+# last checkpoint on the target TLI was an ancestor of the source TLI.
+#
+#
+# This test does not make use of RewindTest as it requires three
+# nodes.
+
+use strict;
+use warnings;
+use PostgresNode;
+use TestLib;
+use Test::More tests => 3;
+
+use File::Copy;
+
+my $tmp_folder = TestLib::tempdir;
+
+my $node_1 = get_new_node('node_1');
+$node_1->init(allows_streaming => 1);
+$node_1->append_conf('postgresql.conf', qq(
+wal_keep_size='100 MB'
+));
+
+$node_1->start;
+
+# Create a couple of test tables
+$node_1->safe_psql('postgres', 'CREATE TABLE public.foo (t TEXT)');
+$node_1->safe_psql('postgres', 'CREATE TABLE public.bar (t TEXT)');
+$node_1->safe_psql('postgres', "INSERT INTO public.bar VALUES ('in both')");
+
+
+# Take backup
+my $backup_name = 'my_backup';
+$node_1->backup($backup_name);
+
+# Create streaming standby from backup
+my $node_2 = get_new_node('node_2');
+$node_2->init_from_backup($node_1, $backup_name,
+ has_streaming => 1);
+$node_2->start;
+
+# Create streaming standby from backup
+my $node_3 = get_new_node('node_3');
+$node_3->init_from_backup($node_1, $backup_name,
+ has_streaming => 1);
+$node_3->start;
+
+# Stop node_1
+
+$node_1->stop('fast');
+
+# Promote node_3
+$node_3->promote;
+
+# node_1 rejoins node_3
+
+my $node_3_connstr = $node_3->connstr;
+
+$node_1->append_conf('postgresql.conf', qq(
+primary_conninfo='$node_3_connstr'
+));
+$node_1->set_standby_mode();
+$node_1->start();
+
+# node_2 follows node_3
+
+$node_2->append_conf('postgresql.conf', qq(
+primary_conninfo='$node_3_connstr'
+));
+$node_2->restart();
+
+# Promote node_1
+
+$node_1->promote;
+
+# We now have a split-brain with two primaries. Insert a row on both to
+# demonstratively create a split brain. After the rewind, we should only
+# see the insert on 1, as the insert on node 3 is rewound away.
+$node_1->safe_psql('postgres', "INSERT INTO public.foo (t) VALUES ('keep this')");
+
+# Insert more rows in node 1, to bump up the XID counter. Otherwise, if
+# rewind doesn't correctly rewind the changes made on the other node,
+# we might fail to notice if the inserts are invisible because the XIDs
+# are not marked as committed.
+$node_1->safe_psql('postgres', "INSERT INTO public.foo (t) VALUES ('and this')");
+$node_1->safe_psql('postgres', "INSERT INTO public.foo (t) VALUES ('and this too')");
+
+# Also insert a row in 'bar' on node 3. It is unmodified in node 1, so it won't get
+# overwritten by replaying the WAL from node 1.
+$node_3->safe_psql('postgres', "INSERT INTO public.bar (t) VALUES ('rewind this')");
+
+# Wait for node 2 to catch up
+$node_2->poll_query_until('postgres',
+ q|SELECT COUNT(*) > 1 FROM public.bar|, 't');
+
+# At this point node_2 will shut down without a shutdown checkpoint,
+# but with WAL entries beyond the preceding shutdown checkpoint.
+$node_2->stop('fast');
+$node_3->stop('fast');
+
+my $node_2_pgdata = $node_2->data_dir;
+my $node_1_connstr = $node_1->connstr;
+
+# Keep a temporary postgresql.conf or it would be overwritten during the rewind.
+copy(
+ "$node_2_pgdata/postgresql.conf",
+ "$tmp_folder/node_2-postgresql.conf.tmp");
+
+command_ok(
+ [
+ 'pg_rewind',
+ "--source-server=$node_1_connstr",
+ "--target-pgdata=$node_2_pgdata"
+ ],
+ 'pg_rewind detects rewind needed');
+
+# Now move back postgresql.conf with old settings
+move(
+ "$tmp_folder/node_2-postgresql.conf.tmp",
+ "$node_2_pgdata/postgresql.conf");
+
+$node_2->start;
+
+# Check contents of the test tables after rewind. The rows inserted in node 3
+# before rewind should've been overwritten with the data from node 1.
+my $result;
+$result = $node_2->safe_psql('postgres', 'checkpoint');
+$result = $node_2->safe_psql('postgres', 'SELECT * FROM public.foo');
+is($result, qq(keep this
+and this
+and this too), 'table foo after rewind');
+
+$result = $node_2->safe_psql('postgres', 'SELECT * FROM public.bar');
+is($result, qq(in both), 'table bar after rewind');