3 files changed, 201 insertions, 24 deletions
diff --git a/src/bin/pg_rewind/parsexlog.c b/src/bin/pg_rewind/parsexlog.c
index bc6f9769941..1a67725ec04 100644
--- a/src/bin/pg_rewind/parsexlog.c
+++ b/src/bin/pg_rewind/parsexlog.c
@@ -54,6 +54,9 @@ static int	SimpleXLogPageRead(XLogReaderState *xlogreader,
  * Read WAL from the datadir/pg_wal, starting from 'startpoint' on timeline
  * index 'tliIndex' in target timeline history, until 'endpoint'. Make note of
  * the data blocks touched by the WAL records, and return them in a page map.
+ *
+ * 'endpoint' is the end of the last record to read. The record starting at
+ * 'endpoint' is the first one that is not read.
  */
 void
 extractPageMap(const char *datadir, XLogRecPtr startpoint, int tliIndex,
@@ -92,7 +95,13 @@ extractPageMap(const char *datadir, XLogRecPtr startpoint, int tliIndex,
 
 		extractPageInfo(xlogreader);
 
-	} while (xlogreader->ReadRecPtr != endpoint);
+	} while (xlogreader->EndRecPtr < endpoint);
+
+	/*
+	 * If 'endpoint' didn't point exactly at a record boundary, the caller
+	 * messed up.
+	 */
+	Assert(xlogreader->EndRecPtr == endpoint);
 
 	XLogReaderFree(xlogreader);
 	if (xlogreadfd != -1)
diff --git a/src/bin/pg_rewind/pg_rewind.c b/src/bin/pg_rewind/pg_rewind.c
index 23fc749e445..789f2ffe2c2 100644
--- a/src/bin/pg_rewind/pg_rewind.c
+++ b/src/bin/pg_rewind/pg_rewind.c
@@ -123,6 +123,7 @@ main(int argc, char **argv)
 	XLogRecPtr	chkptrec;
 	TimeLineID	chkpttli;
 	XLogRecPtr	chkptredo;
+	XLogRecPtr	target_wal_endrec;
 	size_t		size;
 	char	   *buffer;
 	bool		no_ensure_shutdown = false;
@@ -315,43 +316,56 @@ main(int argc, char **argv)
 	{
 		pg_log_info("source and target cluster are on the same timeline");
 		rewind_needed = false;
+		target_wal_endrec = 0;
 	}
 	else
 	{
+		XLogRecPtr	chkptendrec;
+
 		findCommonAncestorTimeline(&divergerec, &lastcommontliIndex);
 		pg_log_info("servers diverged at WAL location %X/%X on timeline %u",
 					(uint32) (divergerec >> 32), (uint32) divergerec,
 					targetHistory[lastcommontliIndex].tli);
 
 		/*
+		 * Determine the end-of-WAL on the target.
+		 *
+		 * The WAL ends at the last shutdown checkpoint, or at
+		 * minRecoveryPoint if it was a standby. (If we supported rewinding a
+		 * server that was not shut down cleanly, we would need to replay
+		 * until we reach the first invalid record, like crash recovery does.)
+		 */
+
+		/* read the checkpoint record on the target to see where it ends. */
+		chkptendrec = readOneRecord(datadir_target,
+									ControlFile_target.checkPoint,
+									targetNentries - 1,
+									restore_command);
+
+		if (ControlFile_target.minRecoveryPoint > chkptendrec)
+		{
+			target_wal_endrec = ControlFile_target.minRecoveryPoint;
+		}
+		else
+		{
+			target_wal_endrec = chkptendrec;
+		}
+
+		/*
 		 * Check for the possibility that the target is in fact a direct
 		 * ancestor of the source. In that case, there is no divergent history
 		 * in the target that needs rewinding.
 		 */
-		if (ControlFile_target.checkPoint >= divergerec)
+		if (target_wal_endrec > divergerec)
 		{
 			rewind_needed = true;
 		}
 		else
 		{
-			XLogRecPtr	chkptendrec;
-
-			/* Read the checkpoint record on the target to see where it ends. */
-			chkptendrec = readOneRecord(datadir_target,
-										ControlFile_target.checkPoint,
-										targetNentries - 1,
-										restore_command);
-
-			/*
-			 * If the histories diverged exactly at the end of the shutdown
-			 * checkpoint record on the target, there are no WAL records in
-			 * the target that don't belong in the source's history, and no
-			 * rewind is needed.
-			 */
-			if (chkptendrec == divergerec)
-				rewind_needed = false;
-			else
-				rewind_needed = true;
+			/* the last common checkpoint record must be part of target WAL */
+			Assert(target_wal_endrec == divergerec);
+
+			rewind_needed = false;
 		}
 	}
 
@@ -384,14 +398,12 @@ main(int argc, char **argv)
 	/*
 	 * Read the target WAL from last checkpoint before the point of fork, to
 	 * extract all the pages that were modified on the target cluster after
-	 * the fork. We can stop reading after reaching the final shutdown record.
-	 * XXX: If we supported rewinding a server that was not shut down cleanly,
-	 * we would need to replay until the end of WAL here.
+	 * the fork.
 	 */
 	if (showprogress)
 		pg_log_info("reading WAL in target");
 	extractPageMap(datadir_target, chkptrec, lastcommontliIndex,
-				   ControlFile_target.checkPoint, restore_command);
+				   target_wal_endrec, restore_command);
 	filemap_finalize();
 
 	if (showprogress)
diff --git a/src/bin/pg_rewind/t/008_min_recovery_point.pl b/src/bin/pg_rewind/t/008_min_recovery_point.pl
new file mode 100644
index 00000000000..8f42016c435
--- /dev/null
+++ b/src/bin/pg_rewind/t/008_min_recovery_point.pl
@@ -0,0 +1,156 @@
+#
+# Test situation where a target data directory contains
+# WAL records beyond both the last checkpoint and the divergence
+# point:
+#
+# Target WAL (TLI 2):
+#
+# backup ... Checkpoint A ... INSERT 'rewind this'
+#            (TLI 1 -> 2)
+#
+#            ^ last common                        ^ minRecoveryPoint
+#              checkpoint
+#
+# Source WAL (TLI 3):
+#
+# backup ... Checkpoint A ... Checkpoint B ... INSERT 'keep this'
+#            (TLI 1 -> 2)     (TLI 2 -> 3)
+#
+#
+# The last common checkpoint is Checkpoint A. But there is WAL on TLI 2
+# after the last common checkpoint that needs to be rewound. We used to
+# have a bug where minRecoveryPoint was ignored, and pg_rewind concluded
+# that the target doesn't need rewinding in this scenario, because the
+# last checkpoint on the target TLI was an ancestor of the source TLI.
+#
+#
+# This test does not make use of RewindTest as it requires three
+# nodes.
+
+use strict;
+use warnings;
+use PostgresNode;
+use TestLib;
+use Test::More tests => 3;
+
+use File::Copy;
+
+my $tmp_folder = TestLib::tempdir;
+
+my $node_1 = get_new_node('node_1');
+$node_1->init(allows_streaming => 1);
+$node_1->append_conf('postgresql.conf', qq(
+wal_keep_size='100 MB'
+));
+
+$node_1->start;
+
+# Create a couple of test tables
+$node_1->safe_psql('postgres', 'CREATE TABLE public.foo (t TEXT)');
+$node_1->safe_psql('postgres', 'CREATE TABLE public.bar (t TEXT)');
+$node_1->safe_psql('postgres', "INSERT INTO public.bar VALUES ('in both')");
+
+
+# Take backup
+my $backup_name = 'my_backup';
+$node_1->backup($backup_name);
+
+# Create streaming standby from backup
+my $node_2 = get_new_node('node_2');
+$node_2->init_from_backup($node_1, $backup_name,
+	has_streaming => 1);
+$node_2->start;
+
+# Create streaming standby from backup
+my $node_3 = get_new_node('node_3');
+$node_3->init_from_backup($node_1, $backup_name,
+	has_streaming => 1);
+$node_3->start;
+
+# Stop node_1
+
+$node_1->stop('fast');
+
+# Promote node_3
+$node_3->promote;
+
+# node_1 rejoins node_3
+
+my $node_3_connstr = $node_3->connstr;
+
+$node_1->append_conf('postgresql.conf', qq(
+primary_conninfo='$node_3_connstr'
+));
+$node_1->set_standby_mode();
+$node_1->start();
+
+# node_2 follows node_3
+
+$node_2->append_conf('postgresql.conf', qq(
+primary_conninfo='$node_3_connstr'
+));
+$node_2->restart();
+
+# Promote node_1
+
+$node_1->promote;
+
+# We now have a split-brain with two primaries. Insert a row on both to
+# demonstratively create a split brain. After the rewind, we should only
+# see the insert on 1, as the insert on node 3 is rewound away.
+$node_1->safe_psql('postgres', "INSERT INTO public.foo (t) VALUES ('keep this')");
+
+# Insert more rows in node 1, to bump up the XID counter. Otherwise, if
+# rewind doesn't correctly rewind the changes made on the other node,
+# we might fail to notice if the inserts are invisible because the XIDs
+# are not marked as committed.
+$node_1->safe_psql('postgres', "INSERT INTO public.foo (t) VALUES ('and this')");
+$node_1->safe_psql('postgres', "INSERT INTO public.foo (t) VALUES ('and this too')");
+
+# Also insert a row in 'bar' on node 3. It is unmodified in node 1, so it won't get
+# overwritten by replaying the WAL from node 1.
+$node_3->safe_psql('postgres', "INSERT INTO public.bar (t) VALUES ('rewind this')");
+
+# Wait for node 2 to catch up
+$node_2->poll_query_until('postgres',
+	q|SELECT COUNT(*) > 1 FROM public.bar|, 't');
+
+# At this point node_2 will shut down without a shutdown checkpoint,
+# but with WAL entries beyond the preceding shutdown checkpoint.
+$node_2->stop('fast');
+$node_3->stop('fast');
+
+my $node_2_pgdata = $node_2->data_dir;
+my $node_1_connstr = $node_1->connstr;
+
+# Keep a temporary postgresql.conf or it would be overwritten during the rewind.
+copy(
+	"$node_2_pgdata/postgresql.conf",
+	"$tmp_folder/node_2-postgresql.conf.tmp");
+
+command_ok(
+    [
+        'pg_rewind',
+        "--source-server=$node_1_connstr",
+        "--target-pgdata=$node_2_pgdata"
+    ],
+	'pg_rewind detects rewind needed');
+
+# Now move back postgresql.conf with old settings
+move(
+	"$tmp_folder/node_2-postgresql.conf.tmp",
+	"$node_2_pgdata/postgresql.conf");
+
+$node_2->start;
+
+# Check contents of the test tables after rewind. The rows inserted in node 3
+# before rewind should've been overwritten with the data from node 1.
+my $result;
+$result = $node_2->safe_psql('postgres', 'checkpoint');
+$result = $node_2->safe_psql('postgres', 'SELECT * FROM public.foo');
+is($result, qq(keep this
+and this
+and this too), 'table foo after rewind');
+
+$result = $node_2->safe_psql('postgres', 'SELECT * FROM public.bar');
+is($result, qq(in both), 'table bar after rewind');