aboutsummaryrefslogtreecommitdiff
path: root/src/include
diff options
context:
space:
mode:
authorAlvaro Herrera <alvherre@alvh.no-ip.org>2021-09-29 11:21:51 -0300
committerAlvaro Herrera <alvherre@alvh.no-ip.org>2021-09-29 11:21:51 -0300
commitff9f111bce24fd9bbca7a20315586de877d74923 (patch)
tree60d376671292cd481f2ae8770fdf5a7812d5e31f /src/include
parent2acb7cc6b56c2b80029c202217e19553578456e9 (diff)
downloadpostgresql-ff9f111bce24fd9bbca7a20315586de877d74923.tar.gz
postgresql-ff9f111bce24fd9bbca7a20315586de877d74923.zip
Fix WAL replay in presence of an incomplete record
Physical replication always ships WAL segment files to replicas once they are complete. This is a problem if one WAL record is split across a segment boundary and the primary server crashes before writing down the segment with the next portion of the WAL record: WAL writing after crash recovery would happily resume at the point where the broken record started, overwriting that record ... but any standby or backup may have already received a copy of that segment, and they are not rewinding. This causes standbys to stop following the primary after the latter crashes: LOG: invalid contrecord length 7262 at A8/D9FFFBC8 because the standby is still trying to read the continuation record (contrecord) for the original long WAL record, but it is not there and it will never be. A workaround is to stop the replica, delete the WAL file, and restart it -- at which point a fresh copy is brought over from the primary. But that's pretty labor intensive, and I bet many users would just give up and re-clone the standby instead. A fix for this problem was already attempted in commit 515e3d84a0b5, but it only addressed the case for the scenario of WAL archiving, so streaming replication would still be a problem (as well as other things such as taking a filesystem-level backup while the server is down after having crashed), and it had performance scalability problems too; so it had to be reverted. This commit fixes the problem using an approach suggested by Andres Freund, whereby the initial portion(s) of the split-up WAL record are kept, and a special type of WAL record is written where the contrecord was lost, so that WAL replay in the replica knows to skip the broken parts. With this approach, we can continue to stream/archive segment files as soon as they are complete, and replay of the broken records will proceed across the crash point without a hitch. Because a new type of WAL record is added, users should be careful to upgrade standbys first, primaries later. Otherwise they risk the standby being unable to start if the primary happens to write such a record. A new TAP test that exercises this is added, but the portability of it is yet to be seen. This has been wrong since the introduction of physical replication, so backpatch all the way back. In stable branches, keep the new XLogReaderState members at the end of the struct, to avoid an ABI break. Author: Álvaro Herrera <alvherre@alvh.no-ip.org> Reviewed-by: Kyotaro Horiguchi <horikyota.ntt@gmail.com> Reviewed-by: Nathan Bossart <bossartn@amazon.com> Discussion: https://postgr.es/m/202108232252.dh7uxf6oxwcy@alvherre.pgsql
Diffstat (limited to 'src/include')
-rw-r--r--src/include/access/xlog_internal.h11
-rw-r--r--src/include/access/xlogreader.h10
-rw-r--r--src/include/catalog/pg_control.h2
3 files changed, 22 insertions, 1 deletions
diff --git a/src/include/access/xlog_internal.h b/src/include/access/xlog_internal.h
index 3b5eceff658..c0da76cab49 100644
--- a/src/include/access/xlog_internal.h
+++ b/src/include/access/xlog_internal.h
@@ -76,8 +76,10 @@ typedef XLogLongPageHeaderData *XLogLongPageHeader;
#define XLP_LONG_HEADER 0x0002
/* This flag indicates backup blocks starting in this page are optional */
#define XLP_BKP_REMOVABLE 0x0004
+/* Replaces a missing contrecord; see CreateOverwriteContrecordRecord */
+#define XLP_FIRST_IS_OVERWRITE_CONTRECORD 0x0008
/* All defined flag bits in xlp_info (used for validity checking of header) */
-#define XLP_ALL_FLAGS 0x0007
+#define XLP_ALL_FLAGS 0x000F
#define XLogPageHeaderSize(hdr) \
(((hdr)->xlp_info & XLP_LONG_HEADER) ? SizeOfXLogLongPHD : SizeOfXLogShortPHD)
@@ -249,6 +251,13 @@ typedef struct xl_restore_point
char rp_name[MAXFNAMELEN];
} xl_restore_point;
+/* Overwrite of prior contrecord */
+typedef struct xl_overwrite_contrecord
+{
+ XLogRecPtr overwritten_lsn;
+ TimestampTz overwrite_time;
+} xl_overwrite_contrecord;
+
/* End of recovery mark, when we don't do an END_OF_RECOVERY checkpoint */
typedef struct xl_end_of_recovery
{
diff --git a/src/include/access/xlogreader.h b/src/include/access/xlogreader.h
index 21d200d3df6..de6fd791fe6 100644
--- a/src/include/access/xlogreader.h
+++ b/src/include/access/xlogreader.h
@@ -175,6 +175,16 @@ struct XLogReaderState
XLogRecPtr ReadRecPtr; /* start of last record read */
XLogRecPtr EndRecPtr; /* end+1 of last record read */
+ /*
+ * Set at the end of recovery: the start point of a partial record at the
+ * end of WAL (InvalidXLogRecPtr if there wasn't one), and the start
+ * location of its first contrecord that went missing.
+ */
+ XLogRecPtr abortedRecPtr;
+ XLogRecPtr missingContrecPtr;
+ /* Set when XLP_FIRST_IS_OVERWRITE_CONTRECORD is found */
+ XLogRecPtr overwrittenRecPtr;
+
/* ----------------------------------------
* Decoded representation of current record
diff --git a/src/include/catalog/pg_control.h b/src/include/catalog/pg_control.h
index e3f48158ce7..749bce0cc6f 100644
--- a/src/include/catalog/pg_control.h
+++ b/src/include/catalog/pg_control.h
@@ -76,6 +76,8 @@ typedef struct CheckPoint
#define XLOG_END_OF_RECOVERY 0x90
#define XLOG_FPI_FOR_HINT 0xA0
#define XLOG_FPI 0xB0
+/* 0xC0 is used in Postgres 9.5-11 */
+#define XLOG_OVERWRITE_CONTRECORD 0xD0
/*