aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTom Lane <tgl@sss.pgh.pa.us>2021-06-11 16:12:36 -0400
committerTom Lane <tgl@sss.pgh.pa.us>2021-06-11 16:12:41 -0400
commitab55d742eb7162c22ee60f1e15e07d2a60063c4e (patch)
treef208cfb0c4a8a1978531626453cd06ce97d82bdd
parent4efcf47053eaf8dd88de2b1a89478df43d37d5c0 (diff)
downloadpostgresql-ab55d742eb7162c22ee60f1e15e07d2a60063c4e.tar.gz
postgresql-ab55d742eb7162c22ee60f1e15e07d2a60063c4e.zip
Fix multiple crasher bugs in partitioned-table replication logic.
apply_handle_tuple_routing(), having detected and reported that the tuple it needed to update didn't exist, tried to update that tuple anyway, leading to a null-pointer dereference. logicalrep_partition_open() failed to ensure that the LogicalRepPartMapEntry it built for a partition was fully independent of that for the partition root, leading to trouble if the root entry was later freed or rebuilt. Meanwhile, on the publisher's side, pgoutput_change() sometimes attempted to apply execute_attr_map_tuple() to a NULL tuple. The first of these was reported by Sergey Bernikov in bug #17055; I found the other two while developing some test cases for this sadly under-tested code. Diagnosis and patch for the first issue by Amit Langote; patches for the others by me; new test cases by me. Back-patch to v13 where this logic came in. Discussion: https://postgr.es/m/17055-9ba800ec8522668b@postgresql.org
-rw-r--r--src/backend/replication/logical/relation.c13
-rw-r--r--src/backend/replication/logical/worker.c48
-rw-r--r--src/backend/replication/pgoutput/pgoutput.c7
-rw-r--r--src/test/subscription/t/001_rep_changes.pl36
-rw-r--r--src/test/subscription/t/013_partition.pl71
5 files changed, 146 insertions, 29 deletions
diff --git a/src/backend/replication/logical/relation.c b/src/backend/replication/logical/relation.c
index e861c0ff802..4930f2ca348 100644
--- a/src/backend/replication/logical/relation.c
+++ b/src/backend/replication/logical/relation.c
@@ -620,7 +620,9 @@ logicalrep_partmap_init(void)
* logicalrep_partition_open
*
* Returned entry reuses most of the values of the root table's entry, save
- * the attribute map, which can be different for the partition.
+ * the attribute map, which can be different for the partition. However,
+ * we must physically copy all the data, in case the root table's entry
+ * gets freed/rebuilt.
*
* Note there's no logicalrep_partition_close, because the caller closes the
* component relation.
@@ -656,7 +658,7 @@ logicalrep_partition_open(LogicalRepRelMapEntry *root,
part_entry->partoid = partOid;
- /* Remote relation is used as-is from the root entry. */
+ /* Remote relation is copied as-is from the root entry. */
entry = &part_entry->relmapentry;
entry->remoterel.remoteid = remoterel->remoteid;
entry->remoterel.nspname = pstrdup(remoterel->nspname);
@@ -699,7 +701,12 @@ logicalrep_partition_open(LogicalRepRelMapEntry *root,
}
}
else
- entry->attrmap = attrmap;
+ {
+ /* Lacking copy_attmap, do this the hard way. */
+ entry->attrmap = make_attrmap(attrmap->maplen);
+ memcpy(entry->attrmap->attnums, attrmap->attnums,
+ attrmap->maplen * sizeof(AttrNumber));
+ }
entry->updatable = root->updatable;
diff --git a/src/backend/replication/logical/worker.c b/src/backend/replication/logical/worker.c
index 98c26002e83..689a66cc72d 100644
--- a/src/backend/replication/logical/worker.c
+++ b/src/backend/replication/logical/worker.c
@@ -1477,12 +1477,13 @@ apply_handle_update_internal(ApplyExecutionData *edata,
else
{
/*
- * The tuple to be updated could not be found.
+ * The tuple to be updated could not be found. Do nothing except for
+ * emitting a log message.
*
- * TODO what to do here, change the log level to LOG perhaps?
+ * XXX should this be promoted to ereport(LOG) perhaps?
*/
elog(DEBUG1,
- "logical replication did not find row for update "
+ "logical replication did not find row to be updated "
"in replication target relation \"%s\"",
RelationGetRelationName(localrel));
}
@@ -1589,9 +1590,14 @@ apply_handle_delete_internal(ApplyExecutionData *edata,
}
else
{
- /* The tuple to be deleted could not be found. */
+ /*
+ * The tuple to be deleted could not be found. Do nothing except for
+ * emitting a log message.
+ *
+ * XXX should this be promoted to ereport(LOG) perhaps?
+ */
elog(DEBUG1,
- "logical replication did not find row for delete "
+ "logical replication did not find row to be deleted "
"in replication target relation \"%s\"",
RelationGetRelationName(localrel));
}
@@ -1728,31 +1734,31 @@ apply_handle_tuple_routing(ApplyExecutionData *edata,
found = FindReplTupleInLocalRel(estate, partrel,
&part_entry->remoterel,
remoteslot_part, &localslot);
-
- oldctx = MemoryContextSwitchTo(GetPerTupleMemoryContext(estate));
- if (found)
- {
- /* Apply the update. */
- slot_modify_data(remoteslot_part, localslot,
- part_entry,
- newtup);
- MemoryContextSwitchTo(oldctx);
- }
- else
+ if (!found)
{
/*
- * The tuple to be updated could not be found.
+ * The tuple to be updated could not be found. Do nothing
+ * except for emitting a log message.
*
- * TODO what to do here, change the log level to LOG
- * perhaps?
+ * XXX should this be promoted to ereport(LOG) perhaps?
*/
elog(DEBUG1,
- "logical replication did not find row for update "
- "in replication target relation \"%s\"",
+ "logical replication did not find row to be updated "
+ "in replication target relation's partition \"%s\"",
RelationGetRelationName(partrel));
+ return;
}
/*
+ * Apply the update to the local tuple, putting the result in
+ * remoteslot_part.
+ */
+ oldctx = MemoryContextSwitchTo(GetPerTupleMemoryContext(estate));
+ slot_modify_data(remoteslot_part, localslot, part_entry,
+ newtup);
+ MemoryContextSwitchTo(oldctx);
+
+ /*
* Does the updated tuple still satisfy the current
* partition's constraint?
*/
diff --git a/src/backend/replication/pgoutput/pgoutput.c b/src/backend/replication/pgoutput/pgoutput.c
index fe12d08a946..63f108f960f 100644
--- a/src/backend/replication/pgoutput/pgoutput.c
+++ b/src/backend/replication/pgoutput/pgoutput.c
@@ -612,8 +612,11 @@ pgoutput_change(LogicalDecodingContext *ctx, ReorderBufferTXN *txn,
/* Convert tuples if needed. */
if (relentry->map)
{
- oldtuple = execute_attr_map_tuple(oldtuple, relentry->map);
- newtuple = execute_attr_map_tuple(newtuple, relentry->map);
+ if (oldtuple)
+ oldtuple = execute_attr_map_tuple(oldtuple,
+ relentry->map);
+ newtuple = execute_attr_map_tuple(newtuple,
+ relentry->map);
}
}
diff --git a/src/test/subscription/t/001_rep_changes.pl b/src/test/subscription/t/001_rep_changes.pl
index 1f654ee6afc..b2fdb289dbf 100644
--- a/src/test/subscription/t/001_rep_changes.pl
+++ b/src/test/subscription/t/001_rep_changes.pl
@@ -6,7 +6,7 @@ use strict;
use warnings;
use PostgresNode;
use TestLib;
-use Test::More tests => 28;
+use Test::More tests => 31;
# Initialize publisher node
my $node_publisher = get_new_node('publisher');
@@ -118,7 +118,7 @@ $node_publisher->safe_psql('postgres',
"INSERT INTO tab_mixed VALUES (2, 'bar', 2.2)");
$node_publisher->safe_psql('postgres',
- "INSERT INTO tab_full_pk VALUES (1, 'foo')");
+ "INSERT INTO tab_full_pk VALUES (1, 'foo'), (2, 'baz')");
$node_publisher->safe_psql('postgres',
"INSERT INTO tab_nothing VALUES (generate_series(1,20))");
@@ -297,6 +297,38 @@ is( $result, qq(local|1.1|baz|1
local|2.2|bar|2),
'update works with different column order and subscriber local values');
+$result = $node_subscriber->safe_psql('postgres',
+ "SELECT * FROM tab_full_pk ORDER BY a");
+is( $result, qq(1|bar
+2|baz),
+ 'update works with REPLICA IDENTITY FULL and a primary key');
+
+# Check that subscriber handles cases where update/delete target tuple
+# is missing. We have to look for the DEBUG1 log messages about that,
+# so temporarily bump up the log verbosity.
+$node_subscriber->append_conf('postgresql.conf', "log_min_messages = debug1");
+$node_subscriber->reload;
+
+$node_subscriber->safe_psql('postgres', "DELETE FROM tab_full_pk");
+
+$node_publisher->safe_psql('postgres',
+ "UPDATE tab_full_pk SET b = 'quux' WHERE a = 1");
+$node_publisher->safe_psql('postgres', "DELETE FROM tab_full_pk WHERE a = 2");
+
+$node_publisher->wait_for_catchup('tap_sub');
+
+my $logfile = slurp_file($node_subscriber->logfile());
+ok( $logfile =~
+ qr/logical replication did not find row to be updated in replication target relation "tab_full_pk"/,
+ 'update target row is missing');
+ok( $logfile =~
+ qr/logical replication did not find row to be deleted in replication target relation "tab_full_pk"/,
+ 'delete target row is missing');
+
+$node_subscriber->append_conf('postgresql.conf',
+ "log_min_messages = warning");
+$node_subscriber->reload;
+
# check behavior with toasted values
$node_publisher->safe_psql('postgres',
diff --git a/src/test/subscription/t/013_partition.pl b/src/test/subscription/t/013_partition.pl
index 9de01017be5..d0b5a55d186 100644
--- a/src/test/subscription/t/013_partition.pl
+++ b/src/test/subscription/t/013_partition.pl
@@ -6,7 +6,7 @@ use strict;
use warnings;
use PostgresNode;
use TestLib;
-use Test::More tests => 56;
+use Test::More tests => 62;
# setup
@@ -347,6 +347,46 @@ $result =
$node_subscriber2->safe_psql('postgres', "SELECT a FROM tab1 ORDER BY 1");
is($result, qq(), 'truncate of tab1 replicated');
+# Check that subscriber handles cases where update/delete target tuple
+# is missing. We have to look for the DEBUG1 log messages about that,
+# so temporarily bump up the log verbosity.
+$node_subscriber1->append_conf('postgresql.conf',
+ "log_min_messages = debug1");
+$node_subscriber1->reload;
+
+$node_publisher->safe_psql('postgres',
+ "INSERT INTO tab1 VALUES (1, 'foo'), (4, 'bar'), (10, 'baz')");
+
+$node_publisher->wait_for_catchup('sub1');
+$node_publisher->wait_for_catchup('sub2');
+
+$node_subscriber1->safe_psql('postgres', "DELETE FROM tab1");
+
+$node_publisher->safe_psql('postgres',
+ "UPDATE tab1 SET b = 'quux' WHERE a = 4");
+$node_publisher->safe_psql('postgres', "DELETE FROM tab1");
+
+$node_publisher->wait_for_catchup('sub1');
+$node_publisher->wait_for_catchup('sub2');
+
+my $logfile = slurp_file($node_subscriber1->logfile());
+ok( $logfile =~
+ qr/logical replication did not find row to be updated in replication target relation's partition "tab1_2_2"/,
+ 'update target row is missing in tab1_2_2');
+ok( $logfile =~
+ qr/logical replication did not find row to be deleted in replication target relation "tab1_1"/,
+ 'delete target row is missing in tab1_1');
+ok( $logfile =~
+ qr/logical replication did not find row to be deleted in replication target relation "tab1_2_2"/,
+ 'delete target row is missing in tab1_2_2');
+ok( $logfile =~
+ qr/logical replication did not find row to be deleted in replication target relation "tab1_def"/,
+ 'delete target row is missing in tab1_def');
+
+$node_subscriber1->append_conf('postgresql.conf',
+ "log_min_messages = warning");
+$node_subscriber1->reload;
+
# Tests for replication using root table identity and schema
# publisher
@@ -650,3 +690,32 @@ is( $result, qq(pub_tab2|1|xxx
pub_tab2|3|yyy
pub_tab2|5|zzz
xxx_c|6|aaa), 'inserts into tab2 replicated');
+
+# Check that subscriber handles cases where update/delete target tuple
+# is missing. We have to look for the DEBUG1 log messages about that,
+# so temporarily bump up the log verbosity.
+$node_subscriber1->append_conf('postgresql.conf',
+ "log_min_messages = debug1");
+$node_subscriber1->reload;
+
+$node_subscriber1->safe_psql('postgres', "DELETE FROM tab2");
+
+$node_publisher->safe_psql('postgres',
+ "UPDATE tab2 SET b = 'quux' WHERE a = 5");
+$node_publisher->safe_psql('postgres', "DELETE FROM tab2 WHERE a = 1");
+
+$node_publisher->wait_for_catchup('sub_viaroot');
+$node_publisher->wait_for_catchup('sub2');
+
+$logfile = slurp_file($node_subscriber1->logfile());
+ok( $logfile =~
+ qr/logical replication did not find row to be updated in replication target relation's partition "tab2_1"/,
+ 'update target row is missing in tab2_1');
+ok( $logfile =~
+ qr/logical replication did not find row to be deleted in replication target relation "tab2_1"/,
+ 'delete target row is missing in tab2_1');
+
+# No need for this until more tests are added.
+# $node_subscriber1->append_conf('postgresql.conf',
+# "log_min_messages = warning");
+# $node_subscriber1->reload;