aboutsummaryrefslogtreecommitdiff
path: root/src/bin/pg_rewind/t/RewindTest.pm
blob: 734adb72ec543b9351ff942e6b13436df4b819e5 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
package RewindTest;

# Test driver for pg_rewind. Each test consists of a cycle where a new cluster
# is first created with initdb, and a streaming replication standby is set up
# to follow the master. Then the master is shut down and the standby is
# promoted, and finally pg_rewind is used to rewind the old master, using the
# standby as the source.
#
# To run a test, the test script (in t/ subdirectory) calls the functions
# in this module. These functions should be called in this sequence:
#
# 1. init_rewind_test - sets up log file etc.
#
# 2. setup_cluster - creates a PostgreSQL cluster that runs as the master
#
# 3. start_master - starts the master server
#
# 4. create_standby - runs pg_basebackup to initialize a standby server, and
#    sets it up to follow the master.
#
# 5. promote_standby - runs "pg_ctl promote" to promote the standby server.
# The old master keeps running.
#
# 6. run_pg_rewind - stops the old master (if it's still running) and runs
# pg_rewind to synchronize it with the now-promoted standby server.
#
# 7. clean_rewind_test - stops both servers used in the test, if they're
# still running.
#
# The test script can use the helper functions master_psql and standby_psql
# to run psql against the master and standby servers, respectively. The
# test script can also use the $connstr_master and $connstr_standby global
# variables, which contain libpq connection strings for connecting to the
# master and standby servers. The data directories are also available
# in paths $test_master_datadir and $test_standby_datadir

use strict;
use warnings;

use TestLib;
use Test::More;

use Config;
use File::Copy;
use File::Path qw(rmtree);
use IPC::Run qw(run start);

use Exporter 'import';
our @EXPORT = qw(
  $connstr_master
  $connstr_standby
  $test_master_datadir
  $test_standby_datadir

  append_to_file
  master_psql
  standby_psql
  check_query

  init_rewind_test
  setup_cluster
  start_master
  create_standby
  promote_standby
  run_pg_rewind
  clean_rewind_test
);

our $test_master_datadir  = "$tmp_check/data_master";
our $test_standby_datadir = "$tmp_check/data_standby";

# Define non-conflicting ports for both nodes.
my $port_master  = $ENV{PGPORT};
my $port_standby = $port_master + 1;

my $connstr_master  = "port=$port_master";
my $connstr_standby = "port=$port_standby";

$ENV{PGDATABASE} = "postgres";

sub master_psql
{
	my $cmd = shift;

	system_or_bail 'psql', '-q', '--no-psqlrc', '-d', $connstr_master,
	  '-c', "$cmd";
}

sub standby_psql
{
	my $cmd = shift;

	system_or_bail 'psql', '-q', '--no-psqlrc', '-d', $connstr_standby,
	  '-c', "$cmd";
}

# Run a query against the master, and check that the output matches what's
# expected
sub check_query
{
	my ($query, $expected_stdout, $test_name) = @_;
	my ($stdout, $stderr);

	# we want just the output, no formatting
	my $result = run [
		'psql',          '-q', '-A', '-t', '--no-psqlrc', '-d',
		$connstr_master, '-c', $query ],
	  '>', \$stdout, '2>', \$stderr;

	# We don't use ok() for the exit code and stderr, because we want this
	# check to be just a single test.
	if (!$result)
	{
		fail("$test_name: psql exit code");
	}
	elsif ($stderr ne '')
	{
		diag $stderr;
		fail("$test_name: psql no stderr");
	}
	else
	{
		$stdout =~ s/\r//g if $Config{osname} eq 'msys';
		is($stdout, $expected_stdout, "$test_name: query result matches");
	}
}

# Run a query once a second, until it returns 't' (i.e. SQL boolean true).
sub poll_query_until
{
	my ($query, $connstr) = @_;

	my $max_attempts = 90;
	my $attempts     = 0;
	my ($stdout, $stderr);

	while ($attempts < $max_attempts)
	{
		my $cmd = [ 'psql', '-At', '-c', "$query", '-d', "$connstr" ];
		my $result = run $cmd, '>', \$stdout, '2>', \$stderr;

		chomp($stdout);
		$stdout =~ s/\r//g if $Config{osname} eq 'msys';
		if ($stdout eq "t")
		{
			return 1;
		}

		# Wait a second before retrying.
		sleep 1;
		$attempts++;
	}

	# The query result didn't change in 90 seconds. Give up. Print the
	# output from the last attempt, hopefully that's useful for debugging.
	chomp($stderr);
	$stderr =~ s/\r//g if $Config{osname} eq 'msys';
	diag qq(poll_query_until timed out executing this query:
$query
expecting this output:
t
last actual query output:
$stdout
with stderr:
$stderr);
	return 0;
}

sub append_to_file
{
	my ($filename, $str) = @_;

	open my $fh, ">>", $filename or die "could not open file $filename";
	print $fh $str;
	close $fh;
}

sub setup_cluster
{
	# Initialize master, data checksums are mandatory
	rmtree($test_master_datadir);
	standard_initdb($test_master_datadir);

	# Custom parameters for master's postgresql.conf
	append_to_file(
		"$test_master_datadir/postgresql.conf", qq(
wal_level = hot_standby
max_wal_senders = 2
wal_keep_segments = 20
max_wal_size = 200MB
shared_buffers = 1MB
wal_log_hints = on
hot_standby = on
autovacuum = off
max_connections = 10
));

	# Accept replication connections on master
	configure_hba_for_replication $test_master_datadir;
}

sub start_master
{
	system_or_bail('pg_ctl' , '-w',
				   '-D' , $test_master_datadir,
				   '-l',  "$log_path/master.log",
				   "-o", "-p $port_master", 'start');

	#### Now run the test-specific parts to initialize the master before setting
	# up standby
}

sub create_standby
{

	# Set up standby with necessary parameter
	rmtree $test_standby_datadir;

	# Base backup is taken with xlog files included
	system_or_bail('pg_basebackup', '-D', $test_standby_datadir,
				   '-p', $port_master, '-x');
	append_to_file(
		"$test_standby_datadir/recovery.conf", qq(
primary_conninfo='$connstr_master application_name=rewind_standby'
standby_mode=on
recovery_target_timeline='latest'
));

	# Start standby
	system_or_bail('pg_ctl', '-w', '-D', $test_standby_datadir,
				   '-l', "$log_path/standby.log",
				   '-o', "-p $port_standby", 'start');

	# The standby may have WAL to apply before it matches the primary.  That
	# is fine, because no test examines the standby before promotion.
}

sub promote_standby
{
	#### Now run the test-specific parts to run after standby has been started
	# up standby

	# Wait for the standby to receive and write all WAL.
	my $wal_received_query =
"SELECT pg_current_xlog_location() = write_location FROM pg_stat_replication WHERE application_name = 'rewind_standby';";
	poll_query_until($wal_received_query, $connstr_master)
	  or die "Timed out while waiting for standby to receive and write WAL";

	# Now promote slave and insert some new data on master, this will put
	# the master out-of-sync with the standby. Wait until the standby is
	# out of recovery mode, and is ready to accept read-write connections.
	system_or_bail('pg_ctl', '-w', '-D', $test_standby_datadir, 'promote');
	poll_query_until("SELECT NOT pg_is_in_recovery()", $connstr_standby)
	  or die "Timed out while waiting for promotion of standby";

	# Force a checkpoint after the promotion. pg_rewind looks at the control
	# file to determine what timeline the server is on, and that isn't updated
	# immediately at promotion, but only at the next checkpoint. When running
	# pg_rewind in remote mode, it's possible that we complete the test steps
	# after promotion so quickly that when pg_rewind runs, the standby has not
	# performed a checkpoint after promotion yet.
	standby_psql("checkpoint");
}

sub run_pg_rewind
{
	my $test_mode = shift;

	# Stop the master and be ready to perform the rewind
	system_or_bail('pg_ctl', '-D', $test_master_datadir, '-m', 'fast', 'stop');

	# At this point, the rewind processing is ready to run.
	# We now have a very simple scenario with a few diverged WAL record.
	# The real testing begins really now with a bifurcation of the possible
	# scenarios that pg_rewind supports.

	# Keep a temporary postgresql.conf for master node or it would be
	# overwritten during the rewind.
	copy("$test_master_datadir/postgresql.conf",
		 "$tmp_check/master-postgresql.conf.tmp");

	# Now run pg_rewind
	if ($test_mode eq "local")
	{
		# Do rewind using a local pgdata as source
		# Stop the master and be ready to perform the rewind
		system_or_bail('pg_ctl', '-D', $test_standby_datadir,
					   '-m', 'fast', 'stop');
		command_ok(['pg_rewind',
					"--debug",
					"--source-pgdata=$test_standby_datadir",
					"--target-pgdata=$test_master_datadir"],
				   'pg_rewind local');
	}
	elsif ($test_mode eq "remote")
	{
		# Do rewind using a remote connection as source
		command_ok(['pg_rewind',
					"--debug",
					"--source-server",
					"port=$port_standby dbname=postgres",
					"--target-pgdata=$test_master_datadir"],
				   'pg_rewind remote');
	}
	else
	{

		# Cannot come here normally
		die("Incorrect test mode specified");
	}

	# Now move back postgresql.conf with old settings
	move("$tmp_check/master-postgresql.conf.tmp",
		 "$test_master_datadir/postgresql.conf");

	# Plug-in rewound node to the now-promoted standby node
	append_to_file(
		"$test_master_datadir/recovery.conf", qq(
primary_conninfo='port=$port_standby'
standby_mode=on
recovery_target_timeline='latest'
));

	# Restart the master to check that rewind went correctly
	system_or_bail('pg_ctl', '-w', '-D', $test_master_datadir,
				   '-l', "$log_path/master.log",
				   '-o', "-p $port_master", 'start');

	#### Now run the test-specific parts to check the result
}

# Clean up after the test. Stop both servers, if they're still running.
sub clean_rewind_test
{
	if ($test_master_datadir)
	{
		system
		  'pg_ctl', '-D', $test_master_datadir, '-m', 'immediate', 'stop';
	}
	if ($test_standby_datadir)
	{
		system
		  'pg_ctl', '-D', $test_standby_datadir, '-m', 'immediate', 'stop';
	}
}

# Stop the test servers, just in case they're still running.
END
{
	my $save_rc = $?;
	clean_rewind_test();
	$? = $save_rc;
}