[Date Prev][Date Next] [Thread Prev][Thread Next] [Date Index] [Thread Index]

Bug#928378: unblock: pg-checksums/0.8-3



Package: release.debian.org
Severity: normal
User: release.debian.org@packages.debian.org
Usertags: unblock

Please unblock package pg-checksums

It fixes #928232 which is a rather serious problem for a program that is
supposed to find data corruption.

unblock pg-checksums/0.8-3

-- System Information:
Debian Release: 8.10
  APT prefers oldstable
  APT policy: (500, 'oldstable')
Architecture: i386 (i686)

Kernel: Linux 3.16.0-4-686-pae (SMP w/1 CPU core)
Locale: LANG=de_DE.UTF-8, LC_CTYPE=de_DE.UTF-8 (charmap=UTF-8)
Shell: /bin/sh linked to /bin/dash
Init: systemd (via /run/systemd/system)
diff -Nru pg-checksums-0.8/debian/changelog pg-checksums-0.8/debian/changelog
--- pg-checksums-0.8/debian/changelog	2019-03-09 16:14:40.000000000 +0100
+++ pg-checksums-0.8/debian/changelog	2019-04-30 14:55:21.000000000 +0200
@@ -1,3 +1,16 @@
+pg-checksums (0.8-3) unstable; urgency=medium
+
+  [ Christoph Berg ]
+  * Remove myself from uploaders.
+
+  [ Michael Banck ]
+  * debian/patches/zero_random_pageheader.patch: New patch, fixes false
+    negatives for all-zero or random pageheaders which were not detected as
+    checksum failures previously. Adapted from upstream commit 043f003f
+    (Closes: #928232).
+
+ -- Michael Banck <michael.banck@credativ.de>  Tue, 30 Apr 2019 14:55:21 +0200
+
 pg-checksums (0.8-2) unstable; urgency=medium
 
   * debian/patches/fsync_pgdata.patch: New patch, backports the fsync_pgdata()
diff -Nru pg-checksums-0.8/debian/control pg-checksums-0.8/debian/control
--- pg-checksums-0.8/debian/control	2019-02-26 21:39:57.000000000 +0100
+++ pg-checksums-0.8/debian/control	2019-04-30 14:44:14.000000000 +0200
@@ -3,7 +3,6 @@
 Priority: optional
 Maintainer: Debian PostgreSQL Maintainers <team+postgresql@tracker.debian.org>
 Uploaders: Michael Banck <michael.banck@credativ.de>,
- Christoph Berg <myon@debian.org>,
 Build-Depends:
  debhelper (>= 9),
  postgresql-server-dev-all (>= 153~),
diff -Nru pg-checksums-0.8/debian/control.in pg-checksums-0.8/debian/control.in
--- pg-checksums-0.8/debian/control.in	2018-10-12 14:04:31.000000000 +0200
+++ pg-checksums-0.8/debian/control.in	2019-04-15 23:18:34.000000000 +0200
@@ -3,7 +3,6 @@
 Priority: optional
 Maintainer: Debian PostgreSQL Maintainers <team+postgresql@tracker.debian.org>
 Uploaders: Michael Banck <michael.banck@credativ.de>,
- Christoph Berg <myon@debian.org>,
 Build-Depends:
  debhelper (>= 9),
  postgresql-server-dev-all (>= 153~),
diff -Nru pg-checksums-0.8/debian/patches/series pg-checksums-0.8/debian/patches/series
--- pg-checksums-0.8/debian/patches/series	2019-02-26 21:19:49.000000000 +0100
+++ pg-checksums-0.8/debian/patches/series	2019-04-23 17:14:59.000000000 +0200
@@ -1,2 +1,3 @@
 postgresnode.patch
 fsync_pgdata.patch
+zero_random_pageheader.patch
diff -Nru pg-checksums-0.8/debian/patches/zero_random_pageheader.patch pg-checksums-0.8/debian/patches/zero_random_pageheader.patch
--- pg-checksums-0.8/debian/patches/zero_random_pageheader.patch	1970-01-01 01:00:00.000000000 +0100
+++ pg-checksums-0.8/debian/patches/zero_random_pageheader.patch	2019-04-30 13:40:05.000000000 +0200
@@ -0,0 +1,355 @@
+commit 043f003fc1fc1cdf8a370971edf6e9323034b15d
+Author: Michael Banck <mbanck@debian.org>
+Date:   Tue Apr 23 16:03:16 2019 +0200
+
+    Fix checksum failure detection for all-zero or random pageheaders.
+    
+    Previously, a zeroed-out pageheader would lead to that page getting skipped as
+    being new, even if other parts of the page have (corrupted) data in it.  Add an
+    additional check that the whole page shall be zero and report a checksum
+    failure if that is not the case.
+    
+    Also, random data in the pageheader would very likely make the page's LSN so
+    large that it would be considered newer than the checkpoint LSN.  As we are not
+    connected to the server, we cannot inquire the current insert record pointer to
+    check against or lock the page, so we (i) advance the checkpoint LSN to the
+    current pg_control value and (ii) demand that the upper 32 bits of the LSN are
+    identical between the checkpoint LSN and the page's LSN in order to skip the
+    page rather than report a checksum failure.
+    
+    Additional TAP tests are added for both cases as well.
+
+Index: pg-checksums/pg_checksums.c
+===================================================================
+--- pg-checksums.orig/pg_checksums.c
++++ pg-checksums/pg_checksums.c
+@@ -77,6 +77,8 @@ static bool deactivate = false;
+ static bool show_progress = false;
+ static bool online = false;
+ 
++static char *DataDir = NULL;
++
+ static const char *progname;
+ 
+ /*
+@@ -181,6 +183,28 @@ isRelFileName(const char *fn)
+ 	return true;
+ }
+ 
++ static void
++update_checkpoint_lsn(void)
++{
++	bool	crc_ok;
++
++#if PG_VERSION_NUM >= 100000
++	ControlFile = get_controlfile(DataDir, progname, &crc_ok);
++	if (!crc_ok)
++	{
++		fprintf(stderr, _("%s: pg_control CRC value is incorrect\n"), progname);
++		exit(1);
++	}
++#elif PG_VERSION_NUM >= 90600
++	ControlFile = get_controlfile(DataDir, progname);
++#else
++	ControlFile = getControlFile(DataDir);
++#endif
++
++	/* Update checkpointLSN with the current value */
++	checkpointLSN = ControlFile->checkPoint;
++}
++
+ static void
+ toggle_progress_report(int signo,
+ 					   siginfo_t * siginfo,
+@@ -252,6 +276,8 @@ scan_file(const char *fn, BlockNumber se
+ 	int			f;
+ 	BlockNumber	blockno;
+ 	bool		block_retry = false;
++	bool		all_zeroes;
++	size_t	   *pagebytes;
+ 
+ 	f = open(fn, O_RDWR | PG_BINARY, 0);
+ 	if (f < 0)
+@@ -273,6 +299,7 @@ scan_file(const char *fn, BlockNumber se
+ 	{
+ 		uint16		csum;
+ 		int			r = read(f, buf.data, BLCKSZ);
++		int			i;
+ 
+ 		if (debug && block_retry)
+ 			fprintf(stderr, _("%s: retrying block %d in file \"%s\"\n"),
+@@ -321,18 +348,38 @@ scan_file(const char *fn, BlockNumber se
+ 			continue;
+ 		}
+ 
++		blocks++;
++
+ 		/* New pages have no checksum yet */
+ 		if (PageIsNew(header))
+ 		{
+-			if (debug && block_retry)
+-				fprintf(stderr, _("%s: block %d in file \"%s\" is new, ignoring\n"),
+-						progname, blockno, fn);
+-			skippedblocks++;
++			/* Check for an all-zeroes page */
++			all_zeroes = true;
++			pagebytes = (size_t *) buf.data;
++			for (i = 0; i < (BLCKSZ / sizeof(size_t)); i++)
++			{
++				if (pagebytes[i] != 0)
++				{
++					all_zeroes = false;
++					break;
++				}
++			}
++			if (!all_zeroes)
++			{
++				fprintf(stderr, _("%s: checksum verification failed in file \"%s\", block %u: pd_upper is zero but block is not all-zero\n"),
++						progname, fn, blockno);
++				badblocks++;
++			}
++			else
++			{
++				if (debug && block_retry)
++					fprintf(stderr, _("%s: block %d in file \"%s\" is new, ignoring\n"),
++							progname, blockno, fn);
++				skippedblocks++;
++			}
+ 			continue;
+ 		}
+ 
+-		blocks++;
+-
+ 		csum = pg_checksum_page(buf.data, blockno + segmentno * RELSEG_SIZE);
+ 		current_size += r;
+ 
+@@ -367,23 +414,34 @@ scan_file(const char *fn, BlockNumber se
+ 
+ 					/* Reset loop to validate the block again */
+ 					blockno--;
++					blocks--;
+ 					current_size -= r;
+ 
++					/*
++					 * Update the checkpoint LSN now. If we get a failure on
++					 * re-read, we would need to do this anyway, and doing it
++					 * now lowers the probability that we see the same torn
++					 * page on re-read.
++					 */
++					update_checkpoint_lsn();
++
+ 					continue;
+ 				}
+ 
+ 				/*
+ 				 * The checksum verification failed on retry as well.  Check if
+ 				 * the page has been modified since the checkpoint and skip it
+-				 * in this case.
++				 * in this case. As a sanity check, demand that the upper 32
++				 * bits of the LSN are identical in order to skip as a guard
++				 * against a corrupted LSN in the pageheader.
+ 				 */
+-				if (PageGetLSN(buf.data) > checkpointLSN)
++				if ((PageGetLSN(buf.data) > checkpointLSN) &&
++					(PageGetLSN(buf.data) >> 32 == checkpointLSN >> 32))
+ 				{
+ 					if (debug)
+ 						fprintf(stderr, _("%s: block %d in file \"%s\" with LSN %X/%X is newer than checkpoint LSN %X/%X, ignoring\n"),
+ 								progname, blockno, fn, (uint32) (PageGetLSN(buf.data) >> 32), (uint32) PageGetLSN(buf.data), (uint32) (checkpointLSN >> 32), (uint32) checkpointLSN);
+ 					block_retry = false;
+-					blocks--;
+ 					skippedblocks++;
+ 					continue;
+ 				}
+@@ -897,7 +955,6 @@ main(int argc, char *argv[])
+ 		{NULL, 0, NULL, 0}
+ 	};
+ 
+-	char	   *DataDir = NULL;
+ 	int			c;
+ 	int			option_index;
+ #if PG_VERSION_NUM >= 100000
+Index: pg-checksums/t/001_checksums.pl
+===================================================================
+--- pg-checksums.orig/t/001_checksums.pl
++++ pg-checksums/t/001_checksums.pl
+@@ -6,7 +6,7 @@ use Cwd;
+ use Config;
+ use PostgresNode;
+ use TestLib;
+-use Test::More tests => 49;
++use Test::More tests => 97;
+ 
+ program_help_ok('pg_checksums');
+ program_version_ok('pg_checksums');
+@@ -14,8 +14,8 @@ program_options_handling_ok('pg_checksum
+ 
+ my $tempdir = TestLib::tempdir;
+ 
+-# Initialize node
+-my $node = get_new_node('main');
++# Initialize node with checksums disabled.
++my $node = get_new_node('node_checksum');
+ $node->init;
+ 
+ $node->start;
+@@ -80,70 +80,112 @@ $node->stop;
+ $node->command_ok(['pg_checksums', '-a', '-D', $pgdata],
+         'pg_checksums are again activated in offline cluster');
+ 
+-#exit 0;
+ $node->start;
+ 
+ $node->command_ok(['pg_checksums', '-c', '-D', $pgdata],
+         'pg_checksums can be verified in online cluster');
+ 
+-# create table to corrupt and get their relfilenode
+-create_corruption($node, 'corrupt1', 'pg_default');
+-
+-$node->command_checks_all([ 'pg_checksums', '-c', '-D', $pgdata],
+-        1,
+-        [qr/Bad checksums:  1/s],
+-        [qr/checksum verification failed/s],
+-        'pg_checksums reports checksum mismatch'
+-);
+-
+-# drop corrupt table again and make sure there is no more corruption
+-$node->safe_psql('postgres', 'DROP TABLE corrupt1;');
+-$node->command_ok(['pg_checksums', '-c', '-D', $pgdata],
+-        'pg_checksums can be verified in online cluster: '.getcwd());
+-
+-
+-# create table to corrupt in a non-default tablespace and get their relfilenode
+-my $tablespace_dir = getcwd()."/tmp_check/ts_corrupt_dir";
++# Set page header and block size
++my $pageheader_size = 24;
++my $block_size = $node->safe_psql('postgres', 'SHOW block_size;');
++
++# Check corruption of table on default tablespace.
++check_relation_corruption($node, 'corrupt1', 'pg_default', $pageheader_size, "\0\0\0\0\0\0\0\0\0", "on tablespace pg_default");
++
++# Create tablespace to check corruptions in a non-default tablespace.
++my $basedir = $node->basedir;
++my $tablespace_dir = "$basedir/ts_corrupt_dir";
+ mkdir ($tablespace_dir);
+-$node->safe_psql('postgres', "CREATE TABLESPACE ts_corrupt LOCATION '".$tablespace_dir."';");
+-create_corruption($node, 'corrupt2', 'ts_corrupt');
+-
+-$node->command_checks_all([ 'pg_checksums', '-c', '-D', $pgdata],
+-        1,
+-        [qr/Bad checksums:  1/s],
+-        [qr/checksum verification failed/s],
+-        'pg_checksums reports checksum mismatch on non-default tablespace'
+-);
+-
+-# drop corrupt table again and make sure there is no more corruption
+-$node->safe_psql('postgres', 'DROP TABLE corrupt2;');
+-$node->command_ok(['pg_checksums', '-c', '-D', $pgdata],
+-        'pg_checksums can be verified in online cluster');
+-
+-# Utility routine to create a table with corrupted checksums.
+-# It stops the node (if running), and starts it again.
+-sub create_corruption
++$tablespace_dir = TestLib::real_dir($tablespace_dir);
++$node->safe_psql('postgres',
++    "CREATE TABLESPACE ts_corrupt LOCATION '$tablespace_dir';");
++check_relation_corruption($node, 'corrupt2', 'ts_corrupt', $pageheader_size, "\0\0\0\0\0\0\0\0\0", "on tablespace ts_corrupt");
++
++# Check corruption in the pageheader with random data in it
++my $random_data = join '', map { ("a".."z")[rand 26] } 1 .. $pageheader_size;
++check_relation_corruption($node, 'corrupt1', 'pg_default', 0, $random_data, "with random data in pageheader");
++
++# Check corruption when the pageheader has been zeroed-out completely
++my $zero_data = "\0"x$pageheader_size;
++check_relation_corruption($node, 'corrupt1', 'pg_default', 0, $zero_data, "with zeroed-out pageheader");
++
++# Utility routine to create and check a table with corrupted checksums
++# on a wanted tablespace.  Note that this stops and starts the node
++# multiple times to perform the checks, leaving the node started
++# at the end.
++sub check_relation_corruption
+ {
+ 	my $node = shift;
+ 	my $table = shift;
+ 	my $tablespace = shift;
+-
+-	my $query = "SELECT a INTO ".$table." FROM generate_series(1,10000) AS a; ALTER TABLE ".$table." SET (autovacuum_enabled=false), SET TABLESPACE ".$tablespace."; SELECT pg_relation_filepath('".$table."')";
+-	my $file_name = $node->safe_psql('postgres', $query);
+-
+-	# set page header and block sizes
+-	my $pageheader_size = 24;
+-	my $block_size = $node->safe_psql('postgres', 'SHOW block_size;');
++	my $offset = shift;
++	my $corrupted_data = shift;
++	my $description = shift;
++	my $pgdata = $node->data_dir;
++
++	$node->safe_psql('postgres',
++		"SELECT a INTO $table FROM generate_series(1,10000) AS a;
++		ALTER TABLE $table SET (autovacuum_enabled=false);");
++
++	$node->safe_psql('postgres',
++		"ALTER TABLE ".$table." SET TABLESPACE ".$tablespace.";");
++
++	my $file_corrupted = $node->safe_psql('postgres',
++		"SELECT pg_relation_filepath('$table');");
++	my $relfilenode_corrupted =  $node->safe_psql('postgres',
++		"SELECT relfilenode FROM pg_class WHERE relname = '$table';");
+ 
+ 	$node->stop;
+ 
+-	open my $file, '+<', "$pgdata/$file_name";
+-	seek($file, $pageheader_size, 0);
+-	syswrite($file, '\0\0\0\0\0\0\0\0\0');
++	# Checksums are correct for single relfilenode as the table is not
++	# corrupted yet.
++	command_ok(['pg_checksums',  '-c', '-D', $pgdata, '-r',
++			   $relfilenode_corrupted],
++		"succeeds for single relfilenode $description with offline cluster");
++
++	# Time to create some corruption
++	open my $file, '+<', "$pgdata/$file_corrupted";
++	seek($file, $offset, 0);
++	syswrite($file, $corrupted_data);
+ 	close $file;
+ 
++	# Checksum checks on single relfilenode fail
++	$node->command_checks_all([ 'pg_checksums', '-c', '-D', $pgdata,
++							  '-r', $relfilenode_corrupted],
++							  1,
++							  [qr/Bad checksums:.*1/],
++							  [qr/checksum verification failed/],
++							  "fails with corrupted data $description");
++
++	# Global checksum checks fail as well
++	$node->command_checks_all([ 'pg_checksums', '-c', '-D', $pgdata],
++							  1,
++							  [qr/Bad checksums:.*1/],
++							  [qr/checksum verification failed/],
++							  "fails with corrupted data for single relfilenode on tablespace $tablespace");
++
++	# Now check online as well
+ 	$node->start;
+ 
++	# Checksum checks on single relfilenode fail
++	$node->command_checks_all([ 'pg_checksums', '-c', '-D', $pgdata,
++							  '-r', $relfilenode_corrupted],
++							  1,
++							  [qr/Bad checksums:.*1/],
++							  [qr/checksum verification failed/],
++							  "fails with corrupted data $description");
++
++	# Global checksum checks fail as well
++	$node->command_checks_all([ 'pg_checksums', '-c', '-D', $pgdata],
++							  1,
++							  [qr/Bad checksums:.*1/],
++							  [qr/checksum verification failed/],
++							  "fails with corrupted data for single relfilenode on tablespace $tablespace");
++
++	# Drop corrupted table again and make sure there is no more corruption.
++	$node->safe_psql('postgres', "DROP TABLE $table;");
++	$node->command_ok(['pg_checksums', '-c', '-D', $pgdata],
++		"succeeds again after table drop on tablespace $tablespace");
+ 	return;
+ }
+ 
diff -Nru pg-checksums-0.8/debian/source/lintian-overrides pg-checksums-0.8/debian/source/lintian-overrides
--- pg-checksums-0.8/debian/source/lintian-overrides	2018-10-12 14:04:31.000000000 +0200
+++ pg-checksums-0.8/debian/source/lintian-overrides	1970-01-01 01:00:00.000000000 +0100
@@ -1,3 +0,0 @@
-# don't bug people uploading from @work
-source: changelog-should-mention-nmu
-source: source-nmu-has-incorrect-version-number

Reply to: