[Date Prev][Date Next] [Thread Prev][Thread Next] [Date Index] [Thread Index]

Re: Bug#486071: Pre-approval for heartbeat testing-proposed-updates upload (2.1.3-6lenny1)



On Fri, Feb 13, 2009 at 11:36:11AM +0100, Philipp Kern wrote:
> On Fri, Feb 13, 2009 at 10:54:02AM +0100, Luk Claes wrote:
> > Simon Horman wrote:
> > > I would like to upload a(nother) fresh version of heartbeat to fix
> > > a fairly severe bug. The fix has been in unstable since 2.1.3-7 and
> > > was included upstream at around the same time - it seems to be well tested.
> > Unfortunately this is too late to make it. Please request to include it
> > in r1 after the release, TIA.
> 
> Oh and please attach a debdiff from the version in Lenny to the one with
> your fix to the bug report.  Your upload should target
> 'stable-proposed-updates'.  TIA.

Thanks, as Lenny has now been released I have made the upload.
The debdiff is below and this email should record it in the BTS.

-- 
Simon Horman
  VA Linux Systems Japan K.K., Sydney, Australia Satellite Office
  H: www.vergenet.net/~horms/             W: www.valinux.co.jp/en


diff -u heartbeat-2.1.3/version.Debian heartbeat-2.1.3/version.Debian
--- heartbeat-2.1.3/version.Debian
+++ heartbeat-2.1.3/version.Debian
@@ -1 +1 @@
-2.1.3-5
+2.1.3-6lenny1
diff -u heartbeat-2.1.3/debian/changelog heartbeat-2.1.3/debian/changelog
--- heartbeat-2.1.3/debian/changelog
+++ heartbeat-2.1.3/debian/changelog
@@ -1,3 +1,14 @@
+heartbeat (2.1.3-6lenny1) stable-proposed-updates; urgency=low
+
+  * dopd: fix basic failover; fix hb message corruption by fprintf(stderr)
+    Patch: fix-basic-failover-fix-hb-message-corruption-by-fprintf.patch
+    Upstream-Status: commit 47f60bebe7b25abd88ea7b5488e66dfe187416ae
+                     "dopd: fix basic failover; fix hb message corruption by
+                      fprintf(stderr)"
+    (closes: #486071)
+
+ -- Simon Horman <horms@debian.org>  Mon, 16 Feb 2009 02:54:43 +0000
+
 heartbeat (2.1.3-6lenny0) testing-proposed-updates; urgency=low
 
   * heartbeat-gui dependancy on python-xml
only in patch2:
unchanged:
--- heartbeat-2.1.3.orig/debian/patches/dopd-fix-basic-failover-fix-hb-message-corruption-by-fprintf_stderr_.patch
+++ heartbeat-2.1.3/debian/patches/dopd-fix-basic-failover-fix-hb-message-corruption-by-fprintf_stderr_.patch
@@ -0,0 +1,110 @@
+# HG changeset patch
+# User Rasto Levrinc <rasto@linbit.com>
+# Date 1206539836 -3600
+# Node ID 47f60bebe7b25abd88ea7b5488e66dfe187416ae
+# Parent  17c0cf487322287d0689a036c32f21b900ce5a80
+dopd: fix basic failover; fix hb message corruption by fprintf(stderr)
+
+check_drbd_peer() used to return FALSE for "node name not in node list",
+so drbd-peer-outdater returned "invalid nodename".
+Then the semantic changed, and check_drbd_peer learned about "dead" peers
+and returned FALSE for them as well. Which made basic failover impossible :(
+
+The return code was now changed to "peer unreachable" for a dead peer.
+And even for nodes which really are not in the host list (and thus could be
+classified as invalide), because, after all, thats what they are.
+unreachable.
+
+Node name comparison needs to be case insensitive; fixed.
+
+During testing with 15 concurrent drbd resources several dopd crashes have been
+observed, which after some debugging turned out to be simply a wrong assumption
+about the global availability of stderr: some fprintf(stderr, "debug message")
+had accidentally used the heartbeat communication channel file descriptor,
+which seriously confused the comm layer.
+All those fprintfs have now been changed to use cl_log.
+
+diff -r 17c0cf487322 -r 47f60bebe7b2 contrib/drbd-outdate-peer/dopd.c
+--- a/contrib/drbd-outdate-peer/dopd.c	Mon Mar 24 16:14:12 2008 +0100
++++ b/contrib/drbd-outdate-peer/dopd.c	Wed Mar 26 14:57:16 2008 +0100
+@@ -202,14 +202,17 @@
+ }
+ 
+ /* check_drbd_peer()
+- * walk the nodes and return TRUE if peer is not this node and it exists.
++ * walk the nodes and return
++ *  FALSE if peer is not found, not a "normal" node, or "dead"
++ *    (no point in trying to reach those nodes).
++ *  TRUE if peer is found to be alive and "normal".
+  */
+ gboolean
+ check_drbd_peer(const char *drbd_peer)
+ {
+ 	const char *node;
+ 	gboolean found = FALSE;
+-	if (!strcmp(drbd_peer, node_name)) {
++	if (!strcasecmp(drbd_peer, node_name)) {
+ 		cl_log(LOG_WARNING, "drbd peer node %s is me!\n", drbd_peer);
+ 		return FALSE;
+ 	}
+@@ -306,9 +309,9 @@
+ 			} else
+ 				pthread_mutex_unlock(&conn_mutex);
+ 		} else {
+-			/* wrong peer was specified,
+-			   send return code 20 to the client */
+-			send_to_client(curr_client, "20");
++			/* peer "dead" or not in node list.
++			 * return "peer could not be reached" */
++			send_to_client(curr_client, "5");
+ 		}
+ 
+ 		ha_msg_del(msg);
+diff -r 17c0cf487322 -r 47f60bebe7b2 contrib/drbd-outdate-peer/drbd-peer-outdater.c
+--- a/contrib/drbd-outdate-peer/drbd-peer-outdater.c	Mon Mar 24 16:14:12 2008 +0100
++++ b/contrib/drbd-outdate-peer/drbd-peer-outdater.c	Wed Mar 26 14:57:16 2008 +0100
+@@ -76,7 +76,7 @@
+ 
+ 	msg = msgfromIPC_noauth(server);
+ 	if (!msg) {
+-		fprintf(stderr, "no message from server or other "
++		cl_log(LOG_WARNING, "no message from server or other "
+ 				"instance is running\n");
+ 		if (client->mainloop != NULL &&
+ 		    g_main_is_running(client->mainloop))
+@@ -92,7 +92,7 @@
+ 	errno = 0;
+ 	rc = strtol(rc_string, &ep, 10);
+ 	if (errno != 0 || *ep != EOS) {
+-		fprintf(stderr, "unknown message: %s from server", rc_string);
++		cl_log(LOG_WARNING, "unknown message: %s from server", rc_string);
+ 		client->rc = 20; /* "officially undefined", unspecified error */
+ 		ha_msg_del(msg);
+ 		if (client->mainloop != NULL &&
+@@ -124,7 +124,7 @@
+ outdater_timeout_dispatch(gpointer user_data)
+ {
+ 	dop_client_t *client = (dop_client_t *)user_data;
+-	fprintf(stderr, "error: could not connect to dopd after %i seconds"
++	cl_log(LOG_WARNING, "error: could not connect to dopd after %i seconds"
+ 			": timeout reached\n", client->timeout);
+ 	if (client->mainloop != NULL && g_main_is_running(client->mainloop))
+ 		g_main_quit(client->mainloop);
+@@ -255,7 +255,7 @@
+ 					 (gpointer)new_client, &ipc_server);
+ 
+ 	if (ipc_server == NULL) {
+-		fprintf(stderr, "Could not connect to "T_OUTDATER" channel\n");
++		cl_log(LOG_WARNING, "Could not connect to "T_OUTDATER" channel\n");
+ 		dop_exit(new_client); /* unreachable */
+ 	}
+ 
+@@ -267,7 +267,7 @@
+ 	ha_msg_add(update, F_OUTDATER_RES, drbd_resource);
+ 
+ 	if (msg2ipcchan(update, ipc_server) != HA_OK) {
+-		fprintf(stderr, "Could not send message\n");
++		cl_log(LOG_WARNING, "Could not send message\n");
+ 		dop_exit(new_client);
+ 	}
+ 
only in patch2:
unchanged:
--- heartbeat-2.1.3.orig/debian/patches/series/2.1.3-6lenny1
+++ heartbeat-2.1.3/debian/patches/series/2.1.3-6lenny1
@@ -0,0 +1 @@
++ dopd-fix-basic-failover-fix-hb-message-corruption-by-fprintf_stderr_.patch
only in patch2:
unchanged:
--- heartbeat-2.1.3.orig/contrib/drbd-outdate-peer/drbd-peer-outdater.c
+++ heartbeat-2.1.3/contrib/drbd-outdate-peer/drbd-peer-outdater.c
@@ -76,7 +76,7 @@
 
 	msg = msgfromIPC_noauth(server);
 	if (!msg) {
-		fprintf(stderr, "no message from server or other "
+		cl_log(LOG_WARNING, "no message from server or other "
 				"instance is running\n");
 		if (client->mainloop != NULL &&
 		    g_main_is_running(client->mainloop))
@@ -92,7 +92,7 @@
 	errno = 0;
 	rc = strtol(rc_string, &ep, 10);
 	if (errno != 0 || *ep != EOS) {
-		fprintf(stderr, "unknown message: %s from server", rc_string);
+		cl_log(LOG_WARNING, "unknown message: %s from server", rc_string);
 		client->rc = 20; /* "officially undefined", unspecified error */
 		ha_msg_del(msg);
 		if (client->mainloop != NULL &&
@@ -124,7 +124,7 @@
 outdater_timeout_dispatch(gpointer user_data)
 {
 	dop_client_t *client = (dop_client_t *)user_data;
-	fprintf(stderr, "error: could not connect to dopd after %i seconds"
+	cl_log(LOG_WARNING, "error: could not connect to dopd after %i seconds"
 			": timeout reached\n", client->timeout);
 	if (client->mainloop != NULL && g_main_is_running(client->mainloop))
 		g_main_quit(client->mainloop);
@@ -255,7 +255,7 @@
 					 (gpointer)new_client, &ipc_server);
 
 	if (ipc_server == NULL) {
-		fprintf(stderr, "Could not connect to "T_OUTDATER" channel\n");
+		cl_log(LOG_WARNING, "Could not connect to "T_OUTDATER" channel\n");
 		dop_exit(new_client); /* unreachable */
 	}
 
@@ -267,7 +267,7 @@
 	ha_msg_add(update, F_OUTDATER_RES, drbd_resource);
 
 	if (msg2ipcchan(update, ipc_server) != HA_OK) {
-		fprintf(stderr, "Could not send message\n");
+		cl_log(LOG_WARNING, "Could not send message\n");
 		dop_exit(new_client);
 	}
 
only in patch2:
unchanged:
--- heartbeat-2.1.3.orig/contrib/drbd-outdate-peer/dopd.c
+++ heartbeat-2.1.3/contrib/drbd-outdate-peer/dopd.c
@@ -202,14 +202,17 @@
 }
 
 /* check_drbd_peer()
- * walk the nodes and return TRUE if peer is not this node and it exists.
+ * walk the nodes and return
+ *  FALSE if peer is not found, not a "normal" node, or "dead"
+ *    (no point in trying to reach those nodes).
+ *  TRUE if peer is found to be alive and "normal".
  */
 gboolean
 check_drbd_peer(const char *drbd_peer)
 {
 	const char *node;
 	gboolean found = FALSE;
-	if (!strcmp(drbd_peer, node_name)) {
+	if (!strcasecmp(drbd_peer, node_name)) {
 		cl_log(LOG_WARNING, "drbd peer node %s is me!\n", drbd_peer);
 		return FALSE;
 	}
@@ -306,9 +309,9 @@
 			} else
 				pthread_mutex_unlock(&conn_mutex);
 		} else {
-			/* wrong peer was specified,
-			   send return code 20 to the client */
-			send_to_client(curr_client, "20");
+			/* peer "dead" or not in node list.
+			 * return "peer could not be reached" */
+			send_to_client(curr_client, "5");
 		}
 
 		ha_msg_del(msg);


Reply to: