Bug#928771: unblock: sbd/1.4.0-18-g5e3283c-1
Package: release.debian.org
Severity: normal
User: release.debian.org@packages.debian.org
Usertags: unblock
Please unblock package sbd
New version contains upstream fixes for some usecases and updates
package tests to work with Corosync/Pacemaker versions in buster.
unblock sbd/1.4.0-18-g5e3283c-1
-- System Information:
Debian Release: buster/sid
APT prefers unstable
APT policy: (500, 'unstable')
Architecture: amd64 (x86_64)
Kernel: Linux 4.19.0-3-amd64 (SMP w/8 CPU cores)
Locale: LANG=en_US.UTF-8, LC_CTYPE=en_US.UTF-8 (charmap=UTF-8), LANGUAGE=en_US.UTF-8 (charmap=UTF-8)
Shell: /bin/sh linked to /bin/bash
Init: systemd (via /run/systemd/system)
LSM: AppArmor: enabled
diff -Nru sbd-1.4.0/debian/changelog sbd-1.4.0-18-g5e3283c/debian/changelog
--- sbd-1.4.0/debian/changelog 2019-01-15 09:25:28.000000000 +0100
+++ sbd-1.4.0-18-g5e3283c/debian/changelog 2019-05-08 10:55:44.000000000 +0200
@@ -1,3 +1,12 @@
+sbd (1.4.0-18-g5e3283c-1) unstable; urgency=medium
+
+ * New upstream version 1.4.0-18-g5e3283c (Closes: #925821)
+ * debian/sbd.lintian-overrides: update manpage line
+ * debian/patches: use /run for PIDFile location
+ * debian/tests: update for corosync v3
+
+ -- Valentin Vidic <vvidic@debian.org> Wed, 08 May 2019 10:55:44 +0200
+
sbd (1.4.0-1) unstable; urgency=medium
* New upstream version 1.4.0
diff -Nru sbd-1.4.0/debian/patches/pidfile-in-runstatedir.patch sbd-1.4.0-18-g5e3283c/debian/patches/pidfile-in-runstatedir.patch
--- sbd-1.4.0/debian/patches/pidfile-in-runstatedir.patch 1970-01-01 01:00:00.000000000 +0100
+++ sbd-1.4.0-18-g5e3283c/debian/patches/pidfile-in-runstatedir.patch 2019-05-08 10:55:20.000000000 +0200
@@ -0,0 +1,28 @@
+Description: Use /run for PIDFile location
+ systemd complains if PIDFile uses /var/run
+Author: Valentin Vidic <vvidic@debian.org>
+Last-Update: 2019-04-26
+---
+This patch header follows DEP-3: http://dep.debian.net/deps/dep3/
+--- a/src/sbd.service.in
++++ b/src/sbd.service.in
+@@ -10,7 +10,7 @@
+
+ [Service]
+ Type=forking
+-PIDFile=@localstatedir@/run/sbd.pid
++PIDFile=@runstatedir@/sbd.pid
+ EnvironmentFile=-@CONFIGDIR@/sbd
+ ExecStart=@sbindir@/sbd $SBD_OPTS -p @localstatedir@/run/sbd.pid watch
+ ExecStop=@bindir@/kill -TERM $MAINPID
+--- a/src/sbd_remote.service.in
++++ b/src/sbd_remote.service.in
+@@ -8,7 +8,7 @@
+
+ [Service]
+ Type=forking
+-PIDFile=@localstatedir@/run/sbd.pid
++PIDFile=@runstatedir@/sbd.pid
+ EnvironmentFile=-@CONFIGDIR@/sbd
+ ExecStart=@sbindir@/sbd $SBD_OPTS -p @localstatedir@/run/sbd.pid watch
+ ExecStop=@bindir@/kill -TERM $MAINPID
diff -Nru sbd-1.4.0/debian/patches/series sbd-1.4.0-18-g5e3283c/debian/patches/series
--- sbd-1.4.0/debian/patches/series 1970-01-01 01:00:00.000000000 +0100
+++ sbd-1.4.0-18-g5e3283c/debian/patches/series 2019-05-08 10:55:20.000000000 +0200
@@ -0,0 +1 @@
+pidfile-in-runstatedir.patch
diff -Nru sbd-1.4.0/debian/sbd.lintian-overrides sbd-1.4.0-18-g5e3283c/debian/sbd.lintian-overrides
--- sbd-1.4.0/debian/sbd.lintian-overrides 2019-01-15 09:12:00.000000000 +0100
+++ sbd-1.4.0-18-g5e3283c/debian/sbd.lintian-overrides 2019-05-08 10:55:01.000000000 +0200
@@ -1 +1 @@
-manpage-has-errors-from-man usr/share/man/man8/sbd.8.gz 185: warning [p 1, 8.7i]: can't break line
+manpage-has-errors-from-man usr/share/man/man8/sbd.8.gz 189: warning [p 1, 8.7i]: can't break line
diff -Nru sbd-1.4.0/debian/tests/control sbd-1.4.0-18-g5e3283c/debian/tests/control
--- sbd-1.4.0/debian/tests/control 2019-01-15 09:12:00.000000000 +0100
+++ sbd-1.4.0-18-g5e3283c/debian/tests/control 2019-05-08 10:55:28.000000000 +0200
@@ -14,10 +14,10 @@
Restrictions: needs-root, allow-stderr, isolation-machine
Tests: regression
-Depends: @, pacemaker, crmsh
+Depends: @
Restrictions: needs-root, isolation-machine, breaks-testbed
Tests: fence-external
-Depends: @, pacemaker, crmsh, fence-agents
+Depends: @
Restrictions: needs-root, isolation-machine, breaks-testbed
Tests: fence-agents
diff -Nru sbd-1.4.0/debian/tests/fence-agents sbd-1.4.0-18-g5e3283c/debian/tests/fence-agents
--- sbd-1.4.0/debian/tests/fence-agents 2019-01-15 09:12:00.000000000 +0100
+++ sbd-1.4.0-18-g5e3283c/debian/tests/fence-agents 2019-05-08 10:55:28.000000000 +0200
@@ -14,21 +14,24 @@
LOOP=$(losetup --find --show $DISK)
echo "=== create ==="
+hostname node1 # must match corosync for fence to work
sbd -d $LOOP create
-echo "SBD_OPTS='-d $LOOP -W -W'" > /etc/default/sbd
+sed -i "s|^#\\?\\(SBD_DEVICE=\\).*|\\1$LOOP|" /etc/default/sbd
+sed -i "s|^\\(SBD_WATCHDOG_DEV=\\).*|\\1/dev/null|" /etc/default/sbd
echo "=== cluster ==="
-service corosync start
-service pacemaker start
-sleep 60
+apt-get --yes --quiet install pacemaker crmsh fence-agents
service sbd status
-crm status
-echo "=== crm ==="
-HOSTNAME=$(uname -n)
-crm configure primitive sbd stonith:fence_sbd params devices=$LOOP plug=$HOSTNAME sbd_path=/usr/sbin/sbd
+echo -n "Waiting for cluster to start... "
+for x in `seq 60 -1 1`; do echo -n "$x "; sleep 1; done; echo
+crm configure primitive sbd stonith:fence_sbd params devices=$LOOP plug=node1 sbd_path=/usr/sbin/sbd
crm configure show
+echo -n "Waiting for resource to start... "
+for x in `seq 10 -1 1`; do echo -n "$x "; sleep 1; done; echo
+crm status
+
echo "=== fence ==="
/tmp/autopkgtest-reboot-prepare fenced
-crm --force node fence $HOSTNAME
+crm --force node fence node1
diff -Nru sbd-1.4.0/debian/tests/fence-external sbd-1.4.0-18-g5e3283c/debian/tests/fence-external
--- sbd-1.4.0/debian/tests/fence-external 2019-01-15 09:12:00.000000000 +0100
+++ sbd-1.4.0-18-g5e3283c/debian/tests/fence-external 2019-05-08 10:55:28.000000000 +0200
@@ -14,20 +14,24 @@
LOOP=$(losetup --find --show $DISK)
echo "=== create ==="
+hostname node1 # must match corosync for fence to work
sbd -d $LOOP create
-echo "SBD_OPTS='-d $LOOP -W -W'" > /etc/default/sbd
+sed -i "s|^#\\?\\(SBD_DEVICE=\\).*|\\1$LOOP|" /etc/default/sbd
+sed -i "s|^\\(SBD_WATCHDOG_DEV=\\).*|\\1/dev/null|" /etc/default/sbd
echo "=== cluster ==="
-service corosync start
-service pacemaker start
-sleep 60
+apt-get --yes --quiet install pacemaker crmsh
service sbd status
-crm status
-echo "=== crm ==="
+echo -n "Waiting for cluster to start... "
+for x in `seq 60 -1 1`; do echo -n "$x "; sleep 1; done; echo
crm configure primitive sbd stonith:external/sbd params sbd_device=$LOOP
crm configure show
+echo -n "Waiting for resource to start... "
+for x in `seq 10 -1 1`; do echo -n "$x "; sleep 1; done; echo
+crm status
+
echo "=== fence ==="
/tmp/autopkgtest-reboot-prepare fenced
-crm --force node fence $(uname -n)
+crm --force node fence node1
diff -Nru sbd-1.4.0/man/sbd.8.pod sbd-1.4.0-18-g5e3283c/man/sbd.8.pod
--- sbd-1.4.0/man/sbd.8.pod 2019-01-14 14:27:27.000000000 +0100
+++ sbd-1.4.0-18-g5e3283c/man/sbd.8.pod 2019-04-16 14:38:22.000000000 +0200
@@ -493,7 +493,7 @@
introduce an additional single point of failure then.
If the SBD device is not accessible, the daemon will fail to start and
-inhibit openais startup.
+inhibit startup of cluster services.
=item Two devices
diff -Nru sbd-1.4.0/src/sbd-cluster.c sbd-1.4.0-18-g5e3283c/src/sbd-cluster.c
--- sbd-1.4.0/src/sbd-cluster.c 2019-01-14 14:27:27.000000000 +0100
+++ sbd-1.4.0-18-g5e3283c/src/sbd-cluster.c 2019-04-16 14:38:22.000000000 +0200
@@ -174,6 +174,25 @@
return TRUE;
}
+static void
+cmap_destroy(void)
+{
+ if (cmap_source) {
+ g_source_destroy(cmap_source);
+ cmap_source = NULL;
+ }
+
+ if (track_handle) {
+ cmap_track_delete(cmap_handle, track_handle);
+ track_handle = 0;
+ }
+
+ if (cmap_handle) {
+ cmap_finalize(cmap_handle);
+ cmap_handle = 0;
+ }
+}
+
static gboolean
sbd_get_two_node(void)
{
@@ -217,18 +236,7 @@
return TRUE;
out:
- if (cmap_source) {
- g_source_destroy(cmap_source);
- cmap_source = NULL;
- }
- if (track_handle) {
- cmap_track_delete(cmap_handle, track_handle);
- track_handle = 0;
- }
- if (cmap_handle) {
- cmap_finalize(cmap_handle);
- cmap_handle = 0;
- }
+ cmap_destroy();
return FALSE;
}
@@ -327,6 +335,12 @@
{
cl_log(LOG_WARNING, "Lost connection to %s", name_for_cluster_type(get_cluster_type()));
+ if (get_cluster_type() != pcmk_cluster_unknown) {
+#if SUPPORT_COROSYNC && CHECK_TWO_NODE
+ cmap_destroy();
+#endif
+ }
+
set_servant_health(pcmk_health_unclean, LOG_ERR, "Cluster connection terminated");
notify_parent();
diff -Nru sbd-1.4.0/src/sbd-common.c sbd-1.4.0-18-g5e3283c/src/sbd-common.c
--- sbd-1.4.0/src/sbd-common.c 2019-01-14 14:27:27.000000000 +0100
+++ sbd-1.4.0-18-g5e3283c/src/sbd-common.c 2019-04-16 14:38:22.000000000 +0200
@@ -568,13 +568,13 @@
#define IOPRIO_PRIO_DATA(mask) ((mask) & IOPRIO_PRIO_MASK)
#define IOPRIO_PRIO_VALUE(class, data) (((class) << IOPRIO_CLASS_SHIFT) | data)
-static unsigned char
+static void
sbd_stack_hogger(unsigned char * inbuf, int kbytes)
{
unsigned char buf[1024];
if(kbytes <= 0) {
- return HOG_CHAR;
+ return;
}
if (inbuf == NULL) {
@@ -584,10 +584,10 @@
}
if (kbytes > 0) {
- return sbd_stack_hogger(buf, kbytes-1);
- } else {
- return buf[sizeof(buf)-1];
+ sbd_stack_hogger(buf, kbytes-1);
}
+
+ return;
}
static void
diff -Nru sbd-1.4.0/src/sbd.h sbd-1.4.0-18-g5e3283c/src/sbd.h
--- sbd-1.4.0/src/sbd.h 2019-01-14 14:27:27.000000000 +0100
+++ sbd-1.4.0-18-g5e3283c/src/sbd.h 2019-04-16 14:38:22.000000000 +0200
@@ -54,10 +54,13 @@
/* FIXME: should add dynamic check of SIG_XX >= SIGRTMAX */
/* exit status for disk-servant */
-#define EXIT_MD_IO_FAIL 20
-#define EXIT_MD_REQUEST_RESET 21
-#define EXIT_MD_REQUEST_SHUTOFF 22
-#define EXIT_MD_REQUEST_CRASHDUMP 23
+#define EXIT_MD_SERVANT_IO_FAIL 20
+#define EXIT_MD_SERVANT_REQUEST_RESET 21
+#define EXIT_MD_SERVANT_REQUEST_SHUTOFF 22
+#define EXIT_MD_SERVANT_REQUEST_CRASHDUMP 23
+
+/* exit status for pcmk-servant */
+#define EXIT_PCMK_SERVANT_GRACEFUL_SHUTDOWN 30
#define HOG_CHAR 0xff
#define SECTOR_NAME_MAX 63
@@ -175,7 +178,7 @@
int dump_headers(struct servants_list_item *servants);
unsigned long get_first_msgwait(struct servants_list_item *servants);
int messenger(const char *name, const char *msg, struct servants_list_item *servants);
-int servant(const char *diskname, int mode, const void* argp);
+int servant_md(const char *diskname, int mode, const void* argp);
#endif
int servant_pcmk(const char *diskname, int mode, const void* argp);
diff -Nru sbd-1.4.0/src/sbd-inquisitor.c sbd-1.4.0-18-g5e3283c/src/sbd-inquisitor.c
--- sbd-1.4.0/src/sbd-inquisitor.c 2019-01-14 14:27:27.000000000 +0100
+++ sbd-1.4.0-18-g5e3283c/src/sbd-inquisitor.c 2019-04-16 14:38:22.000000000 +0200
@@ -42,19 +42,36 @@
struct servants_list_item *newbie;
if (lookup_servant_by_dev(devname)) {
- cl_log(LOG_DEBUG, "Servant %s already exists", devname);
- return;
+ cl_log(LOG_DEBUG, "Servant %s already exists", devname);
+ return;
}
newbie = malloc(sizeof(*newbie));
- if (!newbie) {
- fprintf(stderr, "malloc failed in recruit_servant.\n");
- exit(1);
+ if (newbie) {
+ memset(newbie, 0, sizeof(*newbie));
+ newbie->devname = strdup(devname);
+ newbie->pid = pid;
+ newbie->first_start = 1;
+ }
+ if (!newbie || !newbie->devname) {
+ fprintf(stderr, "heap allocation failed in recruit_servant.\n");
+ exit(1);
+ }
+
+ /* some sanity-check on our newbie */
+ if (sbd_is_disk(newbie)) {
+ cl_log(LOG_INFO, "Monitoring %s", devname);
+ disk_count++;
+ } else if (sbd_is_pcmk(newbie) || sbd_is_cluster(newbie)) {
+ /* alive just after pcmk and cluster servants have shown up */
+ newbie->outdated = 1;
+ } else {
+ /* toss our newbie */
+ cl_log(LOG_ERR, "Refusing to recruit unrecognized servant %s", devname);
+ free((void *) newbie->devname);
+ free(newbie);
+ return;
}
- memset(newbie, 0, sizeof(*newbie));
- newbie->devname = strdup(devname);
- newbie->pid = pid;
- newbie->first_start = 1;
if (!s) {
servants_leader = newbie;
@@ -65,12 +82,6 @@
}
servant_count++;
- if(sbd_is_disk(newbie)) {
- cl_log(LOG_INFO, "Monitoring %s", devname);
- disk_count++;
- } else {
- newbie->outdated = 1;
- }
}
int assign_servant(const char* devname, functionp_t functionp, int mode, const void* argp)
@@ -148,7 +159,7 @@
if (sbd_is_disk(s)) {
#if SUPPORT_SHARED_DISK
DBGLOG(LOG_INFO, "Starting servant for device %s", s->devname);
- s->pid = assign_servant(s->devname, servant, start_mode, s);
+ s->pid = assign_servant(s->devname, servant_md, start_mode, s);
#else
cl_log(LOG_ERR, "Shared disk functionality not supported");
return;
@@ -479,19 +490,19 @@
if (sbd_is_disk(s)) {
if (WIFEXITED(status)) {
switch(WEXITSTATUS(status)) {
- case EXIT_MD_IO_FAIL:
+ case EXIT_MD_SERVANT_IO_FAIL:
DBGLOG(LOG_INFO, "Servant for %s requests to be disowned",
s->devname);
break;
- case EXIT_MD_REQUEST_RESET:
+ case EXIT_MD_SERVANT_REQUEST_RESET:
cl_log(LOG_WARNING, "%s requested a reset", s->devname);
do_reset();
break;
- case EXIT_MD_REQUEST_SHUTOFF:
+ case EXIT_MD_SERVANT_REQUEST_SHUTOFF:
cl_log(LOG_WARNING, "%s requested a shutoff", s->devname);
do_off();
break;
- case EXIT_MD_REQUEST_CRASHDUMP:
+ case EXIT_MD_SERVANT_REQUEST_CRASHDUMP:
cl_log(LOG_WARNING, "%s requested a crashdump", s->devname);
do_crashdump();
break;
@@ -499,6 +510,22 @@
break;
}
}
+ } else if (sbd_is_pcmk(s)) {
+ if (WIFEXITED(status)) {
+ switch(WEXITSTATUS(status)) {
+ case EXIT_PCMK_SERVANT_GRACEFUL_SHUTDOWN:
+ DBGLOG(LOG_INFO, "PCMK-Servant has exited gracefully");
+ /* revert to state prior to pacemaker-detection */
+ s->restarts = 0;
+ s->restart_blocked = 0;
+ cluster_appeared = 0;
+ s->outdated = 1;
+ s->t_last.tv_sec = 0;
+ break;
+ default:
+ break;
+ }
+ }
}
cleanup_servant_by_pid(pid);
}
@@ -753,54 +780,56 @@
int
parse_device_line(const char *line)
{
- int lpc = 0;
- int last = 0;
- int max = 0;
+ size_t lpc = 0;
+ size_t last = 0;
+ size_t max = 0;
int found = 0;
+ bool skip_space = true;
+ int space_run = 0;
- if(line) {
- max = strlen(line);
+ if (!line) {
+ return 0;
}
- if (max <= 0) {
- return found;
- }
+ max = strlen(line);
- cl_log(LOG_DEBUG, "Processing %d bytes: [%s]", max, line);
- /* Skip initial whitespace */
- for (lpc = 0; lpc <= max && isspace(line[lpc]); lpc++) {
- last = lpc + 1;
- }
+ cl_log(LOG_DEBUG, "Processing %d bytes: [%s]", (int) max, line);
- /* Now the actual content */
for (lpc = 0; lpc <= max; lpc++) {
- int a_space = isspace(line[lpc]);
-
- if (a_space && lpc < max && isspace(line[lpc + 1])) {
- /* fast-forward to the end of the spaces */
-
- } else if (a_space || line[lpc] == ';' || line[lpc] == 0) {
- int rc = 1;
- char *entry = NULL;
+ if (isspace(line[lpc])) {
+ if (skip_space) {
+ last = lpc + 1;
+ } else {
+ space_run++;
+ }
+ continue;
+ }
+ skip_space = false;
+ if (line[lpc] == ';' || line[lpc] == 0) {
+ int rc = 0;
+ char *entry = calloc(1, 1 + lpc - last);
- if (lpc > last) {
- entry = calloc(1, 1 + lpc - last);
+ if (entry) {
rc = sscanf(line + last, "%[^;]", entry);
+ } else {
+ fprintf(stderr, "Heap allocation failed parsing device-line.\n");
+ exit(1);
}
- if (entry == NULL) {
- /* Skip */
- } else if (rc != 1) {
- cl_log(LOG_WARNING, "Could not parse (%d %d): %s", last, lpc, line + last);
+ if (rc != 1) {
+ cl_log(LOG_WARNING, "Could not parse: '%s'", line + last);
} else {
+ entry[strlen(entry)-space_run] = '\0';
cl_log(LOG_DEBUG, "Adding '%s'", entry);
recruit_servant(entry, 0);
found++;
}
free(entry);
+ skip_space = true;
last = lpc + 1;
}
+ space_run = 0;
}
return found;
}
@@ -861,7 +890,7 @@
int devices = parse_device_line(value);
if(devices < 1) {
fprintf(stderr, "Invalid device line: %s\n", value);
- exit_status = -2;
+ exit_status = -2;
goto out;
}
#else
@@ -1059,7 +1088,8 @@
break;
case 'h':
usage();
- return (0);
+ goto out;
+ break;
default:
exit_status = -2;
goto out;
@@ -1212,6 +1242,9 @@
}
out:
+ if (timeout_action) {
+ free(timeout_action);
+ }
if (exit_status < 0) {
if (exit_status == -2) {
usage();
diff -Nru sbd-1.4.0/src/sbd-md.c sbd-1.4.0-18-g5e3283c/src/sbd-md.c
--- sbd-1.4.0/src/sbd-md.c 2019-01-14 14:27:27.000000000 +0100
+++ sbd-1.4.0-18-g5e3283c/src/sbd-md.c 2019-04-16 14:38:22.000000000 +0200
@@ -162,9 +162,9 @@
memset(&st->io, 0, sizeof(struct iocb));
if (rw) {
- io_prep_pwrite(&st->io, st->devfd, data, sector_size, sector_size * sector);
+ io_prep_pwrite(&st->io, st->devfd, data, sector_size, (long long) sector_size * sector);
} else {
- io_prep_pread(&st->io, st->devfd, data, sector_size, sector_size * sector);
+ io_prep_pread(&st->io, st->devfd, data, sector_size, (long long) sector_size * sector);
}
if (io_submit(st->ioctx, 1, ios) != 1) {
@@ -373,7 +373,6 @@
struct sector_header_s *s_header;
struct sector_node_s *s_node;
struct sector_mbox_s *s_mbox;
- struct stat s;
char uuid[37];
int i;
int rc = 0;
@@ -394,10 +393,6 @@
uuid_generate(s_header->uuid);
uuid_unparse_lower(s_header->uuid, uuid);
- fstat(st->devfd, &s);
- /* printf("st_size = %ld, st_blksize = %ld, st_blocks = %ld\n",
- s.st_size, s.st_blksize, s.st_blocks); */
-
cl_log(LOG_INFO, "Creating version %d.%d header on device %d (uuid: %s)",
s_header->version, s_header->minor_version,
st->devfd, uuid);
@@ -1031,7 +1026,7 @@
return 0;
}
-int servant(const char *diskname, int mode, const void* argp)
+int servant_md(const char *diskname, int mode, const void* argp)
{
struct sector_mbox_s *s_mbox = NULL;
struct sector_node_s *s_node = NULL;
@@ -1046,11 +1041,6 @@
char uuid[37];
const struct servants_list_item *s = argp;
- if (!diskname) {
- cl_log(LOG_ERR, "Empty disk name %s.", diskname);
- return -1;
- }
-
cl_log(LOG_INFO, "Servant starting for device %s", diskname);
/* Block most of the signals */
@@ -1066,19 +1056,19 @@
st = open_device(diskname, LOG_WARNING);
if (!st) {
- exit(EXIT_MD_IO_FAIL);
+ exit(EXIT_MD_SERVANT_IO_FAIL);
}
s_header = header_get(st);
if (!s_header) {
cl_log(LOG_ERR, "Not a valid header on %s", diskname);
- exit(EXIT_MD_IO_FAIL);
+ exit(EXIT_MD_SERVANT_IO_FAIL);
}
if (servant_check_timeout_inconsistent(s_header) < 0) {
cl_log(LOG_ERR, "Timeouts on %s do not match first device",
diskname);
- exit(EXIT_MD_IO_FAIL);
+ exit(EXIT_MD_SERVANT_IO_FAIL);
}
if (s_header->minor_version > 0) {
@@ -1091,14 +1081,14 @@
cl_log(LOG_ERR,
"No slot allocated, and automatic allocation failed for disk %s.",
diskname);
- rc = EXIT_MD_IO_FAIL;
+ rc = EXIT_MD_SERVANT_IO_FAIL;
goto out;
}
s_node = sector_alloc();
if (slot_read(st, mbox, s_node) < 0) {
cl_log(LOG_ERR, "Unable to read node entry on %s",
diskname);
- exit(EXIT_MD_IO_FAIL);
+ exit(EXIT_MD_SERVANT_IO_FAIL);
}
cl_log(LOG_NOTICE, "Monitoring slot %d on disk %s", mbox, diskname);
@@ -1114,7 +1104,7 @@
if (mode > 0) {
if (mbox_read(st, mbox, s_mbox) < 0) {
cl_log(LOG_ERR, "mbox read failed during start-up in servant.");
- rc = EXIT_MD_IO_FAIL;
+ rc = EXIT_MD_SERVANT_IO_FAIL;
goto out;
}
if (s_mbox->cmd != SBD_MSG_EXIT &&
@@ -1130,7 +1120,7 @@
DBGLOG(LOG_INFO, "First servant start - zeroing inbox");
memset(s_mbox, 0, sizeof(*s_mbox));
if (mbox_write(st, mbox, s_mbox) < 0) {
- rc = EXIT_MD_IO_FAIL;
+ rc = EXIT_MD_SERVANT_IO_FAIL;
goto out;
}
}
@@ -1159,28 +1149,28 @@
s_header_retry = header_get(st);
if (!s_header_retry) {
cl_log(LOG_ERR, "No longer found a valid header on %s", diskname);
- exit(EXIT_MD_IO_FAIL);
+ exit(EXIT_MD_SERVANT_IO_FAIL);
}
if (memcmp(s_header, s_header_retry, sizeof(*s_header)) != 0) {
cl_log(LOG_ERR, "Header on %s changed since start-up!", diskname);
- exit(EXIT_MD_IO_FAIL);
+ exit(EXIT_MD_SERVANT_IO_FAIL);
}
free(s_header_retry);
s_node_retry = sector_alloc();
if (slot_read(st, mbox, s_node_retry) < 0) {
cl_log(LOG_ERR, "slot read failed in servant.");
- exit(EXIT_MD_IO_FAIL);
+ exit(EXIT_MD_SERVANT_IO_FAIL);
}
if (memcmp(s_node, s_node_retry, sizeof(*s_node)) != 0) {
cl_log(LOG_ERR, "Node entry on %s changed since start-up!", diskname);
- exit(EXIT_MD_IO_FAIL);
+ exit(EXIT_MD_SERVANT_IO_FAIL);
}
free(s_node_retry);
if (mbox_read(st, mbox, s_mbox) < 0) {
cl_log(LOG_ERR, "mbox read failed in servant.");
- exit(EXIT_MD_IO_FAIL);
+ exit(EXIT_MD_SERVANT_IO_FAIL);
}
if (s_mbox->cmd > 0) {
@@ -1195,14 +1185,14 @@
sigqueue(ppid, SIG_TEST, signal_value);
break;
case SBD_MSG_RESET:
- exit(EXIT_MD_REQUEST_RESET);
+ exit(EXIT_MD_SERVANT_REQUEST_RESET);
case SBD_MSG_OFF:
- exit(EXIT_MD_REQUEST_SHUTOFF);
+ exit(EXIT_MD_SERVANT_REQUEST_SHUTOFF);
case SBD_MSG_EXIT:
sigqueue(ppid, SIG_EXITREQ, signal_value);
break;
case SBD_MSG_CRASHDUMP:
- exit(EXIT_MD_REQUEST_CRASHDUMP);
+ exit(EXIT_MD_SERVANT_REQUEST_CRASHDUMP);
default:
/* FIXME:
An "unknown" message might result
diff -Nru sbd-1.4.0/src/sbd-pacemaker.c sbd-1.4.0-18-g5e3283c/src/sbd-pacemaker.c
--- sbd-1.4.0/src/sbd-pacemaker.c 2019-01-14 14:27:27.000000000 +0100
+++ sbd-1.4.0-18-g5e3283c/src/sbd-pacemaker.c 2019-04-16 14:38:22.000000000 +0200
@@ -103,6 +103,9 @@
static long last_refresh = 0;
+static int pcmk_clean_shutdown = 0;
+static int pcmk_shutdown = 0;
+
static gboolean
mon_timer_reconnect(gpointer data)
{
@@ -128,10 +131,26 @@
{
if (cib) {
cib->cmds->signoff(cib);
+ /* retrigger as last one might have been skipped */
+ mon_refresh_state(NULL);
+ if (pcmk_clean_shutdown) {
+ /* assume a graceful pacemaker-shutdown */
+ clean_up(EXIT_PCMK_SERVANT_GRACEFUL_SHUTDOWN);
+ }
+ /* getting here we aren't sure about the pacemaker-state
+ so try to use the timeout to reconnect and get
+ everything sorted out again
+ */
+ pcmk_shutdown = 0;
set_servant_health(pcmk_health_transient, LOG_WARNING, "Disconnected from CIB");
timer_id_reconnect = g_timeout_add(reconnect_msec, mon_timer_reconnect, NULL);
}
cib_connected = 0;
+ /* no sense in looking into outdated cib, trying to apply patch, ... */
+ if (current_cib) {
+ free_xml(current_cib);
+ current_cib = NULL;
+ }
return;
}
@@ -171,7 +190,7 @@
mon_timer_notify(gpointer data)
{
static int counter = 0;
- int counter_max = timeout_watchdog / timeout_loop;
+ int counter_max = timeout_watchdog / timeout_loop / 2;
if (timer_id_notify > 0) {
g_source_remove(timer_id_notify);
@@ -257,7 +276,7 @@
static int updates = 0;
static int ever_had_quorum = FALSE;
- node_t *node = pe_find_node(data_set->nodes, local_uname);
+ node_t *node = NULL;
updates++;
@@ -267,11 +286,15 @@
return;
}
+ node = pe_find_node(data_set->nodes, local_uname);
- if (node == NULL) {
+ if ((node == NULL) || (node->details == NULL)) {
set_servant_health(pcmk_health_unknown, LOG_WARNING, "Node state: %s is UNKNOWN", local_uname);
+ notify_parent();
+ return;
+ }
- } else if (node->details->online == FALSE) {
+ if (node->details->online == FALSE) {
set_servant_health(pcmk_health_unknown, LOG_WARNING, "Node state: OFFLINE");
} else if (node->details->unclean) {
@@ -280,11 +303,6 @@
} else if (node->details->pending) {
set_servant_health(pcmk_health_pending, LOG_WARNING, "Node state: pending");
-#if 0
- } else if (node->details->shutdown) {
- set_servant_health(pcmk_health_shutdown, LOG_WARNING, "Node state: shutting down");
-#endif
-
} else if (data_set->flags & pe_flag_have_quorum) {
set_servant_health(pcmk_health_online, LOG_INFO, "Node state: online");
ever_had_quorum = TRUE;
@@ -315,6 +333,12 @@
}
}
+ if (node->details->shutdown) {
+ pcmk_shutdown = 1;
+ }
+ if (pcmk_shutdown && !(node->details->running_rsc)) {
+ pcmk_clean_shutdown = 1;
+ }
notify_parent();
return;
}
@@ -339,7 +363,7 @@
static mainloop_timer_t *refresh_timer = NULL;
if(refresh_timer == NULL) {
- refresh_timer = mainloop_timer_add("refresh", 2000, FALSE, mon_trigger_refresh, NULL);
+ refresh_timer = mainloop_timer_add("refresh", reconnect_msec, FALSE, mon_trigger_refresh, NULL);
refresh_trigger = mainloop_add_trigger(G_PRIORITY_LOW, mon_refresh_state, refresh_timer);
}
@@ -369,9 +393,9 @@
}
/* Refresh
- * - immediately if the last update was more than 5s ago
+ * - immediately if the last update was more than 1s ago
* - every 10 updates
- * - at most 2s after the last update
+ * - at most 1s after the last update
*/
if (updates > 10 || (now - last_refresh) > (reconnect_msec / 1000)) {
mon_refresh_state(refresh_timer);
diff -Nru sbd-1.4.0/src/sbd.sysconfig sbd-1.4.0-18-g5e3283c/src/sbd.sysconfig
--- sbd-1.4.0/src/sbd.sysconfig 2019-01-14 14:27:27.000000000 +0100
+++ sbd-1.4.0-18-g5e3283c/src/sbd.sysconfig 2019-04-16 14:38:22.000000000 +0200
@@ -68,6 +68,9 @@
# If your sbd device(s) reside on a multipath setup or iSCSI, this
# should be the time required to detect a path failure.
#
+# Be aware that watchdog timeout set in the on-disk metadata takes
+# precedence.
+#
SBD_WATCHDOG_TIMEOUT=5
## Type: string
Reply to: