[Date Prev][Date Next] [Thread Prev][Thread Next] [Date Index] [Thread Index]

[Nbd] [PATCH] nbd: improve request timeouts handling (rebased)



[rebased against the master, no other changes]

The main idea behind it is to be able to quickly detect broken replica
and switch over to another when used with any sort of mirror type device
built on top of any number of nbd devices.

Before this change a request would time out causing the socket to be
shut down and the device to fail in case of a dead server or removed
network cable only if:

  a) either the timer around kernel_sendmsg() kicked in
  b) or the TCP failures on retransmission finally caused an error
     on the socket, likely blocked on kernel_recvmsg() at this time,
     waiting for replies from the server

Case a) depends mostly on the size of requests issued and on the maximum
size of the socket buffer -- a lot of read request headers or small
write requests could be "sent" without triggering the requested timeout

Case b) timeout is independent of nbd-client -t <timeout> option
as there is no TCP_USER_TIMEOUT set on the client socket by default.
And even if such timeout was set it would not solve the problem of
an nbd-client hung on receiving replies for much longer time without
setting TCP keep-alives... and that would be the third, independent
timeout setting required to make it work "almost" as expected...

So, instead, take the big hammer approach and:

  *) trace the number of outstanding requests sent to the server
     (nbd->inflight)
  *) enable the timer (nbd->req_timer) before the first request
     is submitted and leave it enabled
  *) when sending next request do not touch the timer (it is up
     to the receiving side to control it at this point)
  *) on receive side update the timer every time a response
     is collected but there are more to read from the server
  *) disable the timer whenever the inflight counter drops
     to zero or an error (leading to the socket shutdown)
     is returned

This patch does NOT prevent the server to process a request for longer
than the timeout specified if only it replies to any other request
submitted within the timeout (the server still may reply to a batch
of requests in any order).

Only the nbd->xmit_timeout != 0 code path is changed so the patch should
not affect nbd connections running without an explicit timeout set
on the nbd-client command line.

There is also no way to enable or disable the timeout on an active
(nbd->pid != 0) nbd device, it is however possible to change its value.
Otherwise the inflight request counter would have to affect the nbd
devices enabled without nbd-client -t <timeout>.

Also move nbd->pid modifications behind nbd->tx_lock wherever possible
to avoid races between the concurrent nbd-client invocations.

Signed-off-by: Michal Belczyk <belczyk@...1274...>
---
 drivers/block/nbd.c | 150 +++++++++++++++++++++++++++++++++++++++++-----------
 include/linux/nbd.h |   6 +++
 2 files changed, 125 insertions(+), 31 deletions(-)

diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 2dc3b51..a20d5eb 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -140,11 +140,24 @@ static void sock_shutdown(struct nbd_device *nbd, int lock)
 
 static void nbd_xmit_timeout(unsigned long arg)
 {
-	struct task_struct *task = (struct task_struct *)arg;
+	struct nbd_device *nbd = (struct nbd_device *)arg;
+	struct task_struct *task_ary[2];
+	unsigned long flags;
+	int i;
 
-	printk(KERN_WARNING "nbd: killing hung xmit (%s, pid: %d)\n",
-		task->comm, task->pid);
-	force_sig(SIGKILL, task);
+	spin_lock_irqsave(&nbd->timer_lock, flags);
+	nbd->timedout = 1;
+	task_ary[0] = nbd->sender;
+	task_ary[1] = nbd->receiver;
+	for (i = 0; i < 2; i++) {
+		if (task_ary[i] == NULL)
+			continue;
+		printk(KERN_WARNING "nbd: killing hung xmit (%s, pid: %d)\n",
+			task_ary[i]->comm, task_ary[i]->pid);
+		force_sig(SIGKILL, task_ary[i]);
+		break;
+	}
+	spin_unlock_irqrestore(&nbd->timer_lock, flags);
 }
 
 /*
@@ -158,7 +171,7 @@ static int sock_xmit(struct nbd_device *nbd, int send, void *buf, int size,
 	struct msghdr msg;
 	struct kvec iov;
 	sigset_t blocked, oldset;
-	unsigned long pflags = current->flags;
+	unsigned long flags, pflags = current->flags;
 
 	if (unlikely(!sock)) {
 		dev_err(disk_to_dev(nbd->disk),
@@ -183,23 +196,39 @@ static int sock_xmit(struct nbd_device *nbd, int send, void *buf, int size,
 		msg.msg_controllen = 0;
 		msg.msg_flags = msg_flags | MSG_NOSIGNAL;
 
-		if (send) {
-			struct timer_list ti;
-
-			if (nbd->xmit_timeout) {
-				init_timer(&ti);
-				ti.function = nbd_xmit_timeout;
-				ti.data = (unsigned long)current;
-				ti.expires = jiffies + nbd->xmit_timeout;
-				add_timer(&ti);
+		if (nbd->xmit_timeout) {
+			spin_lock_irqsave(&nbd->timer_lock, flags);
+			if (nbd->timedout) {
+				spin_unlock_irqrestore(&nbd->timer_lock, flags);
+				printk(KERN_WARNING
+					"nbd (pid %d: %s) timed out\n",
+					task_pid_nr(current), current->comm);
+				result = -EINTR;
+				sock_shutdown(nbd, !send);
+				break;
 			}
+			if (send)
+				nbd->sender = current;
+			else
+				nbd->receiver = current;
+			spin_unlock_irqrestore(&nbd->timer_lock, flags);
+		}
+
+		if (send)
 			result = kernel_sendmsg(sock, &msg, &iov, 1, size);
-			if (nbd->xmit_timeout)
-				del_timer_sync(&ti);
-		} else
+		else
 			result = kernel_recvmsg(sock, &msg, &iov, 1, size,
 						msg.msg_flags);
 
+		if (nbd->xmit_timeout) {
+			spin_lock_irqsave(&nbd->timer_lock, flags);
+			if (send)
+				nbd->sender = NULL;
+			else
+				nbd->receiver = NULL;
+			spin_unlock_irqrestore(&nbd->timer_lock, flags);
+		}
+
 		if (signal_pending(current)) {
 			siginfo_t info;
 			printk(KERN_WARNING "nbd (pid %d: %s) got signal %d\n",
@@ -226,12 +255,12 @@ static int sock_xmit(struct nbd_device *nbd, int send, void *buf, int size,
 }
 
 static inline int sock_send_bvec(struct nbd_device *nbd, struct bio_vec *bvec,
-		int flags)
+		int msg_flags)
 {
 	int result;
 	void *kaddr = kmap(bvec->bv_page);
 	result = sock_xmit(nbd, 1, kaddr + bvec->bv_offset,
-			   bvec->bv_len, flags);
+			   bvec->bv_len, msg_flags);
 	kunmap(bvec->bv_page);
 	return result;
 }
@@ -239,9 +268,9 @@ static inline int sock_send_bvec(struct nbd_device *nbd, struct bio_vec *bvec,
 /* always call with the tx_lock held */
 static int nbd_send_req(struct nbd_device *nbd, struct request *req)
 {
-	int result, flags;
+	int result, msg_flags;
 	struct nbd_request request;
-	unsigned long size = blk_rq_bytes(req);
+	unsigned long flags, size = blk_rq_bytes(req);
 
 	request.magic = htonl(NBD_REQUEST_MAGIC);
 	request.type = htonl(nbd_cmd(req));
@@ -256,6 +285,19 @@ static int nbd_send_req(struct nbd_device *nbd, struct request *req)
 	}
 	memcpy(request.handle, &req, sizeof(req));
 
+	if (nbd->xmit_timeout) {
+		spin_lock_irqsave(&nbd->timer_lock, flags);
+		if (!nbd->inflight) {
+			nbd->req_timer.function = nbd_xmit_timeout;
+			nbd->req_timer.data = (unsigned long)nbd;
+			nbd->req_timer.expires = jiffies + nbd->xmit_timeout;
+			add_timer(&nbd->req_timer);
+		}
+		nbd->inflight++;
+		BUG_ON(nbd->inflight <= 0);
+		spin_unlock_irqrestore(&nbd->timer_lock, flags);
+	}
+
 	dprintk(DBG_TX, "%s: request %p: sending control (%s@%llu,%uB)\n",
 			nbd->disk->disk_name, req,
 			nbdcmd_to_ascii(nbd_cmd(req)),
@@ -277,12 +319,12 @@ static int nbd_send_req(struct nbd_device *nbd, struct request *req)
 		 * whether to set MSG_MORE or not...
 		 */
 		rq_for_each_segment(bvec, req, iter) {
-			flags = 0;
+			msg_flags = 0;
 			if (!rq_iter_last(req, iter))
-				flags = MSG_MORE;
+				msg_flags = MSG_MORE;
 			dprintk(DBG_TX, "%s: request %p: sending %d bytes data\n",
 					nbd->disk->disk_name, req, bvec->bv_len);
-			result = sock_send_bvec(nbd, bvec, flags);
+			result = sock_send_bvec(nbd, bvec, msg_flags);
 			if (result <= 0) {
 				dev_err(disk_to_dev(nbd->disk),
 					"Send data failed (result %d)\n",
@@ -294,6 +336,14 @@ static int nbd_send_req(struct nbd_device *nbd, struct request *req)
 	return 0;
 
 error_out:
+	if (nbd->xmit_timeout) {
+		spin_lock_irqsave(&nbd->timer_lock, flags);
+		nbd->inflight--;
+		BUG_ON(nbd->inflight < 0);
+		if (!nbd->inflight)
+			del_timer_sync(&nbd->req_timer);
+		spin_unlock_irqrestore(&nbd->timer_lock, flags);
+	}
 	return -EIO;
 }
 
@@ -415,24 +465,41 @@ static struct device_attribute pid_attr = {
 static int nbd_do_it(struct nbd_device *nbd)
 {
 	struct request *req;
+	unsigned long flags;
 	int ret;
 
 	BUG_ON(nbd->magic != NBD_MAGIC);
 
 	sk_set_memalloc(nbd->sock->sk);
-	nbd->pid = task_pid_nr(current);
 	ret = device_create_file(disk_to_dev(nbd->disk), &pid_attr);
 	if (ret) {
 		dev_err(disk_to_dev(nbd->disk), "device_create_file failed!\n");
-		nbd->pid = 0;
 		return ret;
 	}
 
-	while ((req = nbd_read_stat(nbd)) != NULL)
-		nbd_end_request(req);
+	for (;;) {
+		req = nbd_read_stat(nbd);
+		if (nbd->xmit_timeout) {
+			spin_lock_irqsave(&nbd->timer_lock, flags);
+			if (req != NULL) {
+				nbd->inflight--;
+				BUG_ON(nbd->inflight < 0);
+			}
+			if (req != NULL && nbd->inflight)
+				mod_timer(&nbd->req_timer,
+					  jiffies + nbd->xmit_timeout);
+			else
+				del_timer_sync(&nbd->req_timer);
+			spin_unlock_irqrestore(&nbd->timer_lock, flags);
+		}
+		if (req != NULL) {
+			nbd_end_request(req);
+			continue;
+		}
+		break;
+	}
 
 	device_remove_file(disk_to_dev(nbd->disk), &pid_attr);
-	nbd->pid = 0;
 	return 0;
 }
 
@@ -680,9 +747,20 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
 		set_capacity(nbd->disk, nbd->bytesize >> 9);
 		return 0;
 
-	case NBD_SET_TIMEOUT:
-		nbd->xmit_timeout = arg * HZ;
+	case NBD_SET_TIMEOUT: {
+		int xt;
+
+		xt = arg * HZ;
+		if (xt < 0)
+			return -EINVAL;
+		if (nbd->pid &&
+		    ((!nbd->xmit_timeout && xt) || (nbd->xmit_timeout && !xt)))
+			return -EBUSY;
+		dev_info(disk_to_dev(nbd->disk), "NBD_SET_TIMEOUT: %d -> %d\n",
+			nbd->xmit_timeout / HZ, xt / HZ);
+		nbd->xmit_timeout = xt;
 		return 0;
+	}
 
 	case NBD_SET_FLAGS:
 		nbd->flags = arg;
@@ -705,6 +783,11 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
 		if (!nbd->file)
 			return -EINVAL;
 
+		nbd->pid = task_pid_nr(current);
+		nbd->inflight = 0;
+		nbd->timedout = 0;
+		nbd->sender = NULL;
+		nbd->receiver = NULL;
 		mutex_unlock(&nbd->tx_lock);
 
 		if (nbd->flags & NBD_FLAG_READ_ONLY)
@@ -721,6 +804,7 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
 					nbd->disk->disk_name);
 		if (IS_ERR(thread)) {
 			mutex_lock(&nbd->tx_lock);
+			nbd->pid = 0;
 			return PTR_ERR(thread);
 		}
 		wake_up_process(thread);
@@ -728,6 +812,7 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
 		kthread_stop(thread);
 
 		mutex_lock(&nbd->tx_lock);
+		nbd->pid = 0;
 		if (error)
 			return error;
 		sock_shutdown(nbd, 0);
@@ -742,6 +827,7 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
 			fput(file);
 		nbd->flags = 0;
 		nbd->bytesize = 0;
+		nbd->xmit_timeout = 0;
 		bdev->bd_inode->i_size = 0;
 		set_capacity(nbd->disk, 0);
 		if (max_part > 0)
@@ -886,6 +972,8 @@ static int __init nbd_init(void)
 		init_waitqueue_head(&nbd_dev[i].waiting_wq);
 		nbd_dev[i].blksize = 1024;
 		nbd_dev[i].bytesize = 0;
+		spin_lock_init(&nbd_dev[i].timer_lock);
+		init_timer(&nbd_dev[i].req_timer);
 		disk->major = NBD_MAJOR;
 		disk->first_minor = i << part_shift;
 		disk->fops = &nbd_fops;
diff --git a/include/linux/nbd.h b/include/linux/nbd.h
index ae4981e..56238ef 100644
--- a/include/linux/nbd.h
+++ b/include/linux/nbd.h
@@ -42,6 +42,12 @@ struct nbd_device {
 	pid_t pid; /* pid of nbd-client, if attached */
 	int xmit_timeout;
 	int disconnect; /* a disconnect has been requested by user */
+	spinlock_t timer_lock;
+	struct timer_list req_timer;
+	int inflight;
+	int timedout;
+	struct task_struct *sender;
+	struct task_struct *receiver;
 };
 
 #endif
-- 
1.8.1.4


-- 
Michal Belczyk Sr.



Reply to: