Hi Michal,
First of all, thanks for resending this. It works for my test setup but
I have some comments inline below.
On Thu, Feb 26, 2015 at 05:37:12PM +0100, Michal Belczyk wrote:
> The main idea behind it is to be able to quickly detect broken replica
> and switch over to another when used with any sort of mirror type device
> built on top of any number of nbd devices.
>
> Before this change a request would time out causing the socket to be
> shut down and the device to fail in case of a dead server or removed
> network cable only if:
>
> a) either the timer around kernel_sendmsg() kicked in
> b) or the TCP failures on retransmission finally caused an error
> on the socket, likely blocked on kernel_recvmsg() at this time,
> waiting for replies from the server
>
> Case a) depends mostly on the size of requests issued and on the maximum
> size of the socket buffer -- a lot of read request headers or small
> write requests could be "sent" without triggering the requested timeout
>
> Case b) timeout is independent of nbd-client -t <timeout> option
> as there is no TCP_USER_TIMEOUT set on the client socket by default.
> And even if such timeout was set it would not solve the problem of
> an nbd-client hung on receiving replies for much longer time without
> setting TCP keep-alives... and that would be the third, independent
> timeout setting required to make it work "almost" as expected...
>
> So, instead, take the big hammer approach and:
>
> *) trace the number of outstanding requests sent to the server
> (nbd->inflight)
> *) enable the timer (nbd->req_timer) before the first request
> is submitted and leave it enabled
> *) when sending next request do not touch the timer (it is up
> to the receiving side to control it at this point)
> *) on receive side update the timer every time a response
> is collected but there are more to read from the server
> *) disable the timer whenever the inflight counter drops
> to zero or an error (leading to the socket shutdown)
> is returned
>
> This patch does NOT prevent the server to process a request for longer
> than the timeout specified if only it replies to any other request
> submitted within the timeout (the server still may reply to a batch
> of requests in any order).
>
> Only the nbd->xmit_timeout != 0 code path is changed so the patch should
> not affect nbd connections running without an explicit timeout set
> on the nbd-client command line.
>
> There is also no way to enable or disable the timeout on an active
> (nbd->pid != 0) nbd device, it is however possible to change its value.
> Otherwise the inflight request counter would have to affect the nbd
> devices enabled without nbd-client -t <timeout>.
>
> Also move nbd->pid modifications behind nbd->tx_lock wherever possible
> to avoid races between the concurrent nbd-client invocations.
This commit message is really good.
>
> Signed-off-by: Michal Belczyk <belczyk@...1274...>
> ---
> drivers/block/nbd.c | 150 +++++++++++++++++++++++++++++++++++++++++-----------
> include/linux/nbd.h | 6 +++
> 2 files changed, 125 insertions(+), 31 deletions(-)
>
> diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
> index 4bc2a5c..cc4a98a 100644
> --- a/drivers/block/nbd.c
> +++ b/drivers/block/nbd.c
> @@ -140,11 +140,24 @@ static void sock_shutdown(struct nbd_device *nbd, int lock)
>
> static void nbd_xmit_timeout(unsigned long arg)
> {
> - struct task_struct *task = (struct task_struct *)arg;
> + struct nbd_device *nbd = (struct nbd_device *)arg;
> + struct task_struct *task_ary[2];
> + unsigned long flags;
> + int i;
>
> - printk(KERN_WARNING "nbd: killing hung xmit (%s, pid: %d)\n",
> - task->comm, task->pid);
> - force_sig(SIGKILL, task);
> + spin_lock_irqsave(&nbd->timer_lock, flags);
> + nbd->timedout = 1;
> + task_ary[0] = nbd->sender;
> + task_ary[1] = nbd->receiver;
> + for (i = 0; i < 2; i++) {
> + if (task_ary[i] == NULL)
> + continue;
> + printk(KERN_WARNING "nbd: killing hung xmit (%s, pid: %d)\n",
> + task_ary[i]->comm, task_ary[i]->pid);
> + force_sig(SIGKILL, task_ary[i]);
> + break;
> + }
> + spin_unlock_irqrestore(&nbd->timer_lock, flags);
> }
>
> /*
> @@ -158,7 +171,7 @@ static int sock_xmit(struct nbd_device *nbd, int send, void *buf, int size,
> struct msghdr msg;
> struct kvec iov;
> sigset_t blocked, oldset;
> - unsigned long pflags = current->flags;
> + unsigned long flags, pflags = current->flags;
I would prefer variable declerations in seperate lines, especially if
there is an assignment involved, for readability.
>
> if (unlikely(!sock)) {
> dev_err(disk_to_dev(nbd->disk),
> @@ -183,23 +196,39 @@ static int sock_xmit(struct nbd_device *nbd, int send, void *buf, int size,
> msg.msg_controllen = 0;
> msg.msg_flags = msg_flags | MSG_NOSIGNAL;
>
> - if (send) {
> - struct timer_list ti;
> -
> - if (nbd->xmit_timeout) {
> - init_timer(&ti);
> - ti.function = nbd_xmit_timeout;
> - ti.data = (unsigned long)current;
> - ti.expires = jiffies + nbd->xmit_timeout;
> - add_timer(&ti);
> + if (nbd->xmit_timeout) {
> + spin_lock_irqsave(&nbd->timer_lock, flags);
> + if (nbd->timedout) {
> + spin_unlock_irqrestore(&nbd->timer_lock, flags);
> + printk(KERN_WARNING
> + "nbd (pid %d: %s) timed out\n",
> + task_pid_nr(current), current->comm);
> + result = -EINTR;
> + sock_shutdown(nbd, !send);
> + break;
I am not a big fan of having this block with a sock_shutdown at two
points in sock_xmit, here as reaction on nbd->timedout and below as
handler of the signal received.
> }
> + if (send)
> + nbd->sender = current;
> + else
> + nbd->receiver = current;
> + spin_unlock_irqrestore(&nbd->timer_lock, flags);
> + }
> +
What happens if the timer interrupt is happens right at this point?
The other process may just fall out of the kernel_send/recvmsg() call
through the signal and call into sock_shutdown(). There it would wait
for the tx_lock (assuming it is receiving). At the same time this
process would continue into kernel_sendmsg. There is no active timer
anymore and the other process is waiting in the lock.
Is there something I am missing or may this be a problem?
> + if (send)
> result = kernel_sendmsg(sock, &msg, &iov, 1, size);
> - if (nbd->xmit_timeout)
> - del_timer_sync(&ti);
> - } else
> + else
> result = kernel_recvmsg(sock, &msg, &iov, 1, size,
> msg.msg_flags);
>
> + if (nbd->xmit_timeout) {
> + spin_lock_irqsave(&nbd->timer_lock, flags);
> + if (send)
> + nbd->sender = NULL;
> + else
> + nbd->receiver = NULL;
> + spin_unlock_irqrestore(&nbd->timer_lock, flags);
> + }
> +
> if (signal_pending(current)) {
> siginfo_t info;
> printk(KERN_WARNING "nbd (pid %d: %s) got signal %d\n",
> @@ -226,12 +255,12 @@ static int sock_xmit(struct nbd_device *nbd, int send, void *buf, int size,
> }
>
> static inline int sock_send_bvec(struct nbd_device *nbd, struct bio_vec *bvec,
> - int flags)
> + int msg_flags)
> {
> int result;
> void *kaddr = kmap(bvec->bv_page);
> result = sock_xmit(nbd, 1, kaddr + bvec->bv_offset,
> - bvec->bv_len, flags);
> + bvec->bv_len, msg_flags);
This renaming from flags to msg_flags seems unrelated. I also don't see
why this is necessary. Please split this into a separate patch if it is
necessary.
> kunmap(bvec->bv_page);
> return result;
> }
> @@ -239,9 +268,9 @@ static inline int sock_send_bvec(struct nbd_device *nbd, struct bio_vec *bvec,
> /* always call with the tx_lock held */
> static int nbd_send_req(struct nbd_device *nbd, struct request *req)
> {
> - int result, flags;
> + int result, msg_flags;
Same as above, please rename msg_flags in a separate patch.
Perhaps 'irq_flags' for the new 'flags' variable would be better?
> struct nbd_request request;
> - unsigned long size = blk_rq_bytes(req);
> + unsigned long flags, size = blk_rq_bytes(req);
>
> memset(&request, 0, sizeof(request));
> request.magic = htonl(NBD_REQUEST_MAGIC);
> @@ -253,6 +282,19 @@ static int nbd_send_req(struct nbd_device *nbd, struct request *req)
> }
> memcpy(request.handle, &req, sizeof(req));
>
> + if (nbd->xmit_timeout) {
> + spin_lock_irqsave(&nbd->timer_lock, flags);
> + if (!nbd->inflight) {
> + nbd->req_timer.function = nbd_xmit_timeout;
> + nbd->req_timer.data = (unsigned long)nbd;
Isn't it possible to assign the above two at initialization?
> + nbd->req_timer.expires = jiffies + nbd->xmit_timeout;
> + add_timer(&nbd->req_timer);
> + }
> + nbd->inflight++;
> + BUG_ON(nbd->inflight <= 0);
> + spin_unlock_irqrestore(&nbd->timer_lock, flags);
> + }
> +
> dprintk(DBG_TX, "%s: request %p: sending control (%s@%llu,%uB)\n",
> nbd->disk->disk_name, req,
> nbdcmd_to_ascii(nbd_cmd(req)),
> @@ -274,12 +316,12 @@ static int nbd_send_req(struct nbd_device *nbd, struct request *req)
> * whether to set MSG_MORE or not...
> */
> rq_for_each_segment(bvec, req, iter) {
> - flags = 0;
> + msg_flags = 0;
> if (!rq_iter_last(bvec, iter))
> - flags = MSG_MORE;
> + msg_flags = MSG_MORE;
> dprintk(DBG_TX, "%s: request %p: sending %d bytes data\n",
> nbd->disk->disk_name, req, bvec.bv_len);
> - result = sock_send_bvec(nbd, &bvec, flags);
> + result = sock_send_bvec(nbd, &bvec, msg_flags);
> if (result <= 0) {
> dev_err(disk_to_dev(nbd->disk),
> "Send data failed (result %d)\n",
> @@ -291,6 +333,14 @@ static int nbd_send_req(struct nbd_device *nbd, struct request *req)
> return 0;
>
> error_out:
> + if (nbd->xmit_timeout) {
> + spin_lock_irqsave(&nbd->timer_lock, flags);
> + nbd->inflight--;
> + BUG_ON(nbd->inflight < 0);
nbd->inflight was checked above already with BUG_ON().
> + if (!nbd->inflight)
> + del_timer_sync(&nbd->req_timer);
> + spin_unlock_irqrestore(&nbd->timer_lock, flags);
> + }
> return -EIO;
> }
>
> @@ -412,24 +462,41 @@ static struct device_attribute pid_attr = {
> static int nbd_do_it(struct nbd_device *nbd)
> {
> struct request *req;
> + unsigned long flags;
> int ret;
>
> BUG_ON(nbd->magic != NBD_MAGIC);
>
> sk_set_memalloc(nbd->sock->sk);
> - nbd->pid = task_pid_nr(current);
> ret = device_create_file(disk_to_dev(nbd->disk), &pid_attr);
> if (ret) {
> dev_err(disk_to_dev(nbd->disk), "device_create_file failed!\n");
> - nbd->pid = 0;
> return ret;
> }
>
> - while ((req = nbd_read_stat(nbd)) != NULL)
> - nbd_end_request(req);
> + for (;;) {
> + req = nbd_read_stat(nbd);
> + if (nbd->xmit_timeout) {
> + spin_lock_irqsave(&nbd->timer_lock, flags);
> + if (req != NULL) {
> + nbd->inflight--;
> + BUG_ON(nbd->inflight < 0);
> + }
> + if (req != NULL && nbd->inflight)
> + mod_timer(&nbd->req_timer,
> + jiffies + nbd->xmit_timeout);
> + else
> + del_timer_sync(&nbd->req_timer);
> + spin_unlock_irqrestore(&nbd->timer_lock, flags);
> + }
> + if (req != NULL) {
> + nbd_end_request(req);
> + continue;
> + }
> + break;
> + }
Please make this loop a bit more readable. Perhaps something like this?
while ((req = nbd_read_stat(nbd)) != NULL) {
nbd_end_request(req);
if (nbd->xmit_timeout) {
...
}
}
if (nbd->xmit_timeout) {
lock();
del_timer_sync();
unlock();
}
>
> device_remove_file(disk_to_dev(nbd->disk), &pid_attr);
> - nbd->pid = 0;
Why is this removed?
> return 0;
> }
>
> @@ -669,9 +736,20 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
> set_capacity(nbd->disk, nbd->bytesize >> 9);
> return 0;
>
> - case NBD_SET_TIMEOUT:
> - nbd->xmit_timeout = arg * HZ;
> + case NBD_SET_TIMEOUT: {
> + int xt;
> +
> + xt = arg * HZ;
> + if (xt < 0)
> + return -EINVAL;
> + if (nbd->pid &&
Could you use nbd->sock here please? This doesn't really depend on a pid
does it?
> + ((!nbd->xmit_timeout && xt) || (nbd->xmit_timeout && !xt)))
> + return -EBUSY;
> + dev_info(disk_to_dev(nbd->disk), "NBD_SET_TIMEOUT: %d -> %d\n",
> + nbd->xmit_timeout / HZ, xt / HZ);
Please use dev_dbg here.
> + nbd->xmit_timeout = xt;
> return 0;
> + }
>
> case NBD_SET_FLAGS:
> nbd->flags = arg;
> @@ -694,6 +772,11 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
> if (!nbd->sock)
> return -EINVAL;
>
> + nbd->pid = task_pid_nr(current);
> + nbd->inflight = 0;
> + nbd->timedout = 0;
> + nbd->sender = NULL;
> + nbd->receiver = NULL;
> mutex_unlock(&nbd->tx_lock);
>
> if (nbd->flags & NBD_FLAG_READ_ONLY)
> @@ -710,6 +793,7 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
> nbd->disk->disk_name);
> if (IS_ERR(thread)) {
> mutex_lock(&nbd->tx_lock);
> + nbd->pid = 0;
I can't really see why it is necessary to get nbd->pid locked? And if
you need this, please split it into a separate patch.
> return PTR_ERR(thread);
> }
> wake_up_process(thread);
> @@ -717,6 +801,7 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
> kthread_stop(thread);
>
> mutex_lock(&nbd->tx_lock);
> + nbd->pid = 0;
> if (error)
> return error;
> sock_shutdown(nbd, 0);
> @@ -731,6 +816,7 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
> sockfd_put(sock);
> nbd->flags = 0;
> nbd->bytesize = 0;
> + nbd->xmit_timeout = 0;
> bdev->bd_inode->i_size = 0;
> set_capacity(nbd->disk, 0);
> if (max_part > 0)
> @@ -874,6 +960,8 @@ static int __init nbd_init(void)
> init_waitqueue_head(&nbd_dev[i].waiting_wq);
> nbd_dev[i].blksize = 1024;
> nbd_dev[i].bytesize = 0;
> + spin_lock_init(&nbd_dev[i].timer_lock);
> + init_timer(&nbd_dev[i].req_timer);
> disk->major = NBD_MAJOR;
> disk->first_minor = i << part_shift;
> disk->fops = &nbd_fops;
> diff --git a/include/linux/nbd.h b/include/linux/nbd.h
> index f62f78a..c1280ca 100644
> --- a/include/linux/nbd.h
> +++ b/include/linux/nbd.h
> @@ -41,6 +41,12 @@ struct nbd_device {
> pid_t pid; /* pid of nbd-client, if attached */
> int xmit_timeout;
> int disconnect; /* a disconnect has been requested by user */
> + spinlock_t timer_lock;
> + struct timer_list req_timer;
> + int inflight;
> + int timedout;
Do we really need 'timedout'? or is it possible to use the pending
signal to immediately shutdown the nbd?
Best Regards,
Markus
--
Pengutronix e.K. | |
Industrial Linux Solutions | http://www.pengutronix.de/ |
Peiner Str. 6-8, 31137 Hildesheim, Germany | Phone: +49-5121-206917-0 |
Amtsgericht Hildesheim, HRA 2686 | Fax: +49-5121-206917-5555 |
Attachment:
signature.asc
Description: Digital signature