Re: [Nbd] [PATCH] nbd: Add locking for tasks

To: Markus Pargmann <mpa@...1897...>
Cc: Jens Axboe <axboe@...161...>, nbd-general@lists.sourceforge.net, Michal, linux-kernel@...25..., Hermann Lauer <Hermann.Lauer@...1489...>
Subject: Re: [Nbd] [PATCH] nbd: Add locking for tasks
From: Ben Hutchings <ben@...1505...>
Date: Thu, 08 Oct 2015 21:14:29 +0100
Message-id: <1444335269.2956.268.camel@...1505...>
In-reply-to: <1444154634-24927-1-git-send-email-mpa@...1897...>
References: <20151001060436.GN19121@...1897...> <1444154634-24927-1-git-send-email-mpa@...1897...>

On Tue, 2015-10-06 at 20:03 +0200, Markus Pargmann wrote:
> The timeout handling introduced in
> 	> 7e2893a16d3e (nbd: Fix timeout detection)
> introduces a race condition which may lead to killing of tasks that are
> not in nbd context anymore. This was not observed or reproducable yet.
> 
> This patch adds locking to critical use of task_recv and task_send to
> avoid killing tasks that already left the NBD thread functions. This
> lock is only acquired if a timeout occures or the nbd device
> starts/stops.
> 
> Reported-by: Ben Hutchings <ben@...1505...>
> Signed-off-by: Markus Pargmann <mpa@...1897...>

Reviewed-by: Ben Hutchings <ben@...1505...>

You could add 'Fixes: 7e2893a16d3e ("nbd: Fix timeout detection")' to
the commit message as well.

nbd_dbg_tasks_show() can still race with thread exit and two tasks can 
race to become the receive thread, but those aren't new bugs.

Ben.

> ---
>  drivers/block/nbd.c | 36 ++++++++++++++++++++++++++++++------
>  1 file changed, 30 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
> index 039c7c4f0539..1a70852ac808 100644
> --- a/drivers/block/nbd.c
> +++ b/drivers/block/nbd.c
> @@ -60,6 +60,7 @@ struct nbd_device {
>  > 	> bool disconnect; /* a disconnect has been requested by user */
>  
>  > 	> struct timer_list timeout_timer;
> +> 	> spinlock_t tasks_lock;
>  > 	> struct task_struct *task_recv;
>  > 	> struct task_struct *task_send;
>  
> @@ -140,21 +141,23 @@ static void sock_shutdown(struct nbd_device *nbd)
>  static void nbd_xmit_timeout(unsigned long arg)
>  {
>  > 	> struct nbd_device *nbd = (struct nbd_device *)arg;
> -> 	> struct task_struct *task;
> +> 	> unsigned long flags;
>  
>  > 	> if (list_empty(&nbd->queue_head))
>  > 	> 	> return;
>  
>  > 	> nbd->disconnect = true;
>  
> -> 	> task = READ_ONCE(nbd->task_recv);
> -> 	> if (task)
> -> 	> 	> force_sig(SIGKILL, task);
> +> 	> spin_lock_irqsave(&nbd->tasks_lock, flags);
> +
> +> 	> if (nbd->task_recv)
> +> 	> 	> force_sig(SIGKILL, nbd->task_recv);
>  
> -> 	> task = READ_ONCE(nbd->task_send);
> -> 	> if (task)
> +> 	> if (nbd->task_send)
>  > 	> 	> force_sig(SIGKILL, nbd->task_send);
>  
> +> 	> spin_unlock_irqrestore(&nbd->tasks_lock, flags);
> +
>  > 	> dev_err(nbd_to_dev(nbd), "Connection timed out, killed receiver and sender, shutting down connection\n");
>  }
>  
> @@ -403,17 +406,24 @@ static int nbd_thread_recv(struct nbd_device *nbd)
>  {
>  > 	> struct request *req;
>  > 	> int ret;
> +> 	> unsigned long flags;
>  
>  > 	> BUG_ON(nbd->magic != NBD_MAGIC);
>  
>  > 	> sk_set_memalloc(nbd->sock->sk);
>  
> +> 	> spin_lock_irqsave(&nbd->tasks_lock, flags);
>  > 	> nbd->task_recv = current;
> +> 	> spin_unlock_irqrestore(&nbd->tasks_lock, flags);
>  
>  > 	> ret = device_create_file(disk_to_dev(nbd->disk), &pid_attr);
>  > 	> if (ret) {
>  > 	> 	> dev_err(disk_to_dev(nbd->disk), "device_create_file failed!\n");
> +
> +> 	> 	> spin_lock_irqsave(&nbd->tasks_lock, flags);
>  > 	> 	> nbd->task_recv = NULL;
> +> 	> 	> spin_unlock_irqrestore(&nbd->tasks_lock, flags);
> +
>  > 	> 	> return ret;
>  > 	> }
>  
> @@ -429,7 +439,9 @@ static int nbd_thread_recv(struct nbd_device *nbd)
>  
>  > 	> device_remove_file(disk_to_dev(nbd->disk), &pid_attr);
>  
> +> 	> spin_lock_irqsave(&nbd->tasks_lock, flags);
>  > 	> nbd->task_recv = NULL;
> +> 	> spin_unlock_irqrestore(&nbd->tasks_lock, flags);
>  
>  > 	> if (signal_pending(current)) {
>  > 	> 	> siginfo_t info;
> @@ -534,8 +546,11 @@ static int nbd_thread_send(void *data)
>  {
>  > 	> struct nbd_device *nbd = data;
>  > 	> struct request *req;
> +> 	> unsigned long flags;
>  
> +> 	> spin_lock_irqsave(&nbd->tasks_lock, flags);
>  > 	> nbd->task_send = current;
> +> 	> spin_unlock_irqrestore(&nbd->tasks_lock, flags);
>  
>  > 	> set_user_nice(current, MIN_NICE);
>  > 	> while (!kthread_should_stop() || !list_empty(&nbd->waiting_queue)) {
> @@ -572,7 +587,15 @@ static int nbd_thread_send(void *data)
>  > 	> 	> nbd_handle_req(nbd, req);
>  > 	> }
>  
> +> 	> spin_lock_irqsave(&nbd->tasks_lock, flags);
>  > 	> nbd->task_send = NULL;
> +> 	> spin_unlock_irqrestore(&nbd->tasks_lock, flags);
> +
> +> 	> /* Clear maybe pending signals */
> +> 	> if (signal_pending(current)) {
> +> 	> 	> siginfo_t info;
> +> 	> 	> dequeue_signal_lock(current, ¤t->blocked, &info);
> +> 	> }
>  
>  > 	> return 0;
>  }
> @@ -1027,6 +1050,7 @@ static int __init nbd_init(void)
>  > 	> 	> nbd_dev[i].magic = NBD_MAGIC;
>  > 	> 	> INIT_LIST_HEAD(&nbd_dev[i].waiting_queue);
>  > 	> 	> spin_lock_init(&nbd_dev[i].queue_lock);
> +> 	> 	> spin_lock_init(&nbd_dev[i].tasks_lock);
>  > 	> 	> INIT_LIST_HEAD(&nbd_dev[i].queue_head);
>  > 	> 	> mutex_init(&nbd_dev[i].tx_lock);
>  > 	> 	> init_timer(&nbd_dev[i].timeout_timer);
-- 
Ben Hutchings
Once a job is fouled up, anything done to improve it makes it worse.

Attachment: signature.asc
Description: This is a digitally signed message part

Reply to:

Follow-Ups:
- Re: [Nbd] [PATCH] nbd: Add locking for tasks
  - From: Jens Axboe <axboe@...161...>

References:
- Re: [Nbd] [PATCH 01/10] nbd: Fix timeout detection
  - From: Markus Pargmann <mpa@...1897...>
- [Nbd] [PATCH] nbd: Add locking for tasks
  - From: Markus Pargmann <mpa@...1897...>

Prev by Date: Re: [Nbd] NBD: Disconnect connection/kill NBD server cause kernel bug even kernel hang
Next by Date: Re: [Nbd] [PATCH] nbd: Add locking for tasks
Previous by thread: [Nbd] [PATCH] nbd: Add locking for tasks
Next by thread: Re: [Nbd] [PATCH] nbd: Add locking for tasks
Index(es):
- Date
- Thread