On Tue, 2015-10-06 at 20:03 +0200, Markus Pargmann wrote: > The timeout handling introduced in > > 7e2893a16d3e (nbd: Fix timeout detection) > introduces a race condition which may lead to killing of tasks that are > not in nbd context anymore. This was not observed or reproducable yet. > > This patch adds locking to critical use of task_recv and task_send to > avoid killing tasks that already left the NBD thread functions. This > lock is only acquired if a timeout occures or the nbd device > starts/stops. > > Reported-by: Ben Hutchings <ben@...1505...> > Signed-off-by: Markus Pargmann <mpa@...1897...> Reviewed-by: Ben Hutchings <ben@...1505...> You could add 'Fixes: 7e2893a16d3e ("nbd: Fix timeout detection")' to the commit message as well. nbd_dbg_tasks_show() can still race with thread exit and two tasks can race to become the receive thread, but those aren't new bugs. Ben. > --- > drivers/block/nbd.c | 36 ++++++++++++++++++++++++++++++------ > 1 file changed, 30 insertions(+), 6 deletions(-) > > diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c > index 039c7c4f0539..1a70852ac808 100644 > --- a/drivers/block/nbd.c > +++ b/drivers/block/nbd.c > @@ -60,6 +60,7 @@ struct nbd_device { > > > bool disconnect; /* a disconnect has been requested by user */ > > > > struct timer_list timeout_timer; > +> > spinlock_t tasks_lock; > > > struct task_struct *task_recv; > > > struct task_struct *task_send; > > @@ -140,21 +141,23 @@ static void sock_shutdown(struct nbd_device *nbd) > static void nbd_xmit_timeout(unsigned long arg) > { > > > struct nbd_device *nbd = (struct nbd_device *)arg; > -> > struct task_struct *task; > +> > unsigned long flags; > > > > if (list_empty(&nbd->queue_head)) > > > > return; > > > > nbd->disconnect = true; > > -> > task = READ_ONCE(nbd->task_recv); > -> > if (task) > -> > > force_sig(SIGKILL, task); > +> > spin_lock_irqsave(&nbd->tasks_lock, flags); > + > +> > if (nbd->task_recv) > +> > > force_sig(SIGKILL, nbd->task_recv); > > -> > task = READ_ONCE(nbd->task_send); > -> > if (task) > +> > if (nbd->task_send) > > > > force_sig(SIGKILL, nbd->task_send); > > +> > spin_unlock_irqrestore(&nbd->tasks_lock, flags); > + > > > dev_err(nbd_to_dev(nbd), "Connection timed out, killed receiver and sender, shutting down connection\n"); > } > > @@ -403,17 +406,24 @@ static int nbd_thread_recv(struct nbd_device *nbd) > { > > > struct request *req; > > > int ret; > +> > unsigned long flags; > > > > BUG_ON(nbd->magic != NBD_MAGIC); > > > > sk_set_memalloc(nbd->sock->sk); > > +> > spin_lock_irqsave(&nbd->tasks_lock, flags); > > > nbd->task_recv = current; > +> > spin_unlock_irqrestore(&nbd->tasks_lock, flags); > > > > ret = device_create_file(disk_to_dev(nbd->disk), &pid_attr); > > > if (ret) { > > > > dev_err(disk_to_dev(nbd->disk), "device_create_file failed!\n"); > + > +> > > spin_lock_irqsave(&nbd->tasks_lock, flags); > > > > nbd->task_recv = NULL; > +> > > spin_unlock_irqrestore(&nbd->tasks_lock, flags); > + > > > > return ret; > > > } > > @@ -429,7 +439,9 @@ static int nbd_thread_recv(struct nbd_device *nbd) > > > > device_remove_file(disk_to_dev(nbd->disk), &pid_attr); > > +> > spin_lock_irqsave(&nbd->tasks_lock, flags); > > > nbd->task_recv = NULL; > +> > spin_unlock_irqrestore(&nbd->tasks_lock, flags); > > > > if (signal_pending(current)) { > > > > siginfo_t info; > @@ -534,8 +546,11 @@ static int nbd_thread_send(void *data) > { > > > struct nbd_device *nbd = data; > > > struct request *req; > +> > unsigned long flags; > > +> > spin_lock_irqsave(&nbd->tasks_lock, flags); > > > nbd->task_send = current; > +> > spin_unlock_irqrestore(&nbd->tasks_lock, flags); > > > > set_user_nice(current, MIN_NICE); > > > while (!kthread_should_stop() || !list_empty(&nbd->waiting_queue)) { > @@ -572,7 +587,15 @@ static int nbd_thread_send(void *data) > > > > nbd_handle_req(nbd, req); > > > } > > +> > spin_lock_irqsave(&nbd->tasks_lock, flags); > > > nbd->task_send = NULL; > +> > spin_unlock_irqrestore(&nbd->tasks_lock, flags); > + > +> > /* Clear maybe pending signals */ > +> > if (signal_pending(current)) { > +> > > siginfo_t info; > +> > > dequeue_signal_lock(current, ¤t->blocked, &info); > +> > } > > > > return 0; > } > @@ -1027,6 +1050,7 @@ static int __init nbd_init(void) > > > > nbd_dev[i].magic = NBD_MAGIC; > > > > INIT_LIST_HEAD(&nbd_dev[i].waiting_queue); > > > > spin_lock_init(&nbd_dev[i].queue_lock); > +> > > spin_lock_init(&nbd_dev[i].tasks_lock); > > > > INIT_LIST_HEAD(&nbd_dev[i].queue_head); > > > > mutex_init(&nbd_dev[i].tx_lock); > > > > init_timer(&nbd_dev[i].timeout_timer); -- Ben Hutchings Once a job is fouled up, anything done to improve it makes it worse.
Attachment:
signature.asc
Description: This is a digitally signed message part