[Date Prev][Date Next] [Thread Prev][Thread Next] [Date Index] [Thread Index]

Bug#607879: System hangs up with mmap.c:873!



On Sat, 2011-01-15 at 14:42 +0000, Ben Hutchings wrote:
> On Sat, 2011-01-15 at 15:36 +0100, Daniel Baumann wrote:
> > On 01/15/2011 03:29 PM, Ben Hutchings wrote:
> > > Daniel, please identify the bug fix we need.
> > 
> > http://lists.debian.org/debian-live/2011/01/msg00093.html
> 
> Well, we might as well just remove aufs then.

(No, that's not serious.)

The attached patch series might deal with this bug.  These patches are
simply cherry-picked from aufs upstream and modified as necessary to
compile.  I leave any further building and testing to you.

Ben.

-- 
Ben Hutchings
Once a job is fouled up, anything done to improve it makes it worse.
From: J. R. Okajima <hooanon05@yahoo.co.jp>
Date: Mon, 1 Mar 2010 17:52:50 +0900
Subject: [PATCH 1/4] Revert "aufs: narrow down the BKL region"

commit e62ca9737674cf9b70a961cb8d1efed4a7cff976 in aufs2-2.6

This reverts commit d84deeb079e09b33c2339bc7a54cf7d15c3b8a85.
BKL doesn't help the multi threaded application.
Lockdep says mmap_sem is circular here. It may be correct, but I am not
sure whether it is false positive or not in real world.

Reported-by: "James ." <jazzrz86@gmail.com>
Signed-off-by: J. R. Okajima <hooanon05@yahoo.co.jp>
---
 fs/aufs/f_op.c |   14 ++++----------
 1 files changed, 4 insertions(+), 10 deletions(-)

diff --git a/fs/aufs/f_op.c b/fs/aufs/f_op.c
index 3e9e47d..4c4ef82 100644
--- a/fs/aufs/f_op.c
+++ b/fs/aufs/f_op.c
@@ -597,11 +597,8 @@ static int aufs_mmap(struct file *file, struct vm_area_struct *vma)
 	up_write(&current->mm->mmap_sem);
 	si_read_lock(sb, AuLock_FLUSH);
 	err = au_reval_and_lock_fdi(file, au_reopen_nondir, /*wlock*/1);
-	if (unlikely(err)) {
-		down_write(&current->mm->mmap_sem);
-		unlock_kernel();
+	if (unlikely(err))
 		goto out;
-	}
 
 	mmapped = !!au_test_mmapped(file);
 	if (wlock) {
@@ -609,16 +606,11 @@ static int aufs_mmap(struct file *file, struct vm_area_struct *vma)
 
 		err = au_ready_to_write(file, -1, &pin);
 		di_downgrade_lock(dentry, AuLock_IR);
-		if (unlikely(err)) {
-			down_write(&current->mm->mmap_sem);
-			unlock_kernel();
+		if (unlikely(err))
 			goto out_unlock;
-		}
 		au_unpin(&pin);
 	} else
 		di_downgrade_lock(dentry, AuLock_IR);
-	down_write(&current->mm->mmap_sem);
-	unlock_kernel();
 
 	h_file = au_h_fptr(file, au_fbstart(file));
 	if (!mmapped && au_test_fs_bad_mapping(h_file->f_dentry->d_sb)) {
@@ -670,6 +662,8 @@ static int aufs_mmap(struct file *file, struct vm_area_struct *vma)
 	fi_write_unlock(file);
  out:
 	si_read_unlock(sb);
+	down_write(&current->mm->mmap_sem);
+	unlock_kernel();
 	return err;
 }
 
-- 
1.7.2.3

From: J. R. Okajima <hooanon05@yahoo.co.jp>
Date: Mon, 1 Mar 2010 23:13:34 +0900
Subject: [PATCH 2/4] Revert "aufs: bugfix, unlock mmap_sem temporary using BKL"

commit 639e607997502dfe7dbe140c8de5d81ba99d4240 in aufs2-2.6

This reverts commit 4b70e6f04d4292d8b5ce6cd7ac7371e68eab9175.
BKL doesn't help the multi threaded application.
Lockdep says mmap_sem is circular here. It may be correct, but I am not
sure whether it is false positive or not in real world.

Reported-by: "James ." <jazzrz86@gmail.com>
---
 fs/aufs/f_op.c |   15 ---------------
 1 files changed, 0 insertions(+), 15 deletions(-)

diff --git a/fs/aufs/f_op.c b/fs/aufs/f_op.c
index 4c4ef82..6f89992 100644
--- a/fs/aufs/f_op.c
+++ b/fs/aufs/f_op.c
@@ -25,7 +25,6 @@
 #include <linux/mman.h>
 #include <linux/mm.h>
 #include <linux/security.h>
-#include <linux/smp_lock.h>
 #include "aufs.h"
 
 /* common function to regular file and dir */
@@ -583,18 +582,6 @@ static int aufs_mmap(struct file *file, struct vm_area_struct *vma)
 	dentry = file->f_dentry;
 	wlock = !!(file->f_mode & FMODE_WRITE) && (vma->vm_flags & VM_SHARED);
 	sb = dentry->d_sb;
-	/*
-	 * Very ugly BKL approach to keep the order of locks.
-	 * Here mm->mmap_sem is acquired by our caller.
-	 *
-	 * native readdir, i_mutex, copy_to_user, mmap_sem
-	 * aufs readdir, i_mutex, rwsem, nested-i_mutex, copy_to_user, mmap_sem
-	 * aufs mmap, mmap_sem, rwsem
-	 *
-	 * Unlock it temporary.
-	 */
-	lock_kernel();
-	up_write(&current->mm->mmap_sem);
 	si_read_lock(sb, AuLock_FLUSH);
 	err = au_reval_and_lock_fdi(file, au_reopen_nondir, /*wlock*/1);
 	if (unlikely(err))
@@ -662,8 +649,6 @@ static int aufs_mmap(struct file *file, struct vm_area_struct *vma)
 	fi_write_unlock(file);
  out:
 	si_read_unlock(sb);
-	down_write(&current->mm->mmap_sem);
-	unlock_kernel();
 	return err;
 }
 
-- 
1.7.2.3

From: J. R. Okajima <hooanon05@yahoo.co.jp>
Date: Mon, 8 Mar 2010 23:45:56 +0900
Subject: [PATCH 3/4] aufs: bugfix, another approach to keep the lock order of mmap_sem

commit d986fa5a8557f6861fcac4106b6d75301bf5d118 in aufs2-2.6

The previous approach
	4b70e6f aufs: bugfix, unlock mmap_sem temporary using BKL
was bad and already reverted.
This approach is ugly too, but it works.

- split aufs_mmap() into two parts.
- the first part is for copy-ing up which requires rwsem and executed by
  aufsd workqueue.
- the second part is generic_file_mmap() and customizing vm_ops, and
  executed by the original context.
- to protect customizing vm_ops from race between two mmaps, introduce a
  new mutex in au_finfo. lock in the first phase, and release it in the
  second. this is the most ugly part of this approach. if we could use
  fi_rwsem for this use, we would use it. but there is no 'set_owner'
  method for rwsem, but mutex has.

Signed-off-by: J. R. Okajima <hooanon05@yahoo.co.jp>
[bwh: Adjust for 2.6.32]
---
 fs/aufs/f_op.c  |  113 +++++++++++++++++++++++++++++++++++++++++++------------
 fs/aufs/file.h  |    3 +
 fs/aufs/finfo.c |   17 ++++++++
 3 files changed, 109 insertions(+), 24 deletions(-)

diff --git a/fs/aufs/f_op.c b/fs/aufs/f_op.c
index 6f89992..32cc36f 100644
--- a/fs/aufs/f_op.c
+++ b/fs/aufs/f_op.c
@@ -77,6 +77,7 @@ int au_do_open_nondir(struct file *file, int flags)
 	finfo = au_fi(file);
 	finfo->fi_h_vm_ops = NULL;
 	finfo->fi_vm_ops = NULL;
+	mutex_init(&finfo->fi_mmap); /* regular file only? */
 	bindex = au_dbstart(dentry);
 	/* O_TRUNC is processed already */
 	BUG_ON(au_test_ro(dentry->d_sb, bindex, dentry->d_inode)
@@ -544,7 +545,7 @@ static int au_custom_vm_ops(struct au_finfo *finfo, struct vm_area_struct *vma)
 	int err;
 	struct vm_operations_struct *h_ops;
 
-	AuRwMustAnyLock(&finfo->fi_rwsem);
+	MtxMustLock(&finfo->fi_mmap);
 
 	err = 0;
 	h_ops = finfo->fi_h_vm_ops;
@@ -570,49 +571,116 @@ static int au_custom_vm_ops(struct au_finfo *finfo, struct vm_area_struct *vma)
 	return err;
 }
 
-static int aufs_mmap(struct file *file, struct vm_area_struct *vma)
+/*
+ * This is another ugly approach to keep the lock order, particularly
+ * mm->mmap_sem and aufs rwsem. The previous approach was reverted and you can
+ * find it in git-log, if you want.
+ *
+ * native readdir: i_mutex, copy_to_user, mmap_sem
+ * aufs readdir: i_mutex, rwsem, nested-i_mutex, copy_to_user, mmap_sem
+ *
+ * Before aufs_mmap() mmap_sem is acquired already, but aufs_mmap() has to
+ * acquire aufs rwsem. It introduces a circular locking dependency.
+ * To address this problem, aufs_mmap() delegates the part which requires aufs
+ * rwsem to its internal workqueue.
+ */
+
+/* very ugly approach */
+#ifdef CONFIG_DEBUG_MUTEXES
+#include <../kernel/mutex-debug.h>
+#else
+#include <../kernel/mutex.h>
+#endif
+
+struct au_mmap_pre_args {
+	/* input */
+	struct file *file;
+	struct vm_area_struct *vma;
+
+	/* output */
+	int *errp;
+	struct file *h_file;
+	int mmapped;
+};
+
+static int au_mmap_pre(struct file *file, struct vm_area_struct *vma,
+		       struct file **h_file, int *mmapped)
 {
 	int err;
-	unsigned char wlock, mmapped;
+	const unsigned char wlock
+		= !!(file->f_mode & FMODE_WRITE) && (vma->vm_flags & VM_SHARED);
 	struct dentry *dentry;
 	struct super_block *sb;
-	struct file *h_file;
-	struct vm_operations_struct *vm_ops;
 
 	dentry = file->f_dentry;
-	wlock = !!(file->f_mode & FMODE_WRITE) && (vma->vm_flags & VM_SHARED);
 	sb = dentry->d_sb;
-	si_read_lock(sb, AuLock_FLUSH);
+	si_read_lock(sb, !AuLock_FLUSH);
 	err = au_reval_and_lock_fdi(file, au_reopen_nondir, /*wlock*/1);
 	if (unlikely(err))
 		goto out;
 
-	mmapped = !!au_test_mmapped(file);
+	*mmapped = !!au_test_mmapped(file);
 	if (wlock) {
 		struct au_pin pin;
 
 		err = au_ready_to_write(file, -1, &pin);
-		di_downgrade_lock(dentry, AuLock_IR);
+		di_write_unlock(dentry);
 		if (unlikely(err))
 			goto out_unlock;
 		au_unpin(&pin);
 	} else
-		di_downgrade_lock(dentry, AuLock_IR);
+		di_write_unlock(dentry);
+	*h_file = au_h_fptr(file, au_fbstart(file));
+	get_file(*h_file);
+	au_fi_mmap_lock(file);
 
-	h_file = au_h_fptr(file, au_fbstart(file));
-	if (!mmapped && au_test_fs_bad_mapping(h_file->f_dentry->d_sb)) {
+out_unlock:
+	fi_write_unlock(file);
+out:
+	si_read_unlock(sb);
+	return err;
+}
+
+static void au_call_mmap_pre(void *args)
+{
+	struct au_mmap_pre_args *a = args;
+	*a->errp = au_mmap_pre(a->file, a->vma, &a->h_file, &a->mmapped);
+}
+
+static int aufs_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	int err, wkq_err;
+	struct au_finfo *finfo;
+	struct dentry *h_dentry;
+	struct vm_operations_struct *vm_ops;
+	struct au_mmap_pre_args args = {
+		.file		= file,
+		.vma		= vma,
+		.errp		= &err
+	};
+
+	wkq_err = au_wkq_wait(au_call_mmap_pre, &args);
+	if (unlikely(wkq_err))
+		err = wkq_err;
+	if (unlikely(err))
+		goto out;
+	finfo = au_fi(file);
+
+	h_dentry = args.h_file->f_dentry;
+	if (!args.mmapped && au_test_fs_bad_mapping(h_dentry->d_sb)) {
 		/*
 		 * by this assignment, f_mapping will differs from aufs inode
 		 * i_mapping.
 		 * if someone else mixes the use of f_dentry->d_inode and
 		 * f_mapping->host, then a problem may arise.
 		 */
-		file->f_mapping = h_file->f_mapping;
+		file->f_mapping = args.h_file->f_mapping;
 	}
 
 	vm_ops = NULL;
-	if (!mmapped) {
-		vm_ops = au_vm_ops(h_file, vma);
+	if (!args.mmapped) {
+		vm_ops = au_vm_ops(args.h_file, vma);
 		err = PTR_ERR(vm_ops);
 		if (IS_ERR(vm_ops))
 			goto out_unlock;
@@ -630,25 +698,22 @@ static int aufs_mmap(struct file *file, struct vm_area_struct *vma)
 		goto out_unlock;
 
 	vma->vm_ops = &aufs_vm_ops;
-	if (!mmapped) {
-		struct au_finfo *finfo = au_fi(file);
-
+	if (!args.mmapped) {
 		finfo->fi_h_vm_ops = vm_ops;
 		mutex_init(&finfo->fi_vm_mtx);
 	}
 
-	err = au_custom_vm_ops(au_fi(file), vma);
+	err = au_custom_vm_ops(finfo, vma);
 	if (unlikely(err))
 		goto out_unlock;
 
-	vfsub_file_accessed(h_file);
-	fsstack_copy_attr_atime(dentry->d_inode, h_file->f_dentry->d_inode);
+	vfsub_file_accessed(args.h_file);
+	fsstack_copy_attr_atime(file->f_dentry->d_inode, h_dentry->d_inode);
 
  out_unlock:
-	di_read_unlock(dentry, AuLock_IR);
-	fi_write_unlock(file);
+	au_fi_mmap_unlock(file);
+	fput(args.h_file);
  out:
-	si_read_unlock(sb);
 	return err;
 }
 
diff --git a/fs/aufs/file.h b/fs/aufs/file.h
index 4aaea9e..1d1d9d6 100644
--- a/fs/aufs/file.h
+++ b/fs/aufs/file.h
@@ -49,6 +49,7 @@ struct au_finfo {
 			struct vm_operations_struct	*fi_h_vm_ops;
 			struct vm_operations_struct	*fi_vm_ops;
 			struct mutex			fi_vm_mtx;
+			struct mutex			fi_mmap;
 		};
 
 		/* dir only */
@@ -114,6 +115,8 @@ void au_set_h_fptr(struct file *file, aufs_bindex_t bindex,
 		   struct file *h_file);
 
 void au_update_figen(struct file *file);
+void au_fi_mmap_lock(struct file *file);
+void au_fi_mmap_unlock(struct file *file);
 
 void au_finfo_fin(struct file *file);
 int au_finfo_init(struct file *file);
diff --git a/fs/aufs/finfo.c b/fs/aufs/finfo.c
index 4ab55e4..24ed4a1 100644
--- a/fs/aufs/finfo.c
+++ b/fs/aufs/finfo.c
@@ -56,6 +56,23 @@ void au_update_figen(struct file *file)
 
 /* ---------------------------------------------------------------------- */
 
+void au_fi_mmap_lock(struct file *file)
+{
+	FiMustWriteLock(file);
+	lockdep_off();
+	mutex_lock(&au_fi(file)->fi_mmap);
+	lockdep_on();
+}
+
+void au_fi_mmap_unlock(struct file *file)
+{
+	lockdep_off();
+	mutex_unlock(&au_fi(file)->fi_mmap);
+	lockdep_on();
+}
+
+/* ---------------------------------------------------------------------- */
+
 void au_finfo_fin(struct file *file)
 {
 	struct au_finfo *finfo;
-- 
1.7.2.3

From: J. R. Okajima <hooanon05@yahoo.co.jp>
Date: Tue, 29 Jun 2010 14:32:26 +0900
Subject: [PATCH 4/4] aufs: bugfix, separate the workqueue for preprocessing mmap

commit b0372e021a903e33f39dd515ceebd8506b1c52aa in aufs2-2.6

variation of common AB-BA deadlock problem.

ProcessA:
- aufs_mmap
  + pre-process with using a workqueue
  + wait until return from the workqueue

Workqueue task for ProcessA:
- acquire aufs rwsem

Processb
- lookup or readdir in aufs
  + acquire aufs rwsem
  + assign a new inode number
  + write the value to the XINO file using a workqueue
  + wait until return from the workqueue

Since the workqueue handles the request one by one, both of processA and
B waits forever.

This bug was introduced by the commit
d986fa5 2010-03-08
	aufs: bugfix, another approach to keep the lock order of mmap_sem
which is the last added workqueue task.
And this is the only one task which acquires such lock in workqueue.
To fix it, introduce another workqueue which is for preprocessing mmap only.
This commit will make the approach more ugly, I don't have another option.

Reported-by: Oliver Welter <mail@oliwel.de>
Signed-off-by: J. R. Okajima <hooanon05@yahoo.co.jp>
[bwh: Adjust context for Debian's 2.6.32]
---
 fs/aufs/f_op.c            |    2 +-
 fs/aufs/wkq.c             |   62 ++++++++++++++++++++++++++++++++++++--------
 fs/aufs/wkq.h             |   13 ++++++++-
 include/linux/aufs_type.h |    1 +
 4 files changed, 64 insertions(+), 14 deletions(-)

diff --git a/fs/aufs/f_op.c b/fs/aufs/f_op.c
index 6abec9c..99982e5 100644
--- a/fs/aufs/f_op.c
+++ b/fs/aufs/f_op.c
@@ -666,7 +666,7 @@ static int aufs_mmap(struct file *file, struct vm_area_struct *vma)
 		.errp		= &err
 	};
 
-	wkq_err = au_wkq_wait(au_call_mmap_pre, &args);
+	wkq_err = au_wkq_wait_pre(au_call_mmap_pre, &args);
 	if (unlikely(wkq_err))
 		err = wkq_err;
 	if (unlikely(err))
diff --git a/fs/aufs/wkq.c b/fs/aufs/wkq.c
index 307b1c4..c867e70 100644
--- a/fs/aufs/wkq.c
+++ b/fs/aufs/wkq.c
@@ -24,8 +24,23 @@
 #include <linux/module.h>
 #include "aufs.h"
 
-/* internal workqueue named AUFS_WKQ_NAME */
-static struct workqueue_struct *au_wkq;
+/* internal workqueue named AUFS_WKQ_NAME and AUFS_WKQ_PRE_NAME */
+enum {
+	AuWkq_INORMAL,
+	AuWkq_IPRE
+};
+
+static struct {
+	char *name;
+	struct workqueue_struct *wkq;
+} au_wkq[] = {
+	[AuWkq_INORMAL] = {
+		.name = AUFS_WKQ_NAME
+	},
+	[AuWkq_IPRE] = {
+		.name = AUFS_WKQ_PRE_NAME
+	}
+};
 
 struct au_wkinfo {
 	struct work_struct wk;
@@ -96,29 +111,34 @@ static void au_wkq_comp_free(struct completion *comp __maybe_unused)
 }
 #endif /* 4KSTACKS */
 
-static void au_wkq_run(struct au_wkinfo *wkinfo, int do_wait)
+static void au_wkq_run(struct au_wkinfo *wkinfo, unsigned int flags)
 {
+	struct workqueue_struct *wkq;
+
 	au_dbg_verify_kthread();
 	INIT_WORK(&wkinfo->wk, wkq_func);
-	if (do_wait)
-		queue_work(au_wkq, &wkinfo->wk);
-	else
+	if (flags & AuWkq_WAIT) {
+		wkq = au_wkq[AuWkq_INORMAL].wkq;
+		if (flags & AuWkq_PRE)
+			wkq = au_wkq[AuWkq_IPRE].wkq;
+		queue_work(wkq, &wkinfo->wk);
+	} else
 		schedule_work(&wkinfo->wk);
 }
 
-int au_wkq_wait(au_wkq_func_t func, void *args)
+int au_wkq_do_wait(unsigned int flags, au_wkq_func_t func, void *args)
 {
 	int err;
 	AuWkqCompDeclare(comp);
 	struct au_wkinfo wkinfo = {
-		.flags	= AuWkq_WAIT,
+		.flags	= flags,
 		.func	= func,
 		.args	= args
 	};
 
 	err = au_wkq_comp_alloc(&wkinfo, &comp);
 	if (!err) {
-		au_wkq_run(&wkinfo, AuWkq_WAIT);
+		au_wkq_run(&wkinfo, flags);
 		/* no timeout, no interrupt */
 		wait_for_completion(wkinfo.comp);
 		au_wkq_comp_free(comp);
@@ -170,11 +190,29 @@ void au_nwt_init(struct au_nowait_tasks *nwt)
 
 void au_wkq_fin(void)
 {
-	destroy_workqueue(au_wkq);
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(au_wkq); i++)
+		if (au_wkq[i].wkq)
+			destroy_workqueue(au_wkq[i].wkq);
 }
 
 int __init au_wkq_init(void)
 {
-	au_wkq = create_workqueue(AUFS_WKQ_NAME);
-	return 0;
+	int err, i;
+
+	err = 0;
+	for (i = 0; !err && i < ARRAY_SIZE(au_wkq); i++) {
+		au_wkq[i].wkq = create_workqueue(au_wkq[i].name);
+		if (IS_ERR(au_wkq[i].wkq))
+			err = PTR_ERR(au_wkq[i].wkq);
+		else if (!au_wkq[i].wkq)
+			err = -ENOMEM;
+		if (unlikely(err))
+			au_wkq[i].wkq = NULL;
+	}
+	if (unlikely(err))
+		au_wkq_fin();
+
+	return err;
 }
diff --git a/fs/aufs/wkq.h b/fs/aufs/wkq.h
index 3fe36b3..3e6f5b5 100644
--- a/fs/aufs/wkq.h
+++ b/fs/aufs/wkq.h
@@ -48,12 +48,13 @@ typedef void (*au_wkq_func_t)(void *args);
 
 /* wkq flags */
 #define AuWkq_WAIT	1
+#define AuWkq_PRE	(1 << 1)
 #define au_ftest_wkq(flags, name)	((flags) & AuWkq_##name)
 #define au_fset_wkq(flags, name)	{ (flags) |= AuWkq_##name; }
 #define au_fclr_wkq(flags, name)	{ (flags) &= ~AuWkq_##name; }
 
 /* wkq.c */
-int au_wkq_wait(au_wkq_func_t func, void *args);
+int au_wkq_do_wait(unsigned int flags, au_wkq_func_t func, void *args);
 int au_wkq_nowait(au_wkq_func_t func, void *args, struct super_block *sb);
 void au_nwt_init(struct au_nowait_tasks *nwt);
 int __init au_wkq_init(void);
@@ -61,6 +62,16 @@ void au_wkq_fin(void);
 
 /* ---------------------------------------------------------------------- */
 
+static inline int au_wkq_wait_pre(au_wkq_func_t func, void *args)
+{
+	return au_wkq_do_wait(AuWkq_WAIT | AuWkq_PRE, func, args);
+}
+
+static inline int au_wkq_wait(au_wkq_func_t func, void *args)
+{
+	return au_wkq_do_wait(AuWkq_WAIT, func, args);
+}
+
 static inline int au_test_wkq(struct task_struct *tsk)
 {
 	return !tsk->mm
diff --git a/include/linux/aufs_type.h b/include/linux/aufs_type.h
index 3ca3948..b13cfc1 100644
--- a/include/linux/aufs_type.h
+++ b/include/linux/aufs_type.h
@@ -75,6 +75,7 @@ typedef __s16 aufs_bindex_t;
 #define AUFS_RDBLK_DEF		512 /* bytes */
 #define AUFS_RDHASH_DEF		32
 #define AUFS_WKQ_NAME		AUFS_NAME "d"
+#define AUFS_WKQ_PRE_NAME	AUFS_WKQ_NAME "_pre"
 #define AUFS_MFS_SECOND_DEF	30 /* seconds */
 #define AUFS_PLINK_WARN		100 /* number of plinks */
 
-- 
1.7.2.3

Attachment: signature.asc
Description: This is a digitally signed message part


Reply to: