[Date Prev][Date Next] [Thread Prev][Thread Next] [Date Index] [Thread Index]

Bug#655353: marked as done (XFS lockups on 2.6.32 [task xfssyncd blocked for more than 120 seconds])



Your message dated Wed, 11 Jan 2012 06:14:08 -0600
with message-id <20120111121408.GD32173@burratino>
and subject line Re: XFS lockups [task xfssyncd blocked for more than 120 seconds]
has caused the Debian Bug report #655353,
regarding XFS lockups on 2.6.32 [task xfssyncd blocked for more than 120 seconds]
to be marked as done.

This means that you claim that the problem has been dealt with.
If this is not the case it is now your responsibility to reopen the
Bug report if necessary, and/or fix the problem forthwith.

(NB: If you are a system administrator and have no idea what this
message is talking about, this may indicate a serious mail system
misconfiguration somewhere. Please contact owner@bugs.debian.org
immediately.)


-- 
655353: http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=655353
Debian Bug Tracking System
Contact owner@bugs.debian.org with problems
--- Begin Message ---
Source: linux-2.6
Version: 2.6.32-39
Severity: normal


Hello,

Since the upgrade from lenny to squeeze I am getting in several servers random kernel lockups related to the XFS filesystem. The errors looks the same on all servers. All servers are running Debian kernel 2.6.32-39




Server 1 running Debian/Squeeze with linux-image-2.6.32-5-686/2.6.32-39
-----------------------------------------------------------------------------------------

[539160.520045] INFO: task xfssyncd:985 blocked for more than 120 seconds.
[539160.520077] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[539160.520123] xfssyncd      D f9197020     0   985      2 0x00000000
[539160.520129]  c340fb40 00000046 f6c32200 f9197020 00000000 c141d100 c141d100 c14186ac
[539160.520137]  c340fcfc c2108100 00000001 0000a0bf 00000000 00000000 00000010 f6a7f280
[539160.520145]  c21036ac c340fcfc 0806c251 f6a38b90 00000082 00000000 00000000 00000000
[539160.520154] Call Trace:
[539160.520179]  [<c126d9b9>] ? schedule_timeout+0x20/0xb0
[539160.520184]  [<c1138433>] ? __lookup_tag+0x8e/0xee
[539160.520189]  [<c11385b1>] ? radix_tree_gang_lookup_tag+0x8a/0xd5
[539160.520193]  [<c126d8c2>] ? wait_for_common+0xa4/0x100
[539160.520199]  [<c102d500>] ? default_wake_function+0x0/0x8
[539160.520225]  [<f8324433>] ? xfs_reclaim_inode+0x85/0xc8 [xfs]
[539160.520259]  [<f8324c26>] ? xfs_inode_ag_walk+0x73/0xbd [xfs]
[539160.520280]  [<f83243ae>] ? xfs_reclaim_inode+0x0/0xc8 [xfs]
[539160.520302]  [<f8324cc8>] ? xfs_inode_ag_iterator+0x58/0x8f [xfs]
[539160.520324]  [<f83243ae>] ? xfs_reclaim_inode+0x0/0xc8 [xfs]
[539160.520345]  [<f8324de4>] ? xfs_reclaim_inodes+0x12/0x16 [xfs]
[539160.520367]  [<f8324e0b>] ? xfs_sync_worker+0x23/0x5c [xfs]
[539160.520388]  [<f83246d7>] ? xfssyncd+0x134/0x17d [xfs]
[539160.520410]  [<f83245a3>] ? xfssyncd+0x0/0x17d [xfs]
[539160.520415]  [<c1043dfc>] ? kthread+0x61/0x66
[539160.520418]  [<c1043d9b>] ? kthread+0x0/0x66
[539160.520423]  [<c1003d47>] ? kernel_thread_helper+0x7/0x10


Server 2 running Debian/Squeeze with linux-image-2.6.32-5-vserver-amd64/2.6.32-39
-----------------------------------------------------------------------------------------

[864600.444518] INFO: task xfssyncd:933 blocked for more than 120 seconds.
[864600.444563] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[864600.444608] xfssyncd      D 0000000000000000     0   933      2 0x00000000
[864600.444612]  ffff88022f06b980 0000000000000046 0000000000000000 ffff880050925e40
[864600.444617]  0000000000002000 ffffffff8118af2b 000000000000f9e0 ffff88022c607fd8
[864600.444620]  0000000000015780 0000000000015780 ffff88022f195640 ffff88022f195938
[864600.444625] Call Trace:
[864600.444632]  [<ffffffff8118af2b>] ? generic_make_request+0x299/0x2f9
[864600.444637]  [<ffffffff8130a2fd>] ? schedule_timeout+0x2e/0xdd
[864600.444659]  [<ffffffffa01cea3d>] ? _xfs_buf_ioapply+0x204/0x22f [xfs]
[864600.444663]  [<ffffffff8119e9c8>] ? __lookup_tag+0xad/0x11b
[864600.444666]  [<ffffffff8130a1b6>] ? wait_for_common+0xde/0x15b
[864600.444671]  [<ffffffff8104a705>] ? default_wake_function+0x0/0x9
[864600.444685]  [<ffffffffa01d6058>] ? xfs_reclaim_inode+0x95/0xe0 [xfs]
[864600.444698]  [<ffffffffa01d6975>] ? xfs_inode_ag_walk+0x92/0xef [xfs]
[864600.444711]  [<ffffffffa01d5fc3>] ? xfs_reclaim_inode+0x0/0xe0 [xfs]
[864600.444724]  [<ffffffffa01d6a43>] ? xfs_inode_ag_iterator+0x71/0xb2 [xfs]
[864600.444737]  [<ffffffffa01d5fc3>] ? xfs_reclaim_inode+0x0/0xe0 [xfs]
[864600.444751]  [<ffffffffa01d6bd0>] ? xfs_sync_worker+0x26/0x5f [xfs]
[864600.444764]  [<ffffffffa01d6338>] ? xfssyncd+0x150/0x1bb [xfs]
[864600.444777]  [<ffffffffa01d61e8>] ? xfssyncd+0x0/0x1bb [xfs]
[864600.444780]  [<ffffffff81065bf5>] ? kthread+0x79/0x81
[864600.444784]  [<ffffffff81011baa>] ? child_rip+0xa/0x20
[864600.444788]  [<ffffffff8118bd3d>] ? generic_unplug_device+0x0/0x34
[864600.444791]  [<ffffffff81065b7c>] ? kthread+0x0/0x81
[864600.444794]  [<ffffffff81011ba0>] ? child_rip+0x0/0x20


Server 3 running Debian/Squeeze with linux-image-2.6.32-5-vserver-amd64/2.6.32-39
-----------------------------------------------------------------------------------------

[ 5520.764046] INFO: task xfssyncd:1376 blocked for more than 120 seconds.
[ 5520.764080] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[ 5520.764126] xfssyncd      D 0000000000000000     0  1376      2 0x00000000
[ 5520.764132]  ffff88007fb5d640 0000000000000046 0000000000000000 ffff88001f342540
[ 5520.764138]  00000000000000ff ffffffff8118af2b 000000000000f9e0 ffff88007e291fd8
[ 5520.764143]  0000000000015780 0000000000015780 ffff880037baab20 ffff880037baae18
[ 5520.764148] Call Trace:
[ 5520.764159]  [<ffffffff8118af2b>] ? generic_make_request+0x299/0x2f9
[ 5520.764166]  [<ffffffff8130a2fd>] ? schedule_timeout+0x2e/0xdd
[ 5520.764200]  [<ffffffffa0268a3d>] ? _xfs_buf_ioapply+0x204/0x22f [xfs]
[ 5520.764205]  [<ffffffff8119e9c8>] ? __lookup_tag+0xad/0x11b
[ 5520.764210]  [<ffffffff8130a1b6>] ? wait_for_common+0xde/0x15b
[ 5520.764216]  [<ffffffff8104a705>] ? default_wake_function+0x0/0x9
[ 5520.764234]  [<ffffffffa0270058>] ? xfs_reclaim_inode+0x95/0xe0 [xfs]
[ 5520.764251]  [<ffffffffa0270975>] ? xfs_inode_ag_walk+0x92/0xef [xfs]
[ 5520.764268]  [<ffffffffa026ffc3>] ? xfs_reclaim_inode+0x0/0xe0 [xfs]
[ 5520.764285]  [<ffffffffa0270a43>] ? xfs_inode_ag_iterator+0x71/0xb2 [xfs]
[ 5520.764301]  [<ffffffffa026ffc3>] ? xfs_reclaim_inode+0x0/0xe0 [xfs]
[ 5520.764318]  [<ffffffffa0270bd0>] ? xfs_sync_worker+0x26/0x5f [xfs]
[ 5520.764335]  [<ffffffffa0270338>] ? xfssyncd+0x150/0x1bb [xfs]
[ 5520.764351]  [<ffffffffa02701e8>] ? xfssyncd+0x0/0x1bb [xfs]
[ 5520.764356]  [<ffffffff81065bf5>] ? kthread+0x79/0x81
[ 5520.764362]  [<ffffffff81011baa>] ? child_rip+0xa/0x20
[ 5520.764366]  [<ffffffff81065b7c>] ? kthread+0x0/0x81
[ 5520.764369]  [<ffffffff81011ba0>] ? child_rip+0x0/0x20


Server 4 running Debian/Squeeze with linux-image-2.6.32-5-vserver-amd64/2.6.32-39
-----------------------------------------------------------------------------------------

[279117.747342] INFO: task xfssyncd:1985 blocked for more than 120 seconds.
[279117.747401] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[279117.747463] xfssyncd      D 0000000000000000     0  1985      2 0x00000000
[279117.747466]  ffff88023f06c0b0 0000000000000046 0000000000800012 ffffffff8118a673
[279117.747468]  ffff88023f0a6c00 000000000000f9e0 ffff88023cd93fd8 0000000000015780
[279117.747470]  0000000000015780 ffff88023c1547e0 ffff88023c154ad8 0000000298bc2318
[279117.747473] Call Trace:
[279117.747479]  [<ffffffff8118a673>] ? generic_make_request+0x299/0x2f9
[279117.747483]  [<ffffffff810114ce>] ? common_interrupt+0xe/0x13
[279117.747501]  [<ffffffffa01b45f4>] ? xfs_ioend_wait+0x84/0x9c [xfs]
[279117.747503]  [<ffffffff8106599e>] ? autoremove_wake_function+0x0/0x2e
[279117.747513]  [<ffffffffa0199460>] ? xfs_ilock_nowait+0x32/0x92 [xfs]
[279117.747520]  [<ffffffffa01bce36>] ? xfs_sync_inode_data+0x91/0xa8 [xfs]
[279117.747528]  [<ffffffffa01bcf61>] ? xfs_inode_ag_walk+0x92/0xef [xfs]
[279117.747535]  [<ffffffffa01bcda5>] ? xfs_sync_inode_data+0x0/0xa8 [xfs]
[279117.747542]  [<ffffffffa01bd02f>] ? xfs_inode_ag_iterator+0x71/0xb2 [xfs]
[279117.747549]  [<ffffffffa01bcda5>] ? xfs_sync_inode_data+0x0/0xa8 [xfs]
[279117.747557]  [<ffffffffa01bd2fe>] ? xfs_sync_data+0x20/0x42 [xfs]
[279117.747564]  [<ffffffffa01bd344>] ? xfs_flush_inodes_work+0x24/0x31 [xfs]
[279117.747571]  [<ffffffffa01bc924>] ? xfssyncd+0x150/0x1bb [xfs]
[279117.747578]  [<ffffffffa01bc7d4>] ? xfssyncd+0x0/0x1bb [xfs]
[279117.747580]  [<ffffffff810656d1>] ? kthread+0x79/0x81
[279117.747582]  [<ffffffff81011baa>] ? child_rip+0xa/0x20
[279117.747584]  [<ffffffff81065658>] ? kthread+0x0/0x81
[279117.747585]  [<ffffffff81011ba0>] ? child_rip+0x0/0x20





A bit of googling [1] suggests that perhaps the commit 17b3847 [2] could have fixed this


Also I can say that I have switched some weeks ago one of this servers to a vainilla/vserver 3.1 Kernel and since then I didn't saw this lockup anymore.




-----------------------------------------------------------------------------------------
[1] http://comments.gmane.org/gmane.comp.file-systems.xfs.general/41907
[2] https://git.kernel.org/linus/17b3847


-- 
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Carlos Alberto Lopez Perez                           http://neutrino.es
Igalia - Free Software Engineering                http://www.igalia.com
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Attachment: signature.asc
Description: OpenPGP digital signature


--- End Message ---
--- Begin Message ---
Version: 3.1.8-2

Hi Carlos,

Carlos Alberto Lopez Perez wrote:

> Since the upgrade from lenny to squeeze I am getting in several
> servers random kernel lockups related to the XFS filesystem. The
> errors looks the same on all servers. All servers are running Debian
> kernel 2.6.32-39

Thanks for reporting this.

Are the hangs noticeable when using the system?  What is happening in
other tasks?  (It should be possible to tell by using sysrq + w when
in that state; you might have to enable it first using

	echo 1 >/proc/sys/kernel/sysrq
	# or
	echo 446 >/proc/sys/kernel/sysrq

or you can trigger it directly with "echo w >/proc/sysrq-trigger" if
able to run commands.  See Documentation/sysrq.txt in the kernel
source or the documentation directory of a linux-doc-<version> package
for details.)

[...]
> A bit of googling [1] suggests that perhaps the commit 17b3847 [2]
> could have fixed this

Hm.  Mike's example you pointed to is stuck in xfs_log_reserve while
most of your examples are stuck in xfs_reclaim_inode.  But if you'd
like to try it, you can test the patch below[*] against linux 2.6.32.y
(either from kernel.org or a squeeze kernel).  [3] explains how to
patch and build a custom kernel.

[...]
> Also I can say that I have switched some weeks ago one of this
> servers to a vainilla/vserver 3.1 Kernel and since then I didn't saw
> this lockup anymore.

Marking accordingly.

Please try 3.0.0-6 from http://snapshot.debian.org/ so we can see if
the fix was introduced in 3.1.  If that one does not produce the
lockup and you have time for it, it would also be useful to try some
kernels halfway between 2.6.32 and 3.0 to see when the fix was
introduced.

Thanks for a pleasant report, and hope that helps.

Sincerely,
Jonathan

[3] http://kernel-handbook.alioth.debian.org/ch-common-tasks.html
[*]

From: Christoph Hellwig <hch@infradead.org>
Date: Tue, 11 Oct 2011 15:14:09 +0000
Subject: xfs: force the log if we encounter pinned buffers in .iop_pushbuf

commit 17b38471c3c07a49f0bbc2ecc2e92050c164e226 upstream.

We need to check for pinned buffers even in .iop_pushbuf given that inode
items flush into the same buffers that may be pinned directly due operations
on the unlinked inode list operating directly on buffers.  To do this add a
return value to .iop_pushbuf that tells the AIL push about this and use
the existing log force mechanisms to unpin it.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reported-by: Stefan Priebe <s.priebe@profihost.ag>
Tested-by: Stefan Priebe <s.priebe@profihost.ag>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
Only compile-tested.

 fs/xfs/quota/xfs_dquot_item.c |   11 +++++++----
 fs/xfs/xfs_inode_item.c       |   14 +++++++++-----
 fs/xfs/xfs_trans.h            |    2 +-
 fs/xfs/xfs_trans_ail.c        |    9 +++++++--
 4 files changed, 24 insertions(+), 12 deletions(-)

diff --git a/fs/xfs/quota/xfs_dquot_item.c b/fs/xfs/quota/xfs_dquot_item.c
index d0d4a9a0bbd7..38cc06f94d47 100644
--- a/fs/xfs/quota/xfs_dquot_item.c
+++ b/fs/xfs/quota/xfs_dquot_item.c
@@ -205,7 +205,7 @@ xfs_qm_dqunpin_wait(
  * search the buffer cache can be a time consuming thing, and AIL lock is a
  * spinlock.
  */
-STATIC void
+STATIC bool
 xfs_qm_dquot_logitem_pushbuf(
 	xfs_dq_logitem_t    *qip)
 {
@@ -213,6 +213,7 @@ xfs_qm_dquot_logitem_pushbuf(
 	xfs_mount_t	*mp;
 	xfs_buf_t	*bp;
 	uint		dopush;
+	bool		ret = true;
 
 	dqp = qip->qli_dquot;
 	ASSERT(XFS_DQ_IS_LOCKED(dqp));
@@ -233,7 +234,7 @@ xfs_qm_dquot_logitem_pushbuf(
 	    ((qip->qli_item.li_flags & XFS_LI_IN_AIL) == 0)) {
 		qip->qli_pushbuf_flag = 0;
 		xfs_dqunlock(dqp);
-		return;
+		return true;
 	}
 	mp = dqp->q_mount;
 	bp = xfs_incore(mp->m_ddev_targp, qip->qli_format.qlf_blkno,
@@ -249,6 +250,7 @@ xfs_qm_dquot_logitem_pushbuf(
 			if (XFS_BUF_ISPINNED(bp)) {
 				xfs_log_force(mp, (xfs_lsn_t)0,
 					      XFS_LOG_FORCE);
+				ret = false;
 			}
 			if (dopush) {
 				int	error;
@@ -269,11 +271,12 @@ xfs_qm_dquot_logitem_pushbuf(
 			xfs_dqunlock(dqp);
 			xfs_buf_relse(bp);
 		}
-		return;
+		return ret;
 	}
 
 	qip->qli_pushbuf_flag = 0;
 	xfs_dqunlock(dqp);
+	return true;
 }
 
 /*
@@ -402,7 +405,7 @@ static struct xfs_item_ops xfs_dquot_item_ops = {
 	.iop_committed	= (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t))
 					xfs_qm_dquot_logitem_committed,
 	.iop_push	= (void(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_push,
-	.iop_pushbuf	= (void(*)(xfs_log_item_t*))
+	.iop_pushbuf	= (bool(*)(xfs_log_item_t*))
 					xfs_qm_dquot_logitem_pushbuf,
 	.iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t))
 					xfs_qm_dquot_logitem_committing
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 9794b876d6ff..dd0c2db25a00 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -751,7 +751,7 @@ xfs_inode_item_committed(
  * We aren't holding the AIL lock (or the flush lock) when this gets called,
  * so it is inherently race-y.
  */
-STATIC void
+STATIC bool
 xfs_inode_item_pushbuf(
 	xfs_inode_log_item_t	*iip)
 {
@@ -759,6 +759,7 @@ xfs_inode_item_pushbuf(
 	xfs_mount_t	*mp;
 	xfs_buf_t	*bp;
 	uint		dopush;
+	bool		ret = true;
 
 	ip = iip->ili_inode;
 
@@ -779,7 +780,7 @@ xfs_inode_item_pushbuf(
 	    ((iip->ili_item.li_flags & XFS_LI_IN_AIL) == 0)) {
 		iip->ili_pushbuf_flag = 0;
 		xfs_iunlock(ip, XFS_ILOCK_SHARED);
-		return;
+		return true;
 	}
 
 	mp = ip->i_mount;
@@ -804,6 +805,7 @@ xfs_inode_item_pushbuf(
 			if (XFS_BUF_ISPINNED(bp)) {
 				xfs_log_force(mp, (xfs_lsn_t)0,
 					      XFS_LOG_FORCE);
+				ret = false;
 			}
 			if (dopush) {
 				int	error;
@@ -818,9 +820,11 @@ xfs_inode_item_pushbuf(
 		} else {
 			iip->ili_pushbuf_flag = 0;
 			xfs_iunlock(ip, XFS_ILOCK_SHARED);
+			if (XFS_BUF_ISPINNED(bp))
+				ret = false;
 			xfs_buf_relse(bp);
 		}
-		return;
+		return ret;
 	}
 	/*
 	 * We have to be careful about resetting pushbuf flag too early (above).
@@ -832,7 +836,7 @@ xfs_inode_item_pushbuf(
 	 */
 	iip->ili_pushbuf_flag = 0;
 	xfs_iunlock(ip, XFS_ILOCK_SHARED);
-	return;
+	return true;
 }
 
 
@@ -903,7 +907,7 @@ static struct xfs_item_ops xfs_inode_item_ops = {
 	.iop_committed	= (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t))
 					xfs_inode_item_committed,
 	.iop_push	= (void(*)(xfs_log_item_t*))xfs_inode_item_push,
-	.iop_pushbuf	= (void(*)(xfs_log_item_t*))xfs_inode_item_pushbuf,
+	.iop_pushbuf	= (bool(*)(xfs_log_item_t*))xfs_inode_item_pushbuf,
 	.iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t))
 					xfs_inode_item_committing
 };
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index a0574f593f52..e5282706fbd9 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -792,7 +792,7 @@ typedef struct xfs_item_ops {
 	void (*iop_unlock)(xfs_log_item_t *);
 	xfs_lsn_t (*iop_committed)(xfs_log_item_t *, xfs_lsn_t);
 	void (*iop_push)(xfs_log_item_t *);
-	void (*iop_pushbuf)(xfs_log_item_t *);
+	bool (*iop_pushbuf)(xfs_log_item_t *);
 	void (*iop_committing)(xfs_log_item_t *, xfs_lsn_t);
 } xfs_item_ops_t;
 
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 2ffc570679be..43d9b8adaf31 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -306,8 +306,13 @@ xfsaild_push(
 
 		case XFS_ITEM_PUSHBUF:
 			XFS_STATS_INC(xs_push_ail_pushbuf);
-			IOP_PUSHBUF(lip);
-			last_pushed_lsn = lsn;
+
+			if (!IOP_PUSHBUF(lip)) {
+				stuck++;
+				flush_log = 1;
+			} else {
+				last_pushed_lsn = lsn;
+			}
 			break;
 
 		case XFS_ITEM_PINNED:
-- 
1.7.8.3



--- End Message ---

Reply to: