[Date Prev][Date Next] [Thread Prev][Thread Next] [Date Index] [Thread Index]

Bug#642911: TX watchdog timeout on RTL8168B



Hi,

Gerd, Lucas, can you test the attached patch?  It works like this:

  # prerequisites
  apt-get install git build-essential
  kernelrepos=git://git.kernel.org/pub/scm/linux/kernel/git/

  # get the Linux kernel if you don't already have it
  git clone $kernelrepos/torvalds/linux.git

  # enter Linux kernel repository
  cd linux

  # fetch point releases
  git remote add -f stable $kernelrepos/stable/linux-stable.git

  # try 3.0.y
  git reset --hard
  git checkout stable/linux-3.0.y
  make localmodconfig
  make deb-pkg; # optionally with -j<num> for parallel build
  dpkg -i ../<name of package>
  reboot
  ... test test test ...; # hopefully it gives hangs or transmit timeouts

  # test with patch
  git am -3sc thepatch
  make deb-pkg; # maybe with -j4
  dpkg -i ../<name of package>
  reboot
  ... test test test ...

Results from testing the squeeze kernel following instructions from [1]
would also be interesting.

Thanks to Sioçnarf for hints.

Hopeful,
Jonathan

[1] http://kernel-handbook.alioth.debian.org/ch-common-tasks.html
or the corresponding page in the debian-kernel-handbook package
From: françois romieu <romieu@fr.zoreil.com>
Date: Sun, 4 Dec 2011 20:30:45 +0000
Subject: r8169: Rx FIFO overflow fixes.

commit 811fd3010cf512f2e23e6c4c912aad54516dc706 upstream.

Realtek has specified that the post 8168c gigabit chips and the post
8105e fast ethernet chips recover automatically from a Rx FIFO overflow.
The driver does not need to clear the RxFIFOOver bit of IntrStatus and
it should rather avoid messing it.

The implementation deserves some explanation:
1. events outside of the intr_event bit mask are now ignored. It enforces
   a no-processing policy for the events that either should not be there
   or should be ignored.

2. RxFIFOOver was already ignored in rtl_cfg_infos[RTL_CFG_1] for the
   whole 8168 line of chips with two exceptions:
   - RTL_GIGA_MAC_VER_22 since b5ba6d12bdac21bc0620a5089e0f24e362645efd
     ("use RxFIFO overflow workaround for 8168c chipset.").
     This one should now be correctly handled.
   - RTL_GIGA_MAC_VER_11 (8168b) which requires a different Rx FIFO
     overflow processing.

   Though it does not conform to Realtek suggestion above, the updated
   driver includes no change for RTL_GIGA_MAC_VER_12 and RTL_GIGA_MAC_VER_17.
   Both are 8168b. RTL_GIGA_MAC_VER_12 is common and a bit old so I'd rather
   wait for experimental evidence that the change suggested by Realtek really
   helps or does not hurt in unexpected ways.

   Removed case statements in rtl8169_interrupt are only 8168 relevant.

3. RxFIFOOver is masked for post 8105e 810x chips, namely the sole 8105e
   (RTL_GIGA_MAC_VER_30) itself.

[jrnieder@gmail.com: As f60ac8e7ab7c ("r8169: prevent RxFIFO induced
 loops in the irq handler") explains, while the RxFIFOOver
 interruption is masked for most 8168, nothing prevents it from
 appearing in the irq status word.  This patch enforces a
 no-processing policy for such events by forgetting them before we
 have a chance to consider acting and by not acking them (which tends
 to have unpleasant results like making the chip lock up).

 To keep the patch minimal, but at a cost of leaving behind some
 confusing dead code, this backport leaves out the "remove redundant
 case statements" part.]

Signed-off-by: Francois Romieu <romieu@fr.zoreil.com>
Cc: hayeswang <hayeswang@realtek.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Jonathan Nieder <jrnieder@gmail.com>
Cc: Ivan Vecera <ivecera@redhat.com>
---
 drivers/net/r8169.c |   36 ++++++++++++++++++++++++------------
 1 files changed, 24 insertions(+), 12 deletions(-)

diff --git a/drivers/net/r8169.c b/drivers/net/r8169.c
index 5f838ef92494..cc57cb91e756 100644
--- a/drivers/net/r8169.c
+++ b/drivers/net/r8169.c
@@ -1043,17 +1043,21 @@ static u8 rtl8168d_efuse_read(void __iomem *ioaddr, int reg_addr)
 	return value;
 }
 
-static void rtl8169_irq_mask_and_ack(void __iomem *ioaddr)
+static void rtl8169_irq_mask_and_ack(struct rtl8169_private *tp)
 {
-	RTL_W16(IntrMask, 0x0000);
+	void __iomem *ioaddr = tp->mmio_addr;
 
-	RTL_W16(IntrStatus, 0xffff);
+	RTL_W16(IntrMask, 0x0000);
+	RTL_W16(IntrStatus, tp->intr_event);
+	RTL_R8(ChipCmd);
 }
 
-static void rtl8169_asic_down(void __iomem *ioaddr)
+static void rtl8169_asic_down(struct rtl8169_private *tp)
 {
+	void __iomem *ioaddr = tp->mmio_addr;
+
 	RTL_W8(ChipCmd, 0x00);
-	rtl8169_irq_mask_and_ack(ioaddr);
+	rtl8169_irq_mask_and_ack(tp);
 	RTL_R16(CPlusCmd);
 }
 
@@ -3611,7 +3615,7 @@ static void rtl8169_hw_reset(struct rtl8169_private *tp)
 	void __iomem *ioaddr = tp->mmio_addr;
 
 	/* Disable interrupts */
-	rtl8169_irq_mask_and_ack(ioaddr);
+	rtl8169_irq_mask_and_ack(tp);
 
 	if (tp->mac_version == RTL_GIGA_MAC_VER_27 ||
 	    tp->mac_version == RTL_GIGA_MAC_VER_28 ||
@@ -4093,8 +4097,7 @@ static void rtl_hw_start_8168(struct net_device *dev)
 	RTL_W16(IntrMitigate, 0x5151);
 
 	/* Work around for RxFIFO overflow. */
-	if (tp->mac_version == RTL_GIGA_MAC_VER_11 ||
-	    tp->mac_version == RTL_GIGA_MAC_VER_22) {
+	if (tp->mac_version == RTL_GIGA_MAC_VER_11) {
 		tp->intr_event |= RxFIFOOver | PCSTimeout;
 		tp->intr_event &= ~RxOverflow;
 	}
@@ -4276,6 +4279,11 @@ static void rtl_hw_start_8101(struct net_device *dev)
 	void __iomem *ioaddr = tp->mmio_addr;
 	struct pci_dev *pdev = tp->pci_dev;
 
+	if (tp->mac_version >= RTL_GIGA_MAC_VER_30) {
+		tp->intr_event &= ~RxFIFOOver;
+		tp->napi_event &= ~RxFIFOOver;
+	}
+
 	if (tp->mac_version == RTL_GIGA_MAC_VER_13 ||
 	    tp->mac_version == RTL_GIGA_MAC_VER_16) {
 		int cap = tp->pcie_cap;
@@ -4539,7 +4547,7 @@ static void rtl8169_wait_for_quiescence(struct net_device *dev)
 	/* Wait for any pending NAPI task to complete */
 	napi_disable(&tp->napi);
 
-	rtl8169_irq_mask_and_ack(ioaddr);
+	rtl8169_irq_mask_and_ack(tp);
 
 	tp->intr_mask = 0xffff;
 	RTL_W16(IntrMask, tp->intr_event);
@@ -5001,13 +5009,17 @@ static irqreturn_t rtl8169_interrupt(int irq, void *dev_instance)
 	 */
 	status = RTL_R16(IntrStatus);
 	while (status && status != 0xffff) {
+		status &= tp->intr_event;
+		if (!status)
+			break;
+
 		handled = 1;
 
 		/* Handle all of the error cases first. These will reset
 		 * the chip, so just exit the loop.
 		 */
 		if (unlikely(!netif_running(dev))) {
-			rtl8169_asic_down(ioaddr);
+			rtl8169_asic_down(tp);
 			break;
 		}
 
@@ -5130,7 +5142,7 @@ static void rtl8169_down(struct net_device *dev)
 
 	spin_lock_irq(&tp->lock);
 
-	rtl8169_asic_down(ioaddr);
+	rtl8169_asic_down(tp);
 	/*
 	 * At this point device interrupts can not be enabled in any function,
 	 * as netif_running is not true (rtl8169_interrupt, rtl8169_reset_task,
@@ -5384,7 +5396,7 @@ static void rtl_shutdown(struct pci_dev *pdev)
 
 	spin_lock_irq(&tp->lock);
 
-	rtl8169_asic_down(ioaddr);
+	rtl8169_asic_down(tp);
 
 	spin_unlock_irq(&tp->lock);
 
-- 
1.7.9.1


Reply to: