[Date Prev][Date Next] [Thread Prev][Thread Next] [Date Index] [Thread Index]

Bug#662348: i915: please save more useful debugging info when GPU locks up



Jonathan Nieder wrote:

> Ben, would the attached patches make sense for squeeze?  (Warning:
> untested.  I don't have a machine using Intel graphics.)

Attached this time.
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Thu, 18 Feb 2010 10:24:56 +0000
Subject: drm/i915: Record batch buffer following GPU error

commit 9df30794f609d9412f14cfd0eb7b45dd64d0b14e upstream.

In order to improve our diagnostic capabilities following a GPU hang
and subsequent reset, we need to record the batch buffer that triggered
the error. We assume that the current batch buffer, plus a few details
about what else is on the active list, will be sufficient -- at the very
least an improvement over nothing.

The extra information is stored in /debug/dri/.../i915_error_state
following an error, and may be decoded using
intel_gpu_tools/tools/intel_error_decode.

v2: Avoid excessive work under spinlocks.
v3: Include ringbuffer for later analysis.
v4: Use kunmap correctly and record more buffer state.
v5: Search ringbuffer for current batch buffer
v6: Use a work fn for the impossible IRQ error case.
v7: Avoid non-atomic paths whilst in IRQ context.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Eric Anholt <eric@anholt.net>
Signed-off-by: Jonathan Nieder <jrnieder@gmail.com>
---
 drivers/gpu/drm/i915/i915_debugfs.c |   85 +++++++++++++
 drivers/gpu/drm/i915/i915_dma.c     |    2 +
 drivers/gpu/drm/i915/i915_drv.h     |   21 ++++
 drivers/gpu/drm/i915/i915_irq.c     |  224 +++++++++++++++++++++++++++++++++--
 drivers/gpu/drm/i915/i915_reg.h     |    1 +
 5 files changed, 326 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index 1372796497c0..8b427c005766 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -350,6 +350,36 @@ static int i915_ringbuffer_info(struct seq_file *m, void *data)
 	return 0;
 }
 
+static const char *pin_flag(int pinned)
+{
+	if (pinned > 0)
+		return " P";
+	else if (pinned < 0)
+		return " p";
+	else
+		return "";
+}
+
+static const char *tiling_flag(int tiling)
+{
+	switch (tiling) {
+	default:
+	case I915_TILING_NONE: return "";
+	case I915_TILING_X: return " X";
+	case I915_TILING_Y: return " Y";
+	}
+}
+
+static const char *dirty_flag(int dirty)
+{
+	return dirty ? " dirty" : "";
+}
+
+static const char *purgeable_flag(int purgeable)
+{
+	return purgeable ? " purgeable" : "";
+}
+
 static int i915_error_state(struct seq_file *m, void *unused)
 {
 	struct drm_info_node *node = (struct drm_info_node *) m->private;
@@ -357,6 +387,7 @@ static int i915_error_state(struct seq_file *m, void *unused)
 	drm_i915_private_t *dev_priv = dev->dev_private;
 	struct drm_i915_error_state *error;
 	unsigned long flags;
+	int i, page, offset, elt;
 
 	spin_lock_irqsave(&dev_priv->error_lock, flags);
 	if (!dev_priv->first_error) {
@@ -368,6 +399,7 @@ static int i915_error_state(struct seq_file *m, void *unused)
 
 	seq_printf(m, "Time: %ld s %ld us\n", error->time.tv_sec,
 		   error->time.tv_usec);
+	seq_printf(m, "PCI ID: 0x%04x\n", dev->pci_device);
 	seq_printf(m, "EIR: 0x%08x\n", error->eir);
 	seq_printf(m, "  PGTBL_ER: 0x%08x\n", error->pgtbl_er);
 	seq_printf(m, "  INSTPM: 0x%08x\n", error->instpm);
@@ -379,6 +411,59 @@ static int i915_error_state(struct seq_file *m, void *unused)
 		seq_printf(m, "  INSTPS: 0x%08x\n", error->instps);
 		seq_printf(m, "  INSTDONE1: 0x%08x\n", error->instdone1);
 	}
+	seq_printf(m, "seqno: 0x%08x\n", error->seqno);
+
+	if (error->active_bo_count) {
+		seq_printf(m, "Buffers [%d]:\n", error->active_bo_count);
+
+		for (i = 0; i < error->active_bo_count; i++) {
+			seq_printf(m, "  %08x %8zd %08x %08x %08x%s%s%s%s",
+				   error->active_bo[i].gtt_offset,
+				   error->active_bo[i].size,
+				   error->active_bo[i].read_domains,
+				   error->active_bo[i].write_domain,
+				   error->active_bo[i].seqno,
+				   pin_flag(error->active_bo[i].pinned),
+				   tiling_flag(error->active_bo[i].tiling),
+				   dirty_flag(error->active_bo[i].dirty),
+				   purgeable_flag(error->active_bo[i].purgeable));
+
+			if (error->active_bo[i].name)
+				seq_printf(m, " (name: %d)", error->active_bo[i].name);
+			if (error->active_bo[i].fence_reg != I915_FENCE_REG_NONE)
+				seq_printf(m, " (fence: %d)", error->active_bo[i].fence_reg);
+
+			seq_printf(m, "\n");
+		}
+	}
+
+	for (i = 0; i < ARRAY_SIZE(error->batchbuffer); i++) {
+		if (error->batchbuffer[i]) {
+			struct drm_i915_error_object *obj = error->batchbuffer[i];
+
+			seq_printf(m, "--- gtt_offset = 0x%08x\n", obj->gtt_offset);
+			offset = 0;
+			for (page = 0; page < obj->page_count; page++) {
+				for (elt = 0; elt < PAGE_SIZE/4; elt++) {
+					seq_printf(m, "%08x :  %08x\n", offset, obj->pages[page][elt]);
+					offset += 4;
+				}
+			}
+		}
+	}
+
+	if (error->ringbuffer) {
+		struct drm_i915_error_object *obj = error->ringbuffer;
+
+		seq_printf(m, "--- ringbuffer = 0x%08x\n", obj->gtt_offset);
+		offset = 0;
+		for (page = 0; page < obj->page_count; page++) {
+			for (elt = 0; elt < PAGE_SIZE/4; elt++) {
+				seq_printf(m, "%08x :  %08x\n", offset, obj->pages[page][elt]);
+				offset += 4;
+			}
+		}
+	}
 
 out:
 	spin_unlock_irqrestore(&dev_priv->error_lock, flags);
diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c
index be27accf9792..143fb17052bd 100644
--- a/drivers/gpu/drm/i915/i915_dma.c
+++ b/drivers/gpu/drm/i915/i915_dma.c
@@ -1554,6 +1554,8 @@ int i915_driver_unload(struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
 
+	i915_destroy_error_state(dev);
+
 	destroy_workqueue(dev_priv->wq);
 	del_timer_sync(&dev_priv->hangcheck_timer);
 
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index e0acd0023c23..b41644d7e78a 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -152,7 +152,27 @@ struct drm_i915_error_state {
 	u32 instps;
 	u32 instdone1;
 	u32 seqno;
+	u64 bbaddr;
 	struct timeval time;
+	struct drm_i915_error_object {
+		int page_count;
+		u32 gtt_offset;
+		u32 *pages[0];
+	} *ringbuffer, *batchbuffer[2];
+	struct drm_i915_error_buffer {
+		size_t size;
+		u32 name;
+		u32 seqno;
+		u32 gtt_offset;
+		u32 read_domains;
+		u32 write_domain;
+		u32 fence_reg;
+		s32 pinned:2;
+		u32 tiling:2;
+		u32 dirty:1;
+		u32 purgeable:1;
+	} *active_bo;
+	u32 active_bo_count;
 };
 
 struct drm_i915_display_funcs {
@@ -770,6 +790,7 @@ extern int i965_reset(struct drm_device *dev, u8 flags);
 
 /* i915_irq.c */
 void i915_hangcheck_elapsed(unsigned long data);
+void i915_destroy_error_state(struct drm_device *dev);
 extern int i915_irq_emit(struct drm_device *dev, void *data,
 			 struct drm_file *file_priv);
 extern int i915_irq_wait(struct drm_device *dev, void *data,
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 345b60f86b52..5a115b69f4be 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -376,6 +376,121 @@ static void i915_error_work_func(struct work_struct *work)
 	}
 }
 
+static struct drm_i915_error_object *
+i915_error_object_create(struct drm_device *dev,
+			 struct drm_gem_object *src)
+{
+	struct drm_i915_error_object *dst;
+	struct drm_i915_gem_object *src_priv;
+	int page, page_count;
+
+	if (src == NULL)
+		return NULL;
+
+	src_priv = src->driver_private;
+	if (src_priv->pages == NULL)
+		return NULL;
+
+	page_count = src->size / PAGE_SIZE;
+
+	dst = kmalloc(sizeof(*dst) + page_count * sizeof (u32 *), GFP_ATOMIC);
+	if (dst == NULL)
+		return NULL;
+
+	for (page = 0; page < page_count; page++) {
+		void *s, *d = kmalloc(PAGE_SIZE, GFP_ATOMIC);
+		if (d == NULL)
+			goto unwind;
+		s = kmap_atomic(src_priv->pages[page], KM_USER0);
+		memcpy(d, s, PAGE_SIZE);
+		kunmap_atomic(s, KM_USER0);
+		dst->pages[page] = d;
+	}
+	dst->page_count = page_count;
+	dst->gtt_offset = src_priv->gtt_offset;
+
+	return dst;
+
+unwind:
+	while (page--)
+		kfree(dst->pages[page]);
+	kfree(dst);
+	return NULL;
+}
+
+static void
+i915_error_object_free(struct drm_i915_error_object *obj)
+{
+	int page;
+
+	if (obj == NULL)
+		return;
+
+	for (page = 0; page < obj->page_count; page++)
+		kfree(obj->pages[page]);
+
+	kfree(obj);
+}
+
+static void
+i915_error_state_free(struct drm_device *dev,
+		      struct drm_i915_error_state *error)
+{
+	i915_error_object_free(error->batchbuffer[0]);
+	i915_error_object_free(error->batchbuffer[1]);
+	i915_error_object_free(error->ringbuffer);
+	kfree(error->active_bo);
+	kfree(error);
+}
+
+static u32
+i915_get_bbaddr(struct drm_device *dev, u32 *ring)
+{
+	u32 cmd;
+
+	if (IS_I830(dev) || IS_845G(dev))
+		cmd = MI_BATCH_BUFFER;
+	else if (IS_I965G(dev))
+		cmd = (MI_BATCH_BUFFER_START | (2 << 6) |
+		       MI_BATCH_NON_SECURE_I965);
+	else
+		cmd = (MI_BATCH_BUFFER_START | (2 << 6));
+
+	return ring[0] == cmd ? ring[1] : 0;
+}
+
+static u32
+i915_ringbuffer_last_batch(struct drm_device *dev)
+{
+	struct drm_i915_private *dev_priv = dev->dev_private;
+	u32 head, bbaddr;
+	u32 *ring;
+
+	/* Locate the current position in the ringbuffer and walk back
+	 * to find the most recently dispatched batch buffer.
+	 */
+	bbaddr = 0;
+	head = I915_READ(PRB0_HEAD) & HEAD_ADDR;
+	ring = (u32 *)(dev_priv->ring.virtual_start + head);
+
+	while (--ring >= (u32 *)dev_priv->ring.virtual_start) {
+		bbaddr = i915_get_bbaddr(dev, ring);
+		if (bbaddr)
+			break;
+	}
+
+	if (bbaddr == 0) {
+		ring = (u32 *)(dev_priv->ring.virtual_start + dev_priv->ring.Size);
+		while (--ring >= (u32 *)dev_priv->ring.virtual_start) {
+			bbaddr = i915_get_bbaddr(dev, ring);
+			if (bbaddr)
+				break;
+		}
+	}
+
+	return bbaddr;
+}
+
 /**
  * i915_capture_error_state - capture an error record for later analysis
  * @dev: drm device
@@ -388,19 +503,26 @@ static void i915_error_work_func(struct work_struct *work)
 static void i915_capture_error_state(struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
+	struct drm_i915_gem_object *obj_priv;
 	struct drm_i915_error_state *error;
+	struct drm_gem_object *batchbuffer[2];
 	unsigned long flags;
+	u32 bbaddr;
+	int count;
 
 	spin_lock_irqsave(&dev_priv->error_lock, flags);
-	if (dev_priv->first_error)
-		goto out;
+	error = dev_priv->first_error;
+	spin_unlock_irqrestore(&dev_priv->error_lock, flags);
+	if (error)
+		return;
 
 	error = kmalloc(sizeof(*error), GFP_ATOMIC);
 	if (!error) {
-		DRM_DEBUG_DRIVER("out ot memory, not capturing error state\n");
-		goto out;
+		DRM_DEBUG_DRIVER("out of memory, not capturing error state\n");
+		return;
 	}
 
+	error->seqno = i915_get_gem_seqno(dev);
 	error->eir = I915_READ(EIR);
 	error->pgtbl_er = I915_READ(PGTBL_ER);
 	error->pipeastat = I915_READ(PIPEASTAT);
@@ -411,6 +533,7 @@ static void i915_capture_error_state(struct drm_device *dev)
 		error->ipehr = I915_READ(IPEHR);
 		error->instdone = I915_READ(INSTDONE);
 		error->acthd = I915_READ(ACTHD);
+		error->bbaddr = 0;
 	} else {
 		error->ipeir = I915_READ(IPEIR_I965);
 		error->ipehr = I915_READ(IPEHR_I965);
@@ -418,14 +541,101 @@ static void i915_capture_error_state(struct drm_device *dev)
 		error->instps = I915_READ(INSTPS);
 		error->instdone1 = I915_READ(INSTDONE1);
 		error->acthd = I915_READ(ACTHD_I965);
+		error->bbaddr = I915_READ64(BB_ADDR);
+	}
+
+	bbaddr = i915_ringbuffer_last_batch(dev);
+
+	/* Grab the current batchbuffer, most likely to have crashed. */
+	batchbuffer[0] = NULL;
+	batchbuffer[1] = NULL;
+	count = 0;
+	list_for_each_entry(obj_priv, &dev_priv->mm.active_list, list) {
+		struct drm_gem_object *obj = obj_priv->obj;
+
+		if (batchbuffer[0] == NULL &&
+		    bbaddr >= obj_priv->gtt_offset &&
+		    bbaddr < obj_priv->gtt_offset + obj->size)
+			batchbuffer[0] = obj;
+
+		if (batchbuffer[1] == NULL &&
+		    error->acthd >= obj_priv->gtt_offset &&
+		    error->acthd < obj_priv->gtt_offset + obj->size &&
+		    batchbuffer[0] != obj)
+			batchbuffer[1] = obj;
+
+		count++;
+	}
+
+	/* We need to copy these to an anonymous buffer as the simplest
+	 * method to avoid being overwritten by userpace.
+	 */
+	error->batchbuffer[0] = i915_error_object_create(dev, batchbuffer[0]);
+	error->batchbuffer[1] = i915_error_object_create(dev, batchbuffer[1]);
+
+	/* Record the ringbuffer */
+	error->ringbuffer = i915_error_object_create(dev, dev_priv->ring.ring_obj);
+
+	/* Record buffers on the active list. */
+	error->active_bo = NULL;
+	error->active_bo_count = 0;
+
+	if (count)
+		error->active_bo = kmalloc(sizeof(*error->active_bo)*count,
+					   GFP_ATOMIC);
+
+	if (error->active_bo) {
+		int i = 0;
+		list_for_each_entry(obj_priv, &dev_priv->mm.active_list, list) {
+			struct drm_gem_object *obj = obj_priv->obj;
+
+			error->active_bo[i].size = obj->size;
+			error->active_bo[i].name = obj->name;
+			error->active_bo[i].seqno = obj_priv->last_rendering_seqno;
+			error->active_bo[i].gtt_offset = obj_priv->gtt_offset;
+			error->active_bo[i].read_domains = obj->read_domains;
+			error->active_bo[i].write_domain = obj->write_domain;
+			error->active_bo[i].fence_reg = obj_priv->fence_reg;
+			error->active_bo[i].pinned = 0;
+			if (obj_priv->pin_count > 0)
+				error->active_bo[i].pinned = 1;
+			if (obj_priv->user_pin_count > 0)
+				error->active_bo[i].pinned = -1;
+			error->active_bo[i].tiling = obj_priv->tiling_mode;
+			error->active_bo[i].dirty = obj_priv->dirty;
+			error->active_bo[i].purgeable = obj_priv->madv != I915_MADV_WILLNEED;
+
+			if (++i == count)
+				break;
+		}
+		error->active_bo_count = i;
 	}
 
 	do_gettimeofday(&error->time);
 
-	dev_priv->first_error = error;
-
-out:
+	spin_lock_irqsave(&dev_priv->error_lock, flags);
+	if (dev_priv->first_error == NULL) {
+		dev_priv->first_error = error;
+		error = NULL;
+	}
 	spin_unlock_irqrestore(&dev_priv->error_lock, flags);
+
+	if (error)
+		i915_error_state_free(dev, error);
+}
+
+void i915_destroy_error_state(struct drm_device *dev)
+{
+	struct drm_i915_private *dev_priv = dev->dev_private;
+	struct drm_i915_error_state *error;
+
+	spin_lock(&dev_priv->error_lock);
+	error = dev_priv->first_error;
+	dev_priv->first_error = NULL;
+	spin_unlock(&dev_priv->error_lock);
+
+	if (error)
+		i915_error_state_free(dev, error);
 }
 
 /**
diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
index b463a0baa924..b2eafa2eb6bf 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -399,6 +399,7 @@
 #define   CM0_COLOR_EVICT_DISABLE (1<<3)
 #define   CM0_DEPTH_WRITE_DISABLE (1<<1)
 #define   CM0_RC_OP_FLUSH_DISABLE (1<<0)
+#define BB_ADDR		0x02140 /* 8 bytes */
 #define GFX_FLSH_CNTL	0x02170 /* 915+ only */
 #define ECOSKPD		0x021d0
 #define   ECO_GATING_CX_ONLY	(1<<3)
-- 
1.7.9.2

From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Thu, 27 May 2010 13:18:12 +0100
Subject: drm/i915: Only print an message if there was an error

commit 35aed2e6be2feaa227fe5c7a0b7c286c4fe71592 upstream.

Only report an error if the GPU has actually detected one, otherwise we
are just hung.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Eric Anholt <eric@anholt.net>
Signed-off-by: Jonathan Nieder <jrnieder@gmail.com>
---
 drivers/gpu/drm/i915/i915_irq.c |   38 ++++++++++++++++++++++++--------------
 1 file changed, 24 insertions(+), 14 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 5a115b69f4be..2dc066550785 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -638,24 +638,13 @@ void i915_destroy_error_state(struct drm_device *dev)
 		i915_error_state_free(dev, error);
 }
 
-/**
- * i915_handle_error - handle an error interrupt
- * @dev: drm device
- *
- * Do some basic checking of regsiter state at error interrupt time and
- * dump it to the syslog.  Also call i915_capture_error_state() to make
- * sure we get a record and make it available in debugfs.  Fire a uevent
- * so userspace knows something bad happened (should trigger collection
- * of a ring dump etc.).
- */
-static void i915_handle_error(struct drm_device *dev, bool wedged)
+static void i915_report_and_clear_eir(struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	u32 eir = I915_READ(EIR);
-	u32 pipea_stats = I915_READ(PIPEASTAT);
-	u32 pipeb_stats = I915_READ(PIPEBSTAT);
 
-	i915_capture_error_state(dev);
+	if (!eir)
+		return;
 
 	printk(KERN_ERR "render error detected, EIR: 0x%08x\n",
 	       eir);
@@ -701,6 +690,9 @@ static void i915_handle_error(struct drm_device *dev, bool wedged)
 	}
 
 	if (eir & I915_ERROR_MEMORY_REFRESH) {
+		u32 pipea_stats = I915_READ(PIPEASTAT);
+		u32 pipeb_stats = I915_READ(PIPEBSTAT);
+
 		printk(KERN_ERR "memory refresh error\n");
 		printk(KERN_ERR "PIPEASTAT: 0x%08x\n",
 		       pipea_stats);
@@ -757,6 +749,24 @@ static void i915_handle_error(struct drm_device *dev, bool wedged)
 		I915_WRITE(EMR, I915_READ(EMR) | eir);
 		I915_WRITE(IIR, I915_RENDER_COMMAND_PARSER_ERROR_INTERRUPT);
 	}
+}
+
+/**
+ * i915_handle_error - handle an error interrupt
+ * @dev: drm device
+ *
+ * Do some basic checking of regsiter state at error interrupt time and
+ * dump it to the syslog.  Also call i915_capture_error_state() to make
+ * sure we get a record and make it available in debugfs.  Fire a uevent
+ * so userspace knows something bad happened (should trigger collection
+ * of a ring dump etc.).
+ */
+static void i915_handle_error(struct drm_device *dev, bool wedged)
+{
+	struct drm_i915_private *dev_priv = dev->dev_private;
+
+	i915_capture_error_state(dev);
+	i915_report_and_clear_eir(dev);
 
 	if (wedged) {
 		atomic_set(&dev_priv->mm.wedged, 1);
-- 
1.7.9.2

From: Daniel Vetter <daniel.vetter@ffwll.ch>
Date: Fri, 20 Aug 2010 18:18:48 +0200
Subject: drm/i915: unload: fix error_work races

commit bc0c7f14432f7f94b16f972f2d23b8c1248249b4 upstream.

This is the first patch to clean up module unload races due to
outstanding timers/work. Preparatory step: Thou shalt not destroy
the workqueue when new work might still get enqued.

Now error_work gets queued by the hangcheck timer and only (atomically)
reads the chip wedged status. So cancel it right after the hangcheck
timer is killed. But the hangcheck is armed by interrupts, so move
everything after irqs are disabled.

Also change a del_timer to a del_timer_sync in the ums gem code, the
hangcheck timer is self-rearming.

Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Jonathan Nieder <jrnieder@gmail.com>
---
 drivers/gpu/drm/i915/i915_dma.c |    8 +++++---
 drivers/gpu/drm/i915/i915_gem.c |    2 +-
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c
index 143fb17052bd..27071e87af53 100644
--- a/drivers/gpu/drm/i915/i915_dma.c
+++ b/drivers/gpu/drm/i915/i915_dma.c
@@ -1556,9 +1556,6 @@ int i915_driver_unload(struct drm_device *dev)
 
 	i915_destroy_error_state(dev);
 
-	destroy_workqueue(dev_priv->wq);
-	del_timer_sync(&dev_priv->hangcheck_timer);
-
 	io_mapping_free(dev_priv->mm.gtt_mapping);
 	if (dev_priv->mm.gtt_mtrr >= 0) {
 		mtrr_del(dev_priv->mm.gtt_mtrr, dev->agp->base,
@@ -1580,6 +1577,9 @@ int i915_driver_unload(struct drm_device *dev)
 		vga_client_register(dev->pdev, NULL, NULL, NULL);
 	}
 
+	del_timer_sync(&dev_priv->hangcheck_timer);
+	cancel_work_sync(&dev_priv->error_work);
+
 	if (dev->pdev->msi_enabled)
 		pci_disable_msi(dev->pdev);
 
@@ -1605,6 +1605,8 @@ int i915_driver_unload(struct drm_device *dev)
 			i915_free_hws(dev);
 	}
 
+	destroy_workqueue(dev_priv->wq);
+
 	pci_dev_put(dev_priv->bridge_dev);
 	kfree(dev->dev_private);
 
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 0314f7fd2505..02e2478a4f5a 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -4373,7 +4373,7 @@ i915_gem_idle(struct drm_device *dev)
 	 * We need to replace this with a semaphore, or something.
 	 */
 	dev_priv->mm.suspended = 1;
-	del_timer(&dev_priv->hangcheck_timer);
+	del_timer_sync(&dev_priv->hangcheck_timer);
 
 	/* Cancel the retire work handler, wait for it to finish if running
 	 */
-- 
1.7.9.2

From: Daniel Vetter <daniel.vetter@ffwll.ch>
Date: Fri, 20 Aug 2010 21:25:11 +0200
Subject: drm/i915: unload: don't leak error state

commit a8b4899e4658e53c0c8f4206af105e358e39ee93 upstream.

With kms, interrupts now get disabled in the modesetting cleanup. So
free the error state afterwards, it currently gets allocated in
the interrupt handler.

Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Jonathan Nieder <jrnieder@gmail.com>
---
 drivers/gpu/drm/i915/i915_dma.c |    4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c
index 27071e87af53..5ba0397bb421 100644
--- a/drivers/gpu/drm/i915/i915_dma.c
+++ b/drivers/gpu/drm/i915/i915_dma.c
@@ -1554,8 +1554,6 @@ int i915_driver_unload(struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
 
-	i915_destroy_error_state(dev);
-
 	io_mapping_free(dev_priv->mm.gtt_mapping);
 	if (dev_priv->mm.gtt_mtrr >= 0) {
 		mtrr_del(dev_priv->mm.gtt_mtrr, dev->agp->base,
@@ -1577,8 +1575,10 @@ int i915_driver_unload(struct drm_device *dev)
 		vga_client_register(dev->pdev, NULL, NULL, NULL);
 	}
 
+	/* Free error state after interrupts are fully disabled. */
 	del_timer_sync(&dev_priv->hangcheck_timer);
 	cancel_work_sync(&dev_priv->error_work);
+	i915_destroy_error_state(dev);
 
 	if (dev->pdev->msi_enabled)
 		pci_disable_msi(dev->pdev);
-- 
1.7.9.2

From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Fri, 1 Oct 2010 13:23:27 +0100
Subject: drm/i915: Only print 'generating error event' if we actually are

commit 2fa772f34042cd4ddfb4ffaf5c24f0ce8c1025e9 upstream.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Jonathan Nieder <jrnieder@gmail.com>
---
 drivers/gpu/drm/i915/i915_irq.c |    3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 2dc066550785..12c91996af04 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -359,7 +359,6 @@ static void i915_error_work_func(struct work_struct *work)
 	char *reset_event[] = { "RESET=1", NULL };
 	char *reset_done_event[] = { "ERROR=0", NULL };
 
-	DRM_DEBUG_DRIVER("generating error event\n");
 	kobject_uevent_env(&dev->primary->kdev.kobj, KOBJ_CHANGE, error_event);
 
 	if (atomic_read(&dev_priv->mm.wedged)) {
@@ -522,6 +521,8 @@ static void i915_capture_error_state(struct drm_device *dev)
 		return;
 	}
 
+	DRM_DEBUG_DRIVER("generating error event\n");
+
 	error->seqno = i915_get_gem_seqno(dev);
 	error->eir = I915_READ(EIR);
 	error->pgtbl_er = I915_READ(PGTBL_ER);
-- 
1.7.9.2

From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Tue, 1 Feb 2011 14:15:55 +0000
Subject: drm/i915: Include 'i915_error_state' hint for when the GPU catches fire

commit b6f7833b9712c0c01f94cc3a518dc62b07bd610b upstream.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Jonathan Nieder <jrnieder@gmail.com>
---
 drivers/gpu/drm/i915/i915_irq.c |    3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 12c91996af04..c910c8b99b04 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -521,7 +521,8 @@ static void i915_capture_error_state(struct drm_device *dev)
 		return;
 	}
 
-	DRM_DEBUG_DRIVER("generating error event\n");
+	DRM_INFO("capturing error event; look for more information in /debug/dri/%d/i915_error_state\n",
+		 dev->primary->index);
 
 	error->seqno = i915_get_gem_seqno(dev);
 	error->eir = I915_READ(EIR);
-- 
1.7.9.2


Reply to: