[Date Prev][Date Next] [Thread Prev][Thread Next] [Date Index] [Thread Index]

Bug#536860: linux-image-2.6.26-2-sparc64: Kernel unaligned access at TPC[48bf10] __delayacct_add_tsk+0x48/0x15c



Hi,

Jonathan Nieder wrote:

> I wonder how this should be dealt with wrt squeeze.

For your amusement, here's an ugly proof-of-concept patch (against
2.6.32.y) that just does unaligned reads from struct taskstats.  The
only redeeming feature is that it doesn't break ABI.  I'd be curious
to hear whether it works and the effect on performance.

If it were to make sense to actually apply something like this, it
would only be on 64-bit arches that do not have efficient unaligned
access.
From: Jonathan Nieder <jrnieder@gmail.com>
Date: Mon, 15 Aug 2011 00:32:02 -0500
Subject: taskstats: use unaligned accesses to work around alignment issues

As upstream commit 4be2c95d1f77 explains, the taskstats struct is
internally aligned to 8-byte boundaries but because it is preceded by
two NLA headers (4 bytes each) and a pid (4 bytes) ends up being
misaligned when returned from mk_reply(), resulting in unaligned
accesses when it is passed around and populated after that.

A few options for fixing that:

 - go back to populating a taskstats struct on the stack or heap and
   copying it back.  Since the structure weighs in at 328 bytes, that
   is somewhat costly

 - change mk_reply to start the reply at an 8-byte boundary.  This is
   what is done upstream on the affected platforms, but unfortunately
   that breaks compatibility with iotop versions before 0.4.2.

 - use unaligned stores and loads on the taskstats struct returned
   from mk_reply.  That is what this patch does.

Intended for testing only, not for upstream application or application
to any public tree for that matter.  This patch would not have any
benefit unless on a 64-bit arch without efficient unaligned accesses.

Signed-off-by: Jonathan Nieder <jrnieder@gmail.com>
---
 kernel/delayacct.c |   54 ++++++++++++++++++++++++++++++++-------------------
 kernel/taskstats.c |   11 ++++++---
 kernel/tsacct.c    |   54 ++++++++++++++++++++++++++++-----------------------
 3 files changed, 71 insertions(+), 48 deletions(-)

diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index ead9b610aa71..69f7e8fe7b53 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -19,6 +19,7 @@
 #include <linux/time.h>
 #include <linux/sysctl.h>
 #include <linux/delayacct.h>
+#include <asm/unaligned.h>
 
 int delayacct_on __read_mostly = 1;	/* Delay accounting turned on/off */
 struct kmem_cache *delayacct_cache;
@@ -97,6 +98,18 @@ void __delayacct_blkio_end(void)
 			&current->delays->blkio_count);
 }
 
+static void update_stat(s64 *stat, s64 updated)
+{
+	put_unaligned(updated < get_unaligned(stat) ? 0 : updated,
+			stat);
+}
+
+static void update_stat_unsigned(u64 *stat, u64 updated)
+{
+	put_unaligned(updated < get_unaligned(stat) ? 0 : updated,
+			stat);
+}
+
 int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
 {
 	s64 tmp;
@@ -111,16 +124,15 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
 	if (!tsk->delays)
 		goto done;
 
-	tmp = (s64)d->cpu_run_real_total;
+	tmp = get_unaligned((s64 *) &d->cpu_run_real_total);
 	cputime_to_timespec(tsk->utime + tsk->stime, &ts);
 	tmp += timespec_to_ns(&ts);
-	d->cpu_run_real_total = (tmp < (s64)d->cpu_run_real_total) ? 0 : tmp;
+	update_stat((s64 *) &d->cpu_run_real_total, tmp);
 
-	tmp = (s64)d->cpu_scaled_run_real_total;
+	tmp = get_unaligned((s64 *) &d->cpu_scaled_run_real_total);
 	cputime_to_timespec(tsk->utimescaled + tsk->stimescaled, &ts);
 	tmp += timespec_to_ns(&ts);
-	d->cpu_scaled_run_real_total =
-		(tmp < (s64)d->cpu_scaled_run_real_total) ? 0 : tmp;
+	update_stat((s64 *) &d->cpu_scaled_run_real_total, tmp);
 
 	/*
 	 * No locking available for sched_info (and too expensive to add one)
@@ -130,27 +142,29 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
 	t2 = tsk->sched_info.run_delay;
 	t3 = tsk->se.sum_exec_runtime;
 
-	d->cpu_count += t1;
+	put_unaligned(get_unaligned(&d->cpu_count) + t1, &d->cpu_count);
 
-	tmp = (s64)d->cpu_delay_total + t2;
-	d->cpu_delay_total = (tmp < (s64)d->cpu_delay_total) ? 0 : tmp;
+	tmp = get_unaligned((s64 *) &d->cpu_delay_total) + t2;
+	update_stat((s64 *) &d->cpu_delay_total, tmp);
 
-	tmp = (s64)d->cpu_run_virtual_total + t3;
-	d->cpu_run_virtual_total =
-		(tmp < (s64)d->cpu_run_virtual_total) ?	0 : tmp;
+	tmp = get_unaligned((s64 *) &d->cpu_run_virtual_total) + t3;
+	update_stat((s64 *) &d->cpu_run_virtual_total, tmp);
 
 	/* zero XXX_total, non-zero XXX_count implies XXX stat overflowed */
 
 	spin_lock_irqsave(&tsk->delays->lock, flags);
-	tmp = d->blkio_delay_total + tsk->delays->blkio_delay;
-	d->blkio_delay_total = (tmp < d->blkio_delay_total) ? 0 : tmp;
-	tmp = d->swapin_delay_total + tsk->delays->swapin_delay;
-	d->swapin_delay_total = (tmp < d->swapin_delay_total) ? 0 : tmp;
-	tmp = d->freepages_delay_total + tsk->delays->freepages_delay;
-	d->freepages_delay_total = (tmp < d->freepages_delay_total) ? 0 : tmp;
-	d->blkio_count += tsk->delays->blkio_count;
-	d->swapin_count += tsk->delays->swapin_count;
-	d->freepages_count += tsk->delays->freepages_count;
+	tmp = get_unaligned(&d->blkio_delay_total) + tsk->delays->blkio_delay;
+	update_stat_unsigned(&d->blkio_delay_total, tmp);
+	tmp = get_unaligned(&d->swapin_delay_total) + tsk->delays->swapin_delay;
+	update_stat_unsigned(&d->swapin_delay_total, tmp);
+	tmp = get_unaligned(&d->freepages_delay_total) + tsk->delays->freepages_delay;
+	update_stat_unsigned(&d->freepages_delay_total, tmp);
+	put_unaligned(get_unaligned(&d->blkio_count) + tsk->delays->blkio_count,
+			&d->blkio_count);
+	put_unaligned(get_unaligned(&d->swapin_count) + tsk->delays->swapin_count,
+			&d->swapin_count);
+	put_unaligned(get_unaligned(&d->freepages_count) + tsk->delays->freepages_count,
+			&d->freepages_count);
 	spin_unlock_irqrestore(&tsk->delays->lock, flags);
 
 done:
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index b080920409c6..79e78205728e 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -27,6 +27,7 @@
 #include <linux/fs.h>
 #include <linux/file.h>
 #include <net/genetlink.h>
+#include <asm/unaligned.h>
 #include <asm/atomic.h>
 
 /*
@@ -204,8 +205,8 @@ static int fill_pid(pid_t pid, struct task_struct *tsk,
 
 	/* fill in basic acct fields */
 	stats->version = TASKSTATS_VERSION;
-	stats->nvcsw = tsk->nvcsw;
-	stats->nivcsw = tsk->nivcsw;
+	put_unaligned(tsk->nvcsw, &stats->nvcsw);
+	put_unaligned(tsk->nivcsw, &stats->nivcsw);
 	bacct_add_tsk(stats, tsk);
 
 	/* fill in extended acct fields */
@@ -252,8 +253,10 @@ static int fill_tgid(pid_t tgid, struct task_struct *first,
 		 */
 		delayacct_add_tsk(stats, tsk);
 
-		stats->nvcsw += tsk->nvcsw;
-		stats->nivcsw += tsk->nivcsw;
+		put_unaligned(get_unaligned(&stats->nvcsw) + tsk->nvcsw,
+				&stats->nvcsw);
+		put_unaligned(get_unaligned(&stats->nivcsw) + tsk->nivcsw,
+				&stats->nivcsw);
 	} while_each_thread(first, tsk);
 
 	unlock_task_sighand(first, &flags);
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 00d59d048edf..a66e7c7e4275 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -21,6 +21,7 @@
 #include <linux/tsacct_kern.h>
 #include <linux/acct.h>
 #include <linux/jiffies.h>
+#include <asm/unaligned.h>
 
 /*
  * fill in basic accounting fields
@@ -39,7 +40,7 @@ void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk)
 	/* rebase elapsed time to usec (should never be negative) */
 	ac_etime = timespec_to_ns(&ts);
 	do_div(ac_etime, NSEC_PER_USEC);
-	stats->ac_etime = ac_etime;
+	put_unaligned(ac_etime, &stats->ac_etime);
 	stats->ac_btime = get_seconds() - ts.tv_sec;
 	if (thread_group_leader(tsk)) {
 		stats->ac_exitcode = tsk->exit_code;
@@ -57,19 +58,21 @@ void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk)
 	stats->ac_pid	 = tsk->pid;
 	rcu_read_lock();
 	tcred = __task_cred(tsk);
-	stats->ac_uid	 = tcred->uid;
+	put_unaligned(tcred->uid, &stats->ac_uid);
 	stats->ac_gid	 = tcred->gid;
 	stats->ac_ppid	 = pid_alive(tsk) ?
 				rcu_dereference(tsk->real_parent)->tgid : 0;
 	rcu_read_unlock();
-	stats->ac_utime	 = cputime_to_msecs(tsk->utime) * USEC_PER_MSEC;
-	stats->ac_stime	 = cputime_to_msecs(tsk->stime) * USEC_PER_MSEC;
-	stats->ac_utimescaled =
-		cputime_to_msecs(tsk->utimescaled) * USEC_PER_MSEC;
-	stats->ac_stimescaled =
-		cputime_to_msecs(tsk->stimescaled) * USEC_PER_MSEC;
-	stats->ac_minflt = tsk->min_flt;
-	stats->ac_majflt = tsk->maj_flt;
+	put_unaligned(cputime_to_msecs(tsk->utime) * USEC_PER_MSEC,
+			&stats->ac_utime);
+	put_unaligned(cputime_to_msecs(tsk->stime) * USEC_PER_MSEC,
+			&stats->ac_stime);
+	put_unaligned(cputime_to_msecs(tsk->utimescaled) * USEC_PER_MSEC,
+			&stats->ac_utimescaled);
+	put_unaligned(cputime_to_msecs(tsk->stimescaled) * USEC_PER_MSEC,
+			&stats->ac_stimescaled);
+	put_unaligned(tsk->min_flt, &stats->ac_minflt);
+	put_unaligned(tsk->maj_flt, &stats->ac_majflt);
 
 	strncpy(stats->ac_comm, tsk->comm, sizeof(stats->ac_comm));
 }
@@ -87,27 +90,30 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p)
 	struct mm_struct *mm;
 
 	/* convert pages-usec to Mbyte-usec */
-	stats->coremem = p->acct_rss_mem1 * PAGE_SIZE / MB;
-	stats->virtmem = p->acct_vm_mem1 * PAGE_SIZE / MB;
+	put_unaligned(p->acct_rss_mem1 * PAGE_SIZE / MB, &stats->coremem);
+	put_unaligned(p->acct_vm_mem1 * PAGE_SIZE / MB, &stats->virtmem);
 	mm = get_task_mm(p);
 	if (mm) {
 		/* adjust to KB unit */
-		stats->hiwater_rss   = get_mm_hiwater_rss(mm) * PAGE_SIZE / KB;
-		stats->hiwater_vm    = get_mm_hiwater_vm(mm)  * PAGE_SIZE / KB;
+		put_unaligned(get_mm_hiwater_rss(mm) * PAGE_SIZE / KB,
+				&stats->hiwater_rss);
+		put_unaligned(get_mm_hiwater_vm(mm)  * PAGE_SIZE / KB,
+				&stats->hiwater_vm);
 		mmput(mm);
 	}
-	stats->read_char	= p->ioac.rchar;
-	stats->write_char	= p->ioac.wchar;
-	stats->read_syscalls	= p->ioac.syscr;
-	stats->write_syscalls	= p->ioac.syscw;
+	put_unaligned(p->ioac.rchar, &stats->read_char);
+	put_unaligned(p->ioac.wchar, &stats->write_char);
+	put_unaligned(p->ioac.syscr, &stats->read_syscalls);
+	put_unaligned(p->ioac.syscw, &stats->write_syscalls);
 #ifdef CONFIG_TASK_IO_ACCOUNTING
-	stats->read_bytes	= p->ioac.read_bytes;
-	stats->write_bytes	= p->ioac.write_bytes;
-	stats->cancelled_write_bytes = p->ioac.cancelled_write_bytes;
+	put_unaligned(p->ioac.read_bytes, &stats->read_bytes);
+	put_unaligned(p->ioac.write_bytes, &stats->write_bytes);
+	put_unaligned(p->ioac.cancelled_write_bytes,
+				&stats->cancelled_write_bytes);
 #else
-	stats->read_bytes	= 0;
-	stats->write_bytes	= 0;
-	stats->cancelled_write_bytes = 0;
+	put_unaligned(0, &stats->read_bytes);
+	put_unaligned(0, &stats->write_bytes);
+	put_unaligned(0, &stats->cancelled_write_bytes);
 #endif
 }
 #undef KB
-- 
1.7.6


Reply to: