[Date Prev][Date Next] [Thread Prev][Thread Next] [Date Index] [Thread Index]

Bug#695182: linux-image-3.2.0-4-686-pae: Write couple of 1GB files for OOM crash



Seems to me that the bug is in function
  bdi_position_ratio()
within file
  mm/page-writeback.c
The internal variable declaration is
  long long pos_ratio;
and calculation of it overflows. - Maybe, changing the declaration to
u64 would help. But also, pos_ratio is used without any bounds checks
as return value though that is declared as unsigned long.

I do not yet understand what bdi_position_ratio() is meant to do, so
cannot yet offer patches.

---

What I did:

I added many lines like
  BUG_ON(pos_ratio<0);
into kernel sources. Running that kernel and creating my files with
  n=0; while [ $n -lt 99 ]; do dd bs=1M count=1024 if=/dev/zero of=x$n; (( n = $n + 1 )); done &
I got after about 15 files created:
/bin/bash: line 1:  2755 Segmentation fault      dd bs=1M count=1024 if=/dev/zero of=x$n
Message from syslogd@zeno at Sat Dec 15 19:46:37 2012 ...
zeno kernel: ------------[ cut here ]------------
zeno kernel: invalid opcode: 0000 [#1] SMP 
...
and in the logs:

Dec 15 19:46:37 zeno kernel: ------------[ cut here ]------------
Dec 15 19:46:37 zeno kernel: kernel BUG at mm/page-writeback.c:569!
Dec 15 19:46:37 zeno kernel: invalid opcode: 0000 [#1] SMP 
Dec 15 19:46:37 zeno kernel: Modules linked in: nfsd exportfs quota_v2 quota_tree fuse joydev usb_storage coretemp crc32c_intel aesni_intel sg cryptd sr_mod aes_i586 aes_generic 8250_pnp evdev i2c_i801 8250 serial_core processor thermal_sys button
Dec 15 19:46:37 zeno kernel: 
Dec 15 19:46:37 zeno kernel: Pid: 2755, comm: dd Not tainted 3.2.32-pk06.08-i386t02 #1 Supermicro X9DR3-F/X9DR3-F
Dec 15 19:46:37 zeno kernel: EIP: 0060:[<c107bf30>] EFLAGS: 00010282 CPU: 0
Dec 15 19:46:37 zeno kernel: EIP is at bdi_position_ratio.isra.16+0x220/0x230
Dec 15 19:46:37 zeno kernel: EAX: fffaadbc EBX: 00000524 ECX: fffaadbc EDX: 760dae6b
Dec 15 19:46:37 zeno kernel: ESI: 00000524 EDI: ea673c18 EBP: d6235d2c ESP: d6235d00
Dec 15 19:46:37 zeno kernel:  DS: 007b ES: 007b FS: 00d8 GS: 0033 SS: 0068
Dec 15 19:46:37 zeno kernel: Process dd (pid: 2755, ti=d6234000 task=d607bb10 task.ti=d6234000)
Dec 15 19:46:37 zeno kernel: Stack:
Dec 15 19:46:37 zeno kernel:  d6235d1c 000280cd 33b036ce 00000098 0000047f 760db26b fffaadbc 0000007a
Dec 15 19:46:37 zeno kernel:  00000004 00000546 d5de809c d6235db0 c107c963 000004f1 00000523 00000546
Dec 15 19:46:37 zeno kernel:  00000000 00140669 00000007 00000000 d5de80bc 00032afc 00000000 d5e83800
Dec 15 19:46:37 zeno kernel: Call Trace:
Dec 15 19:46:37 zeno kernel:  [<c107c963>] balance_dirty_pages_ratelimited_nr+0x253/0x520
Dec 15 19:46:37 zeno kernel:  [<c10747cf>] generic_file_buffered_write+0x16f/0x210
Dec 15 19:46:37 zeno kernel:  [<c1075f7d>] __generic_file_aio_write+0x24d/0x4b0
Dec 15 19:46:37 zeno kernel:  [<c1076240>] generic_file_aio_write+0x60/0xc0
Dec 15 19:46:37 zeno kernel:  [<c10a2fa7>] do_sync_write+0xb7/0xf0
Dec 15 19:46:37 zeno kernel:  [<c1036455>] ? irq_exit+0x55/0x60
Dec 15 19:46:37 zeno kernel:  [<c10a2ef0>] ? wait_on_retry_sync_kiocb+0x50/0x50
Dec 15 19:46:37 zeno kernel:  [<c10a3aa7>] vfs_write+0x87/0x170
Dec 15 19:46:37 zeno kernel:  [<c10a2ef0>] ? wait_on_retry_sync_kiocb+0x50/0x50
Dec 15 19:46:37 zeno kernel:  [<c10a3da8>] sys_write+0x38/0x70
Dec 15 19:46:37 zeno kernel:  [<c160fd14>] sysenter_do_call+0x12/0x26
Dec 15 19:46:37 zeno kernel: Code: 55 ff ff ff 0f 0b 90 8d 74 26 00 0f a4 cb 03 c1 e1 03 e9 74 ff ff ff 8d 74 26 00 89 d0 31 d2 f7 75 10 89 c6 e9 59 ff ff ff 0f 0b <0f> 0b 0f 0b 0f 0b 31 c0 e9 5d ff ff ff 8d 76 00 55 89 e5 83 ec 
Dec 15 19:46:37 zeno kernel: EIP: [<c107bf30>] bdi_position_ratio.isra.16+0x220/0x230 SS:ESP 0068:d6235d00
Dec 15 19:46:37 zeno kernel: ---[ end trace c9c79e2ba8a36130 ]---

Relevant part of file  mm/page-writeback.c :

   525	static unsigned long bdi_position_ratio(struct backing_dev_info *bdi,
   526						unsigned long thresh,
   527						unsigned long bg_thresh,
   528						unsigned long dirty,
   529						unsigned long bdi_thresh,
   530						unsigned long bdi_dirty)
   531	{
   532		unsigned long write_bw = bdi->avg_write_bandwidth;
   533		unsigned long freerun = dirty_freerun_ceiling(thresh, bg_thresh);
   534		unsigned long limit = hard_dirty_limit(thresh);
   535		unsigned long x_intercept;
   536		unsigned long setpoint;		/* dirty pages' target balance point */
   537		unsigned long bdi_setpoint;
   538		unsigned long span;
   539		long long pos_ratio;		/* for scaling up/down the rate limit */
   540		long x;
   541	
   542		if (unlikely(dirty >= limit))
   543			return 0;
   544	
   545		/*
   546		 * global setpoint
   547		 *
   548		 *                           setpoint - dirty 3
   549		 *        f(dirty) := 1.0 + (----------------)
   550		 *                           limit - setpoint
   551		 *
   552		 * it's a 3rd order polynomial that subjects to
   553		 *
   554		 * (1) f(freerun)  = 2.0 => rampup dirty_ratelimit reasonably fast
   555		 * (2) f(setpoint) = 1.0 => the balance point
   556		 * (3) f(limit)    = 0   => the hard limit
   557		 * (4) df/dx      <= 0	 => negative feedback control
   558		 * (5) the closer to setpoint, the smaller |df/dx| (and the reverse)
   559		 *     => fast response on large errors; small oscillation near setpoint
   560		 */
   561		setpoint = (freerun + limit) / 2;
   562		x = div_s64((setpoint - dirty) << RATELIMIT_CALC_SHIFT,
   563			    limit - setpoint + 1);
   564	BUG_ON(x<0);
   565		pos_ratio = x;
   566		pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT;
   567		pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT;
   568		pos_ratio += 1 << RATELIMIT_CALC_SHIFT;
   569	BUG_ON(pos_ratio<0);
   570	
   571		/*
   572		 * We have computed basic pos_ratio above based on global situation. If
   573		 * the bdi is over/under its share of dirty pages, we want to scale
   574		 * pos_ratio further down/up. That is done by the following mechanism.
   575		 */
   576	
   577		/*
   578		 * bdi setpoint
   579		 *
   580		 *        f(bdi_dirty) := 1.0 + k * (bdi_dirty - bdi_setpoint)
   581		 *
   582		 *                        x_intercept - bdi_dirty
   583		 *                     := --------------------------
   584		 *                        x_intercept - bdi_setpoint
   585		 *
   586		 * The main bdi control line is a linear function that subjects to
   587		 *
   588		 * (1) f(bdi_setpoint) = 1.0
   589		 * (2) k = - 1 / (8 * write_bw)  (in single bdi case)
   590		 *     or equally: x_intercept = bdi_setpoint + 8 * write_bw
   591		 *
   592		 * For single bdi case, the dirty pages are observed to fluctuate
   593		 * regularly within range
   594		 *        [bdi_setpoint - write_bw/2, bdi_setpoint + write_bw/2]
   595		 * for various filesystems, where (2) can yield in a reasonable 12.5%
   596		 * fluctuation range for pos_ratio.
   597		 *
   598		 * For JBOD case, bdi_thresh (not bdi_dirty!) could fluctuate up to its
   599		 * own size, so move the slope over accordingly and choose a slope that
   600		 * yields 100% pos_ratio fluctuation on suddenly doubled bdi_thresh.
   601		 */
   602		if (unlikely(bdi_thresh > thresh))
   603			bdi_thresh = thresh;
   604		/*
   605		 * It's very possible that bdi_thresh is close to 0 not because the
   606		 * device is slow, but that it has remained inactive for long time.
   607		 * Honour such devices a reasonable good (hopefully IO efficient)
   608		 * threshold, so that the occasional writes won't be blocked and active
   609		 * writes can rampup the threshold quickly.
   610		 */
   611		bdi_thresh = max(bdi_thresh, (limit - dirty) / 8);
   612		/*
   613		 * scale global setpoint to bdi's:
   614		 *	bdi_setpoint = setpoint * bdi_thresh / thresh
   615		 */
   616		x = div_u64((u64)bdi_thresh << 16, thresh + 1);
   617	BUG_ON(x<0);
   618		bdi_setpoint = setpoint * (u64)x >> 16;
   619		/*
   620		 * Use span=(8*write_bw) in single bdi case as indicated by
   621		 * (thresh - bdi_thresh ~= 0) and transit to bdi_thresh in JBOD case.
   622		 *
   623		 *        bdi_thresh                    thresh - bdi_thresh
   624		 * span = ---------- * (8 * write_bw) + ------------------- * bdi_thresh
   625		 *          thresh                            thresh
   626		 */
   627		span = (thresh - bdi_thresh + 8 * write_bw) * (u64)x >> 16;
   628		x_intercept = bdi_setpoint + span;
   629	
   630		if (bdi_dirty < x_intercept - span / 4) {
   631			pos_ratio = div_u64(pos_ratio * (x_intercept - bdi_dirty),
   632					    x_intercept - bdi_setpoint + 1);
   633		} else
   634			pos_ratio /= 4;
   635	BUG_ON(pos_ratio<0);
   636	
   637		/*
   638		 * bdi reserve area, safeguard against dirty pool underrun and disk idle
   639		 * It may push the desired control point of global dirty pages higher
   640		 * than setpoint.
   641		 */
   642		x_intercept = bdi_thresh / 2;
   643		if (bdi_dirty < x_intercept) {
   644			if (bdi_dirty > x_intercept / 8)
   645				pos_ratio = div_u64(pos_ratio * x_intercept, bdi_dirty);
   646			else
   647				pos_ratio *= 8;
   648		}
   649	BUG_ON(pos_ratio<0);
   650	
   651		return pos_ratio;
   652	}

Cheers, Paul

Paul Szabo   psz@maths.usyd.edu.au   http://www.maths.usyd.edu.au/u/psz/
School of Mathematics and Statistics   University of Sydney    Australia


Reply to: