Bug#695182: linux-image-3.2.0-4-686-pae: Write couple of 1GB files for OOM crash
Dear Ben,
> Please read Documentation/SubmittingPatches, use scripts/checkpatch.pl
> and try to provide a patch that is suitable for upstream inclusion.
> Also, your name belongs in the patch header, not in the code.
I changed the proposed patch accordingly, scripts/checkpatch.pl produces
just a few warnings. I had my patch in use for a while now, so I believe
it is suitably tested.
Please let me know if I need to do anything else.
Cheers, Paul
Paul Szabo psz@maths.usyd.edu.au http://www.maths.usyd.edu.au/u/psz/
School of Mathematics and Statistics University of Sydney Australia
Avoid OOM when filesystem caches fill lowmem and are not reclaimed,
doing drop_caches at that point. The issue is easily reproducible on
machines with over 32GB RAM. The patch correctly protects against OOM.
The added call to drop_caches has been observed to trigger "needlessly"
but on quite rare occasions only.
Also included are several minor fixes:
- Comment about highmem_is_dirtyable that seems used only to calculate
limits and threshholds, not used in any decisions.
- In determine_dirtyable_memory() subtract min_free_kbytes from
returned value. I believe this is "right", that min_free_kbytes is
not intended for dirty pages.
- In bdi_position_ratio() get difference (setpoint-dirty) right even
when it is negative, which happens often. Normally these numbers are
"small" and even with left-shift I never observed a 32-bit overflow.
I believe it should be possible to re-write the whole function in
32-bit ints; maybe it is not worth the effort to make it "efficient";
seeing how this function was always wrong and we survived, it should
simply be removed.
- Comment in bdi_max_pause() that it seems to always return a too-small
value, maybe it should simply return a fixed value.
- Comment in balance_dirty_pages() about a test marked unlikely() but
which I observe to be quite common.
- Comment in __alloc_pages_slowpath() about did_some_progress being
set twice, but only checked after the second setting, so the first
setting is lost and wasted.
- Comment in zone_reclaimable() that maybe should return true with
non-zero NR_SLAB_RECLAIMABLE.
- Comment about all_unreclaimable which may be set wrongly.
- Comments in global_reclaimable_pages() and zone_reclaimable_pages()
about maybe adding or including NR_SLAB_RECLAIMABLE.
Paul Szabo psz@maths.usyd.edu.au http://www.maths.usyd.edu.au/u/psz/
School of Mathematics and Statistics University of Sydney Australia
Reported-by: Paul Szabo <psz@maths.usyd.edu.au>
Reference: http://bugs.debian.org/695182
Signed-off-by: Paul Szabo <psz@maths.usyd.edu.au>
--- fs/drop_caches.c.old 2012-10-17 13:50:15.000000000 +1100
+++ fs/drop_caches.c 2013-01-04 21:52:47.000000000 +1100
@@ -65,3 +65,10 @@ int drop_caches_sysctl_handler(ctl_table
}
return 0;
}
+
+/* Easy call: do "echo 3 > /proc/sys/vm/drop_caches" */
+void easy_drop_caches(void)
+{
+ iterate_supers(drop_pagecache_sb, NULL);
+ drop_slab();
+}
--- mm/page-writeback.c.old 2012-10-17 13:50:15.000000000 +1100
+++ mm/page-writeback.c 2013-01-06 21:54:59.000000000 +1100
@@ -39,7 +39,8 @@
/*
* Sleep at most 200ms at a time in balance_dirty_pages().
*/
-#define MAX_PAUSE max(HZ/5, 1)
+/* Might as well be max(HZ/5,4) to ensure max_pause/4>0 always */
+#define MAX_PAUSE max(HZ/5, 4)
/*
* Estimate write bandwidth at 200ms intervals.
@@ -343,11 +344,26 @@ static unsigned long highmem_dirtyable_m
unsigned long determine_dirtyable_memory(void)
{
unsigned long x;
+ int y = 0;
+ extern int min_free_kbytes;
x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages();
+ /*
+ * Seems that highmem_is_dirtyable is only used here, in the
+ * calculation of limits and threshholds of dirtiness, not in deciding
+ * where to put dirty things. Is that so? Is that as should be?
+ * What is the recommended setting of highmem_is_dirtyable?
+ */
if (!vm_highmem_is_dirtyable)
x -= highmem_dirtyable_memory(x);
+ /* Subtract min_free_kbytes */
+ if (min_free_kbytes > 0)
+ y = min_free_kbytes >> (PAGE_SHIFT - 10);
+ if (x > y)
+ x -= y;
+ else
+ x = 0;
return x + 1; /* Ensure that we never return 0 */
}
@@ -541,6 +557,9 @@ static unsigned long bdi_position_ratio(
if (unlikely(dirty >= limit))
return 0;
+ /* Never seen this happen, just sanity-check paranoia */
+ if (unlikely(freerun >= limit))
+ return 16 << RATELIMIT_CALC_SHIFT;
/*
* global setpoint
@@ -559,7 +578,7 @@ static unsigned long bdi_position_ratio(
* => fast response on large errors; small oscillation near setpoint
*/
setpoint = (freerun + limit) / 2;
- x = div_s64((setpoint - dirty) << RATELIMIT_CALC_SHIFT,
+ x = div_s64(((s64)setpoint - (s64)dirty) << RATELIMIT_CALC_SHIFT,
limit - setpoint + 1);
pos_ratio = x;
pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT;
@@ -995,6 +1014,13 @@ static unsigned long bdi_max_pause(struc
* The pause time will be settled within range (max_pause/4, max_pause).
* Apply a minimal value of 4 to get a non-zero max_pause/4.
*/
+ /*
+ * On large machine it seems we always return 4,
+ * on smaller desktop machine mostly return 5 (rarely 9 or 14).
+ * Are those too small? Should we return something fixed e.g.
+ return (HZ/10);
+ * instead of this wasted/useless calculation?
+ */
return clamp_val(t, 4, MAX_PAUSE);
}
@@ -1109,6 +1135,11 @@ static void balance_dirty_pages(struct a
}
pause = HZ * pages_dirtied / task_ratelimit;
if (unlikely(pause <= 0)) {
+ /*
+ * Not unlikely: often we get zero.
+ * Seems we always get 0 on large machine.
+ * Should not do a pause of 1 here?
+ */
trace_balance_dirty_pages(bdi,
dirty_thresh,
background_thresh,
--- mm/page_alloc.c.old 2012-10-17 13:50:15.000000000 +1100
+++ mm/page_alloc.c 2013-01-05 17:21:41.000000000 +1100
@@ -2207,6 +2207,10 @@ rebalance:
* If we failed to make any progress reclaiming, then we are
* running out of options and have to consider going OOM
*/
+ /*
+ * We had did_some_progress set twice, but is only checked here
+ * so the first setting was lost. Is that as should be?
+ */
if (!did_some_progress) {
if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
if (oom_killer_disabled)
--- mm/vmscan.c.old 2012-10-17 13:50:15.000000000 +1100
+++ mm/vmscan.c 2013-01-06 09:50:49.000000000 +1100
@@ -213,6 +213,8 @@ static inline int do_shrinker_shrink(str
/*
* Call the shrink functions to age shrinkable caches
*
+ * These comments seem to be about filesystem caches, though slabs may be
+ * used elsewhere also.
* Here we assume it costs one seek to replace a lru page and that it also
* takes a seek to recreate a cache object. With this in mind we age equal
* percentages of the lru and ageable caches. This should balance the seeks
@@ -2244,6 +2246,9 @@ static bool shrink_zones(int priority, s
static bool zone_reclaimable(struct zone *zone)
{
+ /* Should we return true with NR_SLAB_RECLAIMABLE ? */
+ /*if (zone_page_state(zone,NR_SLAB_RECLAIMABLE)>0) return true; */
+ /* Wonder about the "correctness" of that *6 factor. */
return zone->pages_scanned < zone_reclaimable_pages(zone) * 6;
}
@@ -2726,9 +2731,87 @@ loop_again:
nr_slab = shrink_slab(&shrink, sc.nr_scanned, lru_pages);
sc.nr_reclaimed += reclaim_state->reclaimed_slab;
total_scanned += sc.nr_scanned;
+ if (unlikely(
+ i == 1 &&
+ nr_slab < 10 &&
+ (reclaim_state->reclaimed_slab) < 10 &&
+ zone_page_state(zone, NR_SLAB_RECLAIMABLE) > 10 &&
+ !zone_watermark_ok_safe(zone, order,
+ high_wmark_pages(zone), end_zone, 0))) {
+ /*
+ * We are stressed (desperate), better
+ * to drop file caches than to suffer
+ * an OOM episode (where you need to
+ * press the reset button): see
+ * http://bugs.debian.org/695182
+ *
+ * Do something like
+ * echo 3 > /proc/sys/vm/drop_caches
+ * See in fs/drop_caches.c :
+ void easy_drop_caches(void)
+ {
+ iterate_supers(drop_pagecache_sb, NULL);
+ drop_slab();
+ }
+ *
+ * All slabs (other than filesystem) may
+ * already be cleared, or may be cleared
+ * now: is that fair or efficient?
+ *
+ * I noticed this issue on machines with
+ * over 32GB RAM, but do not see
+ * anything specific to large memory in
+ * the code. I wonder if the issue might
+ * affect machines with smaller memory
+ * or with 64-bit build.
+ *
+ * Need to drop_pagecache, drop_slab
+ * alone is ineffective. This is
+ * probably why our shrink_slab above
+ * did not succeed.
+ *
+ * Should we drop_slab(), or do loops of
+ * shrink_slab and keep counts as above?
+ * Seems that reclaimed_slab is kept
+ * automatically.
+ *
+ * I wonder why are these slabs or
+ * caches in lowmem, should not they
+ * have been allocated in highmem
+ * initially, instead?
+ */
+ extern void easy_drop_caches(void);
+ reclaim_state->reclaimed_slab = 0;
+ printk(KERN_WARNING "drop_caches with zone=%d nr_slab=%d reclaimed_slab=%ld RECLAIMABLE=%ld FREE=%ld\n",
+ i, nr_slab, reclaim_state->reclaimed_slab,
+ zone_page_state(zone, NR_SLAB_RECLAIMABLE),
+ zone_page_state(zone, NR_FREE_PAGES));
+ easy_drop_caches();
+ printk(KERN_WARNING "after drop_caches reclaimed_slab=%ld RECLAIMABLE=%ld FREE=%ld\n",
+ reclaim_state->reclaimed_slab,
+ zone_page_state(zone, NR_SLAB_RECLAIMABLE),
+ zone_page_state(zone, NR_FREE_PAGES));
+ if (reclaim_state->reclaimed_slab > 0) {
+ sc.nr_reclaimed += reclaim_state->reclaimed_slab;
+ if (nr_slab == 0)
+ nr_slab = 1;
+ }
+ }
if (nr_slab == 0 && !zone_reclaimable(zone))
zone->all_unreclaimable = 1;
+ /*
+ * Beware of all_unreclaimable. We set it when
+ * - shrink_slab() returns 0, which may happen
+ * because of temporary failure or because of
+ * some internal restrictions, and
+ * - zone_reclaimable() returns false, which
+ * may happen though NR_SLAB_RECLAIMABLE is
+ * non-zero
+ * so it may be set "wrong" or prematurely.
+ * And then we do not unset all_unreclaimable
+ * until some page is freed (in page_alloc.c).
+ */
}
/*
@@ -3055,6 +3138,7 @@ unsigned long global_reclaimable_pages(v
{
int nr;
+ /* Should we add/include global_page_state(NR_SLAB_RECLAIMABLE) ? */
nr = global_page_state(NR_ACTIVE_FILE) +
global_page_state(NR_INACTIVE_FILE);
@@ -3069,6 +3153,7 @@ unsigned long zone_reclaimable_pages(str
{
int nr;
+ /* Should we add/include zone_page_state(zone,NR_SLAB_RECLAIMABLE) ? */
nr = zone_page_state(zone, NR_ACTIVE_FILE) +
zone_page_state(zone, NR_INACTIVE_FILE);
Reply to: