[Date Prev][Date Next] [Thread Prev][Thread Next] [Date Index] [Thread Index]

Bug#695182: linux-image-3.2.0-4-686-pae: Write couple of 1GB files for OOM crash



tags 695182 - moreinfo
thanks

Dear Ben,

I suggest the following patch, which seems to solve the problem.
Two attachments: minimal.patch just to show the simplicity, and
complete.patch with comments and enhancements.

Cheers, Paul

Paul Szabo   psz@maths.usyd.edu.au   http://www.maths.usyd.edu.au/u/psz/
School of Mathematics and Statistics   University of Sydney    Australia
--- fs/drop_caches.c.old	2012-10-17 13:50:15.000000000 +1100
+++ fs/drop_caches.c	2013-01-01 09:23:57.000000000 +1100
@@ -58,10 +58,16 @@
 	if (ret)
 		return ret;
 	if (write) {
 		if (sysctl_drop_caches & 1)
 			iterate_supers(drop_pagecache_sb, NULL);
 		if (sysctl_drop_caches & 2)
 			drop_slab();
 	}
 	return 0;
 }
+
+void PSz_drop_caches(void)
+{
+	iterate_supers(drop_pagecache_sb, NULL);
+	drop_slab();
+}
--- mm/vmscan.c.old	2012-10-17 13:50:15.000000000 +1100
+++ mm/vmscan.c	2013-01-01 22:58:51.000000000 +1100
@@ -2719,20 +2719,25 @@
 				KSWAPD_ZONE_BALANCE_GAP_RATIO);
 			if (!zone_watermark_ok_safe(zone, order,
 					high_wmark_pages(zone) + balance_gap,
 					end_zone, 0)) {
 				shrink_zone(priority, zone, &sc);
 
 				reclaim_state->reclaimed_slab = 0;
 				nr_slab = shrink_slab(&shrink, sc.nr_scanned, lru_pages);
 				sc.nr_reclaimed += reclaim_state->reclaimed_slab;
 				total_scanned += sc.nr_scanned;
+if (i==1 && nr_slab<10 && (reclaim_state->reclaimed_slab)<10 && zone_page_state(zone,NR_SLAB_RECLAIMABLE)>10)
+{
+extern void PSz_drop_caches(void);
+  PSz_drop_caches();
+}
 
 				if (nr_slab == 0 && !zone_reclaimable(zone))
 					zone->all_unreclaimable = 1;
 			}
 
 			/*
 			 * If we've done a decent amount of scanning and
 			 * the reclaim ratio is low, start doing writepage
 			 * even in laptop mode
 			 */
--- fs/drop_caches.c.old	2012-10-17 13:50:15.000000000 +1100
+++ fs/drop_caches.c	2013-01-01 09:23:57.000000000 +1100
@@ -58,10 +58,16 @@
 	if (ret)
 		return ret;
 	if (write) {
 		if (sysctl_drop_caches & 1)
 			iterate_supers(drop_pagecache_sb, NULL);
 		if (sysctl_drop_caches & 2)
 			drop_slab();
 	}
 	return 0;
 }
+
+void PSz_drop_caches(void)
+{
+	iterate_supers(drop_pagecache_sb, NULL);
+	drop_slab();
+}
--- mm/page-writeback.c.old	2012-10-17 13:50:15.000000000 +1100
+++ mm/page-writeback.c	2013-01-01 23:01:52.000000000 +1100
@@ -32,21 +32,22 @@
 #include <linux/sysctl.h>
 #include <linux/cpu.h>
 #include <linux/syscalls.h>
 #include <linux/buffer_head.h>
 #include <linux/pagevec.h>
 #include <trace/events/writeback.h>
 
 /*
  * Sleep at most 200ms at a time in balance_dirty_pages().
  */
-#define MAX_PAUSE		max(HZ/5, 1)
+/* PSz: Might as well be max(HZ/5,4) to ensure max_pause/4>0 always */
+#define MAX_PAUSE		max(HZ/5, 4)
 
 /*
  * Estimate write bandwidth at 200ms intervals.
  */
 #define BANDWIDTH_INTERVAL	max(HZ/5, 1)
 
 #define RATELIMIT_CALC_SHIFT	10
 
 /*
  * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited
@@ -339,22 +340,40 @@
  *
  * Returns the numebr of pages that can currently be freed and used
  * by the kernel for direct mappings.
  */
 unsigned long determine_dirtyable_memory(void)
 {
 	unsigned long x;
 
 	x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages();
 
+/*
+ * PSz: Seems that highmem_is_dirtyable is only used here, in the
+ * calculation of limits and threshholds of dirtiness, not in deciding
+ * where to put dirty things. Is that so? Is that as should be?
+ * What is the recommended setting of highmem_is_dirtyable?
+ */
 	if (!vm_highmem_is_dirtyable)
 		x -= highmem_dirtyable_memory(x);
+/* PSz: Should not we subtract min_free_kbytes? */
+{
+extern int min_free_kbytes;
+int y = 0;
+/* printk("PSz: determine_dirtyable_memory was %ld pages, now subtract min_free_kbytes=%d\n",x,min_free_kbytes); */
+if (min_free_kbytes > 0)
+  y = min_free_kbytes >> (PAGE_SHIFT - 10);
+if (x > y)
+  x -= y;
+else
+  x = 0;
+}
 
 	return x + 1;	/* Ensure that we never return 0 */
 }
 
 static unsigned long dirty_freerun_ceiling(unsigned long thresh,
 					   unsigned long bg_thresh)
 {
 	return (thresh + bg_thresh) / 2;
 }
 
@@ -534,39 +553,43 @@
 	unsigned long limit = hard_dirty_limit(thresh);
 	unsigned long x_intercept;
 	unsigned long setpoint;		/* dirty pages' target balance point */
 	unsigned long bdi_setpoint;
 	unsigned long span;
 	long long pos_ratio;		/* for scaling up/down the rate limit */
 	long x;
 
 	if (unlikely(dirty >= limit))
 		return 0;
+	if (unlikely(freerun >= limit))
+/* PSz: Never seen this happen, just sanity-check paranoia */
+		return (16 << RATELIMIT_CALC_SHIFT);
 
 	/*
 	 * global setpoint
 	 *
 	 *                           setpoint - dirty 3
 	 *        f(dirty) := 1.0 + (----------------)
 	 *                           limit - setpoint
 	 *
 	 * it's a 3rd order polynomial that subjects to
 	 *
 	 * (1) f(freerun)  = 2.0 => rampup dirty_ratelimit reasonably fast
 	 * (2) f(setpoint) = 1.0 => the balance point
 	 * (3) f(limit)    = 0   => the hard limit
 	 * (4) df/dx      <= 0	 => negative feedback control
 	 * (5) the closer to setpoint, the smaller |df/dx| (and the reverse)
 	 *     => fast response on large errors; small oscillation near setpoint
 	 */
 	setpoint = (freerun + limit) / 2;
-	x = div_s64((setpoint - dirty) << RATELIMIT_CALC_SHIFT,
+/* PSz: Get that difference right */
+	x = div_s64(((s64)setpoint - (s64)dirty) << RATELIMIT_CALC_SHIFT,
 		    limit - setpoint + 1);
 	pos_ratio = x;
 	pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT;
 	pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT;
 	pos_ratio += 1 << RATELIMIT_CALC_SHIFT;
 
 	/*
 	 * We have computed basic pos_ratio above based on global situation. If
 	 * the bdi is over/under its share of dirty pages, we want to scale
 	 * pos_ratio further down/up. That is done by the following mechanism.
@@ -988,20 +1011,27 @@
 	 * idle.
 	 *
 	 * 8 serves as the safety ratio.
 	 */
 	t = min(t, bdi_dirty * HZ / (8 * bw + 1));
 
 	/*
 	 * The pause time will be settled within range (max_pause/4, max_pause).
 	 * Apply a minimal value of 4 to get a non-zero max_pause/4.
 	 */
+/*
+ * PSz: On large machine it seems we always return 4,
+ * on smaller desktop machine mostly return 5 (rarely 9 or 14).
+ * Are those too small? Should we return something fixed e.g.
+return (HZ/10);
+ * instead of this wasted/useless calculation?
+ */
 	return clamp_val(t, 4, MAX_PAUSE);
 }
 
 /*
  * balance_dirty_pages() must be called by processes which are generating dirty
  * data.  It looks at the number of dirty pages in the machine and will force
  * the caller to wait once crossing the (background_thresh + dirty_thresh) / 2.
  * If we're over `background_thresh' then the writeback threads are woken to
  * perform some writeout.
  */
@@ -1017,22 +1047,24 @@
 	unsigned long dirty_thresh;
 	unsigned long bdi_thresh;
 	long pause = 0;
 	long uninitialized_var(max_pause);
 	bool dirty_exceeded = false;
 	unsigned long task_ratelimit;
 	unsigned long uninitialized_var(dirty_ratelimit);
 	unsigned long pos_ratio;
 	struct backing_dev_info *bdi = mapping->backing_dev_info;
 	unsigned long start_time = jiffies;
+int PSzloop = 0;
 
 	for (;;) {
+PSzloop++;
 		/*
 		 * Unstable writes are a feature of certain networked
 		 * filesystems (i.e. NFS) in which data may have been
 		 * written to the server's write cache, but has not yet
 		 * been flushed to permanent storage.
 		 */
 		nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
 					global_page_state(NR_UNSTABLE_NFS);
 		nr_dirty = nr_reclaimable + global_page_state(NR_WRITEBACK);
 
@@ -1102,20 +1134,25 @@
 					       background_thresh, nr_dirty,
 					       bdi_thresh, bdi_dirty);
 		task_ratelimit = ((u64)dirty_ratelimit * pos_ratio) >>
 							RATELIMIT_CALC_SHIFT;
 		if (unlikely(task_ratelimit == 0)) {
 			pause = max_pause;
 			goto pause;
 		}
 		pause = HZ * pages_dirtied / task_ratelimit;
 		if (unlikely(pause <= 0)) {
+/*
+ * PSz: Not unlikely: often we get zero.
+ * Seems we always get 0 on large machine.
+ * Should not do a pause of 1 here?
+ */
 			trace_balance_dirty_pages(bdi,
 						  dirty_thresh,
 						  background_thresh,
 						  nr_dirty,
 						  bdi_thresh,
 						  bdi_dirty,
 						  dirty_ratelimit,
 						  task_ratelimit,
 						  pages_dirtied,
 						  pause,
@@ -1156,20 +1193,21 @@
 		 * pipe going: the flusher cleans 1 page => the task dirties 1
 		 * more page. However bdi_dirty has accounting errors.  So use
 		 * the larger and more IO friendly bdi_stat_error.
 		 */
 		if (bdi_dirty <= bdi_stat_error(bdi))
 			break;
 
 		if (fatal_signal_pending(current))
 			break;
 	}
+if (PSzloop>2) printk("PSz: Beware infinite: loop=%d in balance_dirty_pages (pid=%d comm=%s)\n",PSzloop,current->pid,current->comm);
 
 	if (!dirty_exceeded && bdi->dirty_exceeded)
 		bdi->dirty_exceeded = 0;
 
 	current->nr_dirtied = 0;
 	if (pause == 0) { /* in freerun area */
 		current->nr_dirtied_pause =
 				dirty_poll_interval(nr_dirty, dirty_thresh);
 	} else if (pause <= max_pause / 4 &&
 		   pages_dirtied >= current->nr_dirtied_pause) {
--- mm/page_alloc.c.old	2012-10-17 13:50:15.000000000 +1100
+++ mm/page_alloc.c	2013-01-01 23:02:23.000000000 +1100
@@ -1865,20 +1865,21 @@
 		/*
 		 * GFP_THISNODE contains __GFP_NORETRY and we never hit this.
 		 * Sanity check for bare calls of __GFP_THISNODE, not real OOM.
 		 * The caller should handle page allocation failure by itself if
 		 * it specifies __GFP_THISNODE.
 		 * Note: Hugepage uses it but will hit PAGE_ALLOC_COSTLY_ORDER.
 		 */
 		if (gfp_mask & __GFP_THISNODE)
 			goto out;
 	}
+printk("PSz: About to OOM with order=%x PAGE_ALLOC_COSTLY_ORDER=%x high_zoneidx=%x ZONE_NORMAL=%x gfp_mask=%x __GFP_THISNODE=%x __GFP_NORETRY=%x\n",order,PAGE_ALLOC_COSTLY_ORDER,high_zoneidx,ZONE_NORMAL,gfp_mask,__GFP_THISNODE,__GFP_NORETRY);
 	/* Exhausted what can be done so it's blamo time */
 	out_of_memory(zonelist, gfp_mask, order, nodemask);
 
 out:
 	clear_zonelist_oom(zonelist, gfp_mask);
 	return page;
 }
 
 #ifdef CONFIG_COMPACTION
 /* Try memory compaction for high-order allocations before reclaim */
@@ -2091,20 +2092,21 @@
 	nodemask_t *nodemask, struct zone *preferred_zone,
 	int migratetype)
 {
 	const gfp_t wait = gfp_mask & __GFP_WAIT;
 	struct page *page = NULL;
 	int alloc_flags;
 	unsigned long pages_reclaimed = 0;
 	unsigned long did_some_progress;
 	bool sync_migration = false;
 	bool deferred_compaction = false;
+int PSzloop = 0;
 
 	/*
 	 * In the slowpath, we sanity check order to avoid ever trying to
 	 * reclaim >= MAX_ORDER areas which will never succeed. Callers may
 	 * be using allocators in order of preference for an area that is
 	 * too large.
 	 */
 	if (order >= MAX_ORDER) {
 		WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));
 		return NULL;
@@ -2200,20 +2202,24 @@
 					nodemask,
 					alloc_flags, preferred_zone,
 					migratetype, &did_some_progress);
 	if (page)
 		goto got_pg;
 
 	/*
 	 * If we failed to make any progress reclaiming, then we are
 	 * running out of options and have to consider going OOM
 	 */
+/*
+ * PSz: We had did_some_progress set twice, but is only checked here
+ * so the first setting was lost. Is that as should be?
+ */
 	if (!did_some_progress) {
 		if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
 			if (oom_killer_disabled)
 				goto nopage;
 			page = __alloc_pages_may_oom(gfp_mask, order,
 					zonelist, high_zoneidx,
 					nodemask, preferred_zone,
 					migratetype);
 			if (page)
 				goto got_pg;
@@ -2229,29 +2235,33 @@
 					goto nopage;
 				/*
 				 * The oom killer is not called for lowmem
 				 * allocations to prevent needlessly killing
 				 * innocent tasks.
 				 */
 				if (high_zoneidx < ZONE_NORMAL)
 					goto nopage;
 			}
 
+PSzloop++;
+printk("PSz: Beware infinite: loop restart=%d in alloc_pages_slowpath for pid=%d comm=%s\n",PSzloop,current->pid,current->comm);
 			goto restart;
 		}
 	}
 
 	/* Check if we should retry the allocation */
 	pages_reclaimed += did_some_progress;
 	if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) {
 		/* Wait for some write requests to complete then retry */
 		wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
+PSzloop++;
+printk("PSz: Beware infinite: loop rebalance=%d in alloc_pages_slowpath for pid=%d comm=%s did_some_progress=%ld pages_reclaimed=%ld\n",PSzloop,current->pid,current->comm,did_some_progress,pages_reclaimed);
 		goto rebalance;
 	} else {
 		/*
 		 * High-order allocations do not necessarily loop after
 		 * direct reclaim and reclaim/compaction depends on compaction
 		 * being called after reclaim so call directly if necessary
 		 */
 		page = __alloc_pages_direct_compact(gfp_mask, order,
 					zonelist, high_zoneidx,
 					nodemask,
@@ -2278,20 +2288,21 @@
  */
 struct page *
 __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
 			struct zonelist *zonelist, nodemask_t *nodemask)
 {
 	enum zone_type high_zoneidx = gfp_zone(gfp_mask);
 	struct zone *preferred_zone;
 	struct page *page = NULL;
 	int migratetype = allocflags_to_migratetype(gfp_mask);
 	unsigned int cpuset_mems_cookie;
+int PSzloop = 0;
 
 	gfp_mask &= gfp_allowed_mask;
 
 	lockdep_trace_alloc(gfp_mask);
 
 	might_sleep_if(gfp_mask & __GFP_WAIT);
 
 	if (should_fail_alloc_page(gfp_mask, order))
 		return NULL;
 
@@ -2325,21 +2336,25 @@
 	trace_mm_page_alloc(page, order, gfp_mask, migratetype);
 
 out:
 	/*
 	 * When updating a task's mems_allowed, it is possible to race with
 	 * parallel threads in such a way that an allocation can fail while
 	 * the mask is being updated. If a page allocation is about to fail,
 	 * check if the cpuset changed during allocation and if so, retry.
 	 */
 	if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
+{
+PSzloop++;
 		goto retry_cpuset;
+}
+if (PSzloop>2) printk("PSz: Beware infinite: loop=%d in alloc_pages_nodemask (pid=%d comm=%s)\n",PSzloop,current->pid,current->comm);
 
 	return page;
 }
 EXPORT_SYMBOL(__alloc_pages_nodemask);
 
 /*
  * Common helper functions.
  */
 unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
 {
@@ -4767,20 +4782,21 @@
  */
 static void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn)
 {
 	int i, nid;
 	unsigned long usable_startpfn;
 	unsigned long kernelcore_node, kernelcore_remaining;
 	/* save the state before borrow the nodemask */
 	nodemask_t saved_node_state = node_states[N_HIGH_MEMORY];
 	unsigned long totalpages = early_calculate_totalpages();
 	int usable_nodes = nodes_weight(node_states[N_HIGH_MEMORY]);
+int PSzloop = 0;
 
 	/*
 	 * If movablecore was specified, calculate what size of
 	 * kernelcore that corresponds so that memory usable for
 	 * any allocation type is evenly spread. If both kernelcore
 	 * and movablecore are specified, then the value of kernelcore
 	 * will be used for required_kernelcore if it's greater than
 	 * what movablecore would have allowed.
 	 */
 	if (required_movablecore) {
@@ -4885,21 +4901,25 @@
 	}
 
 	/*
 	 * If there is still required_kernelcore, we do another pass with one
 	 * less node in the count. This will push zone_movable_pfn[nid] further
 	 * along on the nodes that still have memory until kernelcore is
 	 * satisified
 	 */
 	usable_nodes--;
 	if (usable_nodes && required_kernelcore > usable_nodes)
+{
+PSzloop++;
 		goto restart;
+}
+if (PSzloop>2) printk("PSz: Beware infinite: loop=%d in find_zone_movable_pfns_for_nodes (pid=%d comm=%s)\n",PSzloop,current->pid,current->comm);
 
 	/* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */
 	for (nid = 0; nid < MAX_NUMNODES; nid++)
 		zone_movable_pfn[nid] =
 			roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);
 
 out:
 	/* restore the node_state */
 	node_states[N_HIGH_MEMORY] = saved_node_state;
 }
--- mm/vmscan.c.old	2012-10-17 13:50:15.000000000 +1100
+++ mm/vmscan.c	2013-01-01 23:03:31.000000000 +1100
@@ -206,20 +206,21 @@
 				     unsigned long nr_to_scan)
 {
 	sc->nr_to_scan = nr_to_scan;
 	return (*shrinker->shrink)(shrinker, sc);
 }
 
 #define SHRINK_BATCH 128
 /*
  * Call the shrink functions to age shrinkable caches
  *
+PSz: These comments seem to be about filesystem caches, though slabs may be used elsewhere also.
  * Here we assume it costs one seek to replace a lru page and that it also
  * takes a seek to recreate a cache object.  With this in mind we age equal
  * percentages of the lru and ageable caches.  This should balance the seeks
  * generated by these structures.
  *
  * If the vm encountered mapped pages on the LRU it increase the pressure on
  * slab to avoid swapping.
  *
  * We do weird things to avoid (scanned*seeks*entries) overflowing 32 bits.
  *
@@ -2237,20 +2238,23 @@
 		}
 
 		shrink_zone(priority, zone, sc);
 	}
 
 	return aborted_reclaim;
 }
 
 static bool zone_reclaimable(struct zone *zone)
 {
+/* PSz: Should we return true with non-zero zone_page_state(zone,NR_SLAB_RECLAIMABLE) ? */
+/*if (zone_page_state(zone,NR_SLAB_RECLAIMABLE)>0) return true; */
+/* PSz: Wonder about the "correctness" of that *6 factor. */
 	return zone->pages_scanned < zone_reclaimable_pages(zone) * 6;
 }
 
 /* All zones in zonelist are unreclaimable? */
 static bool all_unreclaimable(struct zonelist *zonelist,
 		struct scan_control *sc)
 {
 	struct zoneref *z;
 	struct zone *zone;
 
@@ -2719,23 +2723,74 @@
 				KSWAPD_ZONE_BALANCE_GAP_RATIO);
 			if (!zone_watermark_ok_safe(zone, order,
 					high_wmark_pages(zone) + balance_gap,
 					end_zone, 0)) {
 				shrink_zone(priority, zone, &sc);
 
 				reclaim_state->reclaimed_slab = 0;
 				nr_slab = shrink_slab(&shrink, sc.nr_scanned, lru_pages);
 				sc.nr_reclaimed += reclaim_state->reclaimed_slab;
 				total_scanned += sc.nr_scanned;
+/*
+ * PSz: Do something like "echo 3 > /proc/sys/vm/drop_caches".
+ * See in fs/drop_caches.c :
+void PSz_drop_caches(void)
+{
+	iterate_supers(drop_pagecache_sb, NULL);
+	drop_slab();
+}
+ *
+ * We are stressed (desperate), better to drop file caches than to
+ * suffer an OOM episode (where you need to press the reset button).
+ *
+ * All slabs (other than filesystem) may already be cleared, or may
+ * be cleared now: is that fair or efficient?
+ *
+ * I noticed this issue on machines with over 32GB RAM, and do not
+ * see anything specific to large-memory in the code. I wonder if
+ * it would be possible to provoke the problem on machines with
+ * smaller memory or with 64-bit build.
+ *
+ * Need to drop_pagecache also, drop_slab alone is ineffective,
+ * and is probably why our shrink_slab above did not succeed.
+ *
+ * Should we drop_slab(), or do loops of shrink_slab and keep
+ * counts as above? Seems that reclaimed_slab is kept automatically.
+ *
+ * I wonder why are these slabs or caches in lowmem, should not
+ * they have been allocated in highmem initially, instead?
+ */
+if (i==1 && nr_slab<10 && (reclaim_state->reclaimed_slab)<10 && zone_page_state(zone,NR_SLAB_RECLAIMABLE)>10)
+{
+extern void PSz_drop_caches(void);
+  reclaim_state->reclaimed_slab = 0;
+  printk("PSz: drop_caches with zone=%d nr_slab=%d reclaimed_slab=%ld RECLAIMABLE=%ld FREE=%ld\n",i,nr_slab,reclaim_state->reclaimed_slab,zone_page_state(zone,NR_SLAB_RECLAIMABLE),zone_page_state(zone,NR_FREE_PAGES));
+  PSz_drop_caches();
+  printk("PSz: after drop_caches reclaimed_slab=%ld RECLAIMABLE=%ld FREE=%ld\n",reclaim_state->reclaimed_slab,zone_page_state(zone,NR_SLAB_RECLAIMABLE),zone_page_state(zone,NR_FREE_PAGES));
+  if (reclaim_state->reclaimed_slab > 0)
+  {
+    sc.nr_reclaimed += reclaim_state->reclaimed_slab;
+    if (nr_slab == 0) nr_slab = 1;
+  }
+}
 
 				if (nr_slab == 0 && !zone_reclaimable(zone))
 					zone->all_unreclaimable = 1;
+/*
+ * PSz: Beware of all_unreclaimable. We set it when
+ *  - shrink_slab() returns 0, which may happen because of temporary failure
+ *    or because of some internal restrictions, and
+ *  - zone_reclaimable() returns false, which may happen though
+ *    zone_page_state(zone,NR_SLAB_RECLAIMABLE) is non-zero
+ * so it may be set "wrong" or prematurely. And then we do not unset
+ * all_unreclaimable until some page is freed (in page_alloc.c).
+ */
 			}
 
 			/*
 			 * If we've done a decent amount of scanning and
 			 * the reclaim ratio is low, start doing writepage
 			 * even in laptop mode
 			 */
 			if (total_scanned > SWAP_CLUSTER_MAX * 2 &&
 			    total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2)
 				sc.may_writepage = 1;
@@ -2815,20 +2870,21 @@
 		 * infinite loop.
 		 *
 		 * Instead, recheck all watermarks at order-0 as they
 		 * are the most important. If watermarks are ok, kswapd will go
 		 * back to sleep. High-order users can still perform direct
 		 * reclaim if they wish.
 		 */
 		if (sc.nr_reclaimed < SWAP_CLUSTER_MAX)
 			order = sc.order = 0;
 
+printk("PSz: loop_again in balance_pgdat (pid=%d comm=%s)\n",current->pid,current->comm);
 		goto loop_again;
 	}
 
 	/*
 	 * If kswapd was reclaiming at a higher order, it has the option of
 	 * sleeping without all zones being balanced. Before it does, it must
 	 * ensure that the watermarks for order-0 on *all* zones are met and
 	 * that the congestion flags are cleared. The congestion flag must
 	 * be cleared as kswapd is the only mechanism that clears the flag
 	 * and it is potentially going to sleep here.
@@ -3048,34 +3104,36 @@
  * The reclaimable count would be mostly accurate.
  * The less reclaimable pages may be
  * - mlocked pages, which will be moved to unevictable list when encountered
  * - mapped pages, which may require several travels to be reclaimed
  * - dirty pages, which is not "instantly" reclaimable
  */
 unsigned long global_reclaimable_pages(void)
 {
 	int nr;
 
+/* PSz: Should we add or include global_page_state(NR_SLAB_RECLAIMABLE) ? */
 	nr = global_page_state(NR_ACTIVE_FILE) +
 	     global_page_state(NR_INACTIVE_FILE);
 
 	if (nr_swap_pages > 0)
 		nr += global_page_state(NR_ACTIVE_ANON) +
 		      global_page_state(NR_INACTIVE_ANON);
 
 	return nr;
 }
 
 unsigned long zone_reclaimable_pages(struct zone *zone)
 {
 	int nr;
 
+/* PSz: Should we add or include zone_page_state(zone,NR_SLAB_RECLAIMABLE) ? */
 	nr = zone_page_state(zone, NR_ACTIVE_FILE) +
 	     zone_page_state(zone, NR_INACTIVE_FILE);
 
 	if (nr_swap_pages > 0)
 		nr += zone_page_state(zone, NR_ACTIVE_ANON) +
 		      zone_page_state(zone, NR_INACTIVE_ANON);
 
 	return nr;
 }
 

Reply to: