[Date Prev][Date Next] [Thread Prev][Thread Next] [Date Index] [Thread Index]

Re: Bug#912411: Cgroup memory subsystem memory leak



reassign 912411 src:linux 4.9.110-3+deb9u2~deb8u1
thanks

On Wed, Oct 31, 2018 at 05:21:39PM +0800, 段熊春 wrote:
> Package: linux-image-4.9.0-0.bpo.7-amd64
> Version: 4.9.110-3+deb9u2~deb8u1
> 
> Package: systemd
> Version: 230-7~bpo8+2
> 
> hi guys:
> We suspect that we may have found a memory leak bug in cgroup memory subsystem, with 1GBytes/Hour leak speed for a special case.
> This bug could be reproduced 100% on the mainstream kernel version 4.19.   (Tried on Debian's latest kernel 4.14 and 4.9, the same result.)
> 
> This is what we have observed (Debian 9 Stretch, with mainstream kernel version 4.19, kconfig attached) and how to reprocude:
> System with Cgroup enabled. A demo service which simulates an "ill" behavior: program broken, and exit immediately after just startup:
> 
> service code
> #include "stdio.h"
> #include "stdlib.h"
> int main()
> {
>  void * p = malloc(10240);
>  return 1;
> }
> Compile the above code and put the binary as /usr/bin/test 
> systemd service
> [Service]
> ExecStart=/usr/bin/test
> Restart=always
> RestartSec=2s
> MemoryLimit=1G
> StartLimitInterval=0
> [Install]
> WantedBy=default.target
> Enable and start the above service with the tool systemctl.
> 
> Some additional information:
> With strace attach to systemd before start the service: systemd will mkdir under /sys/fs/cgroup/memory for that service(/usr/bin/test). After the service stops, rmdir will remove the correspond entry under /sys/fs/cgrou/memory
> With kprobe hook to cgroup_mkdir and cgroup_rmdir: the number of call cgroup_mkdir and cgroup_rmdir are equally.
> With kprobe hook to (1)mem_cgroup_css_alloc (2)mem_cgroup_css_free (3)mem_cgroup_css_released (4)mem_cgroup_css_offline:
> the invoke number of mem_cgroup_css_alloc and mem_cgroup_css_offline are equally  (Assume the number is A)
> the invoke number of alloc mem_cgroup_css_free and mem_cgroup_css_released are equally (Assume the number is B)
> A > B
> With jprobe: we have collected some addresses of memcg. With the crash tool, inspect the living kernel: the member named refcnt's flag in the memcg->css is change to __PERCPU_REF_ATOMIC_DEAD.     memcg->css->refcnt->count  keeps the same value as memcg->memory->count.  After 24 hours, we observed the data structure is still in use, and the value of the two count both are 1.
> we wrote a kmod to put a memcg which counter is 1, nothing happen except this struct has been free
> We suspect the issue maybe caused by incorrect call to  try_charge and cancel_charge. Anyway, just guess.
> Following is some inspection code we used as described above:
> kprobe code
> #include <linux/kernel.h <https://wiki.bytedance.net/pages/kernel.h>>
> #include <linux/module.h <https://wiki.bytedance.net/pages/module.h>>
> #include <linux/kprobes.h <https://wiki.bytedance.net/pages/kprobes.h>>
>  
>  
> static struct kprobe mmalloc = {
>     .symbol_name    = "mem_cgroup_css_alloc",
> };
>  
> static struct kprobe mmrealse = {
>     .symbol_name    = "mem_cgroup_css_free",
> };
> static struct kprobe mmmkdir = {
>     .symbol_name    = "mem_cgroup_css_released",
> };
> static struct kprobe mmrmdir = {
>     .symbol_name    = "mem_cgroup_css_offline",
> };
> atomic_t alloc;
> atomic_t realse;
> atomic_t cmkdir;
> atomic_t crmdir;
>  
>  
> static int handler_alloc_pre(struct kprobe *p, struct pt_regs *regs)
> {
>     atomic_inc(&alloc);
>     printk(KERN_INFO "alloc release %d offline %d alloc %d free %d\n",atomic_read(&cmkdir),atomic_read(&crmdir),atomic_read(&alloc),atomic_read(&realse));
>     return 0;
> }
> static int handler_realse_pre(struct kprobe *p,struct pt_regs *regs)
> {
>    atomic_inc(&realse);
>     printk(KERN_INFO "free release %d offline %d alloc %d free %d\n",atomic_read(&cmkdir),atomic_read(&crmdir),atomic_read(&alloc),atomic_read(&realse));
>     return 0;
> }
> static int handler_mkdir_pre(struct kprobe *p,struct pt_regs *regs)
> {
>    atomic_inc(&cmkdir);
>     printk(KERN_INFO "release release %d offline %d alloc %d free %d\n",atomic_read(&cmkdir),atomic_read(&crmdir),atomic_read(&alloc),atomic_read(&realse));
>     return 0;
> }
> static int handler_rmdir_pre(struct kprobe *p,struct pt_regs *regs)
> {
>    atomic_inc(&crmdir);
>     printk(KERN_INFO "offline release %d offline %d alloc %d free %d\n",atomic_read(&cmkdir),atomic_read(&crmdir),atomic_read(&alloc),atomic_read(&realse));
>     return 0;
> }
>  
>  
> static void handler_post(struct kprobe *p, struct pt_regs *regs,
>                 unsigned long flags)
> {
> }
>  
>  
> static int handler_fault(struct kprobe *p, struct pt_regs *regs, int trapnr)
> {
>     return 0;
> }
>  
> static int __init kprobe_init(void)
> {
>     int ret;
>     mmalloc.pre_handler <https://wiki.bytedance.net/pages/mmalloc.pre_handler> = handler_alloc_pre;
>     mmalloc.post_handler <https://wiki.bytedance.net/pages/mmalloc.post_handler> = handler_post;
>     mmalloc.fault_handler <https://wiki.bytedance.net/pages/mmalloc.fault_handler> = handler_fault;
>  
>     mmrealse.pre_handler <https://wiki.bytedance.net/pages/mmrealse.pre_handler> = handler_realse_pre;
>     mmrealse.post_handler <https://wiki.bytedance.net/pages/mmrealse.post_handler> = handler_post;
>     mmrealse.fault_handler <https://wiki.bytedance.net/pages/mmrealse.fault_handler> = handler_fault;
>  
>     mmmkdir.pre_handler <https://wiki.bytedance.net/pages/mmmkdir.pre_handler> = handler_mkdir_pre;
>     mmmkdir.post_handler <https://wiki.bytedance.net/pages/mmmkdir.post_handler> = handler_post;
>     mmmkdir.fault_handler <https://wiki.bytedance.net/pages/mmmkdir.fault_handler> = handler_fault;
>  
>     mmrmdir.pre_handler <https://wiki.bytedance.net/pages/mmrmdir.pre_handler> = handler_rmdir_pre;
>     mmrmdir.post_handler <https://wiki.bytedance.net/pages/mmrmdir.post_handler> = handler_post;
>     mmrmdir.fault_handler <https://wiki.bytedance.net/pages/mmrmdir.fault_handler> = handler_fault;
>  
>     atomic_set(&alloc,0);
>     atomic_set(&realse,0);
>     atomic_set(&cmkdir,0);
>     atomic_set(&crmdir,0);
>  
>     ret = register_kprobe(&mmalloc);
>     if (ret < 0) {
>         printk(KERN_INFO "register_kprobe failed, returned %d\n", ret);
>         return ret;
>     }
>     ret = register_kprobe(&mmrealse);
>     if (ret < 0) {
>         printk(KERN_INFO "register_kprobe failed, returned %d\n", ret);
>         return ret;
>     }
>     ret = register_kprobe(&mmmkdir);
>     if (ret < 0) {
>         printk(KERN_INFO "register_kprobe failed, returned %d\n", ret);
>         return ret;
>     }
>     ret = register_kprobe(&mmrmdir);
>     if (ret < 0) {
>         printk(KERN_INFO "register_kprobe failed, returned %d\n", ret);
>         return ret;
>     }
>     printk(KERN_INFO "Planted kprobe at %p\n", mmalloc.addr); <https://wiki.bytedance.net/pages/mmalloc.addr);>
>     printk(KERN_INFO "Planted kprobe at %p\n", mmrealse.addr); <https://wiki.bytedance.net/pages/mmrealse.addr);>
>     printk(KERN_INFO "Planted kprobe at %p\n", mmmkdir.addr); <https://wiki.bytedance.net/pages/mmmkdir.addr);>
>     printk(KERN_INFO "Planted kprobe at %p\n", mmrmdir.addr); <https://wiki.bytedance.net/pages/mmrmdir.addr);>
>     return 0;
> }
>  
> static void __exit kprobe_exit(void)
> {
>     unregister_kprobe(&mmalloc);
>     unregister_kprobe(&mmrealse);
>     unregister_kprobe(&mmmkdir);
>     unregister_kprobe(&mmrmdir);
>     printk(KERN_INFO "kprobe at %p unregistered\n", mmalloc.addr); <https://wiki.bytedance.net/pages/mmalloc.addr);>
>     printk(KERN_INFO "kprobe at %p unregistered\n", mmrealse.addr); <https://wiki.bytedance.net/pages/mmrealse.addr);>
>     printk(KERN_INFO "kprobe at %p unregistered\n", mmmkdir.addr); <https://wiki.bytedance.net/pages/mmmkdir.addr);>
>     printk(KERN_INFO "kprobe at %p unregistered\n", mmrmdir.addr); <https://wiki.bytedance.net/pages/mmrmdir.addr);>
> }
>  
> module_init(kprobe_init)
> module_exit(kprobe_exit)
> MODULE_LICENSE("GPL");
> jprobe code
> #include <linux/kernel.h <https://wiki.bytedance.net/pages/kernel.h>>
> #include <linux/module.h <https://wiki.bytedance.net/pages/module.h>>
> #include <linux/kprobes.h <https://wiki.bytedance.net/pages/kprobes.h>>
> #include <linux/cgroup-defs.h <https://wiki.bytedance.net/pages/cgroup-defs.h>>
>  
> static void test(struct cgroup_subsys_state *css){
>     printk(KERN_INFO"memcg address %p refcnt %p !\n",css,(void *)css->refcnt.percpu_count_ptr); <https://wiki.bytedance.net/pages/refcnt.percpu_count_ptr);>
>     jprobe_return();
>     return;
> }
> static struct jprobe my_jprobe = {
>     .entry          = test,
>     .kp = {
>         .symbol_name    = "mem_cgroup_css_offline",
>     },
> };
>  
> static int __init jprobe_init(void)
> {
>     int ret;
>  
>     ret = register_jprobe(&my_jprobe);
>     if (ret < 0) {
>         printk(KERN_INFO "register_jprobe failed, returned %d\n", ret);
>         return -1;
>     }
>     printk(KERN_INFO "Planted jprobe at %p, handler addr %p\n",
>            my_jprobe.kp.addr, <https://wiki.bytedance.net/pages/my_jprobe.kp.addr,> my_jprobe.entry); <https://wiki.bytedance.net/pages/my_jprobe.entry);>
>     return 0;
> }
>  
> static void __exit jprobe_exit(void)
> {
>     unregister_jprobe(&my_jprobe);
>     printk(KERN_INFO "jprobe at %p unregistered\n", my_jprobe.kp.addr); <https://wiki.bytedance.net/pages/my_jprobe.kp.addr);>
> }
>  
> module_init(jprobe_init)
> module_exit(jprobe_exit)
> MODULE_LICENSE("GPL");
> realse kmode
> #include <linux/module.h <https://wiki.bytedance.net/pages/module.h>>
> #include <linux/cgroup-defs.h <https://wiki.bytedance.net/pages/cgroup-defs.h>>
> #include <linux/memcontrol.h <https://wiki.bytedance.net/pages/memcontrol.h>>
> #include <linux/cgroup.h <https://wiki.bytedance.net/pages/cgroup.h>>
>  
> int mymsr_init (void)
> {
>     struct mem_cgroup *memcg_ptr=(void *)0xffff8c1986ff1000;
>     struct cgroup_subsys_state * css_ptr = &memcg_ptr->css;
>     css_put(css_ptr);
>     return 0;
> }
>  
> void mymsr_exit(void)
> {
> }
>  
> MODULE_AUTHOR("xuyun.xy <https://wiki.bytedance.net/pages/xuyun.xy>");
> MODULE_LICENSE("GPL");
> module_init(mymsr_init);
> module_exit(mymsr_exit);
> crash information
> crash> struct mem_cgroup 0xffff8c1c43b86400
> struct mem_cgroup {
> css = {
> cgroup = 0xffff8c1c8a879000,
> ss = 0xffffffffac12aa40,
> refcnt = {
> count = {
> counter = 1
> },
> percpu_count_ptr = 67753193126051,
> release = 0xffffffffab112030,
> confirm_switch = 0x0,
> force_atomic = false,
> rcu = {
> next = 0xffff8c1c8a879038,
> func = 0xffffffffab37fe70
> }
> },
> sibling = {
> next = 0xffff8c1c8ab0f448,
> prev = 0xffff8c18584fac48
> },
> children = {
> next = 0xffff8c1c43b86458,
> prev = 0xffff8c1c43b86458
> },
> id = 34535,
> flags = 16,
> serial_nr = 314540,
> online_cnt = {
> counter = 0
> },
> callback_head = {
> next = 0x0,
> func = 0x0
> },
> destroy_work = {
> data = {
> counter = 960
> },
> entry = {
> next = 0xffff8c1c43b86498,
> prev = 0xffff8c1c43b86498
> },
> func = 0xffffffffab1141c0
> },
> parent = 0xffff8c1106f48800
> },
> id = {
> id = 0,
> ref = {
> counter = 0
> }
> },
> memory = {
> count = {
> counter = 1
> },
> limit = 262144,
> parent = 0xffff8c1106f488c0,
> watermark = 8045,
> 
> 
> crash> struct mem_cgroup 0xffff8c1986ff1000
> struct mem_cgroup {
> css = {
> cgroup = 0xffff8c196f533400,
> ss = 0xffffffffac12aa40,
> refcnt = {
> count = {
> counter = 1
> },
> percpu_count_ptr = 67756691197419,
> release = 0xffffffffab112030,
> confirm_switch = 0x0,
> force_atomic = false,
> rcu = {
> next = 0xffff8c196f533438,
> func = 0xffffffffab37fe70
> }
> },
> sibling = {
> next = 0xffff8c197a9fdc48,
> prev = 0xffff8c196ebbf048
> },
> children = {
> next = 0xffff8c1986ff1058,
> prev = 0xffff8c1986ff1058
> },
> id = 25717,
> flags = 16,
> serial_nr = 201081,
> online_cnt = {
> counter = 0
> },
> callback_head = {
> next = 0x0,
> func = 0x0
> },
> destroy_work = {
> data = {
> counter = 2432
> },
> entry = {
> next = 0xffff8c1986ff1098,
> prev = 0xffff8c1986ff1098
> },
> func = 0xffffffffab1141c0
> },
> parent = 0xffff8c1106f48800
> },
> id = {
> id = 0,
> ref = {
> counter = 0
> }
> },
> memory = {
> count = {
> counter = 1
> },
> limit = 262144,
> parent = 0xffff8c1106f488c0,
> watermark = 6067,
> failcnt = 0
> },
> swap = {
> count = {
> counter = 0
> },
> limit = 2251799813685247,
> parent = 0xffff8c1106f488e8,
> watermark = 0,
> failcnt = 0
> },
> memsw = {
> count = {
> counter = 1
> },
> limit = 2251799813685247,
> parent = 0xffff8c1106f48910,
> watermark = 6067,
> failcnt = 0
> },
> kmem = {
> count = {
> counter = 0
> },
> limit = 2251799813685247,
> parent = 0xffff8c1106f48938,
> watermark = 574,
> failcnt = 0
> },
> tcpmem = {
> count = {
> counter = 0
> },
> limit = 2251799813685247,
> parent = 0xffff8c1106f48960,
> watermark = 0,
> failcnt = 0
> },
> low = 0,
> high = 2251799813685247,
> high_work = {
> data = {
> counter = 68719476704
> },
> entry = {
> next = 0xffff8c1986ff11a0,
> prev = 0xffff8c1986ff11a0
> },
> func = 0xffffffffab217610
> },
> soft_limit = 2251799813685247,
> vmpressure = {
> scanned = 0,
> reclaimed = 0,
> tree_scanned = 0,
> tree_reclaimed = 0,
> sr_lock = {
> {
> rlock = {
> raw_lock = {
> val = {
> counter = 0
> }
> }
> }
> }
> },
> events = {
> next = 0xffff8c1986ff11e8,
> prev = 0xffff8c1986ff11e8
> },
> events_lock = {
> owner = {
> counter = 0
> },
> wait_lock = {
> {
> rlock = {
> raw_lock = {
> val = {
> counter = 0
> }
> }
> }
> }
> },
> osq = {
> tail = {
> counter = 0
> }
> },
> wait_list = {
> next = 0xffff8c1986ff1208,
> prev = 0xffff8c1986ff1208
> }
> },
> work = {
> data = {
> counter = 68719476704
> },
> entry = {
> next = 0xffff8c1986ff1220,
> prev = 0xffff8c1986ff1220
> },
> func = 0xffffffffab21e610
> }
> },
> use_hierarchy = true,
> oom_lock = false,
> under_oom = 0,
> swappiness = 0,
> oom_kill_disable = 0,
> events_file = {
> kn = 0x0
> },
> thresholds_lock = {
> owner = {
> counter = 0
> },
> wait_lock = {
> {
> rlock = {
> raw_lock = {
> val = {
> counter = 0
> }
> }
> }
> }
> },
> osq = {
> tail = {
> counter = 0
> }
> },
> wait_list = {
> next = 0xffff8c1986ff1260,
> prev = 0xffff8c1986ff1260
> }
> },
> thresholds = {
> primary = 0x0,
> spare = 0x0
> },
> memsw_thresholds = {
> primary = 0x0,
> spare = 0x0
> },
> oom_notify = {
> next = 0xffff8c1986ff1290,
> prev = 0xffff8c1986ff1290
> },
> move_charge_at_immigrate = 0,
> moving_account = {
> counter = 0
> },
> move_lock = {
> {
> rlock = {
> raw_lock = {
> val = {
> counter = 0
> }
> }
> }
> }
> },
> move_lock_task = 0x0,
> move_lock_flags = 0,
> stat = 0x3d9fd3426ea0,
> socket_pressure = 4333518749,
> tcpmem_active = false,
> tcpmem_pressure = 0,
> kmemcg_id = 8,
> kmem_state = KMEM_ALLOCATED,
> kmem_caches = {
> next = 0xffff8c1976f7cba0,
> prev = 0xffff8c0dac0401a0
> },
> last_scanned_node = 64,
> scan_nodes = {
> bits = {0}
> },
> numainfo_events = {
> counter = 53
> },
> numainfo_updating = {
> counter = 0
> },
> cgwb_list = {
> next = 0x0,
> prev = 0xffff8c1986ff1308
> },
> cgwb_domain = {
> lock = {
> {
> rlock = {
> raw_lock = {
> val = {
> counter = 0
> }
> }
> }
> }
> },
> completions = {
> events = {
> lock = {
> raw_lock = {
> val = {
> counter = 0
> }
> }
> },
> count = 1,
> list = {
> next = 0xffff8c196ebbf330,
> prev = 0xffff8c197a9fdf30
> },
> counters = 0x3d9fd5814ce8
> },
> period = 0,
> sequence = {
> sequence = 0
> }
> },
> period_timer = {
> entry = {
> next = 0x0,
> pprev = 0x0
> },
> expires = 0,
> function = 0xffffffffab1a3a10,
> data = 18446616639999775512,
> flags = 524326
> },
> period_time = 0,
> dirty_limit_tstamp = 4333518749,
> dirty_limit = 0
> },
> event_list = {
> next = 0xffff8c1986ff1398,
> prev = 0xffff8c1986ff1398
> },
> event_list_lock = {
> {
> rlock = {
> raw_lock = {
> val = {
> counter = 0
> }
> }
> }
> }
> },
> nodeinfo = 0xffff8c1986ff13b0
> }
> 
> 
> bytedance.net <http://bytedance.net/>
> 段熊春
> duanxiongchun@bytedance.com


Reply to: