[Date Prev][Date Next] [Thread Prev][Thread Next] [Date Index] [Thread Index]

Bug#426029: serious performance regression in glibc-2.5 malloc



On Fri, May 25, 2007 at 09:38:24PM +0400, Alexey Mazurov wrote:
> Package: libc6
> Version: 2.5-8
> Severity: important
> Tags: patch fixed-upstream

  FWIW we plan to push glibc 2.6 "soon".

> 
> Current malloc implementation in glibc-2.5 have serious problem with
> large number of free chunks, this bug reported to upstream
> http://sources.redhat.com/bugzilla/show_bug.cgi?id=4349
> and fixed in current cvs and glibc-2.6 release.
> This bug does not happens in Debian Sarge glibc-2.3.2
> 
> Attached patch which fixes this bug extracted from current glibc cvs
> and small test program (it took number of seconds to run test as
> argument)
> 
> To demonstrate how bad it is look at results of test program on
> Xeon 5160 with 32 GB ram:
> stock libc:
> 72 mallocs per second
> patched libc:
> 2251354 mallocs per second

> --- glibc-2.5/malloc/malloc.c	2006-09-07 20:06:02.000000000 +0400
> +++ glibc-2.6/malloc/malloc.c	2007-05-22 21:21:24.000000000 +0400
> @@ -1,5 +1,5 @@
>  /* Malloc implementation for multiple threads without lock contention.
> -   Copyright (C) 1996-2002,2003,2004,2005,2006 Free Software Foundation, Inc.
> +   Copyright (C) 1996-2006, 2007 Free Software Foundation, Inc.
>     This file is part of the GNU C Library.
>     Contributed by Wolfram Gloger <wg@malloc.de>
>     and Doug Lea <dl@cs.oswego.edu>, 2001.
> @@ -27,10 +27,6 @@
>    based on:
>    VERSION 2.7.0 Sun Mar 11 14:14:06 2001  Doug Lea  (dl at gee)
>  
> -   Note: There may be an updated version of this malloc obtainable at
> -           http://www.malloc.de/malloc/ptmalloc2.tar.gz
> -         Check before installing!
> -
>  * Quickstart
>  
>    In order to compile this implementation, a Makefile is provided with
> @@ -1781,6 +1777,10 @@
>  
>    struct malloc_chunk* fd;         /* double links -- used only if free. */
>    struct malloc_chunk* bk;
> +
> +  /* Only used for large blocks: pointer to next larger size.  */
> +  struct malloc_chunk* fd_nextsize; /* double links -- used only if free. */
> +  struct malloc_chunk* bk_nextsize;
>  };
>  
>  
> @@ -1881,7 +1881,7 @@
>  #define mem2chunk(mem) ((mchunkptr)((char*)(mem) - 2*SIZE_SZ))
>  
>  /* The smallest possible chunk */
> -#define MIN_CHUNK_SIZE        (sizeof(struct malloc_chunk))
> +#define MIN_CHUNK_SIZE        (offsetof(struct malloc_chunk, fd_nextsize))
>  
>  /* The smallest size we can malloc is an aligned minimal chunk */
>  
> @@ -2081,6 +2081,24 @@
>    else {                                                               \
>      FD->bk = BK;                                                       \
>      BK->fd = FD;                                                       \
> +    if (!in_smallbin_range (P->size)				       \
> +	&& __builtin_expect (P->fd_nextsize != NULL, 0)) {	       \
> +      assert (P->fd_nextsize->bk_nextsize == P);		       \
> +      assert (P->bk_nextsize->fd_nextsize == P);		       \
> +      if (FD->fd_nextsize == NULL) {				       \
> +	if (P->fd_nextsize == P)				       \
> +	  FD->fd_nextsize = FD->bk_nextsize = FD;		       \
> +	else {							       \
> +	  FD->fd_nextsize = P->fd_nextsize;			       \
> +	  FD->bk_nextsize = P->bk_nextsize;			       \
> +	  P->fd_nextsize->bk_nextsize = FD;			       \
> +	  P->bk_nextsize->fd_nextsize = FD;			       \
> +	}							       \
> +      }	else {							       \
> +	P->fd_nextsize->bk_nextsize = P->bk_nextsize;		       \
> +	P->bk_nextsize->fd_nextsize = P->fd_nextsize;		       \
> +      }								       \
> +    }								       \
>    }                                                                    \
>  }
>  
> @@ -2107,22 +2125,37 @@
>  
>  #define NBINS             128
>  #define NSMALLBINS         64
> -#define SMALLBIN_WIDTH      8
> -#define MIN_LARGE_SIZE    512
> +#define SMALLBIN_WIDTH    MALLOC_ALIGNMENT
> +#define MIN_LARGE_SIZE    (NSMALLBINS * SMALLBIN_WIDTH)
>  
>  #define in_smallbin_range(sz)  \
>    ((unsigned long)(sz) < (unsigned long)MIN_LARGE_SIZE)
>  
> -#define smallbin_index(sz)     (((unsigned)(sz)) >> 3)
> +#define smallbin_index(sz) \
> +  (SMALLBIN_WIDTH == 16 ? (((unsigned)(sz)) >> 4) : (((unsigned)(sz)) >> 3))
>  
> -#define largebin_index(sz)                                                   \
> -(((((unsigned long)(sz)) >>  6) <= 32)?  56 + (((unsigned long)(sz)) >>  6): \
> +#define largebin_index_32(sz)                                                \
> +(((((unsigned long)(sz)) >>  6) <= 38)?  56 + (((unsigned long)(sz)) >>  6): \
>   ((((unsigned long)(sz)) >>  9) <= 20)?  91 + (((unsigned long)(sz)) >>  9): \
>   ((((unsigned long)(sz)) >> 12) <= 10)? 110 + (((unsigned long)(sz)) >> 12): \
>   ((((unsigned long)(sz)) >> 15) <=  4)? 119 + (((unsigned long)(sz)) >> 15): \
>   ((((unsigned long)(sz)) >> 18) <=  2)? 124 + (((unsigned long)(sz)) >> 18): \
>                                          126)
>  
> +// XXX It remains to be seen whether it is good to keep the widths of
> +// XXX the buckets the same or whether it should be scaled by a factor
> +// XXX of two as well.
> +#define largebin_index_64(sz)                                                \
> +(((((unsigned long)(sz)) >>  6) <= 48)?  48 + (((unsigned long)(sz)) >>  6): \
> + ((((unsigned long)(sz)) >>  9) <= 20)?  91 + (((unsigned long)(sz)) >>  9): \
> + ((((unsigned long)(sz)) >> 12) <= 10)? 110 + (((unsigned long)(sz)) >> 12): \
> + ((((unsigned long)(sz)) >> 15) <=  4)? 119 + (((unsigned long)(sz)) >> 15): \
> + ((((unsigned long)(sz)) >> 18) <=  2)? 124 + (((unsigned long)(sz)) >> 18): \
> +                                        126)
> +
> +#define largebin_index(sz) \
> +  (SIZE_SZ == 8 ? largebin_index_64 (sz) : largebin_index_32 (sz))
> +
>  #define bin_index(sz) \
>   ((in_smallbin_range(sz)) ? smallbin_index(sz) : largebin_index(sz))
>  
> @@ -2536,7 +2569,7 @@
>  #if HAVE_MMAP
>      /* address is outside main heap  */
>      if (contiguous(av) && av->top != initial_top(av)) {
> -      assert(((char*)p) < min_address || ((char*)p) > max_address);
> +      assert(((char*)p) < min_address || ((char*)p) >= max_address);
>      }
>      /* chunk is page-aligned */
>      assert(((p->prev_size + sz) & (mp_.pagesize-1)) == 0);
> @@ -2741,8 +2774,19 @@
>    for (i = 0; i < NFASTBINS; ++i) {
>      p = av->fastbins[i];
>  
> +    /* The following test can only be performed for the main arena.
> +       While mallopt calls malloc_consolidate to get rid of all fast
> +       bins (especially those larger than the new maximum) this does
> +       only happen for the main arena.  Trying to do this for any
> +       other arena would mean those arenas have to be locked and
> +       malloc_consolidate be called for them.  This is excessive.  And
> +       even if this is acceptable to somebody it still cannot solve
> +       the problem completely since if the arena is locked a
> +       concurrent malloc call might create a new arena which then
> +       could use the newly invalid fast bins.  */
> +
>      /* all bins past max_fast are empty */
> -    if (i > max_fast_bin)
> +    if (av == &main_arena && i > max_fast_bin)
>        assert(p == 0);
>  
>      while (p != 0) {
> @@ -2786,7 +2830,31 @@
>          /* lists are sorted */
>          assert(p->bk == b ||
>                 (unsigned long)chunksize(p->bk) >= (unsigned long)chunksize(p));
> -      }
> +
> +	if (!in_smallbin_range(size))
> +	  {
> +	    if (p->fd_nextsize != NULL)
> +	      {
> +		if (p->fd_nextsize == p)
> +		  assert (p->bk_nextsize == p);
> +		else
> +		  {
> +		    if (p->fd_nextsize == first (b))
> +		      assert (chunksize (p) < chunksize (p->fd_nextsize));
> +		    else
> +		      assert (chunksize (p) > chunksize (p->fd_nextsize));
> +
> +		    if (p == first (b))
> +		      assert (chunksize (p) > chunksize (p->bk_nextsize));
> +		    else
> +		      assert (chunksize (p) < chunksize (p->bk_nextsize));
> +		  }
> +	      }
> +	    else
> +	      assert (p->bk_nextsize == NULL);
> +	  }
> +      } else if (!in_smallbin_range(size))
> +	assert (p->fd_nextsize == NULL && p->bk_nextsize == NULL);
>        /* chunk is followed by a legal chain of inuse chunks */
>        for (q = next_chunk(p);
>             (q != av->top && inuse(q) &&
> @@ -2805,7 +2873,6 @@
>    assert(total <= (unsigned long)(mp_.max_total_mem));
>    assert(mp_.n_mmaps >= 0);
>  #endif
> -  assert(mp_.n_mmaps <= mp_.n_mmaps_max);
>    assert(mp_.n_mmaps <= mp_.max_n_mmaps);
>  
>    assert((unsigned long)(av->system_mem) <=
> @@ -2885,7 +2952,13 @@
>        is one SIZE_SZ unit larger than for normal chunks, because there
>        is no following chunk whose prev_size field could be used.
>      */
> +#if 1
> +    /* See the front_misalign handling below, for glibc there is no
> +       need for further alignments.  */
> +    size = (nb + SIZE_SZ + pagemask) & ~pagemask;
> +#else
>      size = (nb + SIZE_SZ + MALLOC_ALIGN_MASK + pagemask) & ~pagemask;
> +#endif
>      tried_mmap = true;
>  
>      /* Don't try if size wraps around 0 */
> @@ -2903,6 +2976,12 @@
>            address argument for later munmap in free() and realloc().
>          */
>  
> +#if 1
> +	/* For glibc, chunk2mem increases the address by 2*SIZE_SZ and
> +	   MALLOC_ALIGN_MASK is 2*SIZE_SZ-1.  Each mmap'ed area is page
> +	   aligned and therefore definitely MALLOC_ALIGN_MASK-aligned.  */
> +        assert (((INTERNAL_SIZE_T)chunk2mem(mm) & MALLOC_ALIGN_MASK) == 0);
> +#else
>          front_misalign = (INTERNAL_SIZE_T)chunk2mem(mm) & MALLOC_ALIGN_MASK;
>          if (front_misalign > 0) {
>            correction = MALLOC_ALIGNMENT - front_misalign;
> @@ -2910,10 +2989,12 @@
>            p->prev_size = correction;
>            set_head(p, (size - correction) |IS_MMAPPED);
>          }
> -        else {
> -          p = (mchunkptr)mm;
> -          set_head(p, size|IS_MMAPPED);
> -        }
> +        else
> +#endif
> +	  {
> +	    p = (mchunkptr)mm;
> +	    set_head(p, size|IS_MMAPPED);
> +	  }
>  
>          /* update statistics */
>  
> @@ -4097,7 +4178,6 @@
>    for(;;) {
>  
>      int iters = 0;
> -    bool any_larger = false;
>      while ( (victim = unsorted_chunks(av)->bk) != unsorted_chunks(av)) {
>        bck = victim->bk;
>        if (__builtin_expect (victim->size <= 2 * SIZE_SZ, 0)
> @@ -4125,6 +4205,11 @@
>          unsorted_chunks(av)->bk = unsorted_chunks(av)->fd = remainder;
>          av->last_remainder = remainder;
>          remainder->bk = remainder->fd = unsorted_chunks(av);
> +	if (!in_smallbin_range(remainder_size))
> +	  {
> +	    remainder->fd_nextsize = NULL;
> +	    remainder->bk_nextsize = NULL;
> +	  }
>  
>          set_head(victim, nb | PREV_INUSE |
>  		 (av != &main_arena ? NON_MAIN_ARENA : 0));
> @@ -4173,19 +4258,36 @@
>            size |= PREV_INUSE;
>            /* if smaller than smallest, bypass loop below */
>  	  assert((bck->bk->size & NON_MAIN_ARENA) == 0);
> -          if ((unsigned long)(size) <= (unsigned long)(bck->bk->size)) {
> +	  if ((unsigned long)(size) < (unsigned long)(bck->bk->size)) {
>              fwd = bck;
>              bck = bck->bk;
> +
> +	    victim->fd_nextsize = fwd->fd;
> +	    victim->bk_nextsize = fwd->fd->bk_nextsize;
> +	    fwd->fd->bk_nextsize = victim->bk_nextsize->fd_nextsize = victim;
>            }
>            else {
>  	    assert((fwd->size & NON_MAIN_ARENA) == 0);
> -            while ((unsigned long)(size) < (unsigned long)(fwd->size)) {
> -              fwd = fwd->fd;
> -	      assert((fwd->size & NON_MAIN_ARENA) == 0);
> -	    }
> -            bck = fwd->bk;
> +	    while ((unsigned long) size < fwd->size)
> +	      {
> +		fwd = fwd->fd_nextsize;
> +		assert((fwd->size & NON_MAIN_ARENA) == 0);
> +	      }
> +
> +	    if ((unsigned long) size == (unsigned long) fwd->size)
> +	      /* Always insert in the second position.  */
> +	      fwd = fwd->fd;
> +	    else
> +	      {
> +		victim->fd_nextsize = fwd;
> +		victim->bk_nextsize = fwd->bk_nextsize;
> +		fwd->bk_nextsize = victim;
> +		victim->bk_nextsize->fd_nextsize = victim;
> +	      }
> +	    bck = fwd->bk;
>            }
> -        }
> +	} else
> +	  victim->fd_nextsize = victim->bk_nextsize = victim;
>        }
>  
>        mark_bin(av, victim_index);
> @@ -4194,8 +4296,6 @@
>        fwd->bk = victim;
>        bck->fd = victim;
>  
> -      if (size >= nb + MINSIZE)
> -	any_larger = true;
>  #define MAX_ITERS	10000
>        if (++iters >= MAX_ITERS)
>  	break;
> @@ -4203,21 +4303,25 @@
>  
>      /*
>        If a large request, scan through the chunks of current bin in
> -      sorted order to find smallest that fits.  This is the only step
> -      where an unbounded number of chunks might be scanned without doing
> -      anything useful with them. However the lists tend to be short.
> +      sorted order to find smallest that fits.  Use the skip list for this.
>      */
>  
>      if (!in_smallbin_range(nb)) {
>        bin = bin_at(av, idx);
>  
>        /* skip scan if empty or largest chunk is too small */
> -      if ((victim = last(bin)) != bin &&
> -          (unsigned long)(first(bin)->size) >= (unsigned long)(nb)) {
> +      if ((victim = first(bin)) != bin &&
> +          (unsigned long)(victim->size) >= (unsigned long)(nb)) {
>  
> +	victim = victim->bk_nextsize;
>          while (((unsigned long)(size = chunksize(victim)) <
>                  (unsigned long)(nb)))
> -          victim = victim->bk;
> +          victim = victim->bk_nextsize;
> +
> +	/* Avoid removing the first entry for a size so that the skip
> +	   list does not have to be rerouted.  */
> +	if (victim != last(bin) && victim->size == victim->fd->size)
> +	  victim = victim->fd;
>  
>          remainder_size = size - nb;
>          unlink(victim, bck, fwd);
> @@ -4239,6 +4343,11 @@
>  	  remainder->fd = fwd;
>  	  bck->fd = remainder;
>  	  fwd->bk = remainder;
> +	  if (!in_smallbin_range(remainder_size))
> +	    {
> +	      remainder->fd_nextsize = NULL;
> +	      remainder->bk_nextsize = NULL;
> +	    }
>            set_head(victim, nb | PREV_INUSE |
>  		   (av != &main_arena ? NON_MAIN_ARENA : 0));
>            set_head(remainder, remainder_size | PREV_INUSE);
> @@ -4308,9 +4417,7 @@
>          remainder_size = size - nb;
>  
>          /* unlink */
> -        bck = victim->bk;
> -        bin->bk = bck;
> -        bck->fd = bin;
> +        unlink(victim, bck, fwd);
>  
>          /* Exhaust */
>          if (remainder_size < MINSIZE) {
> @@ -4335,7 +4442,11 @@
>            /* advertise as last remainder */
>            if (in_smallbin_range(nb))
>              av->last_remainder = remainder;
> -
> +	  if (!in_smallbin_range(remainder_size))
> +	    {
> +	      remainder->fd_nextsize = NULL;
> +	      remainder->bk_nextsize = NULL;
> +	    }
>            set_head(victim, nb | PREV_INUSE |
>  		   (av != &main_arena ? NON_MAIN_ARENA : 0));
>            set_head(remainder, remainder_size | PREV_INUSE);
> @@ -4558,8 +4669,13 @@
>  
>        bck = unsorted_chunks(av);
>        fwd = bck->fd;
> -      p->bk = bck;
>        p->fd = fwd;
> +      p->bk = bck;
> +      if (!in_smallbin_range(size))
> +	{
> +	  p->fd_nextsize = NULL;
> +	  p->bk_nextsize = NULL;
> +	}
>        bck->fd = p;
>        fwd->bk = p;
>  
> @@ -4684,7 +4800,15 @@
>        reused anyway.
>      */
>  
> +#if 0
> +    /* It is wrong to limit the fast bins to search using get_max_fast
> +       because, except for the main arena, all the others might have
> +       blocks in the high fast bins.  It's not worth it anyway, just
> +       search all bins all the time.  */
>      maxfb = &(av->fastbins[fastbin_index(get_max_fast ())]);
> +#else
> +    maxfb = &(av->fastbins[NFASTBINS - 1]);
> +#endif
>      fb = &(av->fastbins[0]);
>      do {
>        if ( (p = *fb) != 0) {
> @@ -4719,6 +4843,11 @@
>              unsorted_bin->fd = p;
>              first_unsorted->bk = p;
>  
> +            if (!in_smallbin_range (size)) {
> +	      p->fd_nextsize = NULL;
> +	      p->bk_nextsize = NULL;
> +	    }
> +
>              set_head(p, size | PREV_INUSE);
>              p->bk = unsorted_bin;
>              p->fd = first_unsorted;

> #include <stdlib.h>
> #include <sys/time.h>
> #include <stdio.h>
> #include <signal.h>
> 
> 
> #define NUM_CH (16 * 1024 * 1024)
> #define SIZE_MAX (1024)
> 
> void *p[NUM_CH];
> 
> struct drand48_data data;
> 
> unsigned int rndm(unsigned int max) {
> 	double res;
> 	drand48_r(&data, &res);
> 	return res * max;
> }
> 
> int work = 1;
> 
> void alarm_handler(int i __attribute__((unused))) {
> 	work = 0;
> }
> 
> int main(int argc, char *argv[]) {
> 	unsigned long i = 0;
> 	struct timeval t0, t1, tdiff;
> 	struct itimerval itimer;
> 	int t;
> 
> 	if(argc < 2 || (t = atoi(argv[1])) <= 0) {
> 		fprintf(stderr, "Usage: %s <time>\n", argv[0]);
> 		return 1;
> 	}
> 
> 	fputs("Preparing: ", stdout); fflush(stdout);
> 
> 	gettimeofday(&t0, NULL);
> 
> 	for(i = 0; i < NUM_CH; ++i) {
> 		p[i] = malloc(rndm(SIZE_MAX));
> 	}
> 
> 	for(i = 0; i < NUM_CH; i += 2) {
> 		free(p[i]); p[i] = NULL;
> 	}
> 
> 	gettimeofday(&t1, NULL);
> 	timersub(&t1, &t0, &tdiff);
> 
> 	fprintf(stdout, "%ld.%03lu\n", tdiff.tv_sec, tdiff.tv_usec / 1000);
> 
> 	fprintf(stdout, "Testing:"); fflush(stdout);
> 
> 	itimer.it_interval.tv_sec = 0;
> 	itimer.it_interval.tv_usec = 0;
> 
> 	itimer.it_value.tv_sec = t;
> 	itimer.it_value.tv_usec = 0;
> 
> 	signal(SIGALRM, &alarm_handler);
> 	setitimer(ITIMER_REAL, &itimer, NULL);
> 
> 	gettimeofday(&t0, NULL);
> 
> 	for(i = 0; i < NUM_CH && work; i += 2) {
> 		p[i] = malloc(rndm(SIZE_MAX));
> 	}
> 
> 	gettimeofday(&t1, NULL);
> 	timersub(&t1, &t0, &tdiff);
> 
> 	fprintf(stdout, " %u mallocs per second\n", i/2/tdiff.tv_sec);
> 	return 0;
> }
> 
> #if 0		
> 		void **ptr = p + rndm(NUM_CH);
> 		if(*ptr) { free(*ptr); *ptr = NULL; }
> 		else { *ptr = malloc(rndm(SIZE_MAX)); }
> #endif


-- 
·O·  Pierre Habouzit
··O                                                madcoder@debian.org
OOO                                                http://www.madism.org

Attachment: pgpRitlbkIweX.pgp
Description: PGP signature


Reply to: