[Date Prev][Date Next] [Thread Prev][Thread Next] [Date Index] [Thread Index]

Bug#426029: serious performance regression in glibc-2.5 malloc



Package: libc6
Version: 2.5-8
Severity: important
Tags: patch fixed-upstream

Current malloc implementation in glibc-2.5 have serious problem with
large number of free chunks, this bug reported to upstream
http://sources.redhat.com/bugzilla/show_bug.cgi?id=4349
and fixed in current cvs and glibc-2.6 release.
This bug does not happens in Debian Sarge glibc-2.3.2

Attached patch which fixes this bug extracted from current glibc cvs
and small test program (it took number of seconds to run test as
argument)

To demonstrate how bad it is look at results of test program on
Xeon 5160 with 32 GB ram:
stock libc:
72 mallocs per second
patched libc:
2251354 mallocs per second
--- glibc-2.5/malloc/malloc.c	2006-09-07 20:06:02.000000000 +0400
+++ glibc-2.6/malloc/malloc.c	2007-05-22 21:21:24.000000000 +0400
@@ -1,5 +1,5 @@
 /* Malloc implementation for multiple threads without lock contention.
-   Copyright (C) 1996-2002,2003,2004,2005,2006 Free Software Foundation, Inc.
+   Copyright (C) 1996-2006, 2007 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Wolfram Gloger <wg@malloc.de>
    and Doug Lea <dl@cs.oswego.edu>, 2001.
@@ -27,10 +27,6 @@
   based on:
   VERSION 2.7.0 Sun Mar 11 14:14:06 2001  Doug Lea  (dl at gee)
 
-   Note: There may be an updated version of this malloc obtainable at
-           http://www.malloc.de/malloc/ptmalloc2.tar.gz
-         Check before installing!
-
 * Quickstart
 
   In order to compile this implementation, a Makefile is provided with
@@ -1781,6 +1777,10 @@
 
   struct malloc_chunk* fd;         /* double links -- used only if free. */
   struct malloc_chunk* bk;
+
+  /* Only used for large blocks: pointer to next larger size.  */
+  struct malloc_chunk* fd_nextsize; /* double links -- used only if free. */
+  struct malloc_chunk* bk_nextsize;
 };
 
 
@@ -1881,7 +1881,7 @@
 #define mem2chunk(mem) ((mchunkptr)((char*)(mem) - 2*SIZE_SZ))
 
 /* The smallest possible chunk */
-#define MIN_CHUNK_SIZE        (sizeof(struct malloc_chunk))
+#define MIN_CHUNK_SIZE        (offsetof(struct malloc_chunk, fd_nextsize))
 
 /* The smallest size we can malloc is an aligned minimal chunk */
 
@@ -2081,6 +2081,24 @@
   else {                                                               \
     FD->bk = BK;                                                       \
     BK->fd = FD;                                                       \
+    if (!in_smallbin_range (P->size)				       \
+	&& __builtin_expect (P->fd_nextsize != NULL, 0)) {	       \
+      assert (P->fd_nextsize->bk_nextsize == P);		       \
+      assert (P->bk_nextsize->fd_nextsize == P);		       \
+      if (FD->fd_nextsize == NULL) {				       \
+	if (P->fd_nextsize == P)				       \
+	  FD->fd_nextsize = FD->bk_nextsize = FD;		       \
+	else {							       \
+	  FD->fd_nextsize = P->fd_nextsize;			       \
+	  FD->bk_nextsize = P->bk_nextsize;			       \
+	  P->fd_nextsize->bk_nextsize = FD;			       \
+	  P->bk_nextsize->fd_nextsize = FD;			       \
+	}							       \
+      }	else {							       \
+	P->fd_nextsize->bk_nextsize = P->bk_nextsize;		       \
+	P->bk_nextsize->fd_nextsize = P->fd_nextsize;		       \
+      }								       \
+    }								       \
   }                                                                    \
 }
 
@@ -2107,22 +2125,37 @@
 
 #define NBINS             128
 #define NSMALLBINS         64
-#define SMALLBIN_WIDTH      8
-#define MIN_LARGE_SIZE    512
+#define SMALLBIN_WIDTH    MALLOC_ALIGNMENT
+#define MIN_LARGE_SIZE    (NSMALLBINS * SMALLBIN_WIDTH)
 
 #define in_smallbin_range(sz)  \
   ((unsigned long)(sz) < (unsigned long)MIN_LARGE_SIZE)
 
-#define smallbin_index(sz)     (((unsigned)(sz)) >> 3)
+#define smallbin_index(sz) \
+  (SMALLBIN_WIDTH == 16 ? (((unsigned)(sz)) >> 4) : (((unsigned)(sz)) >> 3))
 
-#define largebin_index(sz)                                                   \
-(((((unsigned long)(sz)) >>  6) <= 32)?  56 + (((unsigned long)(sz)) >>  6): \
+#define largebin_index_32(sz)                                                \
+(((((unsigned long)(sz)) >>  6) <= 38)?  56 + (((unsigned long)(sz)) >>  6): \
  ((((unsigned long)(sz)) >>  9) <= 20)?  91 + (((unsigned long)(sz)) >>  9): \
  ((((unsigned long)(sz)) >> 12) <= 10)? 110 + (((unsigned long)(sz)) >> 12): \
  ((((unsigned long)(sz)) >> 15) <=  4)? 119 + (((unsigned long)(sz)) >> 15): \
  ((((unsigned long)(sz)) >> 18) <=  2)? 124 + (((unsigned long)(sz)) >> 18): \
                                         126)
 
+// XXX It remains to be seen whether it is good to keep the widths of
+// XXX the buckets the same or whether it should be scaled by a factor
+// XXX of two as well.
+#define largebin_index_64(sz)                                                \
+(((((unsigned long)(sz)) >>  6) <= 48)?  48 + (((unsigned long)(sz)) >>  6): \
+ ((((unsigned long)(sz)) >>  9) <= 20)?  91 + (((unsigned long)(sz)) >>  9): \
+ ((((unsigned long)(sz)) >> 12) <= 10)? 110 + (((unsigned long)(sz)) >> 12): \
+ ((((unsigned long)(sz)) >> 15) <=  4)? 119 + (((unsigned long)(sz)) >> 15): \
+ ((((unsigned long)(sz)) >> 18) <=  2)? 124 + (((unsigned long)(sz)) >> 18): \
+                                        126)
+
+#define largebin_index(sz) \
+  (SIZE_SZ == 8 ? largebin_index_64 (sz) : largebin_index_32 (sz))
+
 #define bin_index(sz) \
  ((in_smallbin_range(sz)) ? smallbin_index(sz) : largebin_index(sz))
 
@@ -2536,7 +2569,7 @@
 #if HAVE_MMAP
     /* address is outside main heap  */
     if (contiguous(av) && av->top != initial_top(av)) {
-      assert(((char*)p) < min_address || ((char*)p) > max_address);
+      assert(((char*)p) < min_address || ((char*)p) >= max_address);
     }
     /* chunk is page-aligned */
     assert(((p->prev_size + sz) & (mp_.pagesize-1)) == 0);
@@ -2741,8 +2774,19 @@
   for (i = 0; i < NFASTBINS; ++i) {
     p = av->fastbins[i];
 
+    /* The following test can only be performed for the main arena.
+       While mallopt calls malloc_consolidate to get rid of all fast
+       bins (especially those larger than the new maximum) this does
+       only happen for the main arena.  Trying to do this for any
+       other arena would mean those arenas have to be locked and
+       malloc_consolidate be called for them.  This is excessive.  And
+       even if this is acceptable to somebody it still cannot solve
+       the problem completely since if the arena is locked a
+       concurrent malloc call might create a new arena which then
+       could use the newly invalid fast bins.  */
+
     /* all bins past max_fast are empty */
-    if (i > max_fast_bin)
+    if (av == &main_arena && i > max_fast_bin)
       assert(p == 0);
 
     while (p != 0) {
@@ -2786,7 +2830,31 @@
         /* lists are sorted */
         assert(p->bk == b ||
                (unsigned long)chunksize(p->bk) >= (unsigned long)chunksize(p));
-      }
+
+	if (!in_smallbin_range(size))
+	  {
+	    if (p->fd_nextsize != NULL)
+	      {
+		if (p->fd_nextsize == p)
+		  assert (p->bk_nextsize == p);
+		else
+		  {
+		    if (p->fd_nextsize == first (b))
+		      assert (chunksize (p) < chunksize (p->fd_nextsize));
+		    else
+		      assert (chunksize (p) > chunksize (p->fd_nextsize));
+
+		    if (p == first (b))
+		      assert (chunksize (p) > chunksize (p->bk_nextsize));
+		    else
+		      assert (chunksize (p) < chunksize (p->bk_nextsize));
+		  }
+	      }
+	    else
+	      assert (p->bk_nextsize == NULL);
+	  }
+      } else if (!in_smallbin_range(size))
+	assert (p->fd_nextsize == NULL && p->bk_nextsize == NULL);
       /* chunk is followed by a legal chain of inuse chunks */
       for (q = next_chunk(p);
            (q != av->top && inuse(q) &&
@@ -2805,7 +2873,6 @@
   assert(total <= (unsigned long)(mp_.max_total_mem));
   assert(mp_.n_mmaps >= 0);
 #endif
-  assert(mp_.n_mmaps <= mp_.n_mmaps_max);
   assert(mp_.n_mmaps <= mp_.max_n_mmaps);
 
   assert((unsigned long)(av->system_mem) <=
@@ -2885,7 +2952,13 @@
       is one SIZE_SZ unit larger than for normal chunks, because there
       is no following chunk whose prev_size field could be used.
     */
+#if 1
+    /* See the front_misalign handling below, for glibc there is no
+       need for further alignments.  */
+    size = (nb + SIZE_SZ + pagemask) & ~pagemask;
+#else
     size = (nb + SIZE_SZ + MALLOC_ALIGN_MASK + pagemask) & ~pagemask;
+#endif
     tried_mmap = true;
 
     /* Don't try if size wraps around 0 */
@@ -2903,6 +2976,12 @@
           address argument for later munmap in free() and realloc().
         */
 
+#if 1
+	/* For glibc, chunk2mem increases the address by 2*SIZE_SZ and
+	   MALLOC_ALIGN_MASK is 2*SIZE_SZ-1.  Each mmap'ed area is page
+	   aligned and therefore definitely MALLOC_ALIGN_MASK-aligned.  */
+        assert (((INTERNAL_SIZE_T)chunk2mem(mm) & MALLOC_ALIGN_MASK) == 0);
+#else
         front_misalign = (INTERNAL_SIZE_T)chunk2mem(mm) & MALLOC_ALIGN_MASK;
         if (front_misalign > 0) {
           correction = MALLOC_ALIGNMENT - front_misalign;
@@ -2910,10 +2989,12 @@
           p->prev_size = correction;
           set_head(p, (size - correction) |IS_MMAPPED);
         }
-        else {
-          p = (mchunkptr)mm;
-          set_head(p, size|IS_MMAPPED);
-        }
+        else
+#endif
+	  {
+	    p = (mchunkptr)mm;
+	    set_head(p, size|IS_MMAPPED);
+	  }
 
         /* update statistics */
 
@@ -4097,7 +4178,6 @@
   for(;;) {
 
     int iters = 0;
-    bool any_larger = false;
     while ( (victim = unsorted_chunks(av)->bk) != unsorted_chunks(av)) {
       bck = victim->bk;
       if (__builtin_expect (victim->size <= 2 * SIZE_SZ, 0)
@@ -4125,6 +4205,11 @@
         unsorted_chunks(av)->bk = unsorted_chunks(av)->fd = remainder;
         av->last_remainder = remainder;
         remainder->bk = remainder->fd = unsorted_chunks(av);
+	if (!in_smallbin_range(remainder_size))
+	  {
+	    remainder->fd_nextsize = NULL;
+	    remainder->bk_nextsize = NULL;
+	  }
 
         set_head(victim, nb | PREV_INUSE |
 		 (av != &main_arena ? NON_MAIN_ARENA : 0));
@@ -4173,19 +4258,36 @@
           size |= PREV_INUSE;
           /* if smaller than smallest, bypass loop below */
 	  assert((bck->bk->size & NON_MAIN_ARENA) == 0);
-          if ((unsigned long)(size) <= (unsigned long)(bck->bk->size)) {
+	  if ((unsigned long)(size) < (unsigned long)(bck->bk->size)) {
             fwd = bck;
             bck = bck->bk;
+
+	    victim->fd_nextsize = fwd->fd;
+	    victim->bk_nextsize = fwd->fd->bk_nextsize;
+	    fwd->fd->bk_nextsize = victim->bk_nextsize->fd_nextsize = victim;
           }
           else {
 	    assert((fwd->size & NON_MAIN_ARENA) == 0);
-            while ((unsigned long)(size) < (unsigned long)(fwd->size)) {
-              fwd = fwd->fd;
-	      assert((fwd->size & NON_MAIN_ARENA) == 0);
-	    }
-            bck = fwd->bk;
+	    while ((unsigned long) size < fwd->size)
+	      {
+		fwd = fwd->fd_nextsize;
+		assert((fwd->size & NON_MAIN_ARENA) == 0);
+	      }
+
+	    if ((unsigned long) size == (unsigned long) fwd->size)
+	      /* Always insert in the second position.  */
+	      fwd = fwd->fd;
+	    else
+	      {
+		victim->fd_nextsize = fwd;
+		victim->bk_nextsize = fwd->bk_nextsize;
+		fwd->bk_nextsize = victim;
+		victim->bk_nextsize->fd_nextsize = victim;
+	      }
+	    bck = fwd->bk;
           }
-        }
+	} else
+	  victim->fd_nextsize = victim->bk_nextsize = victim;
       }
 
       mark_bin(av, victim_index);
@@ -4194,8 +4296,6 @@
       fwd->bk = victim;
       bck->fd = victim;
 
-      if (size >= nb + MINSIZE)
-	any_larger = true;
 #define MAX_ITERS	10000
       if (++iters >= MAX_ITERS)
 	break;
@@ -4203,21 +4303,25 @@
 
     /*
       If a large request, scan through the chunks of current bin in
-      sorted order to find smallest that fits.  This is the only step
-      where an unbounded number of chunks might be scanned without doing
-      anything useful with them. However the lists tend to be short.
+      sorted order to find smallest that fits.  Use the skip list for this.
     */
 
     if (!in_smallbin_range(nb)) {
       bin = bin_at(av, idx);
 
       /* skip scan if empty or largest chunk is too small */
-      if ((victim = last(bin)) != bin &&
-          (unsigned long)(first(bin)->size) >= (unsigned long)(nb)) {
+      if ((victim = first(bin)) != bin &&
+          (unsigned long)(victim->size) >= (unsigned long)(nb)) {
 
+	victim = victim->bk_nextsize;
         while (((unsigned long)(size = chunksize(victim)) <
                 (unsigned long)(nb)))
-          victim = victim->bk;
+          victim = victim->bk_nextsize;
+
+	/* Avoid removing the first entry for a size so that the skip
+	   list does not have to be rerouted.  */
+	if (victim != last(bin) && victim->size == victim->fd->size)
+	  victim = victim->fd;
 
         remainder_size = size - nb;
         unlink(victim, bck, fwd);
@@ -4239,6 +4343,11 @@
 	  remainder->fd = fwd;
 	  bck->fd = remainder;
 	  fwd->bk = remainder;
+	  if (!in_smallbin_range(remainder_size))
+	    {
+	      remainder->fd_nextsize = NULL;
+	      remainder->bk_nextsize = NULL;
+	    }
           set_head(victim, nb | PREV_INUSE |
 		   (av != &main_arena ? NON_MAIN_ARENA : 0));
           set_head(remainder, remainder_size | PREV_INUSE);
@@ -4308,9 +4417,7 @@
         remainder_size = size - nb;
 
         /* unlink */
-        bck = victim->bk;
-        bin->bk = bck;
-        bck->fd = bin;
+        unlink(victim, bck, fwd);
 
         /* Exhaust */
         if (remainder_size < MINSIZE) {
@@ -4335,7 +4442,11 @@
           /* advertise as last remainder */
           if (in_smallbin_range(nb))
             av->last_remainder = remainder;
-
+	  if (!in_smallbin_range(remainder_size))
+	    {
+	      remainder->fd_nextsize = NULL;
+	      remainder->bk_nextsize = NULL;
+	    }
           set_head(victim, nb | PREV_INUSE |
 		   (av != &main_arena ? NON_MAIN_ARENA : 0));
           set_head(remainder, remainder_size | PREV_INUSE);
@@ -4558,8 +4669,13 @@
 
       bck = unsorted_chunks(av);
       fwd = bck->fd;
-      p->bk = bck;
       p->fd = fwd;
+      p->bk = bck;
+      if (!in_smallbin_range(size))
+	{
+	  p->fd_nextsize = NULL;
+	  p->bk_nextsize = NULL;
+	}
       bck->fd = p;
       fwd->bk = p;
 
@@ -4684,7 +4800,15 @@
       reused anyway.
     */
 
+#if 0
+    /* It is wrong to limit the fast bins to search using get_max_fast
+       because, except for the main arena, all the others might have
+       blocks in the high fast bins.  It's not worth it anyway, just
+       search all bins all the time.  */
     maxfb = &(av->fastbins[fastbin_index(get_max_fast ())]);
+#else
+    maxfb = &(av->fastbins[NFASTBINS - 1]);
+#endif
     fb = &(av->fastbins[0]);
     do {
       if ( (p = *fb) != 0) {
@@ -4719,6 +4843,11 @@
             unsorted_bin->fd = p;
             first_unsorted->bk = p;
 
+            if (!in_smallbin_range (size)) {
+	      p->fd_nextsize = NULL;
+	      p->bk_nextsize = NULL;
+	    }
+
             set_head(p, size | PREV_INUSE);
             p->bk = unsorted_bin;
             p->fd = first_unsorted;
#include <stdlib.h>
#include <sys/time.h>
#include <stdio.h>
#include <signal.h>


#define NUM_CH (16 * 1024 * 1024)
#define SIZE_MAX (1024)

void *p[NUM_CH];

struct drand48_data data;

unsigned int rndm(unsigned int max) {
	double res;
	drand48_r(&data, &res);
	return res * max;
}

int work = 1;

void alarm_handler(int i __attribute__((unused))) {
	work = 0;
}

int main(int argc, char *argv[]) {
	unsigned long i = 0;
	struct timeval t0, t1, tdiff;
	struct itimerval itimer;
	int t;

	if(argc < 2 || (t = atoi(argv[1])) <= 0) {
		fprintf(stderr, "Usage: %s <time>\n", argv[0]);
		return 1;
	}

	fputs("Preparing: ", stdout); fflush(stdout);

	gettimeofday(&t0, NULL);

	for(i = 0; i < NUM_CH; ++i) {
		p[i] = malloc(rndm(SIZE_MAX));
	}

	for(i = 0; i < NUM_CH; i += 2) {
		free(p[i]); p[i] = NULL;
	}

	gettimeofday(&t1, NULL);
	timersub(&t1, &t0, &tdiff);

	fprintf(stdout, "%ld.%03lu\n", tdiff.tv_sec, tdiff.tv_usec / 1000);

	fprintf(stdout, "Testing:"); fflush(stdout);

	itimer.it_interval.tv_sec = 0;
	itimer.it_interval.tv_usec = 0;

	itimer.it_value.tv_sec = t;
	itimer.it_value.tv_usec = 0;

	signal(SIGALRM, &alarm_handler);
	setitimer(ITIMER_REAL, &itimer, NULL);

	gettimeofday(&t0, NULL);

	for(i = 0; i < NUM_CH && work; i += 2) {
		p[i] = malloc(rndm(SIZE_MAX));
	}

	gettimeofday(&t1, NULL);
	timersub(&t1, &t0, &tdiff);

	fprintf(stdout, " %u mallocs per second\n", i/2/tdiff.tv_sec);
	return 0;
}

#if 0		
		void **ptr = p + rndm(NUM_CH);
		if(*ptr) { free(*ptr); *ptr = NULL; }
		else { *ptr = malloc(rndm(SIZE_MAX)); }
#endif

Reply to: