[Date Prev][Date Next] [Thread Prev][Thread Next] [Date Index] [Thread Index]

Bug#463354: Incremental patching of Packages is slow



Package: apt
Version: 0.7.10

apt-get applies incremental patches to the Packages files in an exceedingly slow
way.  This is caused by several factors, such as the calculation of three
cryptographic hashes (even though they are discarded most of the time), and
slowness of the ed script interpreter.

I have attached a patch to make the ed interpreter several times faster when
mmap is available.

-- 
Morten Hustveit
http://www.junoplay.com/
--- ./debian/apt-0.7.10/methods/rred.cc	2007-11-12 20:48:22.000000000 +0100
+++ rred-mortehu.cc	2008-01-31 03:29:38.469512044 +0100
@@ -5,6 +5,8 @@
 #include <apt-pkg/hashes.h>
 
 #include <sys/stat.h>
+#include <sys/mman.h>
+#include <sys/uio.h>
 #include <unistd.h>
 #include <utime.h>
 #include <stdio.h>
@@ -22,13 +24,25 @@
 
 const char *Prog;
 
+#define IOV_COUNT 1024 /* Don't really want IOV_MAX since it can be arbitrarily large */
+
+struct EdCommand
+{
+  size_t data_start;
+  size_t data_end;
+  size_t data_lines;
+  size_t first_line;
+  size_t last_line;
+  int type;
+};
+
 class RredMethod : public pkgAcqMethod
 {
    bool Debug;
    // the size of this doesn't really matter (except for performance)    
    const static int BUF_SIZE = 1024;
    // the ed commands
-   enum Mode {MODE_CHANGED, MODE_DELETED, MODE_ADDED};
+   enum Mode {MODE_CHANGED='c', MODE_DELETED='d', MODE_ADDED='a'};
    // return values
    enum State {ED_OK, ED_ORDERING, ED_PARSER, ED_FAILURE};
    // this applies a single hunk, it uses a tail recursion to 
@@ -37,6 +51,8 @@
       char *buffer, unsigned int bufsize, Hashes *hash);
    // apply a patch file
    int ed_file(FILE *ed_cmds, FILE *in_file, FILE *out_file, Hashes *hash);
+   // apply a patch file
+   int ed_mmap(const char* ed_cmds, off_t ed_cmds_size, const char* in_file, off_t in_file_size, int out_file, Hashes *hash);
    // the methods main method
    virtual bool Fetch(FetchItem *Itm);
    
@@ -186,6 +202,197 @@
    return ED_OK;
 }
 
+int RredMethod::ed_mmap(const char* ed_cmds, off_t ed_cmds_size, const char* in_file, off_t in_file_size, int out_file, Hashes *hash) {
+   EdCommand* commands = 0;
+   size_t command_count = 0;
+   size_t command_alloc = 0;
+
+   const char* begin;
+   const char* end;
+   const char* ed_end = ed_cmds + ed_cmds_size;
+
+   const char* input = in_file;
+   const char* input_end = in_file + in_file_size;
+
+   size_t i;
+
+   begin = ed_cmds;
+   end = begin;
+
+   /* 1. Parse entire script.  It is executed in reverse order, so we cather it
+    *    in the `commands' buffer first
+    */
+
+   for(;;) {
+      EdCommand cmd;
+      int ch;
+      const char* tmp;
+
+      while(begin != ed_end && *begin == '\n')
+         ++begin;
+      while(end != ed_end && *end != '\n')
+         ++end;
+      if(end == ed_end && begin == end)
+         break;
+
+      /* Determine command type, and abort if it isn't "a", "c", or "d" */
+
+      cmd.type = end[-1];
+
+      if(cmd.type != MODE_ADDED && cmd.type != MODE_CHANGED && cmd.type != MODE_DELETED) {
+         free(commands);
+         return ED_FAILURE;
+      }
+
+      /* Determine command range */
+
+      tmp = begin;
+
+      for(;;) {
+         /* atoll is safe despite lacking NUL-termination; we know there's an
+          * alphabetic character at end[-1]
+          */
+
+         if(tmp == end) {
+            cmd.first_line = atoll(begin);
+            cmd.last_line = cmd.first_line;
+            break;
+         }
+         if(*tmp == ',') {
+            cmd.first_line = atoll(begin);
+            cmd.last_line = atoll(tmp + 1);
+            break;
+         }
+         ++tmp;
+      }
+
+      /* Determine the size of the inserted text, so we don't have to scan this
+       * text again later.
+       */
+
+      begin = end + 1;
+      end = begin;
+      cmd.data_lines = 0;
+
+      if(cmd.type == MODE_ADDED || cmd.type == MODE_CHANGED) {
+         cmd.data_start = begin - ed_cmds;
+         while(end != ed_end) {
+            if(*end == '\n') {
+               if(end[-1] == '.' && end[-2] == '\n')
+                  break;
+
+               ++cmd.data_lines;
+            }
+
+            ++end;
+         }
+         cmd.data_end = end - ed_cmds - 1;
+
+         begin = end + 1;
+         end = begin;
+      }
+      if(command_count == command_alloc) {
+         command_alloc = (command_alloc + 64) * 3 / 2;
+         commands = (EdCommand*) realloc(commands, command_alloc * sizeof(EdCommand));
+      }
+
+      commands[command_count++] = cmd;
+   }
+
+   struct iovec* iov = new struct iovec[IOV_COUNT];
+   size_t iov_size = 0;
+
+   size_t amount, remaining;
+   size_t line = 1;
+   EdCommand* cmd;
+
+   /* 2. Execute script.  We gather writes in a `struct iov' array, and flush
+    *    using writev to minimize the number of system calls.  Data is read
+    *    directly from the memory mappings of the input file and the script.
+    */
+
+   for(i = command_count; i-- > 0; ) {
+      cmd = &commands[i];
+      if(cmd->type == MODE_ADDED)
+         amount = cmd->first_line + 1;
+      else
+         amount = cmd->first_line;
+
+      if(line < amount)
+      {
+         begin = input;
+         while(line != amount) {
+            input = (const char*) memchr(input, '\n', input_end - input);
+            if(!input)
+               break;
+            ++line;
+            ++input;
+         }
+
+         iov[iov_size].iov_base = (void*) begin;
+         iov[iov_size].iov_len = input - begin;
+         hash->Add((const unsigned char*) begin, input - begin);
+
+         if(++iov_size == IOV_COUNT) {
+            writev(out_file, iov, IOV_COUNT);
+            iov_size = 0;
+         }
+      }
+
+      if(cmd->type == MODE_DELETED || cmd->type == MODE_CHANGED) {
+         remaining = (cmd->last_line - cmd->first_line) + 1;
+         line += remaining;
+         while(remaining) {
+            input = (const char*) memchr(input, '\n', input_end - input);
+            if(!input)
+               break;
+
+            --remaining;
+            ++input;
+         }
+      }
+
+      if(cmd->type == MODE_CHANGED || cmd->type == MODE_ADDED) {
+         if(cmd->data_end != cmd->data_start) {
+            iov[iov_size].iov_base = (void*) (ed_cmds + cmd->data_start);
+            iov[iov_size].iov_len = cmd->data_end - cmd->data_start;
+            hash->Add((const unsigned char*) (ed_cmds + cmd->data_start),
+                      iov[iov_size].iov_len);
+
+            if(++iov_size == IOV_COUNT) {
+               writev(out_file, iov, IOV_COUNT);
+               iov_size = 0;
+            }
+         }
+      }
+   }
+
+   if(input != input_end) {
+      iov[iov_size].iov_base = (void*) input;
+      iov[iov_size].iov_len = input_end - input;
+      hash->Add((const unsigned char*) input, input_end - input);
+
+      ++iov_size;
+   }
+
+   if(iov_size) {
+      writev(out_file, iov, iov_size);
+      iov_size = 0;
+   }
+
+   for(i = 0; i < iov_size; i += IOV_COUNT) {
+      if(iov_size - i < IOV_COUNT)
+         writev(out_file, iov + i, iov_size - i);
+      else
+         writev(out_file, iov + i, IOV_COUNT);
+   }
+
+   delete [] iov;
+   free(commands);
+
+   return ED_OK;
+}
+
 
 bool RredMethod::Fetch(FetchItem *Itm)
 {
@@ -211,19 +418,49 @@
       return false;
    
    Hashes Hash;
-   FILE* fFrom = fdopen(From.Fd(), "r");
-   FILE* fPatch = fdopen(Patch.Fd(), "r");
-   FILE* fTo = fdopen(To.Fd(), "w");
-   // now do the actual patching
-   if (ed_file(fPatch, fFrom, fTo, &Hash) != ED_OK) {
-     _error->Errno("rred", _("Could not patch file"));  
-      return false;
+   off_t ed_cmds_size = lseek(Patch.Fd(), 0, SEEK_END);
+   off_t in_file_size = lseek(From.Fd(), 0, SEEK_END);
+   void* ed_cmds_map;
+   void* in_file_map;
+
+   ed_cmds_map = mmap(0, ed_cmds_size, PROT_READ, MAP_SHARED, Patch.Fd(), 0);
+   if(ed_cmds_map != MAP_FAILED) {
+      in_file_map = mmap(0, in_file_size, PROT_READ, MAP_SHARED, From.Fd(), 0);
+      if(in_file_map == MAP_FAILED) {
+         munmap(ed_cmds_map, ed_cmds_size);
+         ed_cmds_map = MAP_FAILED;
+      }
+   }
+
+   if(ed_cmds_map != MAP_FAILED) {
+      // now do the actual patching
+      if (ed_mmap((const char*) ed_cmds_map, ed_cmds_size,
+                  (const char*) in_file_map, in_file_size, To.Fd(), &Hash)) {
+         munmap(ed_cmds_map, ed_cmds_size);
+         munmap(in_file_map, in_file_size);
+        _error->Errno("rred", _("Could not patch file"));  
+         return false;
+      }
+      munmap(ed_cmds_map, ed_cmds_size);
+      munmap(in_file_map, in_file_size);
+   }
+   else {
+      lseek(Patch.Fd(), 0, SEEK_SET);
+      lseek(From.Fd(), 0, SEEK_SET);
+      FILE* fFrom = fdopen(From.Fd(), "r");
+      FILE* fPatch = fdopen(Patch.Fd(), "r");
+      FILE* fTo = fdopen(To.Fd(), "w");
+      // now do the actual patching
+      if (ed_file(fPatch, fFrom, fTo, &Hash) != ED_OK) {
+        _error->Errno("rred", _("Could not patch file"));  
+         return false;
+      }
+      // write out the result
+      fflush(fFrom);
+      fflush(fPatch);
+      fflush(fTo);
    }
 
-   // write out the result
-   fflush(fFrom);
-   fflush(fPatch);
-   fflush(fTo);
    From.Close();
    Patch.Close();
    To.Close();

Reply to: