[Date Prev][Date Next] [Thread Prev][Thread Next] [Date Index] [Thread Index]

better network error handling



Hi,

I was looking into the problem that apt is not very good at knowing
when acquire-items can not be acquired because of e.g. network
failures. 

Apt will still try to get all of them. In the worst case that means
that each item times out after 120sec. For Releases.gpg, Release,
Packages.bz2, Packages.gz per componenent that quickly adds up :)
The installer people seem to be especially interessted in detecting
failures quickly without too much delay.

Attached is a patch that changes this behaviour. A method can return a
"FailReason" (e.g. host returned connection refused, ResolverFailure,
Timeout, currently only used in methods/connect.cc). The Acquire code
then will use this to set the queue into a permanent error state if
the queue is in QueueHost mode. This means that all other items trying
to be fetched from the queue are discarded without further trying. The
reasoning is that if a connection was refused once, it's unlikely that
it will work on the next connect a couple of seconds later.

I would like to get feedback on the patch because the aquire code is
one of the more fragile areas of libapt and I would really appreciate
feedback from people like Jason, Matt, Daniel etc about the approach
taken (it's also available as a baz branch in my archive as
apt--per-queue-failure--0).

Cheers, 
 Michael

-- 
Linux is not The Answer. Yes is the answer. Linux is The Question. - Neo
--- orig/apt-pkg/acquire-item.cc
+++ mod/apt-pkg/acquire-item.cc
@@ -400,16 +400,10 @@
    string Final = _config->FindDir("Dir::State::lists") + URItoFileName(RealURI);
    unlink(Final.c_str());
 
-   // if we get a timeout if fail
-   if(LookupTag(Message,"FailReason") == "Timeout" || 
-      LookupTag(Message,"FailReason") == "TmpResolveFailure") {
-      Item::Failed(Message,Cnf);
-      return;
-   }
-
    // queue a pkgAcqMetaIndex with no sigfile
-   new pkgAcqMetaIndex(Owner, MetaIndexURI, MetaIndexURIDesc, MetaIndexShortDesc,
-		       "", IndexTargets, MetaIndexParser);
+   new pkgAcqMetaIndex(Owner, MetaIndexURI, MetaIndexURIDesc, 
+		       MetaIndexShortDesc, "", IndexTargets, 
+		       MetaIndexParser);
 
    if (Cnf->LocalOnly == true || 
        StringToBool(LookupTag(Message,"Transient-Failure"),false) == false)
@@ -697,6 +691,10 @@
    // No Release file was present, or verification failed, so fall
    // back to queueing Packages files without verification
    QueueIndexes(false);
+
+   Status = StatDone;
+   Complete = false;
+   Dequeue();
 }
 
 									/*}}}*/


--- orig/apt-pkg/acquire-item.h
+++ mod/apt-pkg/acquire-item.h
@@ -48,7 +48,8 @@
    public:
 
    // State of the item
-   enum {StatIdle, StatFetching, StatDone, StatError, StatAuthError} Status;
+   enum {StatIdle, StatFetching, StatDone, StatError, StatAuthError, 
+	 StatPermError} Status;
    string ErrorText;
    unsigned long FileSize;
    unsigned long PartialSize;   


--- orig/apt-pkg/acquire-worker.cc
+++ mod/apt-pkg/acquire-worker.cc
@@ -306,6 +306,12 @@
 	    
 	    pkgAcquire::Item *Owner = Itm->Owner;
 	    pkgAcquire::ItemDesc Desc = *Itm;
+
+	    if(LookupTag(Message,"FailReason") == "Timeout" || 
+	       LookupTag(Message,"FailReason") == "TmpResolveFailure" ||
+	       LookupTag(Message,"FailReason") == "ConnectionRefused")
+	       Owner->Status = pkgAcquire::Item::StatPermError;
+
 	    OwnerQ->ItemDone(Itm);
 	    Owner->Failed(Message,Config);
 	    ItemDone();


--- orig/apt-pkg/acquire.cc
+++ mod/apt-pkg/acquire.cc
@@ -521,14 +521,15 @@
 // Queue::Queue - Constructor						/*{{{*/
 // ---------------------------------------------------------------------
 /* */
-pkgAcquire::Queue::Queue(string Name,pkgAcquire *Owner) : Name(Name), 
-            Owner(Owner)
+pkgAcquire::Queue::Queue(string Name,pkgAcquire *Owner) 
+   : Name(Name), Owner(Owner)
 {
    Items = 0;
    Next = 0;
    Workers = 0;
    MaxPipeDepth = 1;
    PipeDepth = 0;
+   PermanentError = false;
 }
 									/*}}}*/
 // Queue::~Queue - Destructor						/*{{{*/
@@ -572,7 +573,7 @@
 {
    if (Owner->Status == pkgAcquire::Item::StatFetching)
       return _error->Error("Tried to dequeue a fetching object");
-       
+
    bool Res = false;
    
    QItem **I = &Items;
@@ -580,6 +581,15 @@
    {
       if ((*I)->Owner == Owner)
       {
+	 if (Owner->Status == pkgAcquire::Item::StatPermError
+	    &&  this->Owner->QueueMode == QueueHost) 
+	 {
+	    if(_config->FindB("Debug::pkgAcquire",false))
+	       std::cout << "got a permanent error from the item" 
+			 << Owner->DestFile <<  std::endl;
+	    PermanentError = true;
+	 }
+
 	 QItem *Jnk= *I;
 	 *I = (*I)->Next;
 	 Owner->QueueCounter--;
@@ -686,14 +696,32 @@
    is enabled then it keeps the pipe full. */
 bool pkgAcquire::Queue::Cycle()
 {
+
    if (Items == 0 || Workers == 0)
       return true;
 
+   // something happend to the queue and it does not make sense
+   // to keep trying to get stuff from it, cancel the current fetches
+   if(PermanentError) {
+      if(_config->FindB("Debug::pkgAcquire",false))
+	 std::cout << "permanent error in queue: " << Name << std::endl;
+      QItem *I = Items;
+      for (; I != 0; I = I->Next)
+	 if (I->Owner->Status == pkgAcquire::Item::StatIdle) {
+	    if(_config->FindB("Debug::pkgAcquire",false))
+	       std::cout << "Killing (because of permanent error): " << I->Owner->DescURI() << std::endl;
+	    I->Owner->Failed("",this->Owner->GetConfig(URI(I->Owner->DescURI()).Access));
+	    ItemDone(I);
+	    
+	 }
+      return true;
+   }
+
    if (PipeDepth < 0)
       return _error->Error("Pipedepth failure");
-			   
-   // Look for a queable item
+
    QItem *I = Items;
+   // Look for a queable item
    while (PipeDepth < (signed)MaxPipeDepth)
    {
       for (; I != 0; I = I->Next)
@@ -703,7 +731,7 @@
       // Nothing to do, queue is idle.
       if (I == 0)
 	 return true;
-      
+
       I->Worker = Workers;
       I->Owner->Status = pkgAcquire::Item::StatFetching;
       PipeDepth++;


--- orig/apt-pkg/acquire.h
+++ mod/apt-pkg/acquire.h
@@ -169,6 +169,7 @@
    pkgAcquire *Owner;
    signed long PipeDepth;
    unsigned long MaxPipeDepth;
+   bool PermanentError;
    
    public:
    


--- orig/methods/connect.cc
+++ mod/methods/connect.cc
@@ -103,6 +103,8 @@
    if (Err != 0)
    {
       errno = Err;
+      if(errno == ECONNREFUSED)
+	 Owner->SetFailExtraMsg("\nFailReason: ConnectionRefused");
       return _error->Errno("connect",_("Could not connect to %s:%s (%s)."),Host.c_str(),
 			   Service,Name);
    }




Reply to: