Re: [rfc] apt-cache google

To: deity@lists.debian.org
Subject: Re: [rfc] apt-cache google
From: "Neal H. Walfield" <neal@walfield.org>
Date: Mon, 13 Apr 2009 01:04:32 +0200
Message-id: <[🔎] 87eivxv5hb.wl%neal@walfield.org>
In-reply-to: <[🔎] 87fxgevcu2.wl%neal@walfield.org>
References: <[🔎] 87fxgevcu2.wl%neal@walfield.org>
I've attached a slightly improve version of this patch.

  - It uses an improved term-scoring technique, which sets an
    occurrence's score to 10/sqrt(number of words in section).  This
    way searching for foo will score the occurrence of foo higher for
    "foo" than "foo-dev".

  - Second, it offers some spelling correction.  For each term, it
    adds the term minus the first letter (think KDE and GNOME naming
    conventions), the term minus any numeric suffix, and term with
    both of these transformations applied.

  - Finally, it limits the number of results to 40 and has an option
    (--limit) to show more.  If there are more results, it complains
    on stderr.

Neal

=== modified file 'Makefile'
--- Makefile	2006-10-01 23:03:40 +0000
+++ Makefile	2009-04-11 17:26:12 +0000
@@ -17,7 +17,7 @@
 	$(MAKE) -C cmdline $@
 	$(MAKE) -C ftparchive $@
 	$(MAKE) -C dselect $@
-	$(MAKE) -C doc $@
+	# $(MAKE) -C doc $@
 	$(MAKE) -C po $@
 
 # Some very common aliases

=== modified file 'buildlib/environment.mak.in'
--- buildlib/environment.mak.in	2007-07-25 23:27:40 +0000
+++ buildlib/environment.mak.in	2009-04-11 20:21:20 +0000
@@ -7,7 +7,7 @@
 CC = @CC@
 CPPFLAGS+= @CPPFLAGS@ @DEFS@ -D_REENTRANT -Wall
 CXX = @CXX@
-CXXFLAGS+= @CXXFLAGS@
+CXXFLAGS+= -std=gnu++0x @CXXFLAGS@
 NUM_PROCS = @NUM_PROCS@
 GLIBC_VER = @GLIBC_VER@
 LIBSTDCPP_VER = @LIBSTDCPP_VER@

=== modified file 'cmdline/apt-cache.cc'
--- cmdline/apt-cache.cc	2008-11-08 19:11:14 +0000
+++ cmdline/apt-cache.cc	2009-04-12 21:42:59 +0000
@@ -28,6 +28,8 @@
 #include <apt-pkg/algorithms.h>
 #include <apt-pkg/sptr.h>
 
+#include <unordered_map>
+
 #include <config.h>
 #include <apti18n.h>
 
@@ -37,6 +39,8 @@
 #include <errno.h>
 #include <regex.h>
 #include <stdio.h>
+#include <math.h>
+#include <assert.h>
 
 #include <iomanip>
 									/*}}}*/
@@ -1401,6 +1405,351 @@
    return true;
 }
 									/*}}}*/
+
+// KSearch - Perform a keyword search					/*{{{*/
+// ---------------------------------------------------------------------
+
+bool KSearch(CommandLine &CmdL)
+{
+   pkgCache &Cache = *GCache;
+   bool ShowFull = _config->FindB("APT::Cache::ShowFull",false);
+   bool NamesOnly = _config->FindB("APT::Cache::NamesOnly",false);
+   unsigned NumKeywords = CmdL.FileSize() -1;
+
+   /* Maximum number of results.  */
+   unsigned limit = _config->FindI("limit", 40);
+   if (limit == 0)
+     limit = -1;
+   
+   pkgDepCache::Policy Plcy;
+   
+   // Make sure there is at least one argument
+   if (NumKeywords < 1)
+      return _error->Error(_("You must give exactly one pattern"));
+   
+   // Create the text record parser
+   pkgRecords Recs(Cache);
+
+   class frequency_hash: public unordered_map<string, int>
+   {
+   public:
+     void note_word (string word)
+     {
+       frequency_hash::iterator iter = this->find (word);
+       if (iter != this->end ())
+	 iter->second ++;
+       else
+	 this->insert (make_pair (word, 1));
+     }
+   };
+   frequency_hash word_freq;
+
+   class doc_vector: public unordered_map<string, float>
+   {
+     /* The sum of the square of the terms.  */
+     double sum_sq;
+
+   public:
+     pkgCache::PkgIterator P;
+     pkgCache::VerIterator V;
+
+     void add_word (string word, float rank, frequency_hash *freq_hash)
+     {
+       doc_vector::iterator iter = this->find (word);
+       if (iter != this->end ())
+	 iter->second += rank;
+       else
+	 this->insert (make_pair (word, rank));
+
+       if (freq_hash)
+	 freq_hash->note_word (word);
+     }
+
+     void add_string (string s, float rank, frequency_hash *freq_hash)
+     {
+       vector<string> words;
+       int word_count = 0;
+
+       string last;
+
+       int start;
+       int end = 0;
+       int len = s.length ();
+       while (end < len)
+	 {
+	   start = end;
+
+	   while (! isalnum (s[start]) && start < len)
+	     start ++;
+	   if (start == len)
+	     break;
+
+	   word_count ++;
+
+	   end = start;
+	   while (isalnum (s[end]) && end < len)
+	     end ++;
+
+	   string word = s.substr (start, end - start);
+	   for (unsigned i = 0; i < word.length (); i ++)
+	     word[i] = tolower (word[i]);
+
+	   /* Ignore 0 and 1 character length strings.  */
+	   if (word.length () > 1)
+	     {
+	       words.push_back (word);
+
+	       if (last.length () > 1)
+		 /* Add bi-grams.  */
+		 {
+		   last.append (" ");
+		   last.append (word);
+		   
+		   words.push_back (last);
+		 }
+
+	       last = word;
+	     }
+	   else
+	     last.clear ();
+
+	   /* If the term is suffixed with digits, strip them and add
+	      that version.  */
+	   int digits = 0;
+	   while (isdigit (word[word.length () - 1 - digits]))
+	     digits ++;
+
+	   if (digits && word.length () - digits > 2)
+	     words.push_back (word.substr (0, word.length () - digits));
+
+	   /* Add a term with the first character dropped.  This is
+	      very helpful for things like xemacs, gedit, konsole =>
+	      console, etc.  */
+	   if (word.length () > 3)
+	     words.push_back (word.substr (1));
+
+	   if (digits && word.length () - digits > 3)
+	     words.push_back (word.substr (1, word.length () - 1 - digits));
+	 }
+
+       double r = rank / sqrtf (word_count);
+
+       int dump = 0;
+       if (dump)
+	 {
+	   if (P)
+	     cout << P.Name ();
+	   else
+	     cout << "*unnamed package*";
+	   cout << " " << r << "\n";
+	 }
+       for (vector<string>::iterator iter = words.begin ();
+	    iter != words.end ();
+	    iter ++)
+	 {
+	   if (dump)
+	     cout << " " << *iter << ": " << r << endl;
+	   add_word (*iter, r, freq_hash);
+	 }
+     }
+
+     /* Compute the TF/IDF of a document vector.  */
+     void tf_idf (int docs, frequency_hash &word_freq)
+     {
+       sum_sq = 0.0;
+
+       int have_one = 0;
+       int dump = 0;
+
+       doc_vector::iterator iter;
+       doc_vector::iterator next = this->begin ();
+       while (next != this->end ())
+	 {
+	   iter = next;
+	   next ++;
+
+	   frequency_hash::iterator fi = word_freq.find (iter->first);
+	   if (fi == word_freq.end ())
+	     /* The word does not exist in the corpus.  This is likely
+		a query vector.  We can just ignore it.  */
+	     continue;
+
+	   assert (fi->second > 0);
+
+	   float pre = iter->second;
+
+	   iter->second *= logf ((float) docs / (float) fi->second);
+
+	   if (dump)
+	     {
+	       if (! have_one)
+		 {
+		   have_one = 1;
+		   if (P)
+		     cout << P.Name ();
+		   else
+		     cout << "*unnamed package*";
+		   cout << "\n";
+		 }
+
+	       cout << " " << iter->first << " "
+		    << pre << " => "
+		    << iter->second << "\n";
+	     }
+
+	   /* Also calculate the sum of the squares.  */
+	   sum_sq += iter->second * iter->second;
+	 }
+     }
+
+     /* Compute how similar this document is to OTHER.  We use cosine
+	similarity.  */
+     double similarity (doc_vector other)
+     {
+       if (this->empty () || other.empty ())
+	 return 0;
+
+       int c1 = this->size ();
+       int c2 = other.size ();
+
+       doc_vector *v1 = this;
+       doc_vector *v2 = &other;
+       if (c1 > c2)
+	 {
+	   v2 = &other;
+	   v1 = this;
+	 }
+
+       int dump = 0;
+       int have_one = 0;
+
+       double n = 0;
+       for (doc_vector::iterator iter = v1->begin ();
+	    iter != v1->end ();
+	    iter ++)
+	 {
+	   doc_vector::iterator iter2 = v2->find (iter->first);
+	   if (iter2 == v2->end ())
+	     continue;
+
+	   if (dump)
+	     {
+	       if (! have_one)
+		 {
+		   have_one = 1;
+		   cout << other.P.Name () << "\n";
+		 }
+
+	       cout << " " << iter->first << " "
+		    << iter->second << " * " << iter2->second << " => "
+		    << iter->second * iter2->second << "\n";
+	     }
+
+	   n += iter->second * iter2->second;
+	 }
+
+       double result = n / sqrtf (v1->sum_sq * v2->sum_sq);
+
+       if (have_one)
+	 cout << " => " << result << "\n";
+
+       return result;
+     }
+   };
+   int last_doc_id = Cache.HeaderP->PackageCount+1;
+   doc_vector doc_vectors[last_doc_id];
+   int docs = 0;
+
+   // Generate the document vectors.
+   for (pkgCache::PkgIterator P = Cache.PkgBegin(); P.end() == false; P++)
+     {
+       /* Get the candidate version.  If there is none, then the
+	  package is not installable and we skip it.  */
+       pkgCache::VerIterator V = Plcy.GetCandidateVer(P);
+       if (V.end())
+	 continue;
+
+       if (0)
+	 printf ("%s: %s\n", P.Name(), P.Section());
+
+       docs ++;
+       doc_vectors[P->ID].P = P;
+       doc_vectors[P->ID].V = V;
+
+       doc_vectors[P->ID].add_string (P.Name (), 10, &word_freq);
+
+       pkgCache::DescFileIterator D(Cache,V.DescriptionList().FileList());
+       pkgRecords::Parser &Q = Recs.Lookup(D);
+       doc_vectors[P->ID].add_string (Q.ShortDesc (), 10, &word_freq);
+       doc_vectors[P->ID].add_string (Q.LongDesc (), 10, &word_freq);
+
+       if (0)
+	 {
+	   doc_vector::iterator iter;
+	   for (iter = doc_vectors[P->ID].begin ();
+		iter != doc_vectors[P->ID].end ();
+		iter ++)
+	     printf ("%s: %f\n", iter->first.c_str (), iter->second);
+	 }
+     }
+
+   for (int i = 0; i < last_doc_id; i ++)
+     doc_vectors[i].tf_idf (docs, word_freq);
+
+   /* Generate the query vector.  */
+   doc_vector query;
+   string q;
+   for (unsigned i = 0; i != NumKeywords; i ++)
+     {
+       if (! q.empty ())
+	 q.append (" ");
+       q.append (CmdL.FileList[i+1]);
+     }
+   if (0)
+     cout << q << "\n";
+
+   query.add_string (q, 1, NULL);
+   query.tf_idf (docs, word_freq);
+
+   /* Compute the similarities.  */
+   vector<pair<double, int>> results;
+
+   for (int i = 0; i < last_doc_id; i ++)
+     {
+       double similarity = query.similarity (doc_vectors[i]);
+       if (similarity > 0.01)
+	 results.push_back (make_pair (similarity, i));
+     }
+
+   sort (results.begin (), results.end ());
+
+   int i;
+   for (i = results.size () - 1;
+	i >= 0 && results.size () - i <= limit;
+	i --)
+     {
+       if (1)
+	 printf ("%f: ", results[i].first);
+
+       doc_vector d = doc_vectors[results[i].second];
+
+       pkgCache::DescFileIterator D(Cache,d.V.DescriptionList().FileList());
+       pkgRecords::Parser &Q = Recs.Lookup(D);
+
+       printf ("%s: %s\n", d.P.Name (), Q.ShortDesc ().c_str ());
+     }
+
+   if (i != -1)
+     cerr << i + 1
+	  << " results not shown.  Rerun with --limit=0 to see all." << endl;
+
+   if (ferror(stdout))
+       return _error->Error("Write to stdout failed");
+   return true;
+}
+									/*}}}*/
+
+									/*}}}*/
 // ShowPackage - Dump the package record to the screen			/*{{{*/
 // ---------------------------------------------------------------------
 /* */
@@ -1736,6 +2085,7 @@
       "   dumpavail - Print an available file to stdout\n"
       "   unmet - Show unmet dependencies\n"
       "   search - Search the package list for a regex pattern\n"
+      "   ksearch - Search the package list for keywords\n"
       "   show - Show a readable record for the package\n"
       "   depends - Show raw dependency information for a package\n"
       "   rdepends - Show reverse dependency information for a package\n"
@@ -1784,6 +2134,7 @@
       {'c',"config-file",0,CommandLine::ConfigFile},
       {'o',"option",0,CommandLine::ArbItem},
       {0,"installed","APT::Cache::Installed",0},
+      {'l',"limit","limit",CommandLine::HasArg},
       {0,0,0,0}};
    CommandLine::Dispatch CmdsA[] = {{"help",&ShowHelp},
                                     {"add",&DoAdd},
@@ -1796,6 +2147,7 @@
                                     {"dumpavail",&DumpAvail},
                                     {"unmet",&UnMet},
                                     {"search",&Search},
+                                    {"ksearch",&KSearch},
                                     {"depends",&Depends},
                                     {"rdepends",&RDepends},
                                     {"dotty",&Dotty},
Reply to:
References:
- [rfc] apt-cache google
  - From: "Neal H. Walfield" <neal@walfield.org>
Prev by Date: Bug#216951: Apt-get (and aptitude) update fail to check if the http server supports pipelining
Next by Date: Bug#523801: Returns invalid Version objects on packages without candidate versions
Previous by thread: [rfc] apt-cache google
Next by thread: Bug#52670: So what
Index(es):
- Date
- Thread