[Date Prev][Date Next] [Thread Prev][Thread Next] [Date Index] [Thread Index]

[rfc] apt-cache google



Greetings,

Attached to this message is a patch that adds standard keyword
searching to apt-cache.  It uses the well-known document-vector model.
Terms are weighted using TF-IDF and vectors are compared using cosine
similarity.  The vectors include both individual words as well as
bigrams.  After finding the set of relevant documents, the output is
sorted according to computed relevancy.

The following output shows top results when search for window manager
using this new search technique (apt-cache ksearch window manager).
The left column is the computed relevancy.

  0.437478: matchbox-window-manager: window manager for resource-limited systems
  0.350593: xfwm4: window manager of the Xfce project
  0.306863: afterstep: window manager with the NEXTSTEP look and feel
  0.291519: wm2: small, unconfigurable window manager
  0.290499: asclassic: X11 window manager, AfterStep Classic (forked from v1.1)
  0.278316: kwin: the KDE window manager
  0.256702: libmetacity0: library of lightweight GTK2 based Window Manager
  0.250537: metacity: A lightweight GTK2 based Window Manager
  0.249057: 9wm: emulation of the Plan 9 window manager 8-1/2
  0.244742: tinywm: tiny window manager
  0.239528: wmii2: lightweight tabbed and tiled X11 window manager, version 2
  0.237694: wmii: lightweight tabbed and tiled X11 window manager, version 3
  0.234376: fvwm: F(?) Virtual Window Manager
  0.233639: libafterstep1: shared libraries for the AfterStep window manager
  0.225442: libmetacity-dev: Development files of lightweight GTK2 based Window Manager
  0.223276: python-plwm: Pointless Window Manager - Python libraries for creating Window Managers
  0.217922: metacity-common: Shared files of lightweight GTK2 based Window Manager
  0.215831: sawfish: a window manager for X11
  0.211769: olvwm: OpenLook virtual window manager
  0.204527: olwm: Open Look Window Manager
  0.203103: choosewm: fake x-session-manager allowing the user to choose a wm
  0.202086: ratpoison: keyboard-only window manager
  0.195167: obconf: Preferences manager for Openbox window manager
  0.193327: wmaker: NeXTSTEP-like window manager for X
  0.192349: lwm: Lightweight Window Manager
  0.184504: bbdate: Date tool for the blackbox window manager
  0.183133: twm: Tab window manager
  0.182635: compiz-core: OpenGL window and compositing manager
  0.178773: windowlab: Small and simple Amiga-like window manager
  0.176553: openbox-themes: Themes for the Openbox window manager
  0.175592: amiwm: The Amiga look alike window manager
  0.172606: sapphire: A minimal but configurable X11R6 window manager
  0.170955: blackbox: Window manager for X
  0.169932: wm-icons: Themed icon set that is Window Manager agnostic.
  0.169613: icewm: wonderful Win95-OS/2-Motif-like window manager
  0.169526: uwm: The ultimate window manager for UDE
  0.167773: aewm: a minimalist window manager for X11
  0.167125: e16-data: e16 window manager support files
  0.166765: ion2: Keyboard-friendly window manager with tiled windows (v2)
  0.166580: libghc6-xmonad-dev: A lightweight X11 window manager

As you can see, the top results really are the most relevant.

Is there general interest in this type of extension?  If so, I will
clean this patch up and submit it properly.  Note: this is just very
basic search functionality.  I am considering adding more advanced
functionality, if there is interest.

Thanks,

Neal

=== modified file 'buildlib/environment.mak.in'
--- buildlib/environment.mak.in	2007-07-25 23:27:40 +0000
+++ buildlib/environment.mak.in	2009-04-11 20:21:20 +0000
@@ -7,7 +7,7 @@
 CC = @CC@
 CPPFLAGS+= @CPPFLAGS@ @DEFS@ -D_REENTRANT -Wall
 CXX = @CXX@
-CXXFLAGS+= @CXXFLAGS@
+CXXFLAGS+= -std=gnu++0x @CXXFLAGS@
 NUM_PROCS = @NUM_PROCS@
 GLIBC_VER = @GLIBC_VER@
 LIBSTDCPP_VER = @LIBSTDCPP_VER@

=== modified file 'cmdline/apt-cache.cc'
--- cmdline/apt-cache.cc	2008-11-08 19:11:14 +0000
+++ cmdline/apt-cache.cc	2009-04-12 01:55:53 +0000
@@ -28,6 +28,8 @@
 #include <apt-pkg/algorithms.h>
 #include <apt-pkg/sptr.h>
 
+#include <unordered_map>
+
 #include <config.h>
 #include <apti18n.h>
 
@@ -37,6 +39,8 @@
 #include <errno.h>
 #include <regex.h>
 #include <stdio.h>
+#include <math.h>
+#include <assert.h>
 
 #include <iomanip>
 									/*}}}*/
@@ -1401,6 +1405,263 @@
    return true;
 }
 									/*}}}*/
+
+// KSearch - Perform a keyword search					/*{{{*/
+// ---------------------------------------------------------------------
+
+bool KSearch(CommandLine &CmdL)
+{
+   pkgCache &Cache = *GCache;
+   bool ShowFull = _config->FindB("APT::Cache::ShowFull",false);
+   bool NamesOnly = _config->FindB("APT::Cache::NamesOnly",false);
+   unsigned NumKeywords = CmdL.FileSize() -1;
+   
+   pkgDepCache::Policy Plcy;
+   
+   // Make sure there is at least one argument
+   if (NumKeywords < 1)
+      return _error->Error(_("You must give exactly one pattern"));
+   
+   // Create the text record parser
+   pkgRecords Recs(Cache);
+
+   class frequency_hash: public unordered_map<string, int>
+   {
+   public:
+     void note_word (string word)
+     {
+       frequency_hash::iterator iter = this->find (word);
+       if (iter != this->end ())
+	 iter->second ++;
+       else
+	 this->insert (make_pair (word, 1));
+     }
+   };
+   frequency_hash word_freq;
+
+   class doc_vector: public unordered_map<string, float>
+   {
+     /* The sum of the square of the terms.  */
+     double sum_sq;
+
+   public:
+     pkgCache::PkgIterator P;
+     pkgCache::VerIterator V;
+
+     void add_word (string word, int rank, frequency_hash *freq_hash)
+     {
+       doc_vector::iterator iter = this->find (word);
+       if (iter != this->end ())
+	 iter->second += rank;
+       else
+	 this->insert (make_pair (word, rank));
+
+       if (freq_hash)
+	 freq_hash->note_word (word);
+     }
+
+     void add_string (string s, int rank, frequency_hash *freq_hash)
+     {
+       string last;
+
+       int start;
+       int end = 0;
+       int len = s.length ();
+       while (end < len)
+	 {
+	   start = end;
+
+	   while (! isalnum (s[start]) && start < len)
+	     start ++;
+	   if (start == len)
+	     break;
+
+	   end = start;
+	   while (isalnum (s[end]) && end < len)
+	     end ++;
+
+	   string word = s.substr (start, end - start);
+	   for (unsigned i = 0; i < word.length (); i ++)
+	     word[i] = tolower (word[i]);
+
+	   /* Ignore 0 and 1 character length strings.  */
+	   if (word.length () > 1)
+	     {
+	       add_word (word, rank, freq_hash);
+
+	       if (last.length () > 1)
+		 /* Add bi-grams.  */
+		 {
+		   last.append (" ");
+		   last.append (word);
+		   add_word (last, rank, freq_hash);
+		 }
+
+	       last = word;
+	     }
+	   else
+	     last.clear ();
+	 }
+     }
+
+     /* Compute the TF/IDF of a document vector.  */
+     void tf_idf (int docs, frequency_hash &word_freq)
+     {
+       sum_sq = 0.0;
+
+       doc_vector::iterator iter;
+       doc_vector::iterator next = this->begin ();
+       while (next != this->end ())
+	 {
+	   iter = next;
+	   next ++;
+
+	   frequency_hash::iterator fi = word_freq.find (iter->first);
+	   if (fi == word_freq.end ())
+	     /* The word does not exist in the corpus.  This is likely
+		a query vector.  We can just ignore it.  */
+	     continue;
+
+	   assert (fi->second > 0);
+	   iter->second *= logf ((float) docs / (float) fi->second);
+
+	   /* Also calculate the sum of the squares.  */
+	   sum_sq += iter->second * iter->second;
+	 }
+     }
+
+     /* Compute how similar this document is to OTHER.  We use cosine
+	similarity.  */
+     double similarity (doc_vector other)
+     {
+       if (this->empty () || other.empty ())
+	 return 0;
+
+       int c1 = this->size ();
+       int c2 = other.size ();
+
+       doc_vector *v1 = this;
+       doc_vector *v2 = &other;
+       if (c1 > c2)
+	 {
+	   v2 = &other;
+	   v1 = this;
+	 }
+
+       int have_one = 0;
+
+       double n = 0;
+       for (doc_vector::iterator iter = v1->begin ();
+	    iter != v1->end ();
+	    iter ++)
+	 {
+	   doc_vector::iterator iter2 = v2->find (iter->first);
+	   if (iter2 == v2->end ())
+	     continue;
+
+	   if (! have_one)
+	     {
+	       have_one = 1;
+	       cout << other.P.Name () << "\n";
+	     }
+
+	   cout << " " << iter->first << " "
+		<< iter->second << " * " << iter2->second << " => "
+		<< iter->second * iter2->second << "\n";
+
+	   n += iter->second * iter2->second;
+	 }
+
+       double result = n / sqrtf (v1->sum_sq * v2->sum_sq);
+
+       if (have_one)
+	 cout << " => " << result << "\n";
+
+       return result;
+     }
+   };
+   int last_doc_id = Cache.HeaderP->PackageCount+1;
+   doc_vector doc_vectors[last_doc_id];
+   int docs = 0;
+
+   // Generate the document vectors.
+   for (pkgCache::PkgIterator P = Cache.PkgBegin(); P.end() == false; P++)
+     {
+       /* Get the candidate version.  If there is none, then the
+	  package is not installable and we skip it.  */
+       pkgCache::VerIterator V = Plcy.GetCandidateVer(P);
+       if (V.end())
+	 continue;
+
+       docs ++;
+       doc_vectors[P->ID].P = P;
+       doc_vectors[P->ID].V = V;
+
+       doc_vectors[P->ID].add_string (P.Name (), 4, &word_freq);
+
+       pkgCache::DescFileIterator D(Cache,V.DescriptionList().FileList());
+       pkgRecords::Parser &Q = Recs.Lookup(D);
+       doc_vectors[P->ID].add_string (Q.LongDesc (), 1, &word_freq);
+
+       if (0)
+	 {
+	   printf ("%s: %s\n", P.Name(), P.Section());
+	   doc_vector::iterator iter;
+	   for (iter = doc_vectors[P->ID].begin ();
+		iter != doc_vectors[P->ID].end ();
+		iter ++)
+	     printf ("%s: %f\n", iter->first.c_str (), iter->second);
+	 }
+     }
+
+   for (int i = 0; i < last_doc_id; i ++)
+     doc_vectors[i].tf_idf (docs, word_freq);
+
+   /* Generate the query vector.  */
+   doc_vector query;
+   string q;
+   for (unsigned i = 0; i != NumKeywords; i ++)
+     {
+       if (! q.empty ())
+	 q.append (" ");
+       q.append (CmdL.FileList[i+1]);
+     }
+   cout << q << "\n";
+   query.add_string (q, 1, NULL);
+   query.tf_idf (docs, word_freq);
+
+   /* Compute the similarities.  */
+   vector<pair<double, int>> results;
+
+   for (int i = 0; i < last_doc_id; i ++)
+     {
+       double similarity = query.similarity (doc_vectors[i]);
+       if (similarity > 0.01)
+	 results.push_back (make_pair (similarity, i));
+     }
+
+   sort (results.begin (), results.end ());
+
+   for (int i = results.size () - 1; i >= 0; i --)
+     {
+       if (1)
+	 printf ("%f: ", results[i].first);
+
+       doc_vector d = doc_vectors[results[i].second];
+
+       pkgCache::DescFileIterator D(Cache,d.V.DescriptionList().FileList());
+       pkgRecords::Parser &Q = Recs.Lookup(D);
+
+       printf ("%s: %s\n", d.P.Name (), Q.ShortDesc ().c_str ());
+     }
+
+   if (ferror(stdout))
+       return _error->Error("Write to stdout failed");
+   return true;
+}
+									/*}}}*/
+
+									/*}}}*/
 // ShowPackage - Dump the package record to the screen			/*{{{*/
 // ---------------------------------------------------------------------
 /* */
@@ -1736,6 +1997,7 @@
       "   dumpavail - Print an available file to stdout\n"
       "   unmet - Show unmet dependencies\n"
       "   search - Search the package list for a regex pattern\n"
+      "   ksearch - Search the package list for keywords\n"
       "   show - Show a readable record for the package\n"
       "   depends - Show raw dependency information for a package\n"
       "   rdepends - Show reverse dependency information for a package\n"
@@ -1796,6 +2058,7 @@
                                     {"dumpavail",&DumpAvail},
                                     {"unmet",&UnMet},
                                     {"search",&Search},
+                                    {"ksearch",&KSearch},
                                     {"depends",&Depends},
                                     {"rdepends",&RDepends},
                                     {"dotty",&Dotty},




Reply to: