[Date Prev][Date Next] [Thread Prev][Thread Next] [Date Index] [Thread Index]

Re: Description-less packages file



-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1

Hi Andreas,

attached is a patch to:

* rename the ddtp table to descriptions
* create ubuntu_descriptions and derivatives_descriptions tables
* import squeeze's long descriptions into the descriptions table, 
referencing them by calculated md5 in the packages table

Comments welcome -- the ddtp importer may need some changing to match these 
changes. 

cheers
Stuart

- -- 
Stuart Prescott                 www.nanoNANOnano.net
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.4.10 (GNU/Linux)

iEYEARECAAYFAk82frMACgkQn+i4zXHF0aib/ACfXs6Pbqq8J6/J1MXwsRLOoDNw
bqUAn3pJJRmf5RbjB2X6ArCh9m4WAjL0
=wKIS
-----END PGP SIGNATURE-----
>From f6942dbc3efdc956206865b90312e4e3fbc720fc Mon Sep 17 00:00:00 2001
From: Stuart Prescott <stuart+debian@nanonanonano.net>
Date: Sat, 11 Feb 2012 12:11:36 +0000
Subject: [PATCH 1/2] Import long description in to descriptions table

* Rename "ddtp" table to "descriptions"; also create "ubuntu_descriptions"
  and "derivatives_descriptions".
* Add a "descriptions-table" configuration key for packages
* If set, calculate the Description-md5 for each package and add that
  to the packages table; push the long description information into the
  descriptions table.
---
 config-org.yaml          |   33 ++++++++++++++++++++++-
 sql/setup.sql            |   25 ++++++++++++++++-
 sql/upgrade.sql          |   35 ++++++++++++++++++++++++
 udd/packages_gatherer.py |   66 +++++++++++++++++++++++++++++++++++++++-------
 4 files changed, 146 insertions(+), 13 deletions(-)

diff --git a/config-org.yaml b/config-org.yaml
index 938c51b..f2f4fbb 100644
--- a/config-org.yaml
+++ b/config-org.yaml
@@ -168,6 +168,7 @@ debian-squeeze:
   packages-schema: packages
   sources-schema: sources
   release: squeeze
+  descriptions-table: descriptions
 
 debian-squeeze-security:
   type: src-pkg
@@ -183,6 +184,7 @@ debian-squeeze-security:
   packages-schema: packages
   sources-schema: sources
   release: squeeze-security
+  descriptions-table: descriptions
 
 debian-squeeze-proposed-updates:
   type: src-pkg
@@ -196,6 +198,7 @@ debian-squeeze-proposed-updates:
   packages-schema: packages
   sources-schema: sources
   release: squeeze-proposed-updates
+  descriptions-table: descriptions
 
 debian-squeeze-updates:
   type: src-pkg
@@ -209,6 +212,7 @@ debian-squeeze-updates:
   packages-schema: packages
   sources-schema: sources
   release: squeeze-updates
+  descriptions-table: descriptions
 
 debian-backports-squeeze:
   type: src-pkg
@@ -222,6 +226,7 @@ debian-backports-squeeze:
   packages-schema: packages
   sources-schema: sources
   release: squeeze
+  descriptions-table: descriptions
 
 #debian-backports-squeeze-sloppy:
 #  type: src-pkg
@@ -235,6 +240,7 @@ debian-backports-squeeze:
 #  packages-schema: packages
 #  sources-schema: sources
 #  release: squeeze
+#  descriptions-table: descriptions
 
 debian-wheezy:
   type: src-pkg
@@ -450,6 +456,7 @@ ubuntu-natty:
   uploaders-table: ubuntu_uploaders
   packages-schema: packages
   sources-schema: sources
+  descriptions-table: ubuntu_descriptions
 
 ubuntu-natty-security:
   type: src-pkg
@@ -463,6 +470,7 @@ ubuntu-natty-security:
   uploaders-table: ubuntu_uploaders
   packages-schema: packages
   sources-schema: sources
+  descriptions-table: ubuntu_descriptions
 
 ubuntu-natty-updates:
   type: src-pkg
@@ -476,6 +484,7 @@ ubuntu-natty-updates:
   uploaders-table: ubuntu_uploaders
   packages-schema: packages
   sources-schema: sources
+  descriptions-table: ubuntu_descriptions
 
 ubuntu-natty-proposed:
   type: src-pkg
@@ -489,6 +498,7 @@ ubuntu-natty-proposed:
   uploaders-table: ubuntu_uploaders
   packages-schema: packages
   sources-schema: sources
+  descriptions-table: ubuntu_descriptions
 
 ubuntu-natty-backports:
   type: src-pkg
@@ -502,6 +512,7 @@ ubuntu-natty-backports:
   uploaders-table: ubuntu_uploaders
   packages-schema: packages
   sources-schema: sources
+  descriptions-table: ubuntu_descriptions
 
 
 ubuntu-maverick:
@@ -516,6 +527,7 @@ ubuntu-maverick:
   uploaders-table: ubuntu_uploaders
   packages-schema: packages
   sources-schema: sources
+  descriptions-table: ubuntu_descriptions
 
 ubuntu-maverick-security:
   type: src-pkg
@@ -529,6 +541,7 @@ ubuntu-maverick-security:
   uploaders-table: ubuntu_uploaders
   packages-schema: packages
   sources-schema: sources
+  descriptions-table: ubuntu_descriptions
 
 ubuntu-maverick-updates:
   type: src-pkg
@@ -542,6 +555,7 @@ ubuntu-maverick-updates:
   uploaders-table: ubuntu_uploaders
   packages-schema: packages
   sources-schema: sources
+  descriptions-table: ubuntu_descriptions
 
 ubuntu-maverick-proposed:
   type: src-pkg
@@ -555,6 +569,7 @@ ubuntu-maverick-proposed:
   uploaders-table: ubuntu_uploaders
   packages-schema: packages
   sources-schema: sources
+  descriptions-table: ubuntu_descriptions
 
 ubuntu-maverick-backports:
   type: src-pkg
@@ -568,6 +583,7 @@ ubuntu-maverick-backports:
   uploaders-table: ubuntu_uploaders
   packages-schema: packages
   sources-schema: sources
+  descriptions-table: ubuntu_descriptions
 
 
 ubuntu-lucid:
@@ -582,6 +598,7 @@ ubuntu-lucid:
   uploaders-table: ubuntu_uploaders
   packages-schema: packages
   sources-schema: sources
+  descriptions-table: ubuntu_descriptions
 
 ubuntu-lucid-security:
   type: src-pkg
@@ -595,6 +612,7 @@ ubuntu-lucid-security:
   uploaders-table: ubuntu_uploaders
   packages-schema: packages
   sources-schema: sources
+  descriptions-table: ubuntu_descriptions
 
 ubuntu-lucid-updates:
   type: src-pkg
@@ -608,6 +626,7 @@ ubuntu-lucid-updates:
   uploaders-table: ubuntu_uploaders
   packages-schema: packages
   sources-schema: sources
+  descriptions-table: ubuntu_descriptions
 
 ubuntu-lucid-proposed:
   type: src-pkg
@@ -621,6 +640,7 @@ ubuntu-lucid-proposed:
   uploaders-table: ubuntu_uploaders
   packages-schema: packages
   sources-schema: sources
+  descriptions-table: ubuntu_descriptions
 
 ubuntu-lucid-backports:
   type: src-pkg
@@ -634,6 +654,7 @@ ubuntu-lucid-backports:
   uploaders-table: ubuntu_uploaders
   packages-schema: packages
   sources-schema: sources
+  descriptions-table: ubuntu_descriptions
 
 
 ubuntu-karmic:
@@ -648,6 +669,7 @@ ubuntu-karmic:
   uploaders-table: ubuntu_uploaders
   packages-schema: packages
   sources-schema: sources
+  descriptions-table: ubuntu_descriptions
 
 ubuntu-karmic-security:
   type: src-pkg
@@ -661,6 +683,7 @@ ubuntu-karmic-security:
   uploaders-table: ubuntu_uploaders
   packages-schema: packages
   sources-schema: sources
+  descriptions-table: ubuntu_descriptions
 
 ubuntu-karmic-updates:
   type: src-pkg
@@ -674,6 +697,7 @@ ubuntu-karmic-updates:
   uploaders-table: ubuntu_uploaders
   packages-schema: packages
   sources-schema: sources
+  descriptions-table: ubuntu_descriptions
 
 ubuntu-karmic-proposed:
   type: src-pkg
@@ -687,6 +711,7 @@ ubuntu-karmic-proposed:
   uploaders-table: ubuntu_uploaders
   packages-schema: packages
   sources-schema: sources
+  descriptions-table: ubuntu_descriptions
 
 ubuntu-karmic-backports:
   type: src-pkg
@@ -700,6 +725,7 @@ ubuntu-karmic-backports:
   uploaders-table: ubuntu_uploaders
   packages-schema: packages
   sources-schema: sources
+  descriptions-table: ubuntu_descriptions
 
 
 #ubuntu-jaunty:
@@ -741,6 +767,7 @@ ubuntu-hardy:
   uploaders-table: ubuntu_uploaders
   packages-schema: packages
   sources-schema: sources
+  descriptions-table: ubuntu_descriptions
 
 ubuntu-hardy-security:
   type: src-pkg
@@ -754,6 +781,7 @@ ubuntu-hardy-security:
   uploaders-table: ubuntu_uploaders
   packages-schema: packages
   sources-schema: sources
+  descriptions-table: ubuntu_descriptions
 
 ubuntu-hardy-updates:
   type: src-pkg
@@ -767,6 +795,7 @@ ubuntu-hardy-updates:
   uploaders-table: ubuntu_uploaders
   packages-schema: packages
   sources-schema: sources
+  descriptions-table: ubuntu_descriptions
 
 ubuntu-hardy-proposed:
   type: src-pkg
@@ -780,6 +809,7 @@ ubuntu-hardy-proposed:
   uploaders-table: ubuntu_uploaders
   packages-schema: packages
   sources-schema: sources
+  descriptions-table: ubuntu_descriptions
 
 ubuntu-hardy-backports:
   type: src-pkg
@@ -793,6 +823,7 @@ ubuntu-hardy-backports:
   uploaders-table: ubuntu_uploaders
   packages-schema: packages
   sources-schema: sources
+  descriptions-table: ubuntu_descriptions
 
 
 #ubuntu-dapper:
@@ -972,7 +1003,7 @@ ddtp:
    mirror: ftp.debian.org/debian
    files: Translation-.*\.gz
    releases: sid lenny squeeze
-   table: ddtp
+   table: descriptions
    schema: ddtp
 
 ftpnew:
diff --git a/sql/setup.sql b/sql/setup.sql
index d433d5d..315556b 100644
--- a/sql/setup.sql
+++ b/sql/setup.sql
@@ -601,7 +601,7 @@ GRANT SELECT ON all_packages TO PUBLIC;
 GRANT SELECT ON all_packages_distrelcomparch TO PUBLIC;
 GRANT SELECT ON all_bugs TO PUBLIC;
 
-CREATE TABLE ddtp (
+CREATE TABLE descriptions (
        package      text,
        release      text,
        language     text,
@@ -610,8 +610,29 @@ CREATE TABLE ddtp (
        description_md5  text, -- md5 sum of the original English description
     PRIMARY KEY (package, release, language, description, description_md5)
 );
+GRANT SELECT ON descriptions TO PUBLIC;
 
-GRANT SELECT ON ddtp TO PUBLIC;
+CREATE TABLE ubuntu_descriptions (
+       package      text,
+       release      text,
+       language     text,
+       description  text,
+       long_description text,
+       description_md5  text, -- md5 sum of the original English description
+    PRIMARY KEY (package, release, language, description, description_md5)
+);
+GRANT SELECT ON ubuntu_descriptions TO PUBLIC;
+
+CREATE TABLE derivatives_descriptions (
+       package      text,
+       release      text,
+       language     text,
+       description  text,
+       long_description text,
+       description_md5  text, -- md5 sum of the original English description
+    PRIMARY KEY (package, release, language, description, description_md5)
+);
+GRANT SELECT ON derivatives_descriptions TO PUBLIC;
 
 -- active_dds view
 CREATE VIEW active_dds AS
diff --git a/sql/upgrade.sql b/sql/upgrade.sql
index 58e0b87..24c489b 100644
--- a/sql/upgrade.sql
+++ b/sql/upgrade.sql
@@ -365,3 +365,38 @@ CREATE TABLE ddtp (
     PRIMARY KEY (package, release, language, description, description_md5)
 );
 GRANT SELECT ON ddtp TO PUBLIC;
+-- 2012-02-09
+-- Add description tables for ubuntu and derivatives; rename ddtp table
+ALTER TABLE ddtp RENAME TO descriptions;
+GRANT SELECT ON descriptions TO PUBLIC;
+
+CREATE TABLE ubuntu_descriptions (
+       package      text,
+       release      text,
+       language     text,
+       description  text,
+       long_description text,
+       description_md5  text, -- md5 sum of the original English description
+    PRIMARY KEY (package, release, language, description, description_md5)
+);
+GRANT SELECT ON ubuntu_descriptions TO PUBLIC;
+
+CREATE TABLE derivatives_descriptions (
+       package      text,
+       release      text,
+       language     text,
+       description  text,
+       long_description text,
+       description_md5  text, -- md5 sum of the original English description
+    PRIMARY KEY (package, release, language, description, description_md5)
+);
+GRANT SELECT ON derivatives_descriptions TO PUBLIC;
+
+DROP VIEW all_packages;
+ALTER TABLE packages DROP COLUMN long_description;
+ALTER TABLE ubuntu_packages DROP COLUMN long_description;
+ALTER TABLE derivatives_packages DROP COLUMN long_description;
+CREATE VIEW all_packages AS
+SELECT * FROM packages
+UNION ALL SELECT * FROM ubuntu_packages
+UNION ALL SELECT * FROM derivatives_packages;
diff --git a/udd/packages_gatherer.py b/udd/packages_gatherer.py
index 3748f76..cd8e454 100644
--- a/udd/packages_gatherer.py
+++ b/udd/packages_gatherer.py
@@ -17,6 +17,7 @@ from gatherer import gatherer
 from time import time
 import email.Utils
 import re
+import hashlib
 
 def get_gatherer(connection, config, source):
   return packages_gatherer(connection, config, source)
@@ -41,7 +42,7 @@ class packages_gatherer(gatherer):
 
   pkgquery = """EXECUTE package_insert
       (%(Package)s, %(Version)s, %(Architecture)s, %(Maintainer)s, %(maintainer_name)s, %(maintainer_email)s,
-      %(Description)s, %(Long_Description)s, %(Description-md5)s, %(Source)s, %(Source_Version)s, %(Essential)s,
+      %(Description)s, %(Description-md5)s, %(Source)s, %(Source_Version)s, %(Essential)s,
       %(Depends)s, %(Recommends)s, %(Suggests)s, %(Enhances)s,
       %(Pre-Depends)s, %(Breaks)s, %(Installed-Size)s, %(Homepage)s, %(Size)s,
       %(Build-Essential)s, %(Origin)s, %(SHA1)s,
@@ -49,6 +50,10 @@ class packages_gatherer(gatherer):
       %(Tag)s, %(Task)s, %(Python-Version)s, %(Ruby-Versions)s, %(Provides)s,
       %(Conflicts)s, %(SHA256)s, %(Original-Maintainer)s)"""
 
+  descriptionquery = """EXECUTE description_insert
+      (%(Package)s, %(Language)s,
+      %(Description)s, %(Long_Description)s, %(Description-md5)s)"""
+
   def __init__(self, connection, config, source):
     gatherer.__init__(self, connection, config, source)
     # The ID for the distribution we want to include
@@ -61,6 +66,7 @@ class packages_gatherer(gatherer):
     # because different architectures include packages for architecture 'all'
     # with the same version, and we don't want these duplicate entries
     self.imported_all_pkgs = {}
+    self.add_descriptions = False
 
   def build_dict(self, control):
     """Build a dictionary from the control dictionary.
@@ -92,6 +98,7 @@ class packages_gatherer(gatherer):
     it is called.The Format of the sequence is expected to be that of a
     debian packages file."""
     pkgs = []
+    pkgdescs = []
 
     # The fields that are to be read. Other fields are ignored
     for control in deb822.Packages.iter_paragraphs(sequence):
@@ -106,11 +113,20 @@ class packages_gatherer(gatherer):
 
       # We split the description
       if 'Description' in d:
-	if len(d['Description'].split("\n",1)) > 1:
-	  d['Long_Description'] = d['Description'].split("\n",1)[1]
-	else:
-	  d['Long_Description'] = ''
-	d['Description'] = d['Description'].split("\n",1)[0]
+        if self.add_descriptions and \
+            ('Description-md5' not in d or not d['Description-md5']):
+          try:
+            d['Description-md5'] = hashlib.md5((d['Description']+"\n").encode('utf-8')).hexdigest()
+            d['Language'] = 'en'
+            pkgdescs.append(d)
+          except UnicodeEncodeError:
+            self.warned_about['%s description encoding' % d['Package']] = 1
+        if len(d['Description'].split("\n",1)) > 1:
+          d['Long_Description'] = d['Description'].split("\n",1)[1]
+        else:
+          d['Long_Description'] = ''
+        d['Description'] = d['Description'].split("\n",1)[0]
+	# Calculate Description-md5 for releases that don't include it
 
       # Convert numbers to numbers
       for f in ['Installed-Size', 'Size']:
@@ -136,6 +152,12 @@ class packages_gatherer(gatherer):
     except psycopg2.ProgrammingError:
       print "Error while inserting packages"
       raise
+    try:
+      if self.add_descriptions:
+        cur.executemany(self.descriptionquery, pkgdescs)
+    except psycopg2.ProgrammingError:
+      print "Error while inserting descriptions"
+      raise
 
   def setup(self):
     if 'schema-dir' in self.config['general']:
@@ -162,6 +184,9 @@ class packages_gatherer(gatherer):
     # Get distribution ID
     self._distr = src_cfg['distribution']
 
+    self.add_descriptions = ('descriptions-table' in self.my_config
+                              and self.my_config['descriptions-table'])
+
     cur = self.cursor()
     # defer constraints checking until the end of the transaction
     cur.execute("SET CONSTRAINTS ALL DEFERRED")
@@ -169,12 +194,17 @@ class packages_gatherer(gatherer):
     # For every part and every architecture, import the packages into the DB
     for comp in src_cfg['components']:
       cur.execute("DELETE FROM %s WHERE distribution = '%s' AND release = '%s' AND component = '%s'" %\
-	(table, self._distr, src_cfg['release'], comp))
+        (table, self._distr, src_cfg['release'], comp))
+      # For releases that have long descriptions in Packages and not in Translation,
+      # add the description to the configured descriptions table.
+      if self.add_descriptions:
+        cur.execute("DELETE FROM %s WHERE release = '%s' AND language = '%s'" %\
+          (src_cfg['descriptions-table'], src_cfg['release'], 'en'))
       for arch in src_cfg['archs']:
 	path = os.path.join(src_cfg['directory'], comp, 'binary-' + arch, 'Packages.gz')
 	try:
 	  cur.execute("""PREPARE package_insert AS INSERT INTO %s
-	    (Package, Version, Architecture, Maintainer, maintainer_name, maintainer_email, Description, Long_Description, description_md5, Source,
+	    (Package, Version, Architecture, Maintainer, maintainer_name, maintainer_email, Description, description_md5, Source,
 	    Source_Version, Essential, Depends, Recommends, Suggests, Enhances,
 	    Pre_Depends, Breaks, Installed_Size, Homepage, Size,
 	    build_essential, origin, sha1, replaces, section,
@@ -184,8 +214,22 @@ class packages_gatherer(gatherer):
 	  VALUES
 	    ( $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15,
 	      $16, $17, $18, $19, $20, $21, $22, $23, $24, $25, $26, $27, $28,
-	      $29, $30, $31, $32, $33, $34, $35, $36, $37, '%s', '%s', '%s')
+	      $29, $30, $31, $32, $33, $34, $35, $36, '%s', '%s', '%s')
 	    """ %  (table, self._distr, src_cfg['release'], comp))
+          if self.add_descriptions:
+            cur.execute("""PREPARE description_insert AS
+              INSERT INTO %s
+                (package, release, language, description, long_description, description_md5)
+                (SELECT $1 AS package, '%s' AS release, $2 AS language,
+                        $3 AS description, $4 AS long_description, $5 AS description_md5
+                  WHERE NOT EXISTS
+                  (SELECT 1
+                    FROM %s
+                    WHERE package=$1 AND release='%s' AND language=$2 AND
+                          description=$3 AND long_description=$4 AND description_md5=$5))
+
+            """ % (src_cfg['descriptions-table'], src_cfg['release'],
+                    src_cfg['descriptions-table'], src_cfg['release']))
 #	  aux.print_debug("Reading file " + path)
 	  # Copy content from gzipped file to temporary file, so that apt_pkg is
 	  # used by debian
@@ -200,7 +244,9 @@ class packages_gatherer(gatherer):
 	except IOError, (e, message):
 	  print "Could not read packages from %s: %s" % (path, message)
           sys.exit(1)
-	cur.execute("DEALLOCATE package_insert")
+        cur.execute("DEALLOCATE package_insert")
+        if self.add_descriptions:
+          cur.execute("DEALLOCATE description_insert")
     # Fill the summary tables
     cur.execute("DELETE FROM %s" % (table + '_summary'));
     cur.execute("""INSERT INTO %s (package, version, source, source_version,
-- 
1.7.2.5



Reply to: