Bug#549225: UDD: please include removals
tags 549225 - pending
tags 549225 + patch
thanks
On Wed, Oct 07, 2009 at 09:49:45PM +0200, Lucas Nussbaum wrote:
> On 01/10/09 at 19:58 +0200, Sandro Tosi wrote:
> > It would be great if we can have also Debian packages removal into UDD; the
> > canonical source for them is ftp-master.debian.org .
>
> Patch welcomed (python preferred).
Attached:
- udd/removals_gatherer.py
- pkg-removals.patch for sql/setup.sql & config-org.yaml
- scripts/fix-removal-timestamps.py
The last one fixes a rare but recurring issue in the raw input file from
ftp-master, see embedded comment.
Cheers,
Serafeim
--
debtags-organised WNPP bugs: http://members.hellug.gr/serzan/wnpp
Index: sql/setup.sql
===================================================================
--- sql/setup.sql (revision 1621)
+++ sql/setup.sql (working copy)
@@ -535,6 +535,29 @@
);
GRANT SELECT ON wannabuild TO public;
+-- package_removal_batch
+CREATE TABLE package_removal_batch (
+ id int,
+ time timestamp,
+ ftpmaster text,
+ distribution text,
+ requestor text,
+ reasons text,
+ PRIMARY KEY (id)
+);
+GRANT SELECT ON package_removal_batch TO public;
+
+-- package_removal
+CREATE TABLE package_removal (
+ batch_id int,
+ name text,
+ version debversion,
+ arch_array text[],
+ PRIMARY KEY(batch_id, name, version),
+ FOREIGN KEY(batch_id) REFERENCES package_removal_batch(id)
+);
+GRANT SELECT ON package_removal TO public;
+
-- timings of data operations
CREATE TABLE timestamps (
id serial,
Index: config-org.yaml
===================================================================
--- config-org.yaml (revision 1621)
+++ config-org.yaml (working copy)
@@ -22,6 +22,7 @@
dehs: module udd.dehs_gatherer
ldap: module udd.ldap_gatherer
wannabuild: module udd.wannabuild_gatherer
+ removals: module udd.removals_gatherer
timestamp-dir: /org/udd.debian.org/timestamps
lock-dir: /org/udd.debian.org/locks
archs:
@@ -432,3 +433,9 @@
i386, ia64, kfreebsd-amd64, kfreebsd-i386, mips,
mipsel, powerpc, s390, sparc]
+removals:
+ type: removals
+ update-command: wget -q http://ftp-master.debian.org/removals-full.txt -O - | scripts/fix-removal-timestamps.py > /org/udd.debian.org/mirrors/removals-full.txt
+ path: /org/udd.debian.org/mirrors/removals-full.txt
+ table: package_removal
+ schema: package_removal
#!/usr/bin/env python
# This file is a part of the Ultimate Debian Database
# <http://wiki.debian.org/UltimateDebianDatabase>
#
# Copyright (C) 2009 Serafeim Zanikolas <serzan@hellug.gr>
#
# This file is distributed under the terms of the General Public
# License version 3 or (at your option) any later version.
""" import data about the removal of packages (from the debian archive) in UDD
Raw data source: http://ftp-master.debian.org/removals-full.txt
Sample removal batch from the above file:
=========================================================================
[Date: Tue, 9 Jan 2001 20:52:51 -0500] [ftpmaster: James Troup]
Removed the following packages from unstable:
dsniff | 2.3-1 | source, i386
Closed bugs: 81709
------------------- Reason -------------------
ROM; moved to non-US (now depends on libssl)
----------------------------------------------
=========================================================================
Note that a removal batch may have many packages removed (unlike the one
above, where only dsniff is removed).
This script when ran as a standalone script will not connect to the database
but will instead run a basic sanity test (to make sure that the input file
hasn't changed in a way that would break the script).
"""
import sys
import re
from gatherer import gatherer
from aux import quote
def fail(msg):
sys.stderr.write("%s\n" % msg)
exit(1)
def parse_removals(stream):
# We expect lines to appear in the order below. parser.curr_func is set to
# one of several functions based on how we expect to show up next in the
# file.
#
# date; ftp-master name
# distrib
# skip_line*
# pkg name | version | arch[, arch] <-- >=1 lines like these
# skip_line*
#------------------- Reason -------------------
# requestor; reasons
parser = Parser()
for line in stream:
if parser.skip_line(line):
continue
if parser.curr_func(line):
continue
return parser.removal_batches
def get_gatherer(connection, config, source):
return removals_gatherer(connection, config, source)
class removals_gatherer(gatherer):
"""import removals into the database"""
def __init__(self, connection, config, source):
gatherer.__init__(self, connection, config, source)
self.assert_my_config('path', 'table')
def run(self):
conf = self.my_config
try:
input_fd = open(conf['path'])
except IOError:
fail('failed to open %s' % conf['path'])
batch_removals = parse_removals(input_fd)
pkg_removal_table = conf['table']
pkg_removal_batch_table = "%s_batch" % conf['table']
cur = self.cursor()
cur.execute('DELETE FROM %s' % pkg_removal_table)
cur.execute('DELETE FROM %s' % pkg_removal_batch_table)
# insert data for batches of removals
cur.execute('PREPARE batch_removals_insert ' \
'AS INSERT INTO %s (id, time, ftpmaster, ' \
'distribution, requestor, ' \
'reasons)' \
'VALUES ($1, $2, $3, $4, $5, $6)' \
% pkg_removal_batch_table)
for i, batch_removal in enumerate(batch_removals):
cur.execute('EXECUTE batch_removals_insert ' \
'(%s, %s, %s, %s, %s, %s)' \
% (i, quote(batch_removal.timestamp),
quote(batch_removal.ftpmaster),
quote(batch_removal.distribution),
quote(batch_removal.requestor),
quote(batch_removal.reasons)))
cur.execute('DEALLOCATE batch_removals_insert')
cur.execute("ANALYZE %s" % pkg_removal_batch_table)
# insert data for removals of individual packages
cur.execute('PREPARE pkg_removal_insert ' \
'AS INSERT INTO %s (batch_id, name, version, ' \
'arch_array)' \
'VALUES ($1, $2, $3, $4)' % pkg_removal_table)
for i, batch_removal in enumerate(batch_removals):
for pkg in batch_removal.packages:
cur.execute('EXECUTE pkg_removal_insert (%s, %s, %s, %s)' \
% (i, quote(pkg.name), quote(pkg.version),
quote("{%s}" % ",".join(pkg.arches))))
cur.execute('DEALLOCATE pkg_removal_insert')
cur.execute("ANALYZE %s" % pkg_removal_table)
def test(filename, removal_batches):
"""compare the number of parsed packages against those counted with a
shell one-liner"""
from commands import getstatusoutput
status, npackage_removals_via_grep = getstatusoutput(\
"egrep '[^ ]+ *\| *[^ ]+ *\| *[^ ]+' %s | " \
"awk '-F|' '{print $1, $2}' | sed 's/ */ /g' | wc -l" \
% filename)
if status != 0:
fail("failed to extract removed packages with grep")
npackage_removals_via_grep = int(npackage_removals_via_grep)
npackage_removals_via_python = 0
ftpmasters = set()
distribs = set()
package_removals_via_python = set()
for pkg_rm_batch in removal_batches:
npackage_removals_via_python += len(pkg_rm_batch.packages)
ftpmasters.add(pkg_rm_batch.ftpmaster)
distribs.add(pkg_rm_batch.distribution)
if npackage_removals_via_grep != npackage_removals_via_python:
fail("%d removed packages have been parsed but %d were expected" % \
(npackage_removals_via_python, npackage_removals_via_grep))
print '%d packages were removed from %d distributions, in %d\n' \
'batches of removals done by %d ftpmaster members' % \
(npackage_removals_via_python, len(distribs),
len(removal_batches), len(ftpmasters))
class Package(object):
"""container for a single removed package"""
def __init__(self, name, version, arches):
self.name = name
self.version = version
self.arches = [arch.strip() for arch in arches.split(",")]
def __str__(self):
return '%s-%s' % (self.name, self.version)
class PackageRemovalBatch(object):
"""container for a removal batch (refers to one or more packages)"""
def __init__(self, timestamp, ftpmaster):
self.timestamp = timestamp
self.ftpmaster = ftpmaster
self.distribution = None
self.packages = []
self.requestor = None
self.reasons = None
def add_pkg(self, pkg):
self.packages.append(pkg)
def __str__(self):
return "removal of %s at %s by %s from %s" \
% ("\n".join([str(p) for p in self.packages]), \
self.timestamp, self.ftpmaster, self.distribution)
class Parser(object):
date_master_pat = re.compile(r"\[Date: ([^\]]+)] \[ftpmaster: ([^\]]+)\]")
distrib_pat = re.compile(r"Removed the following packages from ([a-z-]+)[:,]*")
pkg_version_arches_pat = re.compile(r"\s*(\S*) *\|\s*(\S+)\s*\|\s*(.*)$")
reason_pat = re.compile("-+\s*Reason\s*-+")
rene_pat = re.compile("(\[rene[^\]]*\])\s*(.*)")
def __init__(self):
self.removal_batch = None
self.removal_batches = []
self.curr_func = self.parse_removal
def skip_line(self, line):
if line.isspace() or line == "":
return True
def parse_removal(self, line):
match = Parser.date_master_pat.search(line)
if match:
timestamp, ftpmaster = match.groups()
self.removal_batch = PackageRemovalBatch(timestamp, ftpmaster)
self.curr_func = self.parse_distrib
return True
def parse_distrib(self, line):
match = Parser.distrib_pat.search(line)
if match:
self.removal_batch.distribution = match.group(1)
self.curr_func = self.parse_pkg_version_arch_or_reason_header
return True
def parse_pkg_version_arch_or_reason_header(self, line):
match = Parser.pkg_version_arches_pat.search(line)
if match:
pkg, version, arches = match.groups()
pkg_obj = Package(pkg, version, arches)
if self.removal_batch:
self.removal_batch.add_pkg(pkg_obj)
return True
elif self.removal_batch:
match = Parser.reason_pat.search(line)
if match:
self.curr_func = self.parse_requestor_reasons
return True
def parse_requestor_reasons(self, line):
match = Parser.rene_pat.search(line)
if match:
self.removal_batch.requestor = match.group(1)
self.removal_batch.reasons = match.group(2)
else:
fields = line.split(';')
if fields == 1: # assume no requestor
self.removal_batch.requestor = None
self.removal_batch.reasons = line
else:
self.removal_batch.requestor = fields[0]
self.removal_batch.reasons = ";".join(fields[1:])
self.curr_func = self.conclude_batch
return True # assume that we always get fed the correct line
def conclude_batch(self, line):
if line.startswith("---------") and self.removal_batch is not None:
self.removal_batches.append(self.removal_batch)
self.removal_batch = None
self.curr_func = self.parse_removal
return True
if '__main__' == __name__:
import os
try:
filename = sys.argv[1]
input_fd = open(filename)
except IndexError:
fail("syntax: %s <removals-file>\n" \
"(when run from the command line will only prints stats)" \
% os.path.basename(sys.argv[0]))
except IOError:
fail("failed to open %s" % filename)
batch_removals = parse_removals(input_fd)
test(filename, batch_removals)
#!/usr/bin/env python
# This file is a part of the Ultimate Debian Database
# <http://wiki.debian.org/UltimateDebianDatabase>
#
# Copyright (C) 2009 Serafeim Zanikolas <serzan@hellug.gr>
#
# This file is distributed under the terms of the General Public
# License version 3 or (at your option) any later version.
"""
Quick hack to fix broken timestamp entries in ftp-archive package removals
history file.
Before:
[Date: Tue, 27 Oct 2009 19:41:19 +0000
] [ftpmaster: Archive Administrator]
After applying this script:
[Date: Tue, 27 Oct 2009 19:41:19 +0000] [ftpmaster: Archive Administrator]
"""
import sys
prev_line = None
for line in sys.stdin:
line = line.rstrip()
if prev_line is None:
prev_line = line
continue
if line.startswith("] [ftpmaster:"):
assert prev_line
print "%s%s" % (prev_line, line)
prev_line = None
else:
print prev_line
prev_line = line
if prev_line:
print prev_line
Reply to: