[Date Prev][Date Next] [Thread Prev][Thread Next] [Date Index] [Thread Index]

[dak/master] implement sensible handling of checksum fields in .changes and .dsc



2008-08-28  Philipp Kern  <pkern@debian.org>

	* daklib/utils.py (check_hashes): adapt to different API, check
	sizes separately

	* daklib/utils.py (parse_changes, parse_deb822): refactor
	the string-based logic of parse_changes into a new function
	parse_deb822; parse_changes itself remains file-based

	* daklib/utils.py (hash_key): gives the key of a hash in the
	files dict

	* daklib/utils.py (create_hash, check_size): made more readable

	* daklib/utils.py (check_hash): just check the hashes and complain
	about missing checksums

	* daklib/utils.py (check_hash_fields): function to reject unknown
	checksums fields

	* daklib/utils.py (_ensure_changes_hash, _ensure_dsc_hash): helper
	functions for ensure_hashes; check their corresponding manifests'
	hashes

	* daklib/utils.py (ensure_hashes): retrieve the checksums fields
	from the original filecontents blob so that they do not need to
	be present in the .dak; refactored the actual checks by calling
	the aforementioned helper functions

	* daklib/utils.py (parse_checksums): parse a given checksums field
	in a manifest and insert the values found into the files dict,
	checking the file sizes on the way
---
 ChangeLog                |   33 ++++++
 dak/process_unchecked.py |    6 +-
 daklib/utils.py          |  264 +++++++++++++++++++++++++++++----------------
 3 files changed, 207 insertions(+), 96 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index 4440c87..87a4273 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,36 @@
+2008-08-28  Philipp Kern  <pkern@debian.org>
+
+	* daklib/utils.py (check_hashes): adapt to different API, check
+	sizes separately
+
+	* daklib/utils.py (parse_changes, parse_deb822): refactor
+	the string-based logic of parse_changes into a new function
+	parse_deb822; parse_changes itself remains file-based
+
+	* daklib/utils.py (hash_key): gives the key of a hash in the
+	files dict
+
+	* daklib/utils.py (create_hash, check_size): made more readable
+
+	* daklib/utils.py (check_hash): just check the hashes and complain
+	about missing checksums
+
+	* daklib/utils.py (check_hash_fields): function to reject unknown
+	checksums fields
+
+	* daklib/utils.py (_ensure_changes_hash, _ensure_dsc_hash): helper
+	functions for ensure_hashes; check their corresponding manifests'
+	hashes
+
+	* daklib/utils.py (ensure_hashes): retrieve the checksums fields
+	from the original filecontents blob so that they do not need to
+	be present in the .dak; refactored the actual checks by calling
+	the aforementioned helper functions
+
+	* daklib/utils.py (parse_checksums): parse a given checksums field
+	in a manifest and insert the values found into the files dict,
+	checking the file sizes on the way
+
 2008-08-15  Mark Hymers  <mhy@debian.org>
 
 	* daklib/utils.py: Actually import a module before using it.
diff --git a/dak/process_unchecked.py b/dak/process_unchecked.py
index 123fd9f..779f676 100755
--- a/dak/process_unchecked.py
+++ b/dak/process_unchecked.py
@@ -919,8 +919,10 @@ def check_urgency ():
 ################################################################################
 
 def check_hashes ():
-    utils.check_hash(".changes", files, "md5sum", apt_pkg.md5sum)
-    utils.check_hash(".dsc", dsc_files, "md5sum", apt_pkg.md5sum)
+    utils.check_hash(".changes", files, "md5", apt_pkg.md5sum)
+    utils.check_size(".changes", files)
+    utils.check_hash(".dsc", dsc_files, "md5", apt_pkg.md5sum)
+    utils.check_size(".dsc", dsc_files)
 
     # This is stupid API, but it'll have to do for now until
     # we actually have proper abstraction
diff --git a/daklib/utils.py b/daklib/utils.py
index 7ff9319..02278e9 100755
--- a/daklib/utils.py
+++ b/daklib/utils.py
@@ -1,4 +1,5 @@
 #!/usr/bin/env python
+# vim:set et ts=4 sw=4:
 
 # Utility functions
 # Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006  James Troup <james@nocrew.org>
@@ -100,34 +101,14 @@ def extract_component_from_section(section):
 
 ################################################################################
 
-def parse_changes(filename, signing_rules=0):
-    """Parses a changes file and returns a dictionary where each field is a
-key.  The mandatory first argument is the filename of the .changes
-file.
-
-signing_rules is an optional argument:
-
- o If signing_rules == -1, no signature is required.
- o If signing_rules == 0 (the default), a signature is required.
- o If signing_rules == 1, it turns on the same strict format checking
-   as dpkg-source.
-
-The rules for (signing_rules == 1)-mode are:
-
-  o The PGP header consists of "-----BEGIN PGP SIGNED MESSAGE-----"
-    followed by any PGP header data and must end with a blank line.
-
-  o The data section must end with a blank line and must be followed by
-    "-----BEGIN PGP SIGNATURE-----".
-"""
-
+def parse_deb822(contents, signing_rules=0):
     error = ""
     changes = {}
 
-    changes_in = open_file(filename)
-    lines = changes_in.readlines()
+    # Split the lines in the input, keeping the linebreaks.
+    lines = contents.splitlines(True)
 
-    if not lines:
+    if len(lines) == 0:
         raise ParseChangesError, "[Empty changes file]"
 
     # Reindex by line number so we can easily verify the format of
@@ -193,7 +174,6 @@ The rules for (signing_rules == 1)-mode are:
     if signing_rules == 1 and inside_signature:
         raise InvalidDscError, index
 
-    changes_in.close()
     changes["filecontents"] = "".join(lines)
 
     if changes.has_key("source"):
@@ -211,112 +191,208 @@ The rules for (signing_rules == 1)-mode are:
 
 ################################################################################
 
-def create_hash (lfiles, key, testfn, basedict = None):
+def parse_changes(filename, signing_rules=0):
+    """Parses a changes file and returns a dictionary where each field is a
+key.  The mandatory first argument is the filename of the .changes
+file.
+
+signing_rules is an optional argument:
+
+ o If signing_rules == -1, no signature is required.
+ o If signing_rules == 0 (the default), a signature is required.
+ o If signing_rules == 1, it turns on the same strict format checking
+   as dpkg-source.
+
+The rules for (signing_rules == 1)-mode are:
+
+  o The PGP header consists of "-----BEGIN PGP SIGNED MESSAGE-----"
+    followed by any PGP header data and must end with a blank line.
+
+  o The data section must end with a blank line and must be followed by
+    "-----BEGIN PGP SIGNATURE-----".
+"""
+
+    changes_in = open_file(filename)
+    content = changes_in.read()
+    changes_in.close()
+    return parse_deb822(content, signing_rules)
+
+################################################################################
+
+def hash_key(hashname):
+    return '%ssum' % hashname
+
+################################################################################
+
+def create_hash(where, files, hashname, hashfunc):
+    """create_hash extends the passed files dict with the given hash by
+    iterating over all files on disk and passing them to the hashing
+    function given."""
+
     rejmsg = []
-    for f in lfiles.keys():
+    for f in files.keys():
         try:
             file_handle = open_file(f)
         except CantOpenError:
             rejmsg.append("Could not open file %s for checksumming" % (f))
 
-        # Check hash
-        if basedict and basedict.has_key(f):
-            basedict[f]['%ssum' % key] = testfn(file_handle)
-        file_handle.close()
+        files[f][hash_key(hashname)] = hashfunc(file_handle)
 
+        file_handle.close()
     return rejmsg
 
 ################################################################################
 
-def check_hash (where, lfiles, key, testfn, basedict = None):
-    rejmsg = []
-    if basedict:
-        for f in basedict.keys():
-            if f not in lfiles:
-                rejmsg.append("%s: no %s checksum" % (f, key))
-
-    for f in lfiles.keys():
-        if basedict and f not in basedict:
-            rejmsg.append("%s: extraneous entry in %s checksums" % (f, key))
+def check_hash(where, files, hashname, hashfunc):
+    """check_hash checks the given hash in the files dict against the actual
+    files on disk.  The hash values need to be present consistently in
+    all file entries.  It does not modify its input in any way."""
 
+    rejmsg = []
+    for f in files.keys():
         try:
             file_handle = open_file(f)
+
+            # Check for the hash entry, to not trigger a KeyError.
+            if not files[f].has_key(hash_key(hashname)):
+                rejmsg.append("%s: misses %s checksum in %s" % (f, hashname,
+                    where))
+                continue
+
+            # Actually check the hash for correctness.
+            if hashfunc(file_handle) != files[f][hash_key(hashname)]:
+                rejmsg.append("%s: %s check failed in %s" % (f, hashname,
+                    where))
         except CantOpenError:
+            # XXX: IS THIS THE BLOODY CASE WHEN THE FILE'S IN THE POOL!?
             continue
+        finally:
+            file_handle.close()
+    return rejmsg
 
-        # Check hash
-        if testfn(file_handle) != lfiles[f][key]:
-            rejmsg.append("%s: %s check failed." % (f, key))
-        file_handle.close()
-        # Store the hashes for later use
-        if basedict:
-            basedict[f]['%ssum' % key] = lfiles[f][key]
-        # Check size
+################################################################################
+
+def check_size(where, files):
+    """check_size checks the file sizes in the passed files dict against the
+    files on disk."""
+
+    rejmsg = []
+    for f in files.keys():
         actual_size = os.stat(f)[stat.ST_SIZE]
-        size = int(lfiles[f]["size"])
+        size = int(files[f]["size"])
         if size != actual_size:
             rejmsg.append("%s: actual file size (%s) does not match size (%s) in %s"
                    % (f, actual_size, size, where))
+    return rejmsg
+
+################################################################################
+
+def check_hash_fields(what, manifest):
+    """check_hash_fields ensures that there are no checksum fields in the
+    given dict that we do not know about."""
+
+    rejmsg = []
+    hashes = map(lambda x: x[0], known_hashes)
+    for field in manifest:
+        if field.startswith("checksums-"):
+            hashname = field.split("-",1)[1]
+            if hashname not in hashes:
+                rejmsg.append("Unsupported checksum field for %s "\
+                    "in %s" % (hashname, what))
+    return rejmsg
+
+################################################################################
+
+def _ensure_changes_hash(changes, format, version, files, hashname, hashfunc):
+    if format >= version:
+        # The version should contain the specified hash.
+        func = check_hash
+
+        # Import hashes from the changes
+        rejmsg = parse_checksums(".changes", files, changes, hashname)
+        if len(rejmsg) > 0:
+            return rejmsg
+    else:
+        # We need to calculate the hash because it can't possibly
+        # be in the file.
+        func = create_hash
+    return func(".changes", files, hashname, hashfunc)
+
+# We could add the orig which might be in the pool to the files dict to
+# access the checksums easily.
+
+def _ensure_dsc_hash(dsc, dsc_files, hashname, hashfunc):
+    """ensure_dsc_hashes' task is to ensure that each and every *present* hash
+    in the dsc is correct, i.e. identical to the changes file and if necessary
+    the pool.  The latter task is delegated to check_hash."""
 
+    rejmsg = []
+    if not dsc.has_key('Checksums-%s' % (hashname,)):
+        return rejmsg
+    # Import hashes from the dsc
+    parse_checksums(".dsc", dsc_files, dsc, hashname)
+    # And check it...
+    rejmsg.extend(check_hash(".dsc", dsc_files, hashname, hashfunc))
     return rejmsg
 
 ################################################################################
 
 def ensure_hashes(changes, dsc, files, dsc_files):
-    # Make sure we recognise the format of the Files: field
-    format = changes.get("format", "0.0").split(".",1)
+    rejmsg = []
+
+    # Make sure we recognise the format of the Files: field in the .changes
+    format = changes.get("format", "0.0").split(".", 1)
     if len(format) == 2:
         format = int(format[0]), int(format[1])
     else:
         format = int(float(format[0])), 0
 
-    rejmsg = []
-    for x in changes:
-        if x.startswith("checksum-"):
-            h = x.split("-",1)[1]
-            if h not in dict(known_hashes):
-                rejmsg.append("Unsupported checksum field in .changes" % (h))
-
-    for x in dsc:
-        if x.startswith("checksum-"):
-            h = x.split("-",1)[1]
-            if h not in dict(known_hashes):
-                rejmsg.append("Unsupported checksum field in .dsc" % (h))
+    # We need to deal with the original changes blob, as the fields we need
+    # might not be in the changes dict serialised into the .dak anymore.
+    orig_changes = parse_deb822(changes['filecontents'])
+
+    # Copy the checksums over to the current changes dict.  This will keep
+    # the existing modifications to it intact.
+    for field in orig_changes:
+        if field.startswith('checksums-'):
+            changes[field] = orig_changes[field]
+
+    # Check for unsupported hashes
+    rejmsg.extend(check_hash_fields(".changes", changes))
+    rejmsg.extend(check_hash_fields(".dsc", dsc))
 
     # We have to calculate the hash if we have an earlier changes version than
     # the hash appears in rather than require it exist in the changes file
-    # I hate backwards compatibility
-    for h,f,v in known_hashes:
-        try:
-            if format < v:
-                for m in create_hash(files, h, f, files):
-                    rejmsg.append(m)
-            else:
-                for m in check_hash(".changes %s" % (h), files, '%ssum' % h, f, files):
-                    rejmsg.append(m)
-        except NoFilesFieldError:
-            rejmsg.append("No Checksums-%s: field in .changes" % (h))
-        except UnknownFormatError, format:
-            rejmsg.append("%s: unknown format of .changes" % (format))
-        except ParseChangesError, line:
-            rejmsg.append("parse error for Checksums-%s in .changes, can't grok: %s." % (h, line))
+    for hashname, hashfunc, version in known_hashes:
+        rejmsg.extend(_ensure_changes_hash(changes, format, version, files,
+            hashname, hashfunc))
+        if "source" in changes["architecture"]:
+            rejmsg.extend(_ensure_dsc_hash(dsc, dsc_files, hashname,
+                hashfunc))
 
-        if "source" not in changes["architecture"]: continue
-
-        try:
-            if format < v:
-                for m in create_hash(dsc_files, h, f, dsc_files):
-                    rejmsg.append(m)
-            else:
-                for m in check_hash(".dsc %s" % (h), dsc_files, '%ssum' % h, f, dsc_files):
-                    rejmsg.append(m)
-        except UnknownFormatError, format:
-            rejmsg.append("%s: unknown format of .dsc" % (format))
-        except NoFilesFieldError:
-            rejmsg.append("No Checksums-%s: field in .dsc" % (h))
-        except ParseChangesError, line:
-            rejmsg.append("parse error for Checksums-%s in .dsc, can't grok: %s." % (h, line))
+    return rejmsg
 
+def parse_checksums(where, files, manifest, hashname):
+    rejmsg = []
+    field = 'checksums-%s' % hashname
+    if not field in manifest:
+        return rejmsg
+    input = manifest[field]
+    for line in input.split('\n'):
+        if not line:
+            break
+        hash, size, file = line.strip().split(' ')
+        if not files.has_key(file):
+            rejmsg.append("%s: not present in files but in checksums-%s in %s" %
+                (file, hashname, where))
+        if not files[file]["size"] == size:
+            rejmsg.append("%s: size differs for files and checksums-%s entry "\
+                "in %s" % (file, hashname, where))
+        files[file][hash_key(hashname)] = hash
+    for f in files.keys():
+        if not files[f].has_key(hash_key(hashname)):
+            rejmsg.append("%s: no entry in checksums-%s in %s" % (file,
+                hashname, where))
     return rejmsg
 
 ################################################################################
-- 
1.5.6.3



Reply to: