[Date Prev][Date Next] [Thread Prev][Thread Next] [Date Index] [Thread Index]

[PATCH v3] Add by-hash support



Add a per-suite boolean to enable by-hash; store the by-hash files in
the db, and record when they stopped being referenced, so that
clean-suites can delete them after the archive's stayofexecution time.

In generate-release, where we have checksums for all the things, move
files to the by-hash dir for the suite's strongest hash function, and
add symlinks in the original location and in the other by-hash dirs.

Signed-off-by: Julien Cristau <jcristau@debian.org>
---
This is also at http://anonscm.debian.org/cgit/users/jcristau/dak.git/commit/?h=by-hash-v3

Review comments welcome :)

changes in v2:
- use archive.stayofexecution as delay before removing files from by-hash
- don't assume any particular ordering for suite.checksums

changes in v3:
- rebase on latest master, update115 is now update116
- handle missing files in clean_byhash

 dak/clean_suites.py      | 28 ++++++++++++++++++++++++
 dak/dakdb/update116.py   | 39 ++++++++++++++++++++++++++++++++++
 dak/generate_releases.py | 55 +++++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 121 insertions(+), 1 deletion(-)
 create mode 100644 dak/dakdb/update116.py

diff --git a/dak/clean_suites.py b/dak/clean_suites.py
index d5b0fc4..51800a8 100755
--- a/dak/clean_suites.py
+++ b/dak/clean_suites.py
@@ -384,6 +384,33 @@ SELECT f.id, f.fingerprint FROM fingerprint f
 
 ################################################################################
 
+def clean_byhash(now_date, session):
+    Logger.log(["Cleaning out unused by-hash files..."])
+
+    q = session.execute("""
+        DELETE FROM hashfile h
+        USING suite s, archive a
+        WHERE s.id = h.suite_id
+          AND a.id = s.archive_id
+          AND h.unreferenced + a.stayofexecution < CURRENT_TIMESTAMP
+        RETURNING a.path, s.suite_name, h.path""")
+    count = q.rowcount
+
+    if not Options["No-Action"]:
+        for base, suite, path in q.fetchall():
+            filename = os.path.join(base, 'dists', suite, path)
+            if os.path.isfile(filename):
+                Logger.log(['delete hashfile', suite, path])
+                os.unlink(filename)
+            else:
+                Logger.log(['database referred to non-existing file', filename])
+        session.commit()
+
+    if count > 0:
+        Logger.log(["total", count])
+
+################################################################################
+
 def clean_empty_directories(session):
     """
     Removes empty directories from pool directories.
@@ -486,6 +513,7 @@ def main():
     clean(now_date, archives, max_delete, session)
     clean_maintainers(now_date, session)
     clean_fingerprints(now_date, session)
+    clean_byhash(now_date, session)
     clean_empty_directories(session)
 
     session.rollback()
diff --git a/dak/dakdb/update116.py b/dak/dakdb/update116.py
new file mode 100644
index 0000000..d5fa3cb
--- /dev/null
+++ b/dak/dakdb/update116.py
@@ -0,0 +1,39 @@
+"""
+Add column to store whether to generate by-hash things per suite
+Add table to store when by-hash files stopped being referenced
+
+@contact: Debian FTP Master <ftpmaster@debian.org>
+@copyright: 2016, Julien Cristau <jcristau@debian.org>
+@license: GNU General Public License version 2 or later
+"""
+
+import psycopg2
+from daklib.dak_exceptions import DBUpdateError
+from daklib.config import Config
+
+def do_update(self):
+    """Add column to store whether to generate by-hash things per suite,
+    add table to store when by-hash files stopped being referenced
+    """
+    print __doc__
+    try:
+        c = self.db.cursor()
+
+        c.execute("ALTER TABLE suite ADD COLUMN byhash BOOLEAN DEFAULT false")
+
+        c.execute("""
+            CREATE TABLE hashfile (
+                suite_id INTEGER NOT NULL REFERENCES suite(id) ON DELETE CASCADE,
+                path TEXT NOT NULL,
+                unreferenced TIMESTAMP,
+                PRIMARY KEY (suite_id, path)
+            )
+             """)
+
+        c.execute("UPDATE config SET value = '116' WHERE name = 'db_revision'")
+
+        self.db.commit()
+
+    except psycopg2.ProgrammingError as msg:
+        self.db.rollback()
+        raise DBUpdateError('Unable to apply sick update 116, rollback issued. Error message : %s' % (str(msg)))
diff --git a/dak/generate_releases.py b/dak/generate_releases.py
index c359177..b557093 100755
--- a/dak/generate_releases.py
+++ b/dak/generate_releases.py
@@ -37,6 +37,7 @@ import stat
 import time
 import gzip
 import bz2
+import errno
 import apt_pkg
 import subprocess
 from tempfile import mkstemp, mkdtemp
@@ -151,7 +152,9 @@ class ReleaseWriter(object):
 
         # Boolean stuff. If we find it true in database, write out "yes" into the release file
         boolattrs = ( ('NotAutomatic',         'notautomatic'),
-                      ('ButAutomaticUpgrades', 'butautomaticupgrades') )
+                      ('ButAutomaticUpgrades', 'butautomaticupgrades'),
+                      ('Acquire-By-Hash',      'byhash'),
+                    )
 
         cnf = Config()
 
@@ -284,6 +287,56 @@ class ReleaseWriter(object):
         out.close()
         os.rename(outfile + '.new', outfile)
 
+        if suite.byhash:
+            query = """
+                UPDATE hashfile SET unreferenced = CURRENT_TIMESTAMP
+                WHERE suite_id = :id AND unreferenced IS NULL"""
+            session.execute(query, {'id': suite.suite_id})
+
+            hashes = filter(lambda h: h in hashfuncs, ('MD5Sum', 'SHA1', 'SHA256'))
+            strong_hash = hashes[-1]
+            for filename in fileinfo:
+                if not os.path.exists(filename):
+                    # probably an uncompressed index we didn't generate
+                    continue
+
+                for h in hashes:
+                    hashfile = os.path.join(os.path.dirname(filename), 'by-hash', h, fileinfo[filename][h])
+                    query = "SELECT 1 FROM hashfile WHERE path = :p AND suite_id = :id"
+                    q = session.execute(
+                            query,
+                            {'p': hashfile, 'id': suite.suite_id})
+                    if q.rowcount:
+                        session.execute('''
+                            UPDATE hashfile SET unreferenced = NULL
+                            WHERE path = :p and suite_id = :id''',
+                            {'p': hashfile, 'id': suite.suite_id})
+                    else:
+                        session.execute('''
+                            INSERT INTO hashfile (path, suite_id, unreferenced)
+                            VALUES (:p, :id, NULL)''',
+                            {'p': hashfile, 'id': suite.suite_id})
+
+                    try:
+                        os.makedirs(os.path.dirname(hashfile))
+                    except OSError as exc:
+                        if exc.errno != errno.EEXIST:
+                            raise
+                    if h == strong_hash:
+                        with open(filename, 'rb') as src:
+                            contents = src.read()
+                        with open(hashfile + '.new', 'wb') as dst:
+                            dst.write(contents)
+                        os.rename(hashfile + '.new', hashfile)
+                        os.unlink(filename)
+                        os.symlink(os.path.join('by-hash', h, fileinfo[filename][h]),
+                                   filename)
+                    elif not os.path.exists(hashfile):
+                        os.symlink(os.path.join('..', strong_hash, fileinfo[filename][strong_hash]),
+                                   hashfile)
+
+                session.commit()
+
         sign_release_dir(suite, os.path.dirname(outfile))
 
         os.chdir(oldcwd)
-- 
2.8.1


Reply to: