[Date Prev][Date Next] [Thread Prev][Thread Next] [Date Index] [Thread Index]

[PATCH v4] Add by-hash support



On Mon, May 16, 2016 at 16:04:16 +0200, Ansgar Burchardt wrote:

> Julien Cristau <jcristau@debian.org> writes:
> >> In case of `No-Action`, the transaction should be rolled back.
> >>
> > I was following what the other clean* functions in that file seem to be
> > doing, AFAICT they rely on the final session.rollback() in main().  I
> > can add it here, though.
> 
> Hmm, I think an explicit rollback might be safer. Then it is clear that
> nothing happens locally, no matter what others might do with the
> session.
> 
Left this as-is per IRC discussion.

> >> > +            strong_hash = hashes[-1]
> >>
> >> Hmm, I don't like this assumption, but then I have nothing better for
> >> now.
> >>
> > You mean the assumption that the suite has at least one of md5, sha1,
> > sha256?
> 
> The assumption that the strongest hash is the last one ;)
> 
Right, this was assuming that sha256 is stronger than sha1 which is
stronger than md5.  But, this is moot now as the new version uses
hardlinks so all names for the files are equivalent.

> >> > +                    if h == strong_hash:
> >> > +                        with open(filename, 'rb') as src:
> >> > +                            contents = src.read()
> >> > +                        with open(hashfile + '.new', 'wb') as dst:
> >> > +                            dst.write(contents)
> >> > +                        os.rename(hashfile + '.new', hashfile)
> >> > +                        os.unlink(filename)
> >>
> >> This is just `os.rename(filename, hashfile)`?  Or `shutil.move` for
> >> the theoretic case of `filename` and `hashfile` being on different
> >> filesystems.
> >>
> > Not quite.  If you're calling generate-releases twice in a row, e.g.
> > after an error, filename is already a symlink, so moving would break
> > things.  I could check for that explicitly, if you prefer?
> 
> Hmm, then `shutil.copy2()` + `os.unlink()`?  Note that `copy2` will not
> copy symlinks, but the actual file (plus metadata).
> 
Again this is no longer necessary with hardlinks.

New version attached and pushed to
http://anonscm.debian.org/cgit/users/jcristau/dak.git/commit/?h=by-hash-v4

Cheers,
Julien
>From 391f5ec09a119131dc846b796ca791f4cecc69e4 Mon Sep 17 00:00:00 2001
From: Julien Cristau <jcristau@debian.org>
Date: Wed, 27 Apr 2016 10:06:08 +0200
Subject: [PATCH] Add by-hash support

Add a per-suite boolean to enable by-hash; store the by-hash files in
the db, and record when they stopped being referenced, so that
clean-suites can delete them after the archive's stayofexecution time.

In generate-release, where we have checksums for all the things,
hardlink files to the by-hash dir for each of the suite's configured
hash methods.

Signed-off-by: Julien Cristau <jcristau@debian.org>
---
 dak/clean_suites.py      | 32 ++++++++++++++++++++++++++++++++
 dak/dakdb/update116.py   | 38 ++++++++++++++++++++++++++++++++++++++
 dak/generate_releases.py | 46 +++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 115 insertions(+), 1 deletion(-)
 create mode 100644 dak/dakdb/update116.py

changes in v2:
- use archive.stayofexecution as delay before removing files from
  by-hash
- don't assume any particular ordering for suite.checksums

changes in v3:
- rebase on latest master, update115 is now update116
- handle missing files in clean_byhash

changes in v4:
- use hardlinks instead of symlinks
- don't initialize `unreferenced` to its default value
- in clean_byhash, remove useless fetchall, and handle ENOENT from
  unlink instead of checking for existence beforehand

diff --git a/dak/clean_suites.py b/dak/clean_suites.py
index d5b0fc4..ac35437 100755
--- a/dak/clean_suites.py
+++ b/dak/clean_suites.py
@@ -34,6 +34,7 @@
 
 ################################################################################
 
+import errno
 import os
 import stat
 import sys
@@ -384,6 +385,36 @@ SELECT f.id, f.fingerprint FROM fingerprint f
 
 ################################################################################
 
+def clean_byhash(now_date, session):
+    Logger.log(["Cleaning out unused by-hash files..."])
+
+    q = session.execute("""
+        DELETE FROM hashfile h
+        USING suite s, archive a
+        WHERE s.id = h.suite_id
+          AND a.id = s.archive_id
+          AND h.unreferenced + a.stayofexecution < CURRENT_TIMESTAMP
+        RETURNING a.path, s.suite_name, h.path""")
+    count = q.rowcount
+
+    if not Options["No-Action"]:
+        for base, suite, path in q:
+            filename = os.path.join(base, 'dists', suite, path)
+            try:
+                os.unlink(filename)
+            except OSError as exc:
+                if exc.errno != errno.ENOENT:
+                    raise
+                Logger.log(['database referred to non-existing file', filename])
+            else:
+                Logger.log(['delete hashfile', suite, path])
+        session.commit()
+
+    if count > 0:
+        Logger.log(["total", count])
+
+################################################################################
+
 def clean_empty_directories(session):
     """
     Removes empty directories from pool directories.
@@ -486,6 +517,7 @@ def main():
     clean(now_date, archives, max_delete, session)
     clean_maintainers(now_date, session)
     clean_fingerprints(now_date, session)
+    clean_byhash(now_date, session)
     clean_empty_directories(session)
 
     session.rollback()
diff --git a/dak/dakdb/update116.py b/dak/dakdb/update116.py
new file mode 100644
index 0000000..7baf052
--- /dev/null
+++ b/dak/dakdb/update116.py
@@ -0,0 +1,38 @@
+"""
+Add support for by-hash with a new table and per-suite boolean
+
+@contact: Debian FTP Master <ftpmaster@debian.org>
+@copyright: 2016, Julien Cristau <jcristau@debian.org>
+@license: GNU General Public License version 2 or later
+"""
+
+import psycopg2
+from daklib.dak_exceptions import DBUpdateError
+from daklib.config import Config
+
+def do_update(self):
+    """Add column to store whether to generate by-hash things per suite,
+    add table to store when by-hash files stopped being referenced
+    """
+    print __doc__
+    try:
+        c = self.db.cursor()
+
+        c.execute("ALTER TABLE suite ADD COLUMN byhash BOOLEAN DEFAULT false")
+
+        c.execute("""
+            CREATE TABLE hashfile (
+                suite_id INTEGER NOT NULL REFERENCES suite(id) ON DELETE CASCADE,
+                path TEXT NOT NULL,
+                unreferenced TIMESTAMP,
+                PRIMARY KEY (suite_id, path)
+            )
+             """)
+
+        c.execute("UPDATE config SET value = '116' WHERE name = 'db_revision'")
+
+        self.db.commit()
+
+    except psycopg2.ProgrammingError as msg:
+        self.db.rollback()
+        raise DBUpdateError('Unable to apply sick update 116, rollback issued. Error message : %s' % (str(msg)))
diff --git a/dak/generate_releases.py b/dak/generate_releases.py
index c359177..82ff639 100755
--- a/dak/generate_releases.py
+++ b/dak/generate_releases.py
@@ -37,6 +37,7 @@ import stat
 import time
 import gzip
 import bz2
+import errno
 import apt_pkg
 import subprocess
 from tempfile import mkstemp, mkdtemp
@@ -151,7 +152,9 @@ class ReleaseWriter(object):
 
         # Boolean stuff. If we find it true in database, write out "yes" into the release file
         boolattrs = ( ('NotAutomatic',         'notautomatic'),
-                      ('ButAutomaticUpgrades', 'butautomaticupgrades') )
+                      ('ButAutomaticUpgrades', 'butautomaticupgrades'),
+                      ('Acquire-By-Hash',      'byhash'),
+                    )
 
         cnf = Config()
 
@@ -284,6 +287,47 @@ class ReleaseWriter(object):
         out.close()
         os.rename(outfile + '.new', outfile)
 
+        if suite.byhash:
+            query = """
+                UPDATE hashfile SET unreferenced = CURRENT_TIMESTAMP
+                WHERE suite_id = :id AND unreferenced IS NULL"""
+            session.execute(query, {'id': suite.suite_id})
+
+            for filename in fileinfo:
+                if not os.path.exists(filename):
+                    # probably an uncompressed index we didn't generate
+                    continue
+
+                for h in hashfuncs:
+                    hashfile = os.path.join(os.path.dirname(filename), 'by-hash', h, fileinfo[filename][h])
+                    query = "SELECT 1 FROM hashfile WHERE path = :p AND suite_id = :id"
+                    q = session.execute(
+                            query,
+                            {'p': hashfile, 'id': suite.suite_id})
+                    if q.rowcount:
+                        session.execute('''
+                            UPDATE hashfile SET unreferenced = NULL
+                            WHERE path = :p and suite_id = :id''',
+                            {'p': hashfile, 'id': suite.suite_id})
+                    else:
+                        session.execute('''
+                            INSERT INTO hashfile (path, suite_id)
+                            VALUES (:p, :id)''',
+                            {'p': hashfile, 'id': suite.suite_id})
+
+                    try:
+                        os.makedirs(os.path.dirname(hashfile))
+                    except OSError as exc:
+                        if exc.errno != errno.EEXIST:
+                            raise
+                    try:
+                        os.link(filename, hashfile)
+                    except OSError as exc:
+                        if exc.errno != errno.EEXIST:
+                            raise
+
+                session.commit()
+
         sign_release_dir(suite, os.path.dirname(outfile))
 
         os.chdir(oldcwd)
-- 
2.8.1


Reply to: