[dak/master] Add tool to de-duplicate pool
---
dak/archive_dedup_pool.py | 142 ++++++++++++++++++++++++++++++++++++++++++++++
dak/dak.py | 2 +
2 files changed, 144 insertions(+)
create mode 100755 dak/archive_dedup_pool.py
diff --git a/dak/archive_dedup_pool.py b/dak/archive_dedup_pool.py
new file mode 100755
index 0000000..7f3eb69
--- /dev/null
+++ b/dak/archive_dedup_pool.py
@@ -0,0 +1,142 @@
+#!/usr/bin/env python
+# vim:set et ts=4 sw=4:
+
+""" De-duplicates files in the pool directory
+
+@contact: Debian FTP Master <ftpmaster@debian.org>
+@copyright: 2017 Bastian Blank <waldi@debian.org>
+@license: GNU General Public License version 2 or later
+"""
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+################################################################################
+
+import apt_pkg
+import os
+import stat
+import sys
+
+from daklib.dbconn import DBConn
+from daklib import daklog
+from daklib.config import Config
+
+Options = None
+Logger = None
+
+################################################################################
+################################################################################
+################################################################################
+
+def usage (exit_code=0):
+ print """Usage: dak archive-dedup-pool [OPTION]...
+ -h, --help show this help and exit.
+ -V, --version display the version number and exit
+"""
+ sys.exit(exit_code)
+
+################################################################################
+
+def dedup_one(size, reference, *filenames):
+ stat_reference = os.stat(reference)
+
+ # safety net
+ if stat_reference.st_size != size:
+ raise RuntimeError('Size of {} does not match database: {} != {}'.format(
+ reference, size, stat_reference.st_size))
+
+ for filename in filenames:
+ stat_filename = os.stat(filename)
+
+ # if file is already a hard-linked, ignore
+ if stat_reference == stat_filename:
+ continue
+
+ # safety net
+ if stat_filename.st_size != size:
+ raise RuntimeError('Size of {} does not match database: {} != {}'.format(
+ filename, size, stat_filename.st_size))
+
+ tempfile = filename + '.new'
+ os.link(reference, tempfile)
+ try:
+ Logger.log(["deduplicate", filename, reference])
+ os.rename(tempfile, filename)
+ finally:
+ try:
+ os.unlink(tempfile)
+ except OSError as e:
+ if e.errno != errno.ENOENT:
+ raise
+
+################################################################################
+
+def dedup(session):
+ results = session.execute("""
+SELECT DISTINCT *
+ FROM (
+ SELECT DISTINCT ON (id) filenames, size
+ FROM (
+ SELECT
+ f1.id,
+ f1.size,
+ array_agg(av.path || '/pool/' || c.name || '/' || f2.filename) OVER (PARTITION BY f1.id, a1.archive_id ORDER by f2.created) AS filenames
+ FROM
+ files AS f1 INNER JOIN
+ files_archive_map AS a1 ON f1.id = a1.file_id INNER JOIN
+ files AS f2 ON f1.size = f2.size AND f1.sha256sum = f2.sha256sum INNER JOIN
+ files_archive_map AS a2 ON f2.id = a2.file_id INNER JOIN
+ component c ON a2.component_id = c.id INNER JOIN
+ archive av ON a1.archive_id = a2.archive_id AND a2.archive_id = av.id
+ ) AS f
+ WHERE array_length(filenames, 1) > 1
+ ORDER BY id, array_length(filenames, 1) DESC
+ ) AS f
+ ORDER by filenames;
+ """)
+
+ for i in results:
+ dedup_one(i['size'], *i['filenames'])
+
+################################################################################
+
+def main():
+ global Options, Logger
+
+ cnf = Config()
+ session = DBConn().session()
+
+ Arguments = [('h',"help","Archive-Dedup-Pool::Options::Help")]
+
+ apt_pkg.parse_commandline(cnf.Cnf,Arguments,sys.argv)
+
+ for i in ["help"]:
+ if not cnf.has_key("Archive-Dedup-Pool::Options::%s" % (i)):
+ cnf["Archive-Dedup-Pool::Options::%s" % (i)] = ""
+
+ Options = cnf.subtree("Archive-Dedup-Pool::Options")
+
+ if Options["Help"]:
+ usage()
+
+ Logger = daklog.Logger("archive-dedup-pool")
+
+ dedup(session)
+
+ Logger.close()
+
+################################################################################
+
+if __name__ == '__main__':
+ main()
diff --git a/dak/dak.py b/dak/dak.py
index 191664f..7a31c46 100755
--- a/dak/dak.py
+++ b/dak/dak.py
@@ -155,6 +155,8 @@ def init():
"Generate a list of override disparities"),
("external-overrides",
"Modify external overrides"),
+ ("archive-dedup-pool",
+ "De-duplicates files in the pool directory"),
]
return functionality
--
2.1.4
Reply to: