[snapshot/master] Faster incremental farm sync
---
mirror/README | 46 +++++++++++++++
mirror/farm-list-entries | 81 +++++++++++++++++++++++++
mirror/farm-make-tarball-from-list | 63 ++++++++++++++++++++
mirror/process-farm-update | 115 ++++++++++++++++++++++++++++++++++++
mirror/ssh-wrap | 19 ++++++
mirror/sync-farm-incremental | 22 +++++++
6 files changed, 346 insertions(+)
create mode 100644 mirror/README
create mode 100755 mirror/farm-list-entries
create mode 100755 mirror/farm-make-tarball-from-list
create mode 100755 mirror/process-farm-update
create mode 100644 mirror/sync-farm-incremental
diff --git a/mirror/README b/mirror/README
new file mode 100644
index 0000000..6b6f980
--- /dev/null
+++ b/mirror/README
@@ -0,0 +1,46 @@
+
+To sync the farm to a mirror, we can either run rsyncs, but these are
+expensive, so we would like to avoid having to do that often.
+
+Since the farm has a pretty specific structure, and files - once in it -
+never change, we can also do the following on a mirror:
+ Ask the master for a list of all files (or files starting with say 00),
+ compare that to a local list,
+ and request any we don't have yet.
+
+We have a set of scripts that help implement this:
+. make a temporary directory, cd there,
+.
+. farm-list-entries 00 |
+. ssh master farm-make-update-tarball 00 |
+. tar xaf -
+. process-farm-update
+.
+. rm -rf temporary directory
+
+
+Other scripts in this directory:
+ - ssh-wrap:
+ ssh authorized_keys command wrapper/dispatcher script
+
+ - sync-farm-completely:
+ Runs one rsync each for all 256 top-level directories.
+
+ - sync-incremental:
+ Triggered from the master, run on the client, will run
+ farm-journal-fetch-tarball and import any new dumps.
+
+ - farm-journal-make-tarball,
+ farm-journal-fetch-tarball,
+ farm-journal-expire:
+ A different way to keep mirrors in sync is to keep a journal of added
+ files on the master. Then we only need to consult this list and copy
+ the new files over to the client when we have updates. These scripts
+ implement that behavior, and also expire old journal entries.
+
+ - import-new-dumps:
+ Prior to postgres replication, we would dump out mirrorruns to flat
+ files, ship these to the mirror which would then import it. This
+ script is the importer.
+
+weasel, Tue, 28 Oct 2014 17:39:50 +0100
diff --git a/mirror/farm-list-entries b/mirror/farm-list-entries
new file mode 100755
index 0000000..de111de
--- /dev/null
+++ b/mirror/farm-list-entries
@@ -0,0 +1,81 @@
+#!/usr/bin/python
+
+# Copyright (c) 2010, 2014 Peter Palfrader
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+# Given a prefix (two nibbles), list all the elements in that part of the farm.
+# output order is undefined.
+#
+# With -d, do not list entries that already are given on stdin.
+
+import sys
+import optparse
+import os
+
+hexdigits = '0123456789abcdef'
+def is_hex(s):
+ return all(c in hexdigits for c in s)
+
+parser = optparse.OptionParser()
+parser.set_usage("%prog --config=<conffile>|--farmpath=<path> <prefix>")
+parser.add_option("-c", "--config", dest="conffile", metavar="CONFFILE",
+ help="Config file location.")
+parser.add_option("-p", "--farmpath", dest="farmpath", metavar="FARMPATH",
+ help="Path to the snapshot farm.")
+parser.add_option("-d", "--diff", dest="diff", action="store_true",
+ help="Only list elements not given in stdin.")
+
+(options, args) = parser.parse_args()
+if (options.conffile is None and options.farmpath is None) or len(args) != 1:
+ parser.print_help()
+ sys.exit(1)
+
+prefix = args[0]
+if len(prefix) != 2 or not is_hex(prefix):
+ print >> sys.stderr, "Invalid prefix."
+ sys.exit(1)
+
+if options.farmpath is None:
+ import yaml
+ config = yaml.safe_load(open(options.conffile).read())
+ farmpath = config['snapshot']['farmpath']
+else:
+ farmpath = options.farmpath
+
+existlist = []
+if options.diff:
+ for line in sys.stdin:
+ h = line.strip()
+ if len(h) != 40 or not is_hex(h):
+ print >>sys.stderr, "Ignoring invalid token %s"%(h)
+ continue
+ existlist.append(h)
+
+exist = set(existlist)
+
+for byte2 in xrange(256):
+ p = os.path.join(farmpath, prefix, '%02x'%byte2)
+ for filename in os.listdir(p):
+ if len(filename) == 40 and is_hex(filename) and not filename in exist:
+ print filename
+
+# vim:set et:
+# vim:set ts=4:
+# vim:set shiftwidth=4:
diff --git a/mirror/farm-make-tarball-from-list b/mirror/farm-make-tarball-from-list
new file mode 100755
index 0000000..fc9f4c6
--- /dev/null
+++ b/mirror/farm-make-tarball-from-list
@@ -0,0 +1,63 @@
+#!/usr/bin/python
+
+# Copyright (c) 2010, 2014 Peter Palfrader
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+# Given a list of hashes from stdin, make a tarball of their files.
+
+import tarfile
+import sys
+import yaml
+import optparse
+import os
+
+parser = optparse.OptionParser()
+parser.set_usage("%prog --config=<conffile>")
+parser.add_option("-c", "--config", dest="conffile", metavar="CONFFILE",
+ help="Config file location.")
+
+(options, args) = parser.parse_args()
+if options.conffile is None:
+ parser.print_help()
+ sys.exit(1)
+
+config = yaml.safe_load(open(options.conffile).read())
+
+def make_path(digest):
+ prefix1 = digest[0:2]
+ prefix2 = digest[2:4]
+ return os.path.join(config['snapshot']['farmpath'], prefix1, prefix2, digest)
+
+hexdigits = '0123456789abcdef'
+def is_hex(s):
+ return all(c in hexdigits for c in s)
+
+tar = tarfile.open(mode="w|", fileobj=sys.stdout)
+for line in sys.stdin:
+ h = line.strip()
+ if len(h) != 40 or not is_hex(h):
+ print >>sys.stderr, "Ignoring invalid token %s"%(h)
+ continue
+ tar.add(make_path(h), arcname=h)
+tar.close()
+
+# vim:set et:
+# vim:set ts=4:
+# vim:set shiftwidth=4:
diff --git a/mirror/process-farm-update b/mirror/process-farm-update
new file mode 100755
index 0000000..c2c1873
--- /dev/null
+++ b/mirror/process-farm-update
@@ -0,0 +1,115 @@
+#!/usr/bin/python
+
+# Copyright (c) 2010, 2014 Peter Palfrader
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+
+# Given a directory of hash-named files, sort them into the farm.
+
+import errno
+import hashlib
+import optparse
+import os
+import os.path
+import shutil
+import subprocess
+import sys
+import tarfile
+import tempfile
+
+parser = optparse.OptionParser()
+parser.set_usage("%prog --config=<conffile> <directory>")
+parser.add_option("-c", "--config", dest="conffile", metavar="CONFFILE",
+ help="Config file location.")
+parser.add_option("-p", "--farmpath", dest="farmpath", metavar="FARMPATH",
+ help="Path to the snapshot farm.")
+parser.add_option("-v", "--verbose", action="store_true",
+ help="Config file location.")
+
+(options, args) = parser.parse_args()
+if (options.conffile is None and options.farmpath is None) or len(args) != 1:
+ parser.print_help()
+ sys.exit(1)
+
+os.chdir(args[0])
+
+if options.farmpath is None:
+ import yaml
+ config = yaml.safe_load(open(options.conffile).read())
+ farmpath = config['snapshot']['farmpath']
+else:
+ farmpath = options.farmpath
+
+def make_path(digest):
+ prefix1 = digest[0:2]
+ prefix2 = digest[2:4]
+
+ dir = farmpath
+ for elem in (prefix1, prefix2):
+ dir = os.path.join(dir, elem)
+ if not os.path.exists(dir):
+ os.mkdir(dir)
+ return os.path.join(dir, digest)
+
+def hash_file(fn):
+ h = hashlib.sha1()
+ f = open(fn)
+ while True:
+ c = f.read(64*1024)
+ if c == "": break
+ h.update(c)
+ f.close()
+ return h.hexdigest()
+
+def copy_file(src, dst):
+ tmpname = os.path.join(os.path.dirname(dst), ".tmp.fetch-tarball."+os.path.basename(dst))
+ shutil.copy2(src, tmpname)
+ os.link(tmpname, dst)
+ os.unlink(tmpname)
+
+def move_file_with_fallback(src, dst):
+ try:
+ if not os.path.exists(dst):
+ os.link(src, dst)
+ except OSError, ex:
+ if ex.errno not in [errno.EXDEV]:
+ raise
+ copy_file(src, dst)
+ os.unlink(src)
+
+
+
+for fn in os.listdir('.'):
+ if fn == "meta": continue
+
+ h = hash_file(fn)
+ if fn != h:
+ sys.stderr.write("Warning: File %s has unexected hash value %s. Ignoring\n"%(fn, h))
+ os.unlink(fn)
+ continue
+
+ target = make_path(h)
+
+ if options.verbose: print "%s -> %s"%(fn,target)
+ move_file_with_fallback(fn, target)
+
+# vim:set et:
+# vim:set ts=4:
+# vim:set shiftwidth=4:
diff --git a/mirror/ssh-wrap b/mirror/ssh-wrap
index 2a572d9..01e6cae 100755
--- a/mirror/ssh-wrap
+++ b/mirror/ssh-wrap
@@ -75,6 +75,19 @@ farm_journal_fetch() {
"$HOME"/code/mirror/farm-journal-make-tarball -c "$SNAPCONFIG" -s "$since"
}
+farm_make_update_tarball() {
+ one_more_arg "$@"
+ local prefix="$1"
+
+ if ! echo "$prefix" | grep -q '^[0-9a-f][0-9a-f]$'; then
+ croak "Invalid characters encountered in prefix '$prefix'"
+ fi
+
+ info "make-update-tarball $prefix"
+ "$HOME"/code/mirror/farm-list-entries -c "$SNAPCONFIG" --diff "$prefix" |
+ "$HOME"/code/mirror/farm-make-tarball-from-list -c "$SNAPCONFIG"
+}
+
do_rsync() {
local allowed_command_prefixes
allowed_rsyncs=(
@@ -125,6 +138,12 @@ case "$action" in
farm_journal_fetch "$@"
;;
+ farm-make-update-tarball)
+ require_master
+
+ farm_make_update_tarball "$@"
+ ;;
+
git-upload-pack)
require_master
diff --git a/mirror/sync-farm-incremental b/mirror/sync-farm-incremental
new file mode 100644
index 0000000..1726f80
--- /dev/null
+++ b/mirror/sync-farm-incremental
@@ -0,0 +1,22 @@
+#!/bin/bash
+
+set -e
+set -o pipefail
+
+farm="$(readlink -f farm)"
+cd "$farm"
+for i in *; do
+ exec 200< "$i"
+ flock -e 200
+
+ tmpdir="$(mktemp -d --tmpdir="$i" ".incremental-update-XXXXXX")"
+ tmpdir="$(readlink -f "$tmpdir")"
+ trap "rm -rf '$tmpdir'" EXIT
+
+ ~/code/mirror/farm-list-entries --farmpath "$farm" "$i" |
+ ssh -4 -C snapshot-master.debian.org farm-make-update-tarball "$i" |
+ (cd $tmpdir && tar xaf -)
+
+ ~/code/mirror/process-farm-update --farmpath "$farm" "$tmpdir"
+ rm -rf "$tmpdir"
+done
--
1.7.10.4
Reply to: