[Date Prev][Date Next] [Thread Prev][Thread Next] [Date Index] [Thread Index]

[snapshot/master] Faster incremental farm sync



---
 mirror/README                      |   46 +++++++++++++++
 mirror/farm-list-entries           |   81 +++++++++++++++++++++++++
 mirror/farm-make-tarball-from-list |   63 ++++++++++++++++++++
 mirror/process-farm-update         |  115 ++++++++++++++++++++++++++++++++++++
 mirror/ssh-wrap                    |   19 ++++++
 mirror/sync-farm-incremental       |   22 +++++++
 6 files changed, 346 insertions(+)
 create mode 100644 mirror/README
 create mode 100755 mirror/farm-list-entries
 create mode 100755 mirror/farm-make-tarball-from-list
 create mode 100755 mirror/process-farm-update
 create mode 100644 mirror/sync-farm-incremental

diff --git a/mirror/README b/mirror/README
new file mode 100644
index 0000000..6b6f980
--- /dev/null
+++ b/mirror/README
@@ -0,0 +1,46 @@
+
+To sync the farm to a mirror, we can either run rsyncs, but these are
+expensive, so we would like to avoid having to do that often.
+
+Since the farm has a pretty specific structure, and files - once in it -
+never change, we can also do the following on a mirror:
+  Ask the master for a list of all files (or files starting with say 00),
+  compare that to a local list,
+  and request any we don't have yet.
+
+We have a set of scripts that help implement this:
+.    make a temporary directory, cd there,
+.
+.    farm-list-entries 00 |
+.      ssh master farm-make-update-tarball 00 |
+.      tar xaf -
+.    process-farm-update
+.
+.    rm -rf temporary directory
+
+
+Other scripts in this directory:
+ - ssh-wrap:
+   ssh authorized_keys command wrapper/dispatcher script
+
+ - sync-farm-completely: 
+   Runs one rsync each for all 256 top-level directories.
+
+ - sync-incremental:
+   Triggered from the master, run on the client, will run
+   farm-journal-fetch-tarball and import any new dumps.
+
+ - farm-journal-make-tarball,
+   farm-journal-fetch-tarball,
+   farm-journal-expire:
+   A different way to keep mirrors in sync is to keep a journal of added
+   files on the master.  Then we only need to consult this list and copy
+   the new files over to the client when we have updates.  These scripts
+   implement that behavior, and also expire old journal entries.
+
+ - import-new-dumps:
+   Prior to postgres replication, we would dump out mirrorruns to flat
+   files, ship these to the mirror which would then import it.  This
+   script is the importer.
+
+weasel, Tue, 28 Oct 2014 17:39:50 +0100
diff --git a/mirror/farm-list-entries b/mirror/farm-list-entries
new file mode 100755
index 0000000..de111de
--- /dev/null
+++ b/mirror/farm-list-entries
@@ -0,0 +1,81 @@
+#!/usr/bin/python
+
+# Copyright (c) 2010, 2014 Peter Palfrader
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+# Given a prefix (two nibbles), list all the elements in that part of the farm.
+# output order is undefined.
+#
+# With -d, do not list entries that already are given on stdin.
+
+import sys
+import optparse
+import os
+
+hexdigits = '0123456789abcdef'
+def is_hex(s):
+  return all(c in hexdigits for c in s)
+
+parser = optparse.OptionParser()
+parser.set_usage("%prog --config=<conffile>|--farmpath=<path> <prefix>")
+parser.add_option("-c", "--config", dest="conffile", metavar="CONFFILE",
+  help="Config file location.")
+parser.add_option("-p", "--farmpath", dest="farmpath", metavar="FARMPATH",
+  help="Path to the snapshot farm.")
+parser.add_option("-d", "--diff", dest="diff", action="store_true",
+  help="Only list elements not given in stdin.")
+
+(options, args) = parser.parse_args()
+if (options.conffile is None and options.farmpath is None) or len(args) != 1:
+  parser.print_help()
+  sys.exit(1)
+
+prefix = args[0]
+if len(prefix) != 2 or not is_hex(prefix):
+  print >> sys.stderr, "Invalid prefix."
+  sys.exit(1)
+
+if options.farmpath is None:
+  import yaml
+  config = yaml.safe_load(open(options.conffile).read())
+  farmpath = config['snapshot']['farmpath']
+else:
+  farmpath = options.farmpath
+
+existlist = []
+if options.diff:
+  for line in sys.stdin:
+    h = line.strip()
+    if len(h) != 40 or not is_hex(h):
+      print >>sys.stderr, "Ignoring invalid token %s"%(h)
+      continue
+    existlist.append(h)
+
+exist = set(existlist)
+
+for byte2 in xrange(256):
+  p = os.path.join(farmpath, prefix, '%02x'%byte2)
+  for filename in os.listdir(p):
+    if len(filename) == 40 and is_hex(filename) and not filename in exist:
+      print filename
+
+# vim:set et:
+# vim:set ts=4:
+# vim:set shiftwidth=4:
diff --git a/mirror/farm-make-tarball-from-list b/mirror/farm-make-tarball-from-list
new file mode 100755
index 0000000..fc9f4c6
--- /dev/null
+++ b/mirror/farm-make-tarball-from-list
@@ -0,0 +1,63 @@
+#!/usr/bin/python
+
+# Copyright (c) 2010, 2014 Peter Palfrader
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+# Given a list of hashes from stdin, make a tarball of their files.
+
+import tarfile
+import sys
+import yaml
+import optparse
+import os
+
+parser = optparse.OptionParser()
+parser.set_usage("%prog --config=<conffile>")
+parser.add_option("-c", "--config", dest="conffile", metavar="CONFFILE",
+  help="Config file location.")
+
+(options, args) = parser.parse_args()
+if options.conffile is None:
+  parser.print_help()
+  sys.exit(1)
+
+config = yaml.safe_load(open(options.conffile).read())
+
+def make_path(digest):
+  prefix1 = digest[0:2]
+  prefix2 = digest[2:4]
+  return os.path.join(config['snapshot']['farmpath'], prefix1, prefix2, digest)
+
+hexdigits = '0123456789abcdef'
+def is_hex(s):
+  return all(c in hexdigits for c in s)
+
+tar = tarfile.open(mode="w|", fileobj=sys.stdout)
+for line in sys.stdin:
+  h = line.strip()
+  if len(h) != 40 or not is_hex(h):
+    print >>sys.stderr, "Ignoring invalid token %s"%(h)
+    continue
+  tar.add(make_path(h), arcname=h)
+tar.close()
+
+# vim:set et:
+# vim:set ts=4:
+# vim:set shiftwidth=4:
diff --git a/mirror/process-farm-update b/mirror/process-farm-update
new file mode 100755
index 0000000..c2c1873
--- /dev/null
+++ b/mirror/process-farm-update
@@ -0,0 +1,115 @@
+#!/usr/bin/python
+
+# Copyright (c) 2010, 2014 Peter Palfrader
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+
+# Given a directory of hash-named files, sort them into the farm.
+
+import errno
+import hashlib
+import optparse
+import os
+import os.path
+import shutil
+import subprocess
+import sys
+import tarfile
+import tempfile
+
+parser = optparse.OptionParser()
+parser.set_usage("%prog --config=<conffile> <directory>")
+parser.add_option("-c", "--config", dest="conffile", metavar="CONFFILE",
+  help="Config file location.")
+parser.add_option("-p", "--farmpath", dest="farmpath", metavar="FARMPATH",
+  help="Path to the snapshot farm.")
+parser.add_option("-v", "--verbose", action="store_true",
+  help="Config file location.")
+
+(options, args) = parser.parse_args()
+if (options.conffile is None and options.farmpath is None) or len(args) != 1:
+  parser.print_help()
+  sys.exit(1)
+
+os.chdir(args[0])
+
+if options.farmpath is None:
+  import yaml
+  config = yaml.safe_load(open(options.conffile).read())
+  farmpath = config['snapshot']['farmpath']
+else:
+  farmpath = options.farmpath
+
+def make_path(digest):
+    prefix1 = digest[0:2]
+    prefix2 = digest[2:4]
+
+    dir = farmpath
+    for elem in (prefix1, prefix2):
+        dir = os.path.join(dir, elem)
+        if not os.path.exists(dir):
+            os.mkdir(dir)
+    return os.path.join(dir, digest)
+
+def hash_file(fn):
+    h = hashlib.sha1()
+    f = open(fn)
+    while True:
+        c = f.read(64*1024)
+        if c == "": break
+        h.update(c)
+    f.close()
+    return h.hexdigest()
+
+def copy_file(src, dst):
+    tmpname = os.path.join(os.path.dirname(dst), ".tmp.fetch-tarball."+os.path.basename(dst))
+    shutil.copy2(src, tmpname)
+    os.link(tmpname, dst)
+    os.unlink(tmpname)
+
+def move_file_with_fallback(src, dst):
+    try:
+        if not os.path.exists(dst):
+            os.link(src, dst)
+    except OSError, ex:
+        if ex.errno not in [errno.EXDEV]:
+            raise
+        copy_file(src, dst)
+    os.unlink(src)
+
+
+
+for fn in os.listdir('.'):
+    if fn == "meta": continue
+
+    h = hash_file(fn)
+    if fn != h:
+        sys.stderr.write("Warning: File %s has unexected hash value %s.  Ignoring\n"%(fn, h))
+        os.unlink(fn)
+        continue
+
+    target = make_path(h)
+
+    if options.verbose: print "%s -> %s"%(fn,target)
+    move_file_with_fallback(fn, target)
+
+# vim:set et:
+# vim:set ts=4:
+# vim:set shiftwidth=4:
diff --git a/mirror/ssh-wrap b/mirror/ssh-wrap
index 2a572d9..01e6cae 100755
--- a/mirror/ssh-wrap
+++ b/mirror/ssh-wrap
@@ -75,6 +75,19 @@ farm_journal_fetch() {
 	"$HOME"/code/mirror/farm-journal-make-tarball -c "$SNAPCONFIG" -s "$since"
 }
 
+farm_make_update_tarball() {
+	one_more_arg "$@"
+	local prefix="$1"
+
+	if ! echo "$prefix" | grep -q '^[0-9a-f][0-9a-f]$'; then
+		croak "Invalid characters encountered in prefix '$prefix'"
+	fi
+
+	info "make-update-tarball $prefix"
+	"$HOME"/code/mirror/farm-list-entries -c "$SNAPCONFIG" --diff "$prefix" |
+	  "$HOME"/code/mirror/farm-make-tarball-from-list -c "$SNAPCONFIG"
+}
+
 do_rsync() {
 	local allowed_command_prefixes
 	allowed_rsyncs=(
@@ -125,6 +138,12 @@ case "$action" in
 		farm_journal_fetch "$@"
 		;;
 
+	farm-make-update-tarball)
+		require_master
+
+		farm_make_update_tarball "$@"
+		;;
+
 	git-upload-pack)
 		require_master
 
diff --git a/mirror/sync-farm-incremental b/mirror/sync-farm-incremental
new file mode 100644
index 0000000..1726f80
--- /dev/null
+++ b/mirror/sync-farm-incremental
@@ -0,0 +1,22 @@
+#!/bin/bash
+
+set -e
+set -o pipefail
+
+farm="$(readlink -f farm)"
+cd "$farm"
+for i in *; do
+  exec 200< "$i"
+  flock -e 200
+
+  tmpdir="$(mktemp -d --tmpdir="$i" ".incremental-update-XXXXXX")"
+  tmpdir="$(readlink -f "$tmpdir")"
+  trap "rm -rf '$tmpdir'" EXIT
+
+  ~/code/mirror/farm-list-entries --farmpath "$farm" "$i" |
+    ssh -4 -C snapshot-master.debian.org farm-make-update-tarball "$i" |
+    (cd $tmpdir && tar xaf -)
+
+  ~/code/mirror/process-farm-update --farmpath "$farm" "$tmpdir"
+  rm -rf "$tmpdir"
+done
-- 
1.7.10.4


Reply to: