[Date Prev][Date Next] [Thread Prev][Thread Next] [Date Index] [Thread Index]

[dak/master] Modified to create caches on disk so COPY can be used, INCOMPLETE!



Signed-off-by: Michael Casadevall <sonicmctails@gmail.com>
---
 dak/import_contents.py |  102 ++++++++++++++++++++++++++---------------------
 daklib/database.py     |   38 +++++++++++++++---
 2 files changed, 88 insertions(+), 52 deletions(-)

diff --git a/dak/import_contents.py b/dak/import_contents.py
index 0669195..d19b1ea 100755
--- a/dak/import_contents.py
+++ b/dak/import_contents.py
@@ -33,9 +33,12 @@ Cnf = None
 projectB = None
 out = None
 AptCnf = None
-content_path_id_cache = {}
-content_file_id_cache = {}
-insert_contents_file_cache = {}
+has_opened_temp_file_lists = False
+content_path_file = ""
+content_name_file = ""
+content_file_cache = {}
+content_name_cache = {}
+content_path_cache = {}
 
 ################################################################################
 
@@ -50,46 +53,31 @@ Import Contents files
 
 ################################################################################
 
-
-def set_contents_file_id(file):
-    global content_file_id_cache
-
-    if not content_file_id_cache.has_key(file):
-        # since this can be called within a transaction, we can't use currval
-        q = projectB.query("INSERT INTO content_file_names VALUES (DEFAULT, '%s') RETURNING id" % (file))
-        content_file_id_cache[file] = int(q.getresult()[0][0])
-    return content_file_id_cache[file]
-
-################################################################################
-
-def set_contents_path_id(path):
-    global content_path_id_cache
-
-    if not content_path_id_cache.has_key(path):
-        q = projectB.query("INSERT INTO content_file_paths VALUES (DEFAULT, '%s') RETURNING id" % (path))
-        content_path_id_cache[path] = int(q.getresult()[0][0])
-    return content_path_id_cache[path]
-
-################################################################################
-
-def insert_content_path(bin_id, fullpath):
-    global insert_contents_file_cache
-    cache_key = "%s_%s" % (bin_id, fullpath)
+def cache_content_path(fullpath):
+    global content_file_cache, contents_name_cache, content_path_cache
+    global content_path_file, content_name_file, has_opened_temp_file_lists
 
     # have we seen this contents before?
-    # probably only revelant during package import
-    if insert_contents_file_cache.has_key(cache_key):
+    if content_file_cache.has_key(fullpath):
         return
 
     # split the path into basename, and pathname
     (path, file)  = os.path.split(fullpath)
 
-    # Get the necessary IDs ...
-    file_id = set_contents_file_id(file)
-    path_id = set_contents_path_id(path)
+    # Due to performance reasons, we need to get the entire filelists table
+    # sorted first before we can do assiocation tables.
+    if has_opened_temp_file_lists == False:
+        content_path_file = open("/tmp/content_file_path.tmp", "w")
+        content_name_file = open("/tmp/content_name_path.tmp", "w")
+        has_opened_temp_file_lists = True
 
-    # Put them into content_assiocations
-    projectB.query("INSERT INTO content_associations VALUES (DEFAULT, '%d', '%d', '%d')" % (bin_id, path_id, file_id))
+    if not content_path_cache.has_key(path):
+        content_path_file.write("DEFAULT %s\n" % (path))
+        content_path_cache[path] = 1
+
+    if not content_name_cache.has_key(file):
+        content_name_file.write("DEFAULT %s\n" % (file))
+        content_name_cache[file] = 1
     return
 
 ################################################################################
@@ -103,6 +91,10 @@ def import_contents(suites):
     # Needed to make sure postgreSQL doesn't freak out on some of the data
     projectB.query("SET CLIENT_ENCODING TO 'LATIN1'")
 
+    # Precache everything
+    #print "Precaching binary information, this will take a few moments ..."
+    #database.preload_binary_id_cache()
+
     # Prep regexs
     line_regex = re.compile(r'^(.+?)\s+(\S+)$')
     pkg_regex = re.compile(r'(\S+)/(\S+)$')
@@ -124,7 +116,14 @@ def import_contents(suites):
         for arch in arch_list:
             print "Processing %s/%s" % (s, arch[1])
             arch_id = database.get_architecture_id(arch[1])
-            f = gzip.open(Cnf["Dir::Root"] + "dists/%s/Contents-%s.gz" % (s, arch[1]), "r")
+
+            try:
+                f = gzip.open(Cnf["Dir::Root"] + "dists/%s/Contents-%s.gz" % (s, arch[1]), "r")
+
+            except:
+                print "Unable to open dists/%s/Contents-%s.gz" % (s, arch[1])
+                print "Skipping ..."
+                continue
 
             # Get line count
             lines = f.readlines()
@@ -157,28 +156,39 @@ def import_contents(suites):
                 filename = matchs[0][0]
                 packages = matchs[0][1].split(',')
 
+
+                cache_content_path(filename)
+
                 # Iterate through each file's packages
-                for package in packages:
-                    matchs = pkg_regex.findall(package)
+                #for package in packages:
+                #    matchs = pkg_regex.findall(package)
 
                     # Needed since the DB is unicode, and these files
                     # are ASCII
-                    section_name = matchs[0][0]
-                    package_name = matchs[0][1]
+                #    section_name = matchs[0][0]
+                #    package_name = matchs[0][1]
 
-                    section_id = database.get_section_id(section_name)
-                    package_id = database.get_latest_binary_version_id(package_name, section_id, suite_id, arch_id)
+                    #section_id = database.get_section_id(section_name)
+                    #package_id = database.get_latest_binary_version_id(package_name, section_id, suite_id, arch_id)
 
-                    if package_id == None:
-                        # Likely got an arch all package
-                        package_id = database.get_latest_binary_version_id(package_name, section_id, suite_id, arch_all_id)
+               #     if package_id == None:
+                        # This can happen if the Contents file refers to a non-existant package
+                        # it seems Contents sometimes can be stale due to use of caches (i.e., hurd-i386)
+                        # hurd-i386 was removed from the archive, but its Contents file still exists
+                        # and is seemingly still updated. The sane thing to do is skip it and continue
+               #         continue
 
-                    insert_content_path(package_id, filename)
 
                 lines_processed += 1
+
+            print "" # newline since the Progress bar doesn't print one
             f.close()
 
     # Commit work
+
+    content_name_file.close()
+    content_path_file.close()
+
     print "Committing to database ..."
     projectB.query("COMMIT")
 
diff --git a/daklib/database.py b/daklib/database.py
index f6bedf3..9cefc38 100755
--- a/daklib/database.py
+++ b/daklib/database.py
@@ -46,6 +46,7 @@ suite_bin_version_cache = {}
 content_path_id_cache = {}
 content_file_id_cache = {}
 insert_contents_file_cache = {}
+cache_preloaded = False
 
 ################################################################################
 
@@ -254,22 +255,47 @@ def get_suite_version(source, suite, arch):
 def get_latest_binary_version_id(binary, section, suite, arch):
     global suite_bin_version_cache
     cache_key = "%s_%s_%s_%s" % (binary, section, suite, arch)
+    cache_key_all = "%s_%s_%s_%s" % (binary, section, suite, get_architecture_id("all"))
 
+    # Check for the cache hit for its arch, then arch all
     if suite_bin_version_cache.has_key(cache_key):
         return suite_bin_version_cache[cache_key]
+    if suite_bin_version_cache.has_key(cache_key_all):
+        return suite_bin_version_cache[cache_key_all]
+    if cache_preloaded == True:
+        return # package does not exist
 
-    q = projectB.query("SELECT b.id, b.version FROM binaries b JOIN bin_associations ba ON (b.id = ba.bin) JOIN override o ON (o.package=b.package) WHERE b.package = '%s' AND b.architecture = '%d' AND ba.suite = '%d' AND o.section = '%d'" % (binary, int(arch), int(suite), int(section)))
+    q = projectB.query("SELECT DISTINCT b.id FROM binaries b JOIN bin_associations ba ON (b.id = ba.bin) JOIN override o ON (o.package=b.package) WHERE b.package = '%s' AND b.architecture = '%d' AND ba.suite = '%d' AND o.section = '%d'" % (binary, int(arch), int(suite), int(section)))
 
-    highest_bid, highest_version = None, None
+    if not q.getresult():
+        return False
 
-    for bi in q.getresult():
-        if highest_version == None or apt_pkg.VersionCompare(bi[1], highest_version) == 1:
-             highest_bid = bi[0]
-             highest_version = bi[1]
+    highest_bid = q.getresult()[0][0]
 
     suite_bin_version_cache[cache_key] = highest_bid
     return highest_bid
 
+def preload_binary_id_cache():
+    global suite_bin_version_cache, cache_preloaded
+
+    # Get suite info
+    q = projectB.query("SELECT id FROM suite")
+    suites = q.getresult()
+
+    # Get arch mappings
+    q = projectB.query("SELECT id FROM architecture")
+    arches = q.getresult()
+
+    for suite in suites:
+        for arch in arches:
+            q = projectB.query("SELECT DISTINCT b.id, b.package, o.section FROM binaries b JOIN bin_associations ba ON (b.id = ba.bin) JOIN override o ON (o.package=b.package) WHERE b.architecture = '%d' AND ba.suite = '%d'" % (int(arch[0]), int(suite[0])))
+
+            for bi in q.getresult():
+                cache_key = "%s_%s_%s_%s" % (bi[1], bi[2], suite[0], arch[0])
+                suite_bin_version_cache[cache_key] = int(bi[0])
+
+    cache_preloaded = True
+
 ################################################################################
 
 def get_or_set_maintainer_id (maintainer):
-- 
1.5.6.5



Reply to: