[Date Prev][Date Next] [Thread Prev][Thread Next] [Date Index] [Thread Index]

Bug#761869: [Patch] statistics: improve time execution using group by query



tags 761869 + patch
thanks

There is still a minor issue with this patch: We cannot compute
statistics using the group by query (as we will count multiple times
each package) and so we still need the previous methods to exist.
From 06f8895c90693cbc78caa6c41043707e58c46610 Mon Sep 17 00:00:00 2001
From: Orestis Ioannou <orestis@oioannou.com>
Date: Tue, 24 Mar 2015 20:01:33 +0100
Subject: [PATCH] statistics: improve time execution using group by query

Update statistics stage computed using group by query.
Benchmark references in doc/update-stats-query.bench.sql
Closes: #761869
---
 debsources/statistics.py       | 51 ++++++++++++++++++++++++++++++++++++++++++
 debsources/tests/test_stats.py | 19 ++++++++++++++++
 debsources/updater.py          | 39 +++++++++++++++++++++-----------
 3 files changed, 96 insertions(+), 13 deletions(-)

diff --git a/debsources/statistics.py b/debsources/statistics.py
index b2904f4..4b2458e 100644
--- a/debsources/statistics.py
+++ b/debsources/statistics.py
@@ -120,6 +120,57 @@ def source_packages(session, suite=None, areas=None):
     return _count(q)
 
 
+def stats_grouped_by(session, stat, areas=None):
+    ''' Compute statistics `stat` query using grouped by
+        to minimize time execution.
+
+        Reference doc/update-stats-query.bench.sql
+    '''
+    logging.debug('Compute stats for all suites')
+    if stat is 'source_packages':
+        q = (session.query(Suite.suite.label("suite"),
+                           sql_func.count(Package.id))
+             .join(Package)
+             .group_by(Suite.suite)
+             )
+    elif stat is 'source_files':
+        q = (session.query(Suite.suite.label("suite"),
+                           sql_func.count(Checksum.id))
+             .join(Package)
+             .join(Checksum)
+             .group_by(Suite.suite)
+             )
+    elif stat is 'disk_usage':
+        q = (session.query(Suite.suite.label("suite"),
+                           sql_func.sum(Metric.value))
+             .filter(Metric.metric == 'size')
+             .join(Package)
+             .join(Metric)
+             .group_by(Suite.suite)
+             )
+    elif stat is 'ctags':
+        q = (session.query(Suite.suite.label('suite'),
+                           sql_func.count(Ctag.id))
+             .join(Package)
+             .join(Ctag)
+             .group_by(Suite.suite)
+             )
+    elif stat is 'sloccount':
+        q = (session.query(Suite.suite.label('suite'),
+                           SlocCount.language.label('language'),
+                           sql_func.sum(SlocCount.count))
+             .join(Package)
+             .join(SlocCount)
+             .group_by(Suite.suite, SlocCount.language)
+             )
+    else:
+        logging.warn("Unrecognised stat %s" % stat)
+        return 0
+    if areas:
+        q = q.filter(Package.area.in_(areas))
+    return q.all()
+
+
 def source_files(session, suite=None, areas=None):
     """source files count
 
diff --git a/debsources/tests/test_stats.py b/debsources/tests/test_stats.py
index ce98548..f3bbb49 100644
--- a/debsources/tests/test_stats.py
+++ b/debsources/tests/test_stats.py
@@ -182,3 +182,22 @@ class Stats(unittest.TestCase, DbTestFixture):
         area_count = statistics.ctags(self.session, areas=['main'])
         self.assertEqual(area_count, 87507)
         self.assertLessEqual(area_count, statistics.ctags(self.session))
+
+    @istest
+    def test_group_by_stats(self):
+        self.assertEqual(dict(statistics.stats_grouped_by(self.session,
+                                                          'disk_usage'))['etch'],
+                         32736)
+        self.assertEqual(dict(statistics.stats_grouped_by(self.session,
+                                                          'ctags'))['wheezy'],
+                         20150)
+        self.assertEqual(dict(statistics.stats_grouped_by(self.session,
+                                                          'source_packages'))['jessie'],
+                         13)
+        self.assertEqual(dict(statistics.stats_grouped_by(self.session,
+                                                          'source_files'))['wheezy'],
+                         1632)
+        sloc_list= statistics.stats_grouped_by(self.session, 'sloccount')
+        wheezy_sloc = [[item[1], item[2]] for item in sloc_list
+                       if item[0] == "wheezy"]
+        self.assertEqual(dict(wheezy_sloc)['sh'], 13560)
diff --git a/debsources/updater.py b/debsources/updater.py
index d0cc389..fdd72aa 100644
--- a/debsources/updater.py
+++ b/debsources/updater.py
@@ -460,20 +460,33 @@ def update_statistics(status, conf, session, suites=None):
         session.add(siz)
         session.add(loc)
 
-    # compute per-suite stats
-    for suite in suites:
-        siz = HistorySize(suite, timestamp=now)
-        loc = HistorySlocCount(suite, timestamp=now)
-
-        suite_key = 'debian_' + suite + '.'
-        for stat in ['disk_usage', 'source_packages', 'source_files', 'ctags']:
-            v = getattr(statistics, stat)(session, suite)
-            stats[suite_key + stat] = v
-            setattr(siz, stat, v)
-        store_sloccount_stats(statistics.sloccount_summary(session, suite),
-                              stats, suite_key + 'sloccount', loc)
-        if not conf['dry_run'] and 'db' in conf['backends']:
+    # Update HistorySize
+    suite_key = 'debian_'
+    hist_siz = dict((suite, HistorySize(suite, timestamp=now))
+                    for suite in suites)
+    for stat in ['disk_usage', 'source_packages', 'source_files', 'ctags']:
+        stats_result = statistics.stats_grouped_by(session, stat)
+        for res in stats_result:
+            stats[suite_key+res[0]+'.'+stat] = res[1]
+            setattr(hist_siz[res[0]], stat, res[1])
+
+    if not conf['dry_run'] and 'db' in conf['backends']:
+        for siz in hist_siz.values():
             session.add(siz)
+
+    # update historySlocCount
+    sloccount_res = statistics.stats_grouped_by(session, 'sloccount')
+    hist_loc = dict((suite, HistorySlocCount(suite, timestamp=now))
+                    for suite in suites)
+    for suite in suites:
+        temp = [[item[1], item[2]] for item in sloccount_res
+                if item[0] == suite]
+        store_sloccount_stats(dict(temp), stats,
+                              suite_key + suite + ".sloccount",
+                              hist_loc[suite])
+
+    if not conf['dry_run'] and 'db' in conf['backends']:
+        for loc in hist_loc.values():
             session.add(loc)
 
     session.flush()
-- 
2.1.4

Attachment: signature.asc
Description: OpenPGP digital signature


Reply to: