tags 761869 + patch thanks There is still a minor issue with this patch: We cannot compute statistics using the group by query (as we will count multiple times each package) and so we still need the previous methods to exist.
From 06f8895c90693cbc78caa6c41043707e58c46610 Mon Sep 17 00:00:00 2001 From: Orestis Ioannou <orestis@oioannou.com> Date: Tue, 24 Mar 2015 20:01:33 +0100 Subject: [PATCH] statistics: improve time execution using group by query Update statistics stage computed using group by query. Benchmark references in doc/update-stats-query.bench.sql Closes: #761869 --- debsources/statistics.py | 51 ++++++++++++++++++++++++++++++++++++++++++ debsources/tests/test_stats.py | 19 ++++++++++++++++ debsources/updater.py | 39 +++++++++++++++++++++----------- 3 files changed, 96 insertions(+), 13 deletions(-) diff --git a/debsources/statistics.py b/debsources/statistics.py index b2904f4..4b2458e 100644 --- a/debsources/statistics.py +++ b/debsources/statistics.py @@ -120,6 +120,57 @@ def source_packages(session, suite=None, areas=None): return _count(q) +def stats_grouped_by(session, stat, areas=None): + ''' Compute statistics `stat` query using grouped by + to minimize time execution. + + Reference doc/update-stats-query.bench.sql + ''' + logging.debug('Compute stats for all suites') + if stat is 'source_packages': + q = (session.query(Suite.suite.label("suite"), + sql_func.count(Package.id)) + .join(Package) + .group_by(Suite.suite) + ) + elif stat is 'source_files': + q = (session.query(Suite.suite.label("suite"), + sql_func.count(Checksum.id)) + .join(Package) + .join(Checksum) + .group_by(Suite.suite) + ) + elif stat is 'disk_usage': + q = (session.query(Suite.suite.label("suite"), + sql_func.sum(Metric.value)) + .filter(Metric.metric == 'size') + .join(Package) + .join(Metric) + .group_by(Suite.suite) + ) + elif stat is 'ctags': + q = (session.query(Suite.suite.label('suite'), + sql_func.count(Ctag.id)) + .join(Package) + .join(Ctag) + .group_by(Suite.suite) + ) + elif stat is 'sloccount': + q = (session.query(Suite.suite.label('suite'), + SlocCount.language.label('language'), + sql_func.sum(SlocCount.count)) + .join(Package) + .join(SlocCount) + .group_by(Suite.suite, SlocCount.language) + ) + else: + logging.warn("Unrecognised stat %s" % stat) + return 0 + if areas: + q = q.filter(Package.area.in_(areas)) + return q.all() + + def source_files(session, suite=None, areas=None): """source files count diff --git a/debsources/tests/test_stats.py b/debsources/tests/test_stats.py index ce98548..f3bbb49 100644 --- a/debsources/tests/test_stats.py +++ b/debsources/tests/test_stats.py @@ -182,3 +182,22 @@ class Stats(unittest.TestCase, DbTestFixture): area_count = statistics.ctags(self.session, areas=['main']) self.assertEqual(area_count, 87507) self.assertLessEqual(area_count, statistics.ctags(self.session)) + + @istest + def test_group_by_stats(self): + self.assertEqual(dict(statistics.stats_grouped_by(self.session, + 'disk_usage'))['etch'], + 32736) + self.assertEqual(dict(statistics.stats_grouped_by(self.session, + 'ctags'))['wheezy'], + 20150) + self.assertEqual(dict(statistics.stats_grouped_by(self.session, + 'source_packages'))['jessie'], + 13) + self.assertEqual(dict(statistics.stats_grouped_by(self.session, + 'source_files'))['wheezy'], + 1632) + sloc_list= statistics.stats_grouped_by(self.session, 'sloccount') + wheezy_sloc = [[item[1], item[2]] for item in sloc_list + if item[0] == "wheezy"] + self.assertEqual(dict(wheezy_sloc)['sh'], 13560) diff --git a/debsources/updater.py b/debsources/updater.py index d0cc389..fdd72aa 100644 --- a/debsources/updater.py +++ b/debsources/updater.py @@ -460,20 +460,33 @@ def update_statistics(status, conf, session, suites=None): session.add(siz) session.add(loc) - # compute per-suite stats - for suite in suites: - siz = HistorySize(suite, timestamp=now) - loc = HistorySlocCount(suite, timestamp=now) - - suite_key = 'debian_' + suite + '.' - for stat in ['disk_usage', 'source_packages', 'source_files', 'ctags']: - v = getattr(statistics, stat)(session, suite) - stats[suite_key + stat] = v - setattr(siz, stat, v) - store_sloccount_stats(statistics.sloccount_summary(session, suite), - stats, suite_key + 'sloccount', loc) - if not conf['dry_run'] and 'db' in conf['backends']: + # Update HistorySize + suite_key = 'debian_' + hist_siz = dict((suite, HistorySize(suite, timestamp=now)) + for suite in suites) + for stat in ['disk_usage', 'source_packages', 'source_files', 'ctags']: + stats_result = statistics.stats_grouped_by(session, stat) + for res in stats_result: + stats[suite_key+res[0]+'.'+stat] = res[1] + setattr(hist_siz[res[0]], stat, res[1]) + + if not conf['dry_run'] and 'db' in conf['backends']: + for siz in hist_siz.values(): session.add(siz) + + # update historySlocCount + sloccount_res = statistics.stats_grouped_by(session, 'sloccount') + hist_loc = dict((suite, HistorySlocCount(suite, timestamp=now)) + for suite in suites) + for suite in suites: + temp = [[item[1], item[2]] for item in sloccount_res + if item[0] == suite] + store_sloccount_stats(dict(temp), stats, + suite_key + suite + ".sloccount", + hist_loc[suite]) + + if not conf['dry_run'] and 'db' in conf['backends']: + for loc in hist_loc.values(): session.add(loc) session.flush() -- 2.1.4
Attachment:
signature.asc
Description: OpenPGP digital signature