summaryrefslogtreecommitdiff
path: root/scripts/summarize_build_stats.py
diff options
context:
space:
mode:
Diffstat (limited to 'scripts/summarize_build_stats.py')
-rw-r--r--scripts/summarize_build_stats.py93
1 files changed, 87 insertions, 6 deletions
diff --git a/scripts/summarize_build_stats.py b/scripts/summarize_build_stats.py
index 404a18d3e..a4a965af4 100644
--- a/scripts/summarize_build_stats.py
+++ b/scripts/summarize_build_stats.py
@@ -7,6 +7,7 @@
from __future__ import print_function
import datetime
+import itertools
import numpy
import re
import sys
@@ -14,6 +15,7 @@ import sys
from chromite.cbuildbot import constants
from chromite.lib import cidb
from chromite.lib import commandline
+from chromite.lib import cros_build_lib
from chromite.lib import cros_logging as logging
@@ -36,6 +38,8 @@ class CLStatsEngine(object):
self.blames = {}
self.summary = {}
self.builds_by_build_id = {}
+ self.slave_builds_by_master_id = {}
+ self.slave_builds_by_config = {}
def GatherBuildAnnotations(self):
"""Gather the failure annotations for builds from cidb."""
@@ -158,6 +162,16 @@ class CLStatsEngine(object):
self.builds_by_build_id.update(
{b['id'] : b for b in self.builds})
+ # Gather slave statuses for each of the master builds. For now this is a
+ # separate query per CQ run, but this could be consolidated to a single
+ # query if necessary (requires adding a cidb.py API method).
+ for bid in self.builds_by_build_id:
+ self.slave_builds_by_master_id[bid] = self.db.GetSlaveStatuses(bid)
+
+ self.slave_builds_by_config = cros_build_lib.GroupByKey(
+ itertools.chain(*self.slave_builds_by_master_id.values()),
+ 'build_config')
+
def _PrintCounts(self, reasons, fmt):
"""Print a sorted list of reasons in descending order of frequency.
@@ -199,7 +213,23 @@ class CLStatsEngine(object):
)
return false_rejection_rate
- def Summarize(self, build_type):
+ def GetBuildRunTimes(self, builds):
+ """Gets the elapsed run times of the completed builds within |builds|.
+
+ Args:
+ builds: Iterable of build statuses as returned by cidb.
+
+ Returns:
+ A list of the elapsed times (in seconds) of the builds that completed.
+ """
+ times = []
+ for b in builds:
+ if b['finish_time']:
+ td = (b['finish_time'] - b['start_time']).total_seconds()
+ times.append(td)
+ return times
+
+ def Summarize(self, build_type, bad_patch_candidates=False):
"""Process, print, and return a summary of statistics.
As a side effect, save summary to self.summary.
@@ -208,11 +238,11 @@ class CLStatsEngine(object):
A dictionary summarizing the statistics.
"""
if build_type == 'cq':
- return self.SummarizeCQ()
+ return self.SummarizeCQ(bad_patch_candidates=bad_patch_candidates)
else:
return self.SummarizePFQ()
- def SummarizeCQ(self):
+ def SummarizeCQ(self, bad_patch_candidates=False):
"""Process, print, and return a summary of cl action statistics.
As a side effect, save summary to self.summary.
@@ -230,6 +260,8 @@ class CLStatsEngine(object):
else:
logging.info('No runs included.')
+ build_times_sec = sorted(self.GetBuildRunTimes(self.builds))
+
build_reason_counts = {}
for reasons in self.reasons.values():
for reason in reasons:
@@ -237,8 +269,11 @@ class CLStatsEngine(object):
build_reason_counts[reason] = build_reason_counts.get(reason, 0) + 1
unique_blames = set()
+ build_blame_counts = {}
for blames in self.blames.itervalues():
unique_blames.update(blames)
+ for blame in blames:
+ build_blame_counts[blame] = build_blame_counts.get(blame, 0) + 1
unique_cl_blames = {blame for blame in unique_blames if
EXTERNAL_CL_BASE_URL in blame}
@@ -290,6 +325,23 @@ class CLStatsEngine(object):
for x in range(max(rejection_counts) + 1):
good_patch_rejection_breakdown.append((x, rejection_counts.count(x)))
+ # For CQ runs that passed, track which slave was the long pole, i.e. the
+ # last to finish.
+ long_pole_slave_counts = {}
+ for bid, master_build in self.builds_by_build_id.items():
+ if master_build['status'] == constants.BUILDER_STATUS_PASSED:
+ if not self.slave_builds_by_master_id[bid]:
+ continue
+ # TODO(akeshet): The set of slaves also includes non-important slaves
+ # (there is no distinction in cidb between important and non-important).
+ # To protect max(...) from hitting any None values we need the if check
+ # below. Revisit this once we can filter out non-important slaves.
+ _, long_config = max((slave['finish_time'], slave['build_config'])
+ for slave in self.slave_builds_by_master_id[bid]
+ if slave['finish_time'])
+ long_pole_slave_counts[long_config] = (
+ long_pole_slave_counts.get(long_config, 0) + 1)
+
summary = {
'total_cl_actions': len(self.claction_history),
'unique_cls': len(self.claction_history.affected_cls),
@@ -306,6 +358,7 @@ class CLStatsEngine(object):
'patch_handling_time': patch_handle_times,
'bad_cl_candidates': bad_cl_candidates,
'unique_blames_change_count': len(unique_cl_blames),
+ 'long_pole_slave_counts': long_pole_slave_counts,
}
logging.info('CQ committed %s changes', summary['submitted_patches'])
@@ -388,11 +441,19 @@ class CLStatsEngine(object):
numpy.percentile(cq_handle_times, 90) / 3600.0)
logging.info('')
+ # Log some statistics about cq-master run-time.
+ logging.info('CQ-master run time:')
+ logging.info(' 50th percentile: %.2f hours',
+ numpy.percentile(build_times_sec, 50) / 3600.0)
+ logging.info(' 90th percenfile: %.2f hours',
+ numpy.percentile(build_times_sec, 90) / 3600.0)
+
for bot_type, patches in summary['bad_cl_candidates'].items():
logging.info('%d bad patch candidates were rejected by the %s',
len(patches), bot_type)
- for k in patches:
- logging.info('Bad patch candidate in: %s', k)
+ if bad_patch_candidates:
+ for k in patches:
+ logging.info('Bad patch candidate in: %s', k)
fmt_fai = ' %(cnt)d failures in %(reason)s'
fmt_rej = ' %(cnt)d rejections due to %(reason)s'
@@ -406,6 +467,21 @@ class CLStatsEngine(object):
logging.info('Reasons why builds failed:')
self._PrintCounts(build_reason_counts, fmt_fai)
+ logging.info('Bugs or CLs responsible for build failures:')
+ self._PrintCounts(build_blame_counts, fmt_fai)
+
+ total_counts = sum(long_pole_slave_counts.values())
+ logging.info('Slowest CQ slaves out of %s passing runs:', total_counts)
+ for (count, config) in sorted(
+ (v, k) for (k, v) in long_pole_slave_counts.items()):
+ if count < (total_counts / 20.0):
+ continue
+ build_times = self.GetBuildRunTimes(self.slave_builds_by_config[config])
+ logging.info('%s times the slowest slave was %s', count, config)
+ logging.info(' 50th percentile: %.2f hours, 90th percentile: %.2f hours',
+ numpy.percentile(build_times, 50) / 3600.0,
+ numpy.percentile(build_times, 90) / 3600.0)
+
return summary
# TODO(akeshet): some of this logic is copied directly from SummarizeCQ.
@@ -490,6 +566,10 @@ def GetParser():
parser.add_argument('--build-type', choices=['cq', 'chrome-pfq'],
default='cq',
help='Build type to summarize. Default: cq.')
+ parser.add_argument('--bad-patch-candidates', action='store_true',
+ default=False,
+ help='In CQ mode, whether to print bad patch '
+ 'candidates.')
return parser
@@ -527,4 +607,5 @@ def main(argv):
cl_stats_engine = CLStatsEngine(db)
cl_stats_engine.Gather(start_date, end_date, master_config,
starting_build_number=options.starting_build)
- cl_stats_engine.Summarize(options.build_type)
+ cl_stats_engine.Summarize(options.build_type,
+ options.bad_patch_candidates)