diff options
Diffstat (limited to 'scripts/summarize_build_stats.py')
-rw-r--r-- | scripts/summarize_build_stats.py | 93 |
1 files changed, 87 insertions, 6 deletions
diff --git a/scripts/summarize_build_stats.py b/scripts/summarize_build_stats.py index 404a18d3e..a4a965af4 100644 --- a/scripts/summarize_build_stats.py +++ b/scripts/summarize_build_stats.py @@ -7,6 +7,7 @@ from __future__ import print_function import datetime +import itertools import numpy import re import sys @@ -14,6 +15,7 @@ import sys from chromite.cbuildbot import constants from chromite.lib import cidb from chromite.lib import commandline +from chromite.lib import cros_build_lib from chromite.lib import cros_logging as logging @@ -36,6 +38,8 @@ class CLStatsEngine(object): self.blames = {} self.summary = {} self.builds_by_build_id = {} + self.slave_builds_by_master_id = {} + self.slave_builds_by_config = {} def GatherBuildAnnotations(self): """Gather the failure annotations for builds from cidb.""" @@ -158,6 +162,16 @@ class CLStatsEngine(object): self.builds_by_build_id.update( {b['id'] : b for b in self.builds}) + # Gather slave statuses for each of the master builds. For now this is a + # separate query per CQ run, but this could be consolidated to a single + # query if necessary (requires adding a cidb.py API method). + for bid in self.builds_by_build_id: + self.slave_builds_by_master_id[bid] = self.db.GetSlaveStatuses(bid) + + self.slave_builds_by_config = cros_build_lib.GroupByKey( + itertools.chain(*self.slave_builds_by_master_id.values()), + 'build_config') + def _PrintCounts(self, reasons, fmt): """Print a sorted list of reasons in descending order of frequency. @@ -199,7 +213,23 @@ class CLStatsEngine(object): ) return false_rejection_rate - def Summarize(self, build_type): + def GetBuildRunTimes(self, builds): + """Gets the elapsed run times of the completed builds within |builds|. + + Args: + builds: Iterable of build statuses as returned by cidb. + + Returns: + A list of the elapsed times (in seconds) of the builds that completed. + """ + times = [] + for b in builds: + if b['finish_time']: + td = (b['finish_time'] - b['start_time']).total_seconds() + times.append(td) + return times + + def Summarize(self, build_type, bad_patch_candidates=False): """Process, print, and return a summary of statistics. As a side effect, save summary to self.summary. @@ -208,11 +238,11 @@ class CLStatsEngine(object): A dictionary summarizing the statistics. """ if build_type == 'cq': - return self.SummarizeCQ() + return self.SummarizeCQ(bad_patch_candidates=bad_patch_candidates) else: return self.SummarizePFQ() - def SummarizeCQ(self): + def SummarizeCQ(self, bad_patch_candidates=False): """Process, print, and return a summary of cl action statistics. As a side effect, save summary to self.summary. @@ -230,6 +260,8 @@ class CLStatsEngine(object): else: logging.info('No runs included.') + build_times_sec = sorted(self.GetBuildRunTimes(self.builds)) + build_reason_counts = {} for reasons in self.reasons.values(): for reason in reasons: @@ -237,8 +269,11 @@ class CLStatsEngine(object): build_reason_counts[reason] = build_reason_counts.get(reason, 0) + 1 unique_blames = set() + build_blame_counts = {} for blames in self.blames.itervalues(): unique_blames.update(blames) + for blame in blames: + build_blame_counts[blame] = build_blame_counts.get(blame, 0) + 1 unique_cl_blames = {blame for blame in unique_blames if EXTERNAL_CL_BASE_URL in blame} @@ -290,6 +325,23 @@ class CLStatsEngine(object): for x in range(max(rejection_counts) + 1): good_patch_rejection_breakdown.append((x, rejection_counts.count(x))) + # For CQ runs that passed, track which slave was the long pole, i.e. the + # last to finish. + long_pole_slave_counts = {} + for bid, master_build in self.builds_by_build_id.items(): + if master_build['status'] == constants.BUILDER_STATUS_PASSED: + if not self.slave_builds_by_master_id[bid]: + continue + # TODO(akeshet): The set of slaves also includes non-important slaves + # (there is no distinction in cidb between important and non-important). + # To protect max(...) from hitting any None values we need the if check + # below. Revisit this once we can filter out non-important slaves. + _, long_config = max((slave['finish_time'], slave['build_config']) + for slave in self.slave_builds_by_master_id[bid] + if slave['finish_time']) + long_pole_slave_counts[long_config] = ( + long_pole_slave_counts.get(long_config, 0) + 1) + summary = { 'total_cl_actions': len(self.claction_history), 'unique_cls': len(self.claction_history.affected_cls), @@ -306,6 +358,7 @@ class CLStatsEngine(object): 'patch_handling_time': patch_handle_times, 'bad_cl_candidates': bad_cl_candidates, 'unique_blames_change_count': len(unique_cl_blames), + 'long_pole_slave_counts': long_pole_slave_counts, } logging.info('CQ committed %s changes', summary['submitted_patches']) @@ -388,11 +441,19 @@ class CLStatsEngine(object): numpy.percentile(cq_handle_times, 90) / 3600.0) logging.info('') + # Log some statistics about cq-master run-time. + logging.info('CQ-master run time:') + logging.info(' 50th percentile: %.2f hours', + numpy.percentile(build_times_sec, 50) / 3600.0) + logging.info(' 90th percenfile: %.2f hours', + numpy.percentile(build_times_sec, 90) / 3600.0) + for bot_type, patches in summary['bad_cl_candidates'].items(): logging.info('%d bad patch candidates were rejected by the %s', len(patches), bot_type) - for k in patches: - logging.info('Bad patch candidate in: %s', k) + if bad_patch_candidates: + for k in patches: + logging.info('Bad patch candidate in: %s', k) fmt_fai = ' %(cnt)d failures in %(reason)s' fmt_rej = ' %(cnt)d rejections due to %(reason)s' @@ -406,6 +467,21 @@ class CLStatsEngine(object): logging.info('Reasons why builds failed:') self._PrintCounts(build_reason_counts, fmt_fai) + logging.info('Bugs or CLs responsible for build failures:') + self._PrintCounts(build_blame_counts, fmt_fai) + + total_counts = sum(long_pole_slave_counts.values()) + logging.info('Slowest CQ slaves out of %s passing runs:', total_counts) + for (count, config) in sorted( + (v, k) for (k, v) in long_pole_slave_counts.items()): + if count < (total_counts / 20.0): + continue + build_times = self.GetBuildRunTimes(self.slave_builds_by_config[config]) + logging.info('%s times the slowest slave was %s', count, config) + logging.info(' 50th percentile: %.2f hours, 90th percentile: %.2f hours', + numpy.percentile(build_times, 50) / 3600.0, + numpy.percentile(build_times, 90) / 3600.0) + return summary # TODO(akeshet): some of this logic is copied directly from SummarizeCQ. @@ -490,6 +566,10 @@ def GetParser(): parser.add_argument('--build-type', choices=['cq', 'chrome-pfq'], default='cq', help='Build type to summarize. Default: cq.') + parser.add_argument('--bad-patch-candidates', action='store_true', + default=False, + help='In CQ mode, whether to print bad patch ' + 'candidates.') return parser @@ -527,4 +607,5 @@ def main(argv): cl_stats_engine = CLStatsEngine(db) cl_stats_engine.Gather(start_date, end_date, master_config, starting_build_number=options.starting_build) - cl_stats_engine.Summarize(options.build_type) + cl_stats_engine.Summarize(options.build_type, + options.bad_patch_candidates) |