diff options
90 files changed, 1251 insertions, 5780 deletions
diff --git a/client/bin/local_host.py b/client/bin/local_host.py index 7d8db264ca..2f0a91b048 100644 --- a/client/bin/local_host.py +++ b/client/bin/local_host.py @@ -3,11 +3,18 @@ """ This file contains the implementation of a host object for the local machine. """ - -import distutils.core, glob, os, platform, shutil +import distutils.core +import glob +import os +import platform +import shutil +import sys + +import common from autotest_lib.client.common_lib import hosts, error from autotest_lib.client.bin import utils + class LocalHost(hosts.Host): """This class represents a host running locally on the host.""" @@ -44,23 +51,20 @@ class LocalHost(hosts.Host): @see common_lib.hosts.Host.run() """ try: - result = utils.run( - command, timeout=timeout, ignore_status=True, - ignore_timeout=ignore_timeout, - stdout_tee=stdout_tee, stderr_tee=stderr_tee, stdin=stdin, - args=args) - except error.CmdError, e: - # this indicates a timeout exception - raise error.AutotestHostRunError('command timed out', e.result_obj) - - if ignore_timeout and result is None: - # We have timed out, there is no result to report. - return None - - if not ignore_status and result.exit_status > 0: - raise error.AutotestHostRunError('command execution error', result) - - return result + return utils.run( + command, timeout=timeout, ignore_status=ignore_status, + ignore_timeout=ignore_timeout, stdout_tee=stdout_tee, + stderr_tee=stderr_tee, stdin=stdin, args=args) + except error.CmdTimeoutError as e: + # CmdTimeoutError is a subclass of CmdError, so must be caught first + new_error = error.AutotestHostRunTimeoutError( + e.command, e.result_obj, additional_text=e.additional_text) + raise error.AutotestHostRunTimeoutError, new_error, \ + sys.exc_info()[2] + except error.CmdError as e: + new_error = error.AutotestHostRunCmdError( + e.command, e.result_obj, additional_text=e.additional_text) + raise error.AutotestHostRunCmdError, new_error, sys.exc_info()[2] def list_files_glob(self, path_glob): diff --git a/client/bin/local_host_unittest.py b/client/bin/local_host_unittest.py index 51ca71d4e3..f5bc5f59d6 100755 --- a/client/bin/local_host_unittest.py +++ b/client/bin/local_host_unittest.py @@ -71,24 +71,41 @@ class test_local_host_class(unittest.TestCase): @mock.patch('autotest_lib.client.bin.local_host.utils.run') - def test_run_failure_raised(self, mock_run): - result = local_host.utils.CmdResult( - command='yes', - stdout='', - stderr='err', - exit_status=1, - duration=1, + def test_run_cmd_failure_raised(self, mock_run): + mock_result = mock.MagicMock() + mock_run.side_effect = error.CmdError('yes', mock_result) + + host = local_host.LocalHost() + with self.assertRaises(error.AutotestHostRunCmdError) as exc_cm: + host.run('yes', timeout=123) + + self.assertEqual(exc_cm.exception.result_obj, mock_result) + mock_run.assert_called_once_with( + 'yes', + timeout=123, + ignore_status=False, + stdout_tee=local_host.utils.TEE_TO_LOGS, + stderr_tee=local_host.utils.TEE_TO_LOGS, + stdin=None, + ignore_timeout=False, + args=(), ) - mock_run.return_value = result + + + @mock.patch('autotest_lib.client.bin.local_host.utils.run') + def test_run_cmd_timeout_raised(self, mock_run): + mock_result = mock.MagicMock() + mock_run.side_effect = error.CmdTimeoutError('yes', mock_result) host = local_host.LocalHost() - with self.assertRaises(error.AutotestHostRunError): + with self.assertRaises(error.AutotestHostRunTimeoutError) as exc_cm: host.run('yes', timeout=123) + self.assertEqual(exc_cm.exception.result_obj, mock_result) mock_run.assert_called_once_with( - result.command, + 'yes', timeout=123, - ignore_status=True, + ignore_status=False, stdout_tee=local_host.utils.TEE_TO_LOGS, stderr_tee=local_host.utils.TEE_TO_LOGS, stdin=None, diff --git a/client/bin/result_tools/utils_lib.py b/client/bin/result_tools/utils_lib.py index 4322e79460..f9b33a7ea0 100644 --- a/client/bin/result_tools/utils_lib.py +++ b/client/bin/result_tools/utils_lib.py @@ -4,6 +4,9 @@ """Shared constants and methods for result utilities.""" +import collections + + # Following are key names for directory summaries. The keys are started with / # so it can be differentiated with a valid file name. The short keys are # designed for smaller file size of the directory summary. @@ -18,4 +21,61 @@ COLLECTED_SIZE_BYTES = '/C' DIRS = '/D' # Default root directory name. To allow summaries to be merged effectively, all # summaries are collected with root directory of '' -ROOT_DIR = ''
\ No newline at end of file +ROOT_DIR = '' + +# Information of test result sizes to be stored in tko_job_keyvals. +# The total size (in kB) of test results that generated during the test, +# including: +# * server side test logs and result files. +# * client side test logs, sysinfo, system logs and crash dumps. +# Note that a test can collect the same test result files from DUT multiple +# times during the test, before and after each iteration/test. So the value of +# client_result_collected_KB could be larger than the value of +# result_uploaded_KB, which is the size of result directory on the server side, +# even if the test result throttling is not applied. +# +# Attributes of the named tuple includes: +# client_result_collected_KB: The total size (in KB) of test results collected +# from test device. +# original_result_total_KB: The original size (in KB) of test results before +# being trimmed. +# result_uploaded_KB: The total size (in KB) of test results to be uploaded by +# gs_offloader. +# result_throttled: Flag to indicate if test results collection is throttled. +ResultSizeInfo = collections.namedtuple( + 'ResultSizeInfo', + ['client_result_collected_KB', + 'original_result_total_KB', + 'result_uploaded_KB', + 'result_throttled']) + + +def get_result_size_info(client_collected_bytes, summary): + """Get the result size information. + + @param client_collected_bytes: Size in bytes of results collected from the + test device. + @param summary: A dictionary of directory summary. + @return: A namedtuple of result size informations, including: + client_result_collected_KB: The total size (in KB) of test results + collected from test device. + original_result_total_KB: The original size (in KB) of test results + before being trimmed. + result_uploaded_KB: The total size (in KB) of test results to be + uploaded. + result_throttled: True if test results collection is throttled. + """ + root_entry = summary[ROOT_DIR] + client_result_collected_KB= client_collected_bytes / 1024 + original_result_total_KB = root_entry[ORIGINAL_SIZE_BYTES] / 1024 + result_uploaded_KB = root_entry[TRIMMED_SIZE_BYTES] / 1024 + # Test results are considered to be throttled if the total size of + # results collected is different from the total size of trimmed results + # from the client side. + result_throttled = ( + root_entry[ORIGINAL_SIZE_BYTES] != root_entry[TRIMMED_SIZE_BYTES]) + + return ResultSizeInfo(client_result_collected_KB=client_result_collected_KB, + original_result_total_KB=original_result_total_KB, + result_uploaded_KB=result_uploaded_KB, + result_throttled=result_throttled)
\ No newline at end of file diff --git a/client/common_lib/cros/arc_util.py b/client/common_lib/cros/arc_util.py index b0f7c22f9f..86fb1c1069 100644 --- a/client/common_lib/cros/arc_util.py +++ b/client/common_lib/cros/arc_util.py @@ -144,7 +144,7 @@ def set_browser_options_for_opt_in(b_options): b_options.gaia_login = True -def enable_play_store(autotest_ext): +def enable_play_store(autotest_ext, enabled): """ Enable ARC++ Play Store @@ -152,13 +152,16 @@ def enable_play_store(autotest_ext): @param autotest_ext: autotest extension object. + @param enabled: if True then perform opt-in, otherwise opt-out. + @returns: True if the opt-in should continue; else False. """ if autotest_ext is None: raise error.TestFail( - 'Could not enable ARC because autotest API does not exist') + 'Could not change the Play Store enabled state because ' + 'autotest API does not exist') # Skip enabling for managed users, since value is policy enforced. # Return early if a managed user has ArcEnabled set to false. @@ -175,18 +178,18 @@ def enable_play_store(autotest_ext): logging.info('Determined that ARC is managed by user policy.') policy_enabled = autotest_ext.EvaluateJavaScript( 'window.__play_store_state.enabled') - if not policy_enabled: + if enabled != policy_enabled: logging.info( - 'Returning early since ARC is policy-enforced off.') + 'Returning early since ARC is policy-enforced.') return False else: autotest_ext.ExecuteJavaScript(''' - chrome.autotestPrivate.setPlayStoreEnabled( - true, function(enabled) {}); - ''') + chrome.autotestPrivate.setPlayStoreEnabled( + %s, function(enabled) {}); + ''' % ('true' if enabled else 'false')) except exceptions.EvaluateException as e: - raise error.TestFail(' Could not enable ARC via autotest API. "%s".' - % e) + raise error.TestFail('Could not change the Play Store enabled state ' + ' via autotest API. "%s".' % e) return True @@ -242,12 +245,7 @@ def opt_in_and_wait_for_completion(extension_main_page): @raises error.TestFail if opt-in doesn't complete after timeout. """ - js_code_click_agree = """ - doc = appWindow.contentWindow.document; - agree_button_element = doc.getElementById('button-agree'); - agree_button_element.click(); - """ - extension_main_page.ExecuteJavaScript(js_code_click_agree) + extension_main_page.ExecuteJavaScript('termsPage.onAgree()') SIGN_IN_TIMEOUT = 120 try: @@ -269,6 +267,9 @@ def opt_in_and_wait_for_completion(extension_main_page): else: raise error.TestFail('Opt-in app did not finish running after %s ' 'seconds!' % SIGN_IN_TIMEOUT) + # Reset termsPage to be able to reuse OptIn page and wait condition for ToS + # are loaded. + extension_main_page.ExecuteJavaScript('termsPage = null') def opt_in(browser, autotest_ext): @@ -277,15 +278,17 @@ def opt_in(browser, autotest_ext): Return early if the arc_setting cannot be set True. - @param browser: chrome.Chrome broswer object. + @param browser: chrome.Chrome browser object. @param autotest_ext: autotest extension object. @raises: error.TestFail if opt in fails. """ + logging.info(_OPT_IN_BEGIN) - if not enable_play_store(autotest_ext): + if not enable_play_store(autotest_ext, True): return + extension_main_page = find_opt_in_extension_page(browser) opt_in_and_wait_for_completion(extension_main_page) logging.info(_OPT_IN_FINISH) diff --git a/client/common_lib/cros/chrome.py b/client/common_lib/cros/chrome.py index 27a5f58bc7..708aa635c9 100644 --- a/client/common_lib/cros/chrome.py +++ b/client/common_lib/cros/chrome.py @@ -165,7 +165,7 @@ class Chrome(object): if utils.is_arc_available(): if disable_arc_opt_in: if arc_util.should_start_arc(arc_mode): - arc_util.enable_play_store(self.autotest_ext) + arc_util.enable_play_store(self.autotest_ext, True) else: arc_util.opt_in(self.browser, self.autotest_ext) arc_util.post_processing_after_browser(self) diff --git a/client/common_lib/cros/graphite/__init__.py b/client/common_lib/cros/graphite/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 --- a/client/common_lib/cros/graphite/__init__.py +++ /dev/null diff --git a/client/common_lib/cros/graphite/autotest_es.py b/client/common_lib/cros/graphite/autotest_es.py deleted file mode 100644 index 4f0a0e0ee4..0000000000 --- a/client/common_lib/cros/graphite/autotest_es.py +++ /dev/null @@ -1,154 +0,0 @@ -# Copyright (c) 2014 The Chromium OS Authors. All rights reserved. -# Use of this source code is governed by a BSD-style license that can be -# found in the LICENSE file. - -# This file defines helper functions for putting entries into elasticsearch. - -"""Utils for sending metadata to elasticsearch - -Elasticsearch is a key-value store NOSQL database. -Source is here: https://github.com/elasticsearch/elasticsearch -We will be using es to store our metadata. - -For example, if we wanted to store the following metadata: - -metadata = { - 'host_id': 1 - 'job_id': 20 - 'time_start': 100000 - 'time_recorded': 100006 -} - -The following call will send metadata to the default es server. - autotest_es.post(index, metadata) -We can also specify which port and host to use. - -Using for testing: Sometimes, when we choose a single index -to put entries into, we want to clear that index of all -entries before running our tests. Use clear_index function. -(see es_utils_functionaltest.py for an example) - -This file also contains methods for sending queries to es. Currently, -the query (json dict) we send to es is quite complicated (but flexible). - -For example, the below query returns job_id, host_id, and job_start -for all job_ids in [0, 99999] and host_id matching 10. - -range_eq_query = { - 'fields': ['job_id', 'host_id', 'job_start'], - 'query': { - 'filtered': { - 'query': { - 'match': { - 'host_id': 10, - } - } - 'filter': { - 'range': { - 'job_id': { - 'gte': 0, - 'lte': 99999, - } - } - } - } - } -} - -To send a query once it is created, call execute_query() to send it to the -intended elasticsearch server. The query() function can be used to construct a -query with certain parameters and execute it all in one call. - -""" - -import es_utils - -import common -from autotest_lib.client.common_lib import global_config - - -# Server and ports for elasticsearch (for metadata use only) -METADATA_ES_SERVER = global_config.global_config.get_config_value( - 'CROS', 'ES_HOST', default=None) -ES_PORT = global_config.global_config.get_config_value( - 'CROS', 'ES_PORT', type=int, default=9200) -ES_UDP_PORT = global_config.global_config.get_config_value( - 'CROS', 'ES_UDP_PORT', type=int, default=9700) -# Whether to use http. udp is very little overhead (around 3 ms) compared to -# using http (tcp) takes ~ 500 ms for the first connection and 50-100ms for -# subsequent connections. -ES_USE_HTTP = global_config.global_config.get_config_value( - 'CROS', 'ES_USE_HTTP', type=bool, default=False) - -# If CLIENT/metadata_index is not set, INDEX_METADATA falls back to -# autotest instance name (SERVER/hostname). -INDEX_METADATA = global_config.global_config.get_config_value( - 'CLIENT', 'metadata_index', type=str, default=None) -if not INDEX_METADATA: - INDEX_METADATA = global_config.global_config.get_config_value( - 'SERVER', 'hostname', type=str, default='localhost') - -DEFAULT_BULK_POST_RETRIES = 5 - -def post(use_http=ES_USE_HTTP, host=METADATA_ES_SERVER, port=ES_PORT, - timeout=es_utils.DEFAULT_TIMEOUT, index=INDEX_METADATA, - udp_port=ES_UDP_PORT, - *args, **kwargs): - """This function takes a series of arguments which are passed to the - es_utils.ESMetadata constructor, and any other arguments are passed to - its post() function. For an explanation of each, see those functions in - es_utils. - """ - if not host: - return - - esmd = es_utils.ESMetadata(use_http=use_http, host=host, port=port, - timeout=timeout, index=index, udp_port=udp_port) - return esmd.post(*args, **kwargs) - - -def bulk_post(data_list, host=METADATA_ES_SERVER, port=ES_PORT, - timeout=es_utils.DEFAULT_TIMEOUT, index=INDEX_METADATA, - retries=DEFAULT_BULK_POST_RETRIES, *args, **kwargs): - """This function takes a series of arguments which are passed to the - es_utils.ESMetadata constructor, and a list of metadata, then upload to - Elasticsearch server using Elasticsearch bulk API. This can greatly nhance - the performance of uploading data using HTTP. - For an explanation of each argument, see those functions in es_utils. - """ - if not host: - return True - esmd = es_utils.ESMetadata(use_http=True, host=host, port=port, - timeout=timeout, index=index, - udp_port=ES_UDP_PORT) - # bulk post may fail due to the amount of data, retry several times. - for _ in range(retries): - if esmd.bulk_post(data_list, *args, **kwargs): - return True - return False - - -def execute_query(host=METADATA_ES_SERVER, port=ES_PORT, - timeout=es_utils.DEFAULT_TIMEOUT, index=INDEX_METADATA, - *args, **kwargs): - """This function takes a series of arguments which are passed to the - es_utils.ESMetadata constructor, and any other arguments are passed to - its execute_query() function. For an explanation of each, see those - functions in es_utils. - """ - esmd = es_utils.ESMetadata(use_http=True, host=host, port=port, - timeout=timeout, index=index, udp_port=0) - return esmd.execute_query(*args, **kwargs) - - -def query(host=METADATA_ES_SERVER, port=ES_PORT, - timeout=es_utils.DEFAULT_TIMEOUT, - index=INDEX_METADATA, *args, **kwargs): - """This function takes a series of arguments which are passed to the - es_utils.ESMetadata constructor, and any other arguments are passed to - its query() function. For an explanation of each, see those functions in - es_utils. - """ - esmd = es_utils.ESMetadata(use_http=True, host=host, port=port, - timeout=timeout, index=index, udp_port=0) - return esmd.query(*args, **kwargs) diff --git a/client/common_lib/cros/graphite/common.py b/client/common_lib/cros/graphite/common.py deleted file mode 100644 index 849be4d0cb..0000000000 --- a/client/common_lib/cros/graphite/common.py +++ /dev/null @@ -1,8 +0,0 @@ -import os, sys -dirname = os.path.dirname(sys.modules[__name__].__file__) -client_dir = os.path.abspath(os.path.join(dirname, "..", "..", "..")) -sys.path.insert(0, client_dir) -import setup_modules -sys.path.pop(0) -setup_modules.setup(base_path=client_dir, - root_module_name="autotest_lib.client") diff --git a/client/common_lib/cros/graphite/elasticsearch_mock.py b/client/common_lib/cros/graphite/elasticsearch_mock.py deleted file mode 100644 index f4ef71ad10..0000000000 --- a/client/common_lib/cros/graphite/elasticsearch_mock.py +++ /dev/null @@ -1,15 +0,0 @@ -# Copyright (c) 2014 The Chromium OS Authors. All rights reserved. -# Use of this source code is governed by a BSD-style license that can be -# found in the LICENSE file. - -import stats_es_mock - - -class Elasticsearch(stats_es_mock.mock_class_base): - """mock class for es_mock""" - pass - - -class ElasticsearchException(Exception): - """Mock class for elcasticsearch.ElasticsearchException""" - pass
\ No newline at end of file diff --git a/client/common_lib/cros/graphite/es_test_utils.py b/client/common_lib/cros/graphite/es_test_utils.py deleted file mode 100644 index 6d1e2b337d..0000000000 --- a/client/common_lib/cros/graphite/es_test_utils.py +++ /dev/null @@ -1,97 +0,0 @@ -# pylint: disable=missing-docstring -# Copyright (c) 2014 The Chromium OS Authors. All rights reserved. -# Use of this source code is governed by a BSD-style license that can be -# found in the LICENSE file. - -"""Helper functions for testing elasticsearch.""" - -import time - -import common - -import elasticsearch - - -# Maps target type to method to trigger sending of metadata. -# This differs based on what each object does. -# For example, in order for timer to send something, its stop -# method must be called. This differs for other stats objects. -TARGET_TO_METHOD = { - 'timer': 'stop', - 'gauge': 'send', - 'raw': 'send', - 'average': 'send', - 'counter': '_send', -} - -# Default maximum number of entries to return from ES query -DEFAULT_NUM_ENTRIES = 100 - -class EsTestUtilException(Exception): - """Exception raised when functions here fail. """ - pass - - -def sequential_random_insert_ints(keys, num_entries, target_type, index, - host, port, use_http, udp_port, - between_insert_secs=0, - print_interval=10): - """Inserts a bunch of random entries into the es database. - Keys are given, values are randomly generated. - - @param keys: A list of keys - @param num_entries: Number of entries to insert - @param target_type: This must be in - ['timer', 'gauge', 'raw', 'average', 'counter'] - @param between_insert_secs: Time to sleep after each insert. - defaults to no sleep time. - @param print_interval: how often to print - defaults to every 10 entries. - @param index: Index of es db to insert to - @param host: host of es db - @param port: port of es db - """ - # We are going to start the value at 0 and increment it by one per val. - for i in range(num_entries): - if print_interval == 0 or i % print_interval == 0: - print(' Inserting entry #%s with keys %s into index "%s."' - % (i, str(keys), index)) - metadata = {} - for value, key in enumerate(keys): - metadata[key] = value - - # Subname and value are not important from metadata pov. - subname = 'metadata.test' - value = 10 - time.sleep(between_insert_secs) - - -def clear_index(index, host, port, timeout, sleep_time=0.5, clear_timeout=5): - """Clears index in es db located at host:port. - - Warning: Will delete all data in es for a given index - - @param index: Index of es db to clear - @param host: elasticsearch host - @param port: elasticsearch port - @param timeout: how long to wait while connecting to es. - @param sleep_time: time between tries of clear_index - defaults to 0.5 seconds - @param clear_timeout: how long to wait for index to be cleared. - defualts to 5 seconds - Will quit and throw error if not cleared. (Number of seconds) - """ - es = elasticsearch.Elasticsearch(host=host, - port=port, - timeout=timeout) - if es.indices.exists(index=index): - print 'deleting index %s' % (index) - es.indices.delete(index=index) - time_start = time.time() - while es.indices.exists(index=index): - print 'waiting until index is deleted...' - time.sleep(sleep_time) - if time.time() - time_start > clear_timeout: - raise EsTestUtilException('clear_index failed.') - - print 'successfully deleted index %s' % (index) diff --git a/client/common_lib/cros/graphite/es_utils.py b/client/common_lib/cros/graphite/es_utils.py deleted file mode 100644 index 72fbc12a9c..0000000000 --- a/client/common_lib/cros/graphite/es_utils.py +++ /dev/null @@ -1,504 +0,0 @@ -# Copyright (c) 2014 The Chromium OS Authors. All rights reserved. -# Use of this source code is governed by a BSD-style license that can be -# found in the LICENSE file. - -# This file defines helper functions for putting entries into elasticsearch. - -"""Utils for sending metadata to elasticsearch - -Elasticsearch is a key-value store NOSQL database. -Source is here: https://github.com/elasticsearch/elasticsearch -We will be using es to store our metadata. - -For example, if we wanted to store the following metadata: - -metadata = { - 'host_id': 1 - 'job_id': 20 - 'time_start': 100000 - 'time_recorded': 100006 -} - -The following call will send metadata to the default es server. - es_utils.ESMetadata().post(index, metadata) -We can also specify which port and host to use. - -Using for testing: Sometimes, when we choose a single index -to put entries into, we want to clear that index of all -entries before running our tests. Use clear_index function. -(see es_utils_functionaltest.py for an example) - -This file also contains methods for sending queries to es. Currently, -the query (json dict) we send to es is quite complicated (but flexible). -We've included several methods that composes queries that would be useful. -These methods are all named create_*_query() - -For example, the below query returns job_id, host_id, and job_start -for all job_ids in [0, 99999] and host_id matching 10. - -range_eq_query = { - 'fields': ['job_id', 'host_id', 'job_start'], - 'query': { - 'filtered': { - 'query': { - 'match': { - 'host_id': 10, - } - } - 'filter': { - 'range': { - 'job_id': { - 'gte': 0, - 'lte': 99999, - } - } - } - } - } -} - -To send a query once it is created, call execute_query() to send it to the -intended elasticsearch server. - -""" - -import collections -import json -import logging -import socket -import time - -try: - import elasticsearch - from elasticsearch import helpers as elasticsearch_helpers -except ImportError: - import elasticsearch_mock as elasticsearch - elasticsearch_helpers = elasticsearch.Elasticsearch() - - -# Global timeout for connection to esdb timeout. -DEFAULT_TIMEOUT = 30 - -# Default result size for a query. -DEFAULT_RESULT_SIZE = 10**4 -# Default result size when scrolling query results. -DEFAULT_SCROLL_SIZE = 5*10**4 - -class EsUtilException(Exception): - """Exception raised when functions here fail. """ - pass - - -QueryResult = collections.namedtuple('QueryResult', ['total', 'hits']) - - -class ESMetadata(object): - """Class handling es connection for metadata.""" - - @property - def es(self): - """Read only property, lazily initialized""" - if not self._es: - self._es = elasticsearch.Elasticsearch(host=self.host, - port=self.port, - timeout=self.timeout) - return self._es - - - def __init__(self, use_http, host, port, index, udp_port, - timeout=DEFAULT_TIMEOUT): - """Initialize ESMetadata object. - - @param use_http: Whether to send data to ES using HTTP. - @param host: Elasticsearch host. - @param port: Elasticsearch port. - @param index: What index the metadata is stored in. - @param udp_port: What port to use for UDP data. - @param timeout: How long to wait while connecting to es. - """ - self.use_http = use_http - self.host = host - self.port = port - self.index = index - self.udp_port = udp_port - self.timeout = timeout - self._es = None - - - def _send_data_http(self, type_str, metadata): - """Sends data to insert into elasticsearch using HTTP. - - @param type_str: sets the _type field in elasticsearch db. - @param metadata: dictionary object containing metadata - """ - try: - self.es.index(index=self.index, doc_type=type_str, body=metadata) - except elasticsearch.ElasticsearchException as e: - # Mute exceptions from metadata reporting to prevent meta data - # reporting errors from killing test. - logging.error(e) - - - def _send_data_udp(self, type_str, metadata): - """Sends data to insert into elasticsearch using UDP. - - @param type_str: sets the _type field in elasticsearch db. - @param metadata: dictionary object containing metadata - """ - try: - # Header. - message = json.dumps( - {'index': {'_index': self.index, '_type': type_str}}, - separators=(', ', ' : ')) - message += '\n' - # Metadata. - message += json.dumps(metadata, separators=(', ', ' : ')) - message += '\n' - - sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) - sock.sendto(message, (self.host, self.udp_port)) - except socket.error as e: - logging.warn(e) - - - def post(self, type_str, metadata, log_time_recorded=True, **kwargs): - """Wraps call of send_data, inserts entry into elasticsearch. - - @param type_str: Sets the _type field in elasticsearch db. - @param metadata: Dictionary object containing metadata - @param log_time_recorded: Whether to automatically record the time - this metadata is recorded. Default is True. - @param kwargs: Additional metadata fields - - @return: True if post action succeeded. Otherwise return False. - - """ - if not self.host: - return True - - if not metadata: - return True - - metadata = metadata.copy() - metadata.update(kwargs) - # metadata should not contain anything with key '_type' - if '_type' in metadata: - type_str = metadata['_type'] - del metadata['_type'] - if log_time_recorded: - metadata['time_recorded'] = time.time() - try: - if self.use_http: - self._send_data_http(type_str, metadata) - else: - self._send_data_udp(type_str, metadata) - return True - except elasticsearch.ElasticsearchException as e: - logging.error(e) - return False - - - def bulk_post(self, data_list, log_time_recorded=True, **kwargs): - """Wraps call of send_data, inserts entry into elasticsearch. - - @param data_list: A list of dictionary objects containing metadata. - @param log_time_recorded: Whether to automatically record the time - this metadata is recorded. Default is True. - @param kwargs: Additional metadata fields - - @return: True if post action succeeded. Otherwise return False. - - """ - if not self.host: - return True - - if not data_list: - return True - - actions = [] - for metadata in data_list: - metadata = metadata.copy() - metadata.update(kwargs) - if log_time_recorded and not 'time_recorded' in metadata: - metadata['time_recorded'] = time.time() - metadata['_index'] = self.index - actions.append(metadata) - - try: - elasticsearch_helpers.bulk(self.es, actions) - return True - except elasticsearch.ElasticsearchException as e: - logging.error(e) - return False - - - def _compose_query(self, equality_constraints=[], fields_returned=None, - range_constraints=[], size=DEFAULT_RESULT_SIZE, - sort_specs=None, regex_constraints=[], - batch_constraints=[]): - """Creates a dict. representing multple range and/or equality queries. - - Example input: - _compose_query( - fields_returned = ['time_recorded', 'hostname', - 'status', 'dbg_str'], - equality_constraints = [ - ('_type', 'host_history'), - ('hostname', '172.22.169.106'), - ], - range_constraints = [ - ('time_recorded', 1405628341.904379, 1405700341.904379) - ], - size=20, - sort_specs=[ - 'hostname', - {'time_recorded': 'asc'}, - ] - ) - - Output: - { - 'fields': ['time_recorded', 'hostname', 'status', 'dbg_str'], - 'query': { - 'bool': { - 'minimum_should_match': 3, - 'should': [ - { - 'term': { - '_type': 'host_history' - } - }, - { - 'term': { - 'hostname': '172.22.169.106' - } - }, - { - 'range': { - 'time_recorded': { - 'gte': 1405628341.904379, - 'lte': 1405700341.904379 - } - } - } - ] - }, - }, - 'size': 20 - 'sort': [ - 'hostname', - { 'time_recorded': 'asc'}, - ] - } - - @param equality_constraints: list of tuples of (field, value) pairs - representing what each field should equal to in the query. - e.g. [ ('field1', 1), ('field2', 'value') ] - @param fields_returned: list of fields that we should return when - the query is executed. Set it to None to return all fields. Note - that the key/vals will be stored in _source key of the hit object, - if fields_returned is set to None. - @param range_constraints: list of tuples of (field, low, high) pairs - representing what each field should be between (inclusive). - e.g. [ ('field1', 2, 10), ('field2', -1, 20) ] - If you want one side to be unbounded, you can use None. - e.g. [ ('field1', 2, None) ] means value of field1 >= 2. - @param size: max number of entries to return. Default is 100000. - @param sort_specs: A list of fields to sort on, tiebreakers will be - broken by the next field(s). - @param regex_constraints: A list of regex constraints of tuples of - (field, value) pairs, e.g., [('filed1', '.*value.*')]. - @param batch_constraints: list of tuples of (field, list) pairs - representing each field should be equal to one of the values - in the list. - e.g., [ ('job_id', [10, 11, 12, 13]) ] - @returns: dictionary object that represents query to es. - This will return None if there are no equality constraints - and no range constraints. - """ - if not equality_constraints and not range_constraints: - raise EsUtilException('No range or equality constraints specified.') - - # Creates list of range dictionaries to put in the 'should' list. - range_list = [] - if range_constraints: - for key, low, high in range_constraints: - if low is None and high is None: - continue - temp_dict = {} - if low is not None: - temp_dict['gte'] = low - if high is not None: - temp_dict['lte'] = high - range_list.append( {'range': {key: temp_dict}}) - - # Creates the list of term dictionaries to put in the 'should' list. - eq_list = [{'term': {k: v}} for k, v in equality_constraints if k] - batch_list = [{'terms': {k: v}} for k, v in batch_constraints if k] - regex_list = [{'regexp': {k: v}} for k, v in regex_constraints if k] - constraints = eq_list + batch_list + range_list + regex_list - query = { - 'query': { - 'bool': { - 'must': constraints, - } - }, - } - if fields_returned: - query['fields'] = fields_returned - query['size'] = size - if sort_specs: - query['sort'] = sort_specs - return query - - - def execute_query(self, query): - """Makes a query on the given index. - - @param query: query dictionary (see _compose_query) - @returns: A QueryResult instance describing the result. - - Example output: - { - "took" : 5, - "timed_out" : false, - "_shards" : { - "total" : 16, - "successful" : 16, - "failed" : 0 - }, - "hits" : { - "total" : 4, - "max_score" : 1.0, - "hits" : [ { - "_index" : "graphite_metrics2", - "_type" : "metric", - "_id" : "rtntrjgdsafdsfdsfdsfdsfdssssssss", - "_score" : 1.0, - "_source":{"target_type": "timer", - "host_id": 1, - "job_id": 22, - "time_start": 400} - }, { - "_index" : "graphite_metrics2", - "_type" : "metric", - "_id" : "dfgfddddddddddddddddddddddhhh", - "_score" : 1.0, - "_source":{"target_type": "timer", - "host_id": 2, - "job_id": 23, - "time_start": 405} - }, { - "_index" : "graphite_metrics2", - "_type" : "metric", - "_id" : "erwerwerwewtrewgfednvfngfngfrhfd", - "_score" : 1.0, - "_source":{"target_type": "timer", - "host_id": 3, - "job_id": 24, - "time_start": 4098} - }, { - "_index" : "graphite_metrics2", - "_type" : "metric", - "_id" : "dfherjgwetfrsupbretowegoegheorgsa", - "_score" : 1.0, - "_source":{"target_type": "timer", - "host_id": 22, - "job_id": 25, - "time_start": 4200} - } ] - } - } - - """ - if not self.es.indices.exists(index=self.index): - logging.error('Index (%s) does not exist on %s:%s', - self.index, self.host, self.port) - return None - result = self.es.search(index=self.index, body=query) - # Check if all matched records are returned. It could be the size is - # set too small. Special case for size set to 1, as that means that - # the query cares about the first matched entry. - # TODO: Use pagination in Elasticsearch. This needs major change on how - # query results are iterated. - size = query.get('size', 1) - need_scroll = 'size' in query and size == DEFAULT_RESULT_SIZE - return_count = len(result['hits']['hits']) - total_match = result['hits']['total'] - if total_match > return_count and need_scroll: - logging.warn('There are %d matched records, only %d entries are ' - 'returned. Query size is set to %d. Will try to use ' - 'scroll command to get all entries.', total_match, - return_count, size) - # Try to get all results with scroll. - hits = self._get_results_by_scan(query, total_match) - else: - hits = result['hits']['hits'] - # Extract the actual results from the query. - output = QueryResult(total_match, []) - for hit in hits: - converted = {} - if 'fields' in hit: - for key, value in hit['fields'].items(): - converted[key] = value[0] if len(value)==1 else value - else: - converted = hit['_source'].copy() - output.hits.append(converted) - return output - - - def _get_results_by_scan(self, query, total_match=None): - """Get all results by using scan. - - @param query: query dictionary (see _compose_query) - @param total_match: The number of total matched results. Pass the value - in so the code doesn't need to run another query to get it. - - @returns: A list of matched results. - """ - if True or not total_match: - # Reduce the return size to make the query run faster. - query['size'] = 1 - result = self.es.search(index=self.index, body=query) - total_match = result['hits']['total'] - # Remove the sort from query so scroll method can run faster. - sort = None - if 'sort' in query: - sort = query['sort'] - if len(sort) > 1: - raise EsUtilException('_get_results_by_scan does not support ' - 'sort with more than one key: %s', sort) - del query['sort'] - del query['size'] - scroll = elasticsearch_helpers.scan(self.es, query=query, - index=self.index, - size=DEFAULT_SCROLL_SIZE) - hits = [] - next_mark = 0 - for hit in scroll: - hits.append(hit) - downloaded_percent = 100 * float(len(hits))/total_match - if downloaded_percent > next_mark: - logging.debug('%2.0f%% downloaded (%d)', downloaded_percent, - len(hits)) - next_mark += 5 - logging.debug('Number of hits found: %s', len(hits)) - - if sort: - logging.debug('Sort hits with rule: %s', sort) - sort_key = sort[0].keys()[0] - is_desc = sort[0].values()[0] == 'desc' - # If the query has `fields` specified, the dict of hit stores value - # in hit['fields'], otherwise, the keyvals are stored in - # hit['_source']. - key = lambda hit:(hit['_source'][sort_key] if '_source' in hit else - hit['fields'][sort_key][0]) - hits = sorted(hits, key=key, reverse=is_desc) - - return hits - - - def query(self, *args, **kwargs): - """The arguments to this function are the same as _compose_query.""" - query = self._compose_query(*args, **kwargs) - return self.execute_query(query) diff --git a/client/common_lib/cros/graphite/stats.py b/client/common_lib/cros/graphite/stats.py deleted file mode 100644 index 062b706831..0000000000 --- a/client/common_lib/cros/graphite/stats.py +++ /dev/null @@ -1,54 +0,0 @@ -# pylint: disable=missing-docstring -# Copyright (c) 2013 The Chromium OS Authors. All rights reserved. -# Use of this source code is governed by a BSD-style license that can be -# found in the LICENSE file. - -# This is _type for all metadata logged to elasticsearch from here. -STATS_ES_TYPE = 'stats_metadata' - - -def _prepend_init(_es, _conn, _prefix): - def wrapper(original): - """Decorator to override __init__.""" - - class _Derived(original): - def __init__(self, name, connection=None, bare=False, - metadata=None): - name = self._add_prefix(name, _prefix, bare) - conn = connection if connection else _conn - super(_Derived, self).__init__(name, conn) - self.metadata = metadata - self.es = _es - - def _add_prefix(self, name, prefix, bare=False): - """ - Since many people run their own local AFE, stats from a local - setup shouldn't get mixed into stats from prod. Therefore, - this function exists to add a prefix, nominally the name of - the local server, if |name| doesn't already start with the - server name, so that each person has their own "folder" of - stats that they can look at. - - However, this functionality might not always be wanted, so we - allow one to pass in |bare=True| to force us to not prepend - the local server name. (I'm not sure when one would use this, - but I don't see why I should disallow it...) - - >>> prefix = 'potato_nyc' - >>> _add_prefix('rpc.create_job', bare=False) - 'potato_nyc.rpc.create_job' - >>> _add_prefix('rpc.create_job', bare=True) - 'rpc.create_job' - - @param name The name to append to the server name if it - doesn't start with the server name. - @param bare If True, |name| will be returned un-altered. - @return A string to use as the stat name. - - """ - if not bare and not name.startswith(prefix): - name = '%s.%s' % (prefix, name) - return name - - return _Derived - return wrapper diff --git a/client/common_lib/cros/graphite/stats_es_functionaltest.py b/client/common_lib/cros/graphite/stats_es_functionaltest.py deleted file mode 100644 index e32ab6ff35..0000000000 --- a/client/common_lib/cros/graphite/stats_es_functionaltest.py +++ /dev/null @@ -1,144 +0,0 @@ -# Copyright (c) 2014 The Chromium OS Authors. All rights reserved. -# Use of this source code is governed by a BSD-style license that can be -# found in the LICENSE file. - -"""This is a script that inserts a bunch of entries into - elasticdb by reporting stats with metadata in the stats module. - -Usage: - # runs tests on all stats objects on prod instance of es - python stats_es_functionaltest.py --all --es_host=prod - # runs tests on all stats objects on test instance of es (localhost) - python stats_es_functionaltest.py --all --es_host=test - - python stats_es_functionaltest.py --test=timer # runs tests on timer obj. -""" - -import optparse -import time - -import common -from autotest_lib.client.common_lib.cros.graphite import autotest_es -from autotest_lib.client.common_lib.cros.graphite import es_test_utils - - -TESTS_ALL = ['timer', 'gauge', 'raw', 'average', 'counter'] - - -class StatsFunctionalTest(object): - """Test stats module with metadata""" - - def __init__(self, es_host, es_port, index): - self.host = es_host - self.port = es_port - self.index = index - self.wait_time = 6 # Bulk flush is 5 seconds - if autotest_es.ES_USE_HTTP: - # No flush time for http requests. - self.wait_time = 2 - - def run_tests(self, tests=TESTS_ALL, - num_entries=10, - keys=['job_id', 'host_id', 'job_start']): - """Runs test listed in the param tests. - - @param tests: list of tests to run - @param num_entries: number of metadata entries to insert - @param keys: keys each metadata dictionary will have - - """ - for test_type in tests: - if test_type not in TESTS_ALL: - print 'Skipping test %s, it is not supported. ' % (test_type) - es_test_utils.clear_index(index=self.index, - host=self.host, - port=self.port, - timeout=10) - print 'running %s test.' % (test_type) - self._run_one_test_metadata(test_type, num_entries, keys) - - - def _run_one_test_metadata(self, test_type, num_entries, keys): - """Puts many entries into elasticdb, then query it. """ - - print ('preparing to insert %s entries with keys %s into elasticdb...' - % (num_entries, keys)) - es_test_utils.sequential_random_insert_ints( - keys=keys, - target_type=test_type, - index=self.index, - host=self.host, - port=self.port, - use_http = autotest_es.ES_USE_HTTP, - udp_port = autotest_es.ES_UDP_PORT, - num_entries=num_entries, - print_interval=num_entries/5) - # Wait a bit for es to be populated with the metadata entry. - # I set this to 6 seconds because bulk.udp.flush_interval (es server) - # is configured to be 5 seconds. - print 'waiting %s seconds...' % (self.wait_time) - time.sleep(self.wait_time) - result = autotest_es.query(host=self.host, port=self.port, - index=self.index, fields_returned=keys, - range_constraints=[('host_id', 0, None)]) - if not result: - print ('%s test error: Index %s not found.' - %(test_type, self.index)) - return - - # TODO(michaelliang): Check hits and total are valid keys at each layer. - num_entries_found = result.total - print(' Inserted %s entries, found %s entries.' - %(num_entries, num_entries_found)) - if num_entries_found != num_entries: - print '\n\n%s test failed! \n\n' % (test_type) - else: - print '\n\n%s test passed! \n\n' % (test_type) - - -def main(): - """main script. """ - - parser = optparse.OptionParser() - parser.add_option('--all', action='store_true', dest='run_all', - default=False, - help='set --all flag to run all tests.') - parser.add_option('--test', type=str, - help=('Enter subset of [\'timer\', \'gauge\', \'raw\',' - '\'average\', \'counter\']'), - dest='test_to_run', - default=None) - parser.add_option('--es_host', type=str, - help=('Enter "prod" or "test" or an ip'), - dest='es_host', - default='localhost') - parser.add_option('--es_port', type=int, - help=('Enter port of es instance, usually 9200'), - dest='es_port', - default=9200) - options, _ = parser.parse_args() - - - if not options.run_all and not options.test_to_run: - print ('No tests specified.' - 'For help: python stats_es_functionaltest.py -h') - if options.es_host == 'prod': - es_host = autotest_es.METADATA_ES_SERVER - es_port = autotest_es.ES_PORT - elif options.es_host == 'test': - es_host = 'http://localhost' - es_port = autotest_es.ES_PORT - else: - es_host = options.es_host - es_port = options.es_port - test_obj = StatsFunctionalTest(es_host, - es_port, - 'stats_es_functionaltest') - if options.run_all: - test_obj.run_tests() - elif options.test_to_run: - test_obj.run_tests([options.test_to_run]) - - -if __name__ == '__main__': - main() diff --git a/client/common_lib/error.py b/client/common_lib/error.py index 4268870836..f9babced40 100644 --- a/client/common_lib/error.py +++ b/client/common_lib/error.py @@ -210,6 +210,30 @@ class AutotestHostRunError(GenericHostRunError, AutotestError): pass +class AutotestHostRunCmdError(AutotestHostRunError): + """Indicates that the command run via Host.run failed. + + This is equivalent to CmdError when raised from a Host object instead of + directly on the DUT using utils.run + """ + + def __init__(self, command, result_obj, additional_text=''): + description = command + if additional_text: + description += ' (%s)' % additional_text + super(AutotestHostRunCmdError, self).__init__(description, result_obj) + self.command = command + self.additional_text = additional_text + + +class AutotestHostRunTimeoutError(AutotestHostRunCmdError): + """Indicates that a command run via Host.run timed out. + + This is equivalent to CmdTimeoutError when raised from a Host object instead + of directly on the DUT using utils.run + """ + + # server-specific errors class AutoservError(Exception): diff --git a/client/common_lib/cros/graphite/stats_es_mock.py b/client/common_lib/metrics_mock_class.py index d1cedce259..d1cedce259 100644 --- a/client/common_lib/cros/graphite/stats_es_mock.py +++ b/client/common_lib/metrics_mock_class.py diff --git a/client/common_lib/utils.py b/client/common_lib/utils.py index 0ce9334530..a5da13156e 100644 --- a/client/common_lib/utils.py +++ b/client/common_lib/utils.py @@ -47,7 +47,7 @@ from autotest_lib.client.common_lib import env from autotest_lib.client.common_lib import error from autotest_lib.client.common_lib import global_config from autotest_lib.client.common_lib import logging_manager -from autotest_lib.client.common_lib.cros.graphite import stats_es_mock +from autotest_lib.client.common_lib import metrics_mock_class from autotest_lib.client.cros import constants from autotest_lib.client.common_lib.lsbrelease_utils import * @@ -2760,6 +2760,6 @@ def poll_for_condition(condition, time.sleep(sleep_interval) -class metrics_mock(stats_es_mock.mock_class_base): +class metrics_mock(metrics_mock_class.mock_class_base): """mock class for metrics in case chromite is not installed.""" pass diff --git a/client/cros/crash/crash_test.py b/client/cros/crash/crash_test.py index 02fbb9874c..cc24e57682 100644 --- a/client/cros/crash/crash_test.py +++ b/client/cros/crash/crash_test.py @@ -4,6 +4,7 @@ import contextlib import fcntl +import glob import logging import os import re @@ -85,6 +86,7 @@ class CrashTest(test.test): _SYSTEM_CRASH_DIR = '/var/spool/crash' _FALLBACK_USER_CRASH_DIR = '/home/chronos/crash' _USER_CRASH_DIRS = '/home/chronos/u-*/crash' + _USER_CRASH_DIR_REGEX = re.compile('/home/chronos/u-([a-f0-9]+)/crash') # Use the same file format as crash does normally: # <basename>.#.#.#.meta @@ -302,21 +304,29 @@ class CrashTest(test.test): utils.system('rm -f "%s"' % constants.OWNER_KEY_FILE) - def _get_crash_dir(self, username): - """Returns full path to the crash directory for a given username + def _get_crash_dir(self, username, force_user_crash_dir=False): + """Returns crash directory for process running as the given user. - This only really works (currently) when no one is logged in. That - is OK (currently) as the only test that uses this runs when no one - is actually logged in. - - @param username: username to use: - 'chronos': Returns user crash directory. - 'root': Returns system crash directory. + @param username: Unix user of the crashing process. + @param force_user_crash_dir: Regardless of |username|, return the crash + directory of the current user session, or + the fallback directory if no sessions. """ - if username == 'chronos': - return self._FALLBACK_USER_CRASH_DIR - else: + if username == 'root' and not force_user_crash_dir: return self._SYSTEM_CRASH_DIR + else: + dirs = glob.glob(self._USER_CRASH_DIRS) + return dirs[0] if dirs else self._FALLBACK_USER_CRASH_DIR + + + def _canonicalize_crash_dir(self, crash_dir): + """Converts /home/chronos crash directory to /home/user counterpart. + + @param crash_dir: A path of the form /home/chronos/u-<hash>/crash. + @returns /home/user/<hash>/crash, or |crash_dir| on form mismatch. + """ + match = re.match(self._USER_CRASH_DIR_REGEX, crash_dir) + return ('/home/user/%s/crash' % match.group(1)) if match else crash_dir def _initialize_crash_reporter(self): diff --git a/client/cros/crash/crasher/crasher.cc b/client/cros/crash/crasher/crasher.cc index bcfd855ccb..f77061e090 100644 --- a/client/cros/crash/crasher/crasher.cc +++ b/client/cros/crash/crasher/crasher.cc @@ -2,12 +2,15 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. -#include <stdio.h> -#include <stdlib.h> -#include <string.h> -#include <sys/types.h> +#include <sys/socket.h> +#include <sys/un.h> #include <unistd.h> +#include <cerrno> +#include <cstdlib> +#include <cstring> +#include <iostream> + int recbomb(int n); void PrepareBelow(int argc, char *argv[]); extern int DefeatTailOptimizationForCrasher(); @@ -20,16 +23,56 @@ int main(int argc, char *argv[]) { return recbomb(16) + DefeatTailOptimizationForCrasher(); } +bool SendPid(const char *socket_path); + +using std::cerr; + // Prepare for doing the crash, but do it below main so that main's // line numbers remain stable. void PrepareBelow(int argc, char *argv[]) { - fprintf(stderr, "pid=%lld\n", static_cast<long long>(getpid())); - fflush(stderr); + cerr << "pid=" << getpid() << '\n'; if (argc == 2 && strcmp(argv[1], "--nocrash") == 0) { - fprintf(stderr, "Doing normal exit\n"); - // Just exit with an error code if requested, to test that - // CrashDumper cleanup does not cause troubles. + cerr << "Doing normal exit\n"; exit(0); } - fprintf(stderr, "Crashing as requested.\n"); + if (argc == 3 && strcmp(argv[1], "--sendpid") == 0) { + if (!SendPid(argv[2])) + exit(0); + } + cerr << "Crashing as requested.\n"; +} + +// Used when the crasher runs in a different PID namespace than the test. A PID +// sent over a Unix domain socket to a process in a different PID namespace is +// converted to that PID namespace. +bool SendPid(const char *socket_path) { + struct Socket { + Socket(): fd(socket(AF_UNIX, SOCK_DGRAM, 0)) {} + ~Socket() { if (fd != -1) close(fd); } + int fd; + } sock; + + if (sock.fd == -1) { + cerr << "socket() failed: " << strerror(errno) << '\n'; + return false; + } + + sockaddr_un address = { AF_UNIX }; + strncpy(address.sun_path, socket_path, sizeof(address.sun_path) - 1); + sockaddr *address_ptr = reinterpret_cast<sockaddr *>(&address); + if (connect(sock.fd, address_ptr, sizeof(address)) == -1) { + cerr << "connect() failed: " << strerror(errno) << '\n'; + return false; + } + + char zero = '\0'; + iovec data = { &zero, 1 }; + msghdr msg = { NULL, 0, &data, 1 }; + + if (sendmsg(sock.fd, &msg, 0) == -1) { + cerr << "sendmsg() failed: " << strerror(errno) << '\n'; + return false; + } + + return true; } diff --git a/client/cros/crash/user_crash_test.py b/client/cros/crash/user_crash_test.py index 9687b7b30c..33e72ac078 100644 --- a/client/cros/crash/user_crash_test.py +++ b/client/cros/crash/user_crash_test.py @@ -17,6 +17,9 @@ from autotest_lib.client.bin import utils from autotest_lib.client.common_lib import error +CRASHER = 'crasher_nobreakpad' + + class UserCrashTest(crash_test.CrashTest): """ Base class for tests that verify crash reporting for user processes. Shared @@ -26,21 +29,44 @@ class UserCrashTest(crash_test.CrashTest): def setup(self): - crasher_dir = os.path.join(os.path.dirname(__file__), 'crasher') - shutil.copytree(crasher_dir, self.srcdir) + """Copy the crasher source code under |srcdir| and build it.""" + src = os.path.join(os.path.dirname(__file__), 'crasher') + dest = os.path.join(self.srcdir, 'crasher') + shutil.copytree(src, dest) - os.chdir(self.srcdir) + os.chdir(dest) utils.make() - def _prepare_crasher(self): + def initialize(self, expected_tag='user', expected_version=None, + force_user_crash_dir=False): + """Initialize and configure the test. + + @param expected_tag: Expected tag in crash_reporter log message. + @param expected_version: Expected version included in the crash report, + or None to use the Chrome OS version. + @param force_user_crash_dir: Always look for crash reports in the crash + directory of the current user session, or + the fallback directory if no sessions. + """ + crash_test.CrashTest.initialize(self) + self._expected_tag = expected_tag + self._expected_version = expected_version + self._force_user_crash_dir = force_user_crash_dir + + + def _prepare_crasher(self, root_path='/'): """Extract the crasher and set its permissions. crasher is only gzipped to subvert Portage stripping. + + @param root_path: Root directory of the chroot environment in which the + crasher is installed and run. """ - self._crasher_path = os.path.join(self.srcdir, 'crasher_nobreakpad') + self._root_path = root_path + self._crasher_path = os.path.join(self.srcdir, 'crasher', CRASHER) utils.system('cd %s; tar xzf crasher.tgz-unmasked' % - self.srcdir) + os.path.dirname(self._crasher_path)) # Make sure all users (specifically chronos) have access to # this directory and its decendents in order to run crasher # executable as different users. @@ -54,7 +80,8 @@ class UserCrashTest(crash_test.CrashTest): hierarchy: <symbol-root>/<module_name>/<file_id>/<module_name>.sym """ - self._symbol_dir = os.path.join(self.srcdir, 'symbols') + self._symbol_dir = os.path.join(os.path.dirname(self._crasher_path), + 'symbols') utils.system('rm -rf %s' % self._symbol_dir) os.mkdir(self._symbol_dir) @@ -141,12 +168,14 @@ class UserCrashTest(crash_test.CrashTest): # Should identify main line if not self._is_frame_in_stack(16, basename, 'main', - 'crasher.cc', 20, stack): + 'crasher.cc', 23, stack): raise error.TestFail('Did not show main on stack') def _run_crasher_process(self, username, cause_crash=True, consent=True, - crasher_path=None): + crasher_path=None, run_crasher=None, + expected_uid=None, expected_exit_code=None, + expected_reason=None): """Runs the crasher process. Will wait up to 10 seconds for crash_reporter to report the crash. @@ -154,7 +183,26 @@ class UserCrashTest(crash_test.CrashTest): notification message..." appears. While associated logs are likely to be available at this point, the function does not guarantee this. - Returns: + @param username: Unix user of the crasher process. + @param cause_crash: Whether the crasher should crash. + @param consent: Whether the user consents to crash reporting. + @param crasher_path: Path to which the crasher should be copied before + execution. Relative to |_root_path|. + @param run_crasher: A closure to override the default |crasher_command| + invocation. It should return a tuple describing the + process, where |pid| can be None if it should be + parsed from the |output|: + + def run_crasher(username, crasher_command): + ... + return (exit_code, output, pid) + + @param expected_uid: + @param expected_exit_code: + @param expected_reason: + Expected information in crash_reporter log message. + + @returns: A dictionary with keys: returncode: return code of the crasher crashed: did the crasher return segv error code @@ -164,44 +212,63 @@ class UserCrashTest(crash_test.CrashTest): if crasher_path is None: crasher_path = self._crasher_path else: - utils.system('cp -a "%s" "%s"' % (self._crasher_path, crasher_path)) + dest = os.path.join(self._root_path, + crasher_path[os.path.isabs(crasher_path):]) + + utils.system('cp -a "%s" "%s"' % (self._crasher_path, dest)) self.enable_crash_filtering(os.path.basename(crasher_path)) - if username != 'root': - crasher_command = ['su', username, '-c'] - expected_result = 128 + signal.SIGSEGV + crasher_command = [] + + if username == 'root': + if expected_exit_code is None: + expected_exit_code = -signal.SIGSEGV else: - crasher_command = [] - expected_result = -signal.SIGSEGV + if expected_exit_code is None: + expected_exit_code = 128 + signal.SIGSEGV + + if not run_crasher: + crasher_command.extend(['su', username, '-c']) crasher_command.append(crasher_path) basename = os.path.basename(crasher_path) if not cause_crash: crasher_command.append('--nocrash') self._set_consent(consent) - crasher = subprocess.Popen(crasher_command, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE) - output = crasher.communicate()[1] - logging.debug('Output from %s: %s', crasher_command, output) - - # Grab the pid from the process output. We can't just use - # crasher.pid unfortunately because that may be the PID of su. - match = re.search(r'pid=(\d+)', output) - if not match: - raise error.TestFail('Could not find pid output from crasher: %s' % - output) - pid = int(match.group(1)) - - expected_uid = pwd.getpwnam(username)[2] - if consent: - handled_string = 'handling' + + logging.debug('Running crasher: %s', crasher_command) + + if run_crasher: + (exit_code, output, pid) = run_crasher(username, crasher_command) + else: - handled_string = 'ignoring - no consent' + crasher = subprocess.Popen(crasher_command, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + + output = crasher.communicate()[1] + exit_code = crasher.returncode + pid = None + + logging.debug('Crasher output:\n%s', output) + + if pid is None: + # Get the PID from the output, since |crasher.pid| may be su's PID. + match = re.search(r'pid=(\d+)', output) + if not match: + raise error.TestFail('Missing PID in crasher output') + pid = int(match.group(1)) + + if expected_uid is None: + expected_uid = pwd.getpwnam(username)[2] + + if expected_reason is None: + expected_reason = 'handling' if consent else 'ignoring - no consent' + expected_message = ( - 'Received crash notification for %s[%d] sig 11, user %d (%s)' % - (basename, pid, expected_uid, handled_string)) + '[%s] Received crash notification for %s[%d] sig 11, user %d (%s)' % + (self._expected_tag, basename, pid, expected_uid, expected_reason)) # Wait until no crash_reporter is running. utils.poll_for_condition( @@ -222,10 +289,10 @@ class UserCrashTest(crash_test.CrashTest): except utils.TimeoutError: pass - result = {'crashed': crasher.returncode == expected_result, + result = {'crashed': exit_code == expected_exit_code, 'crash_reporter_caught': is_caught, 'output': output, - 'returncode': crasher.returncode} + 'returncode': exit_code} logging.debug('Crasher process result: %s', result) return result @@ -287,33 +354,41 @@ class UserCrashTest(crash_test.CrashTest): raise error.TestFail('Report signature mismatch: %s vs %s' % (result['sig'], expected_sig)) - # Check version matches. - lsb_release = utils.read_file('/etc/lsb-release') - version_match = re.search(r'CHROMEOS_RELEASE_VERSION=(.*)', lsb_release) - if not ('Version: %s' % version_match.group(1)) in result['output']: - raise error.TestFail('Did not find version %s in log output' % - version_match.group(1)) + version = self._expected_version + if version is None: + lsb_release = utils.read_file('/etc/lsb-release') + version = re.search( + r'CHROMEOS_RELEASE_VERSION=(.*)', lsb_release).group(1) + + if not ('Version: %s' % version) in result['output']: + raise error.TestFail('Missing version %s in log output' % version) def _run_crasher_process_and_analyze(self, username, cause_crash=True, consent=True, - crasher_path=None): + crasher_path=None, run_crasher=None, + expected_uid=None, + expected_exit_code=None): self._log_reader.set_start_by_current() - result = self._run_crasher_process(username, cause_crash=cause_crash, - consent=consent, - crasher_path=crasher_path) + result = self._run_crasher_process( + username, cause_crash=cause_crash, consent=consent, + crasher_path=crasher_path, run_crasher=run_crasher, + expected_uid=expected_uid, expected_exit_code=expected_exit_code) if not result['crashed'] or not result['crash_reporter_caught']: return result - crash_dir = self._get_crash_dir(username) + crash_dir = self._get_crash_dir(username, self._force_user_crash_dir) if not consent: if os.path.exists(crash_dir): raise error.TestFail('Crash directory should not exist') return result + if not os.path.exists(crash_dir): + raise error.TestFail('Crash directory does not exist') + crash_contents = os.listdir(crash_dir) basename = os.path.basename(crasher_path or self._crasher_path) @@ -336,7 +411,8 @@ class UserCrashTest(crash_test.CrashTest): if not crash_reporter_minidump is None: raise error.TestFail('Crash reporter wrote multiple ' 'minidumps') - crash_reporter_minidump = os.path.join(crash_dir, filename) + crash_reporter_minidump = os.path.join( + self._canonicalize_crash_dir(crash_dir), filename) elif (filename.startswith(basename) and filename.endswith('.meta')): if not crash_reporter_meta is None: @@ -379,9 +455,15 @@ class UserCrashTest(crash_test.CrashTest): raise error.TestFail('crash_reporter did not catch crash') - def _check_crashing_process(self, username, consent=True): - result = self._run_crasher_process_and_analyze(username, - consent=consent) + def _check_crashing_process(self, username, consent=True, + crasher_path=None, run_crasher=None, + expected_uid=None, expected_exit_code=None): + result = self._run_crasher_process_and_analyze( + username, consent=consent, + crasher_path=crasher_path, + run_crasher=run_crasher, + expected_uid=expected_uid, + expected_exit_code=expected_exit_code) self._check_crashed_and_caught(result) diff --git a/client/site_tests/graphics_WebGLAquarium/control b/client/site_tests/graphics_WebGLAquarium/control index 7ce783e8aa..85cba517b6 100644 --- a/client/site_tests/graphics_WebGLAquarium/control +++ b/client/site_tests/graphics_WebGLAquarium/control @@ -6,7 +6,7 @@ AUTHOR = 'chromeos-gfx' NAME = "graphics_WebGLAquarium" PURPOSE = "Execute the WebGL aquarium test suite." CRITERIA = "All suite tests must not crash/hang." -ATTRIBUTES = "suite:crosbolt_perf_perbuild, suite:graphics, suite:graphics_browser, suite:graphics_per-day, suite:partners" +ATTRIBUTES = "suite:crosbolt_perf_perbuild, suite:bvt-perbuild, suite:graphics, suite:graphics_browser, suite:graphics_per-day, suite:partners" TIME = "medium" TEST_CATEGORY = "Performance" TEST_CLASS = "graphics" diff --git a/client/site_tests/platform_WarningCollector/control b/client/site_tests/platform_AnomalyCollector/control index 147ec731bd..72084849f0 100644 --- a/client/site_tests/platform_WarningCollector/control +++ b/client/site_tests/platform_AnomalyCollector/control @@ -2,9 +2,9 @@ # Use of this source code is governed by a BSD-style license that can be # found in the LICENSE file. -NAME = "platform_WarningCollector" +NAME = "platform_AnomalyCollector" AUTHOR = "chromeos-kernel@google.com" -PURPOSE = "Checks that the warning collector is collecting warnings" +PURPOSE = "Checks that the anomaly collector is collecting log anomalies" CRITERIA = "The collector must create a file when a test warning is produced" TIME = "SHORT" TEST_CATEGORY = "Functional" @@ -12,9 +12,9 @@ TEST_CLASS = "Platform" TEST_TYPE = "Client" DOC = """ -This checks that the warning collector is operational by generating -a test warning and verifying that the collector daemon collects it. +This checks that the anomaly collector is operational by generating +a test kernel warning and verifying that the collector daemon collects it. More functionality is tested by the compile-time test. """ -job.run_test('platform_WarningCollector') +job.run_test('platform_AnomalyCollector') diff --git a/client/site_tests/platform_WarningCollector/platform_WarningCollector.py b/client/site_tests/platform_AnomalyCollector/platform_AnomalyCollector.py index bfd0091c2f..e430b0c0b3 100644 --- a/client/site_tests/platform_WarningCollector/platform_WarningCollector.py +++ b/client/site_tests/platform_AnomalyCollector/platform_AnomalyCollector.py @@ -5,18 +5,18 @@ import os from autotest_lib.client.bin import test, utils -class platform_WarningCollector(test.test): - "Tests the warning collector daemon" +class platform_AnomalyCollector(test.test): + "Tests the anomaly collector daemon" version = 1 def run_once(self): "Runs the test once" - # Restart the warning collector daemon, trigger a test warning, and - # verify that a kwarn file is created. - utils.system("stop warn-collector") - utils.system("rm -rf /var/run/kwarn") - utils.system("start warn-collector") + # Restart the anomaly collector daemon, trigger a test kernel warning, + # and verify that a warning file is created. + utils.system("stop anomaly-collector") + utils.system("rm -rf /var/run/anomaly-collector") + utils.system("start anomaly-collector") utils.system("sleep 0.1") lkdtm = "/sys/kernel/debug/provoke-crash/DIRECT" if os.path.exists(lkdtm): @@ -24,4 +24,4 @@ class platform_WarningCollector(test.test): else: utils.system("echo warning > /proc/breakme") utils.system("sleep 0.1") - utils.system("test -f /var/run/kwarn/warning") + utils.system("test -f /var/run/anomaly-collector/warning") diff --git a/client/site_tests/security_SandboxedServices/baseline b/client/site_tests/security_SandboxedServices/baseline index ad9ca347ce..068df23e1b 100644 --- a/client/site_tests/security_SandboxedServices/baseline +++ b/client/site_tests/security_SandboxedServices/baseline @@ -35,7 +35,7 @@ thermal.sh,root,root,No,No,No,No daisydog,watchdog,watchdog,Yes,Yes,Yes,No permission_brok,devbroker,root,No,Yes,Yes,No netfilter-queue,nfqueue,nfqueue,No,Yes,No,Yes -warn_collector,root,root,No,No,No,No +anomaly_collect,root,root,No,No,No,No attestationd,attestation,attestation,No,No,No,No periodic_schedu,root,root,No,No,No,No esif_ufd,root,root,No,No,No,No diff --git a/contrib/result_size_summary.py b/contrib/result_size_summary.py deleted file mode 100644 index c4a2bd0dab..0000000000 --- a/contrib/result_size_summary.py +++ /dev/null @@ -1,84 +0,0 @@ -#!/usr/bin/env python - -# Copyright 2016 The Chromium OS Authors. All rights reserved. -# Use of this source code is governed by a BSD-style license that can be -# found in the LICENSE file. - -"""This script is used to calculate the daily summary of the total size of -the test results uploaded to Google Storage per day. It can also output the -test results with the largest size. -""" - -import argparse -import time - -import common -from autotest_lib.client.common_lib import time_utils -from autotest_lib.client.common_lib.cros.graphite import autotest_es - - -def get_summary(start_time, end_time, top=None, report_stat=False): - """Get the summary of total size of test results for given period. - - @param start_time: Start time of the test results to search for. - @param end_time: End time of the test results to search for. - @param top: Number of top hits with the largest size of test results. - @param report_stat: (deprecated, ignored) - """ - fields_returned = ['size_KB', 'time_recorded'] - if top > 0: - fields_returned.append('result_dir') - records = autotest_es.query( - fields_returned=fields_returned, - equality_constraints=[('_type', 'result_dir_size'),], - range_constraints=[('time_recorded', start_time, end_time)], - sort_specs=[{'time_recorded': 'asc'}]) - - total_GB = sum([e['size_KB'] for e in records.hits]) / 1000000.0 - print 'Number of test result entries found: %d' % len(records.hits) - print 'Total size of test results uploaded: %.2f GB' % total_GB - if top: - hits = sorted(records.hits, key=lambda hit:hit['size_KB'], reverse=True) - for hit in hits[:top]: - print ('%s: \t%.2f MB' % - (hit['result_dir'], hit['size_KB']/1000.0)) - - -def main(): - """main script. """ - t_now = time.time() - t_now_minus_one_day = t_now - 3600*24 - parser = argparse.ArgumentParser() - parser.add_argument('-l', type=int, dest='last', - help='last days to summary test results across', - default=None) - parser.add_argument('--start', type=str, dest='start', - help=('Enter start time as: yyyy-mm-dd hh:mm:ss,' - 'defualts to 24h ago.'), - default=time_utils.epoch_time_to_date_string( - t_now_minus_one_day)) - parser.add_argument('--end', type=str, dest='end', - help=('Enter end time in as: yyyy-mm-dd hh:mm:ss,' - 'defualts to current time.'), - default=time_utils.epoch_time_to_date_string(t_now)) - parser.add_argument('-t', type=int, dest='top', - help='Print the top x of large result folders.', - default=0) - parser.add_argument('-r', action='store_true', dest='report_stat', - default=False, - help='Deprecated, ignored.') - options = parser.parse_args() - - if options.last: - start_time = t_now - 3600*24*options.last - end_time = t_now - else: - start_time = time_utils.to_epoch_time(options.start) - end_time = time_utils.to_epoch_time(options.end) - - get_summary(start_time=start_time, end_time=end_time, - top=options.top, report_stat=options.report_stat) - - -if __name__ == '__main__': - main() diff --git a/frontend/afe/models.py b/frontend/afe/models.py index 099cbd19d4..b6f512ccb2 100644 --- a/frontend/afe/models.py +++ b/frontend/afe/models.py @@ -21,7 +21,6 @@ from autotest_lib.client.common_lib import global_config from autotest_lib.client.common_lib import host_queue_entry_states from autotest_lib.client.common_lib import control_data, priorities, decorators from autotest_lib.client.common_lib import utils -from autotest_lib.client.common_lib.cros.graphite import autotest_es from autotest_lib.server import utils as server_utils # job options and user preferences @@ -597,24 +596,6 @@ class Host(model_logic.ModelWithInvalid, rdb_model_extensions.AbstractHostModel, self.labels.clear() - def record_state(self, type_str, state, value, other_metadata=None): - """Record metadata in elasticsearch. - - @param type_str: sets the _type field in elasticsearch db. - @param state: string representing what state we are recording, - e.g. 'locked' - @param value: value of the state, e.g. True - @param other_metadata: Other metadata to store in metaDB. - """ - metadata = { - state: value, - 'hostname': self.hostname, - } - if other_metadata: - metadata = dict(metadata.items() + other_metadata.items()) - autotest_es.post(use_http=True, type_str=type_str, metadata=metadata) - - def save(self, *args, **kwargs): # extra spaces in the hostname can be a sneaky source of errors self.hostname = self.hostname.strip() @@ -629,13 +610,8 @@ class Host(model_logic.ModelWithInvalid, rdb_model_extensions.AbstractHostModel, self.locked_by = User.current_user() if not self.lock_time: self.lock_time = datetime.now() - self.record_state('lock_history', 'locked', self.locked, - {'changed_by': self.locked_by.login, - 'lock_reason': self.lock_reason}) self.dirty = True elif not self.locked and self.locked_by: - self.record_state('lock_history', 'locked', self.locked, - {'changed_by': self.locked_by.login}) self.locked_by = None self.lock_time = None super(Host, self).save(*args, **kwargs) diff --git a/frontend/afe/rpc_interface.py b/frontend/afe/rpc_interface.py index 25910b7307..b52d28bba4 100644 --- a/frontend/afe/rpc_interface.py +++ b/frontend/afe/rpc_interface.py @@ -65,7 +65,6 @@ from autotest_lib.server.cros.dynamic_suite import suite as SuiteBase from autotest_lib.server.cros.dynamic_suite import tools from autotest_lib.server.cros.dynamic_suite.suite import Suite from autotest_lib.server.lib import status_history -from autotest_lib.site_utils import host_history from autotest_lib.site_utils import job_history from autotest_lib.site_utils import server_manager_utils from autotest_lib.site_utils import stable_version_utils @@ -1912,35 +1911,9 @@ def get_job_history(**filter_data): def get_host_history(start_time, end_time, hosts=None, board=None, pool=None): - """Get history of a list of host. - - The return is a JSON string of host history for each host, for example, - {'172.22.33.51': [{'status': 'Resetting' - 'start_time': '2014-08-07 10:02:16', - 'end_time': '2014-08-07 10:03:16', - 'log_url': 'http://autotest/reset-546546/debug', - 'dbg_str': 'Task: Special Task 19441991 (host ...)'}, - {'status': 'Running' - 'start_time': '2014-08-07 10:03:18', - 'end_time': '2014-08-07 10:13:00', - 'log_url': 'http://autotest/reset-546546/debug', - 'dbg_str': 'HQE: 15305005, for job: 14995562'} - ] - } - @param start_time: start time to search for history, can be string value or - epoch time. - @param end_time: end time to search for history, can be string value or - epoch time. - @param hosts: A list of hosts to search for history. Default is None. - @param board: board type of hosts. Default is None. - @param pool: pool type of hosts. Default is None. - @returns: JSON string of the host history. - """ - return rpc_utils.prepare_for_serialization( - host_history.get_history_details( - start_time=start_time, end_time=end_time, - hosts=hosts, board=board, pool=pool, - process_pool_size=4)) + """Deprecated.""" + raise ValueError('get_host_history rpc is deprecated ' + 'and no longer implemented.') def shard_heartbeat(shard_hostname, jobs=(), hqes=(), known_job_ids=(), diff --git a/frontend/health/__init__.py b/frontend/health/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 --- a/frontend/health/__init__.py +++ /dev/null diff --git a/frontend/health/check_test_health.py b/frontend/health/check_test_health.py deleted file mode 100755 index 9238f32e46..0000000000 --- a/frontend/health/check_test_health.py +++ /dev/null @@ -1,119 +0,0 @@ -#!/usr/bin/python -# -# Copyright (c) 2013 The Chromium OS Authors. All rights reserved. -# Use of this source code is governed by a BSD-style license that can be -# found in the LICENSE file. - -import argparse, logging, os, subprocess, sys - -THIS_DIR = os.path.dirname(__file__) -UTILS_DIR = os.path.abspath(os.path.join(THIS_DIR, os.pardir, os.pardir, - 'utils')) -TEST_IMPORTER = os.path.join(UTILS_DIR, 'test_importer.py') - -# The list of scripts are passed as lists as this allows us to add arguments -# as the list is just passed to subprocess.call(). We expect the scripts to -# return 0 on success and a non-zero value on failure. - -# Scripts that need to be ran first to do any preperation. E.g. update the -# database. -PREP_SCRIPTS = [ - [TEST_IMPORTER] - ] - -COMPLETE_FAILURES = os.path.join(THIS_DIR, 'complete_failures.py') -PASSING_EXPERIMENTAL = os.path.join(THIS_DIR, 'passing_experimental.py') -# Scripts ran to do the analysis. -ANALYSIS_SCRIPTS = [ - [COMPLETE_FAILURES], - [PASSING_EXPERIMENTAL] - ] - - -def run_prep_scripts(scripts): - """ - Run the scripts that are required to be ran before the analysis. - - This stops and returns False at the first script failure. - - @param scripts: A list of lists. Where the inner list is the script name - and arguments to be called (as subprocess.call() expects). - - @return True if all the scripts succeeded and False otherwise. - - """ - - for script in scripts: - logging.info('Running %s', ' '.join(script)) - return_code = subprocess.call(script) - if return_code != 0: - logging.error('\'%s\' failed with return code %d', - (' '.join(script), return_code)) - return False - - return True - - -def run_analysis_scripts(scripts): - """ - Run the scripts that analyze the database. - - All scripts are ran even if one fails. - - @param scripts: A list of lists, where the inner list is the script name - and arguments to be called (as subprocess.call() expects). - - @return True if all the scripts succeeded and False otherwise. - - """ - - success = True - - for script in scripts: - logging.info('Running %s', ' '.join(script)) - return_code = subprocess.call(script) - if return_code != 0: - logging.error('\'%s\' failed with return code %d', - (' '.join(script), return_code)) - success = False - - return success - - -def parse_options(args): - """Parse the command line options.""" - - description = ('Runs test health and preparation scripts.') - parser = argparse.ArgumentParser(description=description) - parser.parse_args(args) - - -def main(args=None): - """ - The main function. - - This allows us to test this function by calling it in the unit test file. - - @param args: The command line arguments being passed in. - - @return 0 if everything succeeded and a non-zero integer otherwise. - - """ - args = [] if args is None else args - parse_options(args) - - logging.getLogger().setLevel(logging.INFO) - - prep_success = run_prep_scripts(PREP_SCRIPTS) - if not prep_success: - return 1 - - analysis_success = run_analysis_scripts(ANALYSIS_SCRIPTS) - if not analysis_success: - return 1 - - return 0 - - -if __name__ == '__main__': - sys.exit(main(sys.argv[1:])) diff --git a/frontend/health/check_test_health_unittest.py b/frontend/health/check_test_health_unittest.py deleted file mode 100755 index 4ce7d4122a..0000000000 --- a/frontend/health/check_test_health_unittest.py +++ /dev/null @@ -1,212 +0,0 @@ -#!/usr/bin/python -# -# Copyright (c) 2013 The Chromium OS Authors. All rights reserved. -# Use of this source code is governed by a BSD-style license that can be -# found in the LICENSE file. - -import logging, mox, subprocess, unittest - -import common - -from autotest_lib.frontend.health import check_test_health - - -class RunPrepScriptsTests(mox.MoxTestBase): - """Test the run_prep_scripts() function.""" - - def setUp(self): - super(RunPrepScriptsTests, self).setUp() - self.mox.StubOutWithMock(subprocess, 'call') - self.mox.StubOutWithMock(logging, 'error') - self.mox.StubOutWithMock(logging, 'info') - - - def test_given_scripts_are_called(self): - """Test that all the scripts passed in are called when they pass.""" - scripts = [['script1.sh', 'arg'], ['script2.sh']] - - logging.info(mox.IgnoreArg(), mox.IgnoreArg()) - subprocess.call(['script1.sh', 'arg']).AndReturn(0) - logging.info(mox.IgnoreArg(), mox.IgnoreArg()) - subprocess.call(['script2.sh']).AndReturn(0) - - self.mox.ReplayAll() - check_test_health.run_prep_scripts(scripts) - - - def test_return_true_if_all_scripts_suceed(self): - """Test that True is returned when all the scripts succeed.""" - scripts = [['script.sh']] - - logging.info(mox.IgnoreArg(), mox.IgnoreArg()) - subprocess.call(mox.IgnoreArg()).AndReturn(0) - - self.mox.ReplayAll() - self.assertTrue(check_test_health.run_prep_scripts(scripts)) - - - def test_script_information_logging(self): - """Test that we log prep running and failure.""" - scripts = [['pass.py'], ['fail.sh', 'arg']] - - logging.info('Running %s', 'pass.py') - subprocess.call(['pass.py']).AndReturn(0) - logging.info('Running %s', 'fail.sh arg') - subprocess.call(['fail.sh', 'arg']).AndReturn(1) - logging.error('\'%s\' failed with return code %d', - ('fail.sh arg', 1)) - - self.mox.ReplayAll() - check_test_health.run_prep_scripts(scripts) - - - def test_return_false_if_script_fails(self): - """Test that False is returned if a preparation step fails.""" - scripts = [['script.sh']] - - logging.info(mox.IgnoreArg(), mox.IgnoreArg()) - subprocess.call(mox.IgnoreArg()).AndReturn(1) - logging.error(mox.IgnoreArg(), mox.IgnoreArg()) - - self.mox.ReplayAll() - self.assertFalse(check_test_health.run_prep_scripts(scripts)) - - - def test_do_not_run_other_scripts_after_one_fails(self): - """Test that the other prep scripts are not ran if one fails.""" - scripts = [['script1.sh', 'arg'], ['script2.sh']] - - logging.info(mox.IgnoreArg(), mox.IgnoreArg()) - subprocess.call(['script1.sh', 'arg']).AndReturn(1) - logging.error(mox.IgnoreArg(), mox.IgnoreArg()) - - self.mox.ReplayAll() - check_test_health.run_prep_scripts(scripts) - - - -class RunAnalysisScripts(mox.MoxTestBase): - """Test the run_analysis_scripts() function.""" - - def setUp(self): - super(RunAnalysisScripts, self).setUp() - self.mox.StubOutWithMock(subprocess, 'call') - self.mox.StubOutWithMock(logging, 'error') - self.mox.StubOutWithMock(logging, 'info') - - - def test_given_scripts_are_called(self): - """Test that all the scripts passed in are called when they pass.""" - scripts = [['script1.sh', 'arg'], ['script2.sh']] - - logging.info(mox.IgnoreArg(), mox.IgnoreArg()) - subprocess.call(['script1.sh', 'arg']).AndReturn(0) - logging.info(mox.IgnoreArg(), mox.IgnoreArg()) - subprocess.call(['script2.sh']).AndReturn(0) - - self.mox.ReplayAll() - check_test_health.run_analysis_scripts(scripts) - - - def test_return_true_if_all_scripts_suceed(self): - """Test that True is returned when all the scripts succeed.""" - scripts = [['script.sh']] - - logging.info(mox.IgnoreArg(), mox.IgnoreArg()) - subprocess.call(mox.IgnoreArg()).AndReturn(0) - - self.mox.ReplayAll() - self.assertTrue(check_test_health.run_analysis_scripts(scripts)) - - - def test_script_information_logging(self): - """Test that we log prep running and failure.""" - scripts = [['pass.py'], ['fail.sh', 'arg']] - - logging.info('Running %s', 'pass.py') - subprocess.call(['pass.py']).AndReturn(0) - logging.info('Running %s', 'fail.sh arg') - subprocess.call(['fail.sh', 'arg']).AndReturn(1) - logging.error('\'%s\' failed with return code %d', - ('fail.sh arg', 1)) - - self.mox.ReplayAll() - check_test_health.run_analysis_scripts(scripts) - - - def test_return_false_if_script_fails(self): - """"Test that False is returned when at least one script fails.""" - scripts = [['script.sh']] - - logging.info(mox.IgnoreArg(), mox.IgnoreArg()) - subprocess.call(mox.IgnoreArg()).AndReturn(1) - logging.error(mox.IgnoreArg(), mox.IgnoreArg()) - - self.mox.ReplayAll() - self.assertFalse(check_test_health.run_analysis_scripts(scripts)) - - - def test_run_other_scripts_after_one_fails(self): - """Test that the other analysis scripts are ran even if one fails.""" - scripts = [['script1.sh', 'arg'], ['script2.sh']] - - logging.info(mox.IgnoreArg(), mox.IgnoreArg()) - subprocess.call(['script1.sh', 'arg']).AndReturn(1) - logging.error(mox.IgnoreArg(), mox.IgnoreArg()) - logging.info(mox.IgnoreArg(), mox.IgnoreArg()) - subprocess.call(['script2.sh']).AndReturn(0) - - self.mox.ReplayAll() - check_test_health.run_analysis_scripts(scripts) - - -class MainTests(mox.MoxTestBase): - """Tests the main function.""" - - def setUp(self): - super(MainTests, self).setUp() - self.mox.StubOutWithMock(check_test_health, 'run_prep_scripts') - self.mox.StubOutWithMock(check_test_health, 'run_analysis_scripts') - self._orig_prep = check_test_health.PREP_SCRIPTS - self._orig_analysis = check_test_health.ANALYSIS_SCRIPTS - - - def tearDown(self): - super(MainTests, self).tearDown() - check_test_health.PREP_SCRIPTS = self._orig_prep - check_test_health.ANALYSIS_SCRIPTS = self._orig_analysis - - - def test_all_functions_called_if_there_are_no_errors(self): - """Test that all the script calling functions are called by default.""" - check_test_health.PREP_SCRIPTS = [['test_prep']] - check_test_health.ANALYSIS_SCRIPTS = [['test_analysis']] - - check_test_health.run_prep_scripts( - check_test_health.PREP_SCRIPTS).AndReturn(True) - check_test_health.run_analysis_scripts( - check_test_health.ANALYSIS_SCRIPTS).AndReturn(True) - - self.mox.ReplayAll() - self.assertEqual(check_test_health.main(), 0) - - - def test_handle_prep_failure(self): - """Test that we properly handle a prep script failing.""" - check_test_health.run_prep_scripts(mox.IgnoreArg()).AndReturn(False) - - self.mox.ReplayAll() - self.assertEqual(check_test_health.main(), 1) - - - def test_handle_analysis_failure(self): - """Test that we properly handle an analysis script failing.""" - check_test_health.run_prep_scripts(mox.IgnoreArg()).AndReturn(True) - check_test_health.run_analysis_scripts(mox.IgnoreArg()).AndReturn(False) - - self.mox.ReplayAll() - self.assertEqual(check_test_health.main(), 1) - - -if __name__ == '__main__': - unittest.main() diff --git a/frontend/health/common.py b/frontend/health/common.py deleted file mode 100644 index a9ca715556..0000000000 --- a/frontend/health/common.py +++ /dev/null @@ -1,14 +0,0 @@ -# Copyright (c) 2013 The Chromium OS Authors. All rights reserved. -# Use of this source code is governed by a BSD-style license that can be -# found in the LICENSE file. - -"""Setup autotest_lib convenience imports.""" - -import os, sys -dirname = os.path.dirname(sys.modules[__name__].__file__) -autotest_dir = os.path.abspath(os.path.join(dirname, os.pardir, os.pardir)) -client_dir = os.path.join(autotest_dir, 'client') -sys.path.insert(0, client_dir) -import setup_modules -sys.path.pop(0) -setup_modules.setup(base_path=autotest_dir, root_module_name='autotest_lib') diff --git a/frontend/health/complete_failures.py b/frontend/health/complete_failures.py deleted file mode 100755 index 5d6b8dcd60..0000000000 --- a/frontend/health/complete_failures.py +++ /dev/null @@ -1,174 +0,0 @@ -#!/usr/bin/python -# -# Copyright (c) 2013 The Chromium OS Authors. All rights reserved. -# Use of this source code is governed by a BSD-style license that can be -# found in the LICENSE file. - - -import argparse, datetime, sys - -import common -from autotest_lib.client.common_lib import mail -from autotest_lib.frontend import setup_django_readonly_environment - -# Django and the models are only setup after -# the setup_django_readonly_environment module is imported. -from autotest_lib.frontend.tko import models as tko_models -from autotest_lib.frontend.health import utils - - -# Mark a test as failing too long if it has not passed in this many days -_DAYS_TO_BE_FAILING_TOO_LONG = 60 -# Ignore any tests that have not ran in this many days -_DAYS_NOT_RUNNING_CUTOFF = 60 -_MAIL_RESULTS_FROM = 'chromeos-test-health@google.com' -_MAIL_RESULTS_TO = 'chromeos-lab-infrastructure@google.com' - - -def is_valid_test_name(name): - """ - Returns if a test name is valid or not. - - There is a bunch of entries in the tko_test table that are not actually - test names. They are there as a side effect of how Autotest uses this - table. - - Two examples of bad tests names are as follows: - link-release/R29-4228.0.0/faft_ec/firmware_ECPowerG3_SERVER_JOB - try_new_image-chormeos1-rack2-host2 - - @param name: The candidate test names to check. - @return True if name is a valid test name and false otherwise. - - """ - return not '/' in name and not name.startswith('try_new_image') - - -def prepare_last_passes(last_passes): - """ - Fix up the last passes so they can be used by the system. - - This filters out invalid test names and converts the test names to utf8 - encoding. - - @param last_passes: The dictionary of test_name:last_pass pairs. - - @return: Valid entries in encoded as utf8 strings. - """ - valid_test_names = filter(is_valid_test_name, last_passes) - # The shelve module does not accept Unicode objects as keys but does - # accept utf-8 strings. - return {name.encode('utf8'): last_passes[name] - for name in valid_test_names} - - -def get_recently_ran_test_names(): - """ - Get all the test names from the database that have been recently ran. - - @return a set of the recently ran tests. - - """ - cutoff_delta = datetime.timedelta(_DAYS_NOT_RUNNING_CUTOFF) - cutoff_date = datetime.datetime.today() - cutoff_delta - results = tko_models.Test.objects.filter( - started_time__gte=cutoff_date).values('test').distinct() - test_names = [test['test'] for test in results] - valid_test_names = filter(is_valid_test_name, test_names) - return {test.encode('utf8') for test in valid_test_names} - - -def get_tests_to_analyze(recent_test_names, last_pass_times): - """ - Get all the recently ran tests as well as the last time they have passed. - - The minimum datetime is given as last pass time for tests that have never - passed. - - @param recent_test_names: The set of the names of tests that have been - recently ran. - @param last_pass_times: The dictionary of test_name:last_pass_time pairs. - - @return the dict of test_name:last_finish_time pairs. - - """ - prepared_passes = prepare_last_passes(last_pass_times) - - running_passes = {} - for test, pass_time in prepared_passes.items(): - if test in recent_test_names: - running_passes[test] = pass_time - - failures_names = recent_test_names.difference(running_passes) - always_failed = {test: datetime.datetime.min for test in failures_names} - return dict(always_failed.items() + running_passes.items()) - - -def email_about_test_failure(failed_tests, all_tests): - """ - Send an email about all the tests that have failed if there are any. - - @param failed_tests: The list of failed tests. This will be sorted in this - function. - @param all_tests: All the names of tests that have been recently ran. - - """ - if failed_tests: - failed_tests.sort() - mail.send(_MAIL_RESULTS_FROM, - [_MAIL_RESULTS_TO], - [], - 'Long Failing Tests', - '%d/%d tests have been failing for at least %d days.\n' - 'They are the following:\n\n%s' - % (len(failed_tests), len(all_tests), - _DAYS_TO_BE_FAILING_TOO_LONG, - '\n'.join(failed_tests))) - - -def filter_out_good_tests(tests): - """ - Remove all tests that have passed recently enough to be good. - - @param tests: The tests to filter on. - - @return: A list of tests that have not passed for a long time. - - """ - cutoff = (datetime.datetime.today() - - datetime.timedelta(_DAYS_TO_BE_FAILING_TOO_LONG)) - return [name for name, last_pass in tests.items() if last_pass < cutoff] - - -def parse_options(args): - """Parse the command line options.""" - - description = ('Collects information about which tests have been ' - 'failing for a long time and creates an email summarizing ' - 'the results.') - parser = argparse.ArgumentParser(description=description) - parser.parse_args(args) - - -def main(args=None): - """ - The script code. - - Allows other python code to import and run this code. This will be more - important if a nice way to test this code can be determined. - - @param args: The command line arguments being passed in. - - """ - args = [] if args is None else args - parse_options(args) - all_test_names = get_recently_ran_test_names() - last_passes = utils.get_last_pass_times() - tests = get_tests_to_analyze(all_test_names, last_passes) - failures = filter_out_good_tests(tests) - email_about_test_failure(failures, all_test_names) - - - -if __name__ == '__main__': - sys.exit(main(sys.argv[1:])) diff --git a/frontend/health/complete_failures_functional_test.py b/frontend/health/complete_failures_functional_test.py deleted file mode 100755 index 0685863740..0000000000 --- a/frontend/health/complete_failures_functional_test.py +++ /dev/null @@ -1,137 +0,0 @@ -#!/usr/bin/python -# -# Copyright (c) 2013 The Chromium OS Authors. All rights reserved. -# Use of this source code is governed by a BSD-style license that can be -# found in the LICENSE file. - -#!/usr/bin/python -# -# Copyright (c) 2013 The Chromium OS Authors. All rights reserved. -# Use of this source code is governed by a BSD-style license that can be -# found in the LICENSE file. - -import datetime, unittest - -import mox - -import common -# This must come before the import of complete_failures in order to use the -# in memory database. -from autotest_lib.frontend import setup_django_readonly_environment -from autotest_lib.frontend import setup_test_environment -import complete_failures -from autotest_lib.client.common_lib import mail -from autotest_lib.frontend.tko import models -from django import test - - -GOOD_STATUS_IDX = 6 -FAIL_STATUS_IDX = 4 - -ERROR_STATUS = models.Status(status_idx=2, word='ERROR') -ABORT_STATUS = models.Status(status_idx=3, word='ABORT') -FAIL_STATUS = models.Status(status_idx=4, word='FAIL') -WARN_STATUS = models.Status(status_idx=5, word='WARN') -GOOD_STATUS = models.Status(status_idx=6, word='GOOD') -ALERT_STATUS = models.Status(status_idx=7, word='ALERT') - - -def add_statuses(): - """ - Save the statuses to the in-memory database. - - These normally exist in the database and the code expects them. However, the - normal test database setup does not do this for us. - """ - ERROR_STATUS.save() - ABORT_STATUS.save() - FAIL_STATUS.save() - WARN_STATUS.save() - GOOD_STATUS.save() - ALERT_STATUS.save() - - -# During the tests there is a point where Django does a type check on -# datetime.datetime. Unfortunately this means when datetime is mocked out, -# horrible failures happen when Django tries to do this check. The solution -# chosen is to create a pure Python class that inheirits from datetime.datetime -# so that the today class method can be directly mocked out. It is necesarry -# to mock out datetime.datetime completely as it a C class and so cannot have -# parts of itself mocked out. -class MockDatetime(datetime.datetime): - """Used to mock out parts of datetime.datetime.""" - pass - - -class CompleteFailuresFunctionalTests(mox.MoxTestBase, test.TestCase): - """ - Does a functional test of the complete_failures script. - - This uses an in-memory database but everything else is a full run. - - """ - - def setUp(self): - super(CompleteFailuresFunctionalTests, self).setUp() - setup_test_environment.set_up() - add_statuses() - # All of our tests will involve mocking out the datetime.today() class - # method. - self.mox.StubOutWithMock(MockDatetime, 'today') - self.datetime = datetime.datetime - datetime.datetime = MockDatetime - # We need to mock out the send function in all tests or else the - # emails will be sent out during tests. - self.mox.StubOutWithMock(mail, 'send') - - self._orignal_too_late = complete_failures._DAYS_TO_BE_FAILING_TOO_LONG - - - def tearDown(self): - complete_failures._DAYS_TO_BE_FAILING_TOO_LONG = self._orignal_too_late - datetime.datetime = self.datetime - setup_test_environment.tear_down() - super(CompleteFailuresFunctionalTests, self).tearDown() - - - def test(self): - """Does a basic test of as much of the system as possible.""" - job = models.Job(job_idx=1) - kernel = models.Kernel(kernel_idx=1) - machine = models.Machine(machine_idx=1) - success_status = models.Status(status_idx=GOOD_STATUS_IDX) - fail_status = models.Status(status_idx=FAIL_STATUS_IDX) - - old_passing_test = models.Test(job=job, status=success_status, - kernel=kernel, machine=machine, - test='test1', - started_time=self.datetime(2012, 1, 1)) - old_passing_test.save() - failing_test = models.Test(job=job, status=fail_status, - kernel=kernel, machine=machine, - test='test2', - started_time=self.datetime(2012,1,1)) - failing_test.save() - good_test = models.Test(job=job, status=success_status, - kernel=kernel, machine=machine, - test='test3', - started_time=self.datetime(2012, 1, 20)) - good_test.save() - - complete_failures._DAYS_TO_BE_FAILING_TOO_LONG = 10 - MockDatetime.today().AndReturn(self.datetime(2012, 1, 21)) - MockDatetime.today().AndReturn(self.datetime(2012, 1, 21)) - mail.send('chromeos-test-health@google.com', - ['chromeos-lab-infrastructure@google.com'], - [], - 'Long Failing Tests', - '2/3 tests have been failing for at least %d days.\n' - 'They are the following:\n\ntest1\ntest2' - % complete_failures._DAYS_TO_BE_FAILING_TOO_LONG) - - self.mox.ReplayAll() - complete_failures.main() - - -if __name__ == '__main__': - unittest.main() diff --git a/frontend/health/complete_failures_unittest.py b/frontend/health/complete_failures_unittest.py deleted file mode 100755 index b49e0d8306..0000000000 --- a/frontend/health/complete_failures_unittest.py +++ /dev/null @@ -1,296 +0,0 @@ -#!/usr/bin/python -# -# Copyright (c) 2013 The Chromium OS Authors. All rights reserved. -# Use of this source code is governed by a BSD-style license that can be -# found in the LICENSE file. - -import datetime, unittest - -import mox - -import common -# This must come before the import of complete_failures in order to use the -# in memory database. -from autotest_lib.frontend import setup_django_readonly_environment -from autotest_lib.frontend import setup_test_environment -import complete_failures -from autotest_lib.client.common_lib import mail -from autotest_lib.frontend.tko import models -from django import test - - -GOOD_STATUS_IDX = 6 - -# See complte_failurs_functional_tests.py for why we need this. -class MockDatetime(datetime.datetime): - """Used to mock out parts of datetime.datetime.""" - pass - - -class EmailAboutTestFailureTests(mox.MoxTestBase): - """ - Tests that emails are sent about failed tests. - """ - def setUp(self): - super(EmailAboutTestFailureTests, self).setUp() - - # We need to mock out the send function in all tests or else the - # emails will be sent out during tests. - self.mox.StubOutWithMock(mail, 'send') - - self._orig_too_long = complete_failures._DAYS_TO_BE_FAILING_TOO_LONG - - - def tearDown(self): - complete_failures._DAYS_TO_BE_FAILING_TOO_LONG = self._orig_too_long - super(EmailAboutTestFailureTests, self).tearDown() - - - def test_email_sent_about_all_failed_tests(self): - """Test that the email report mentions all the failed_tests.""" - complete_failures._DAYS_TO_BE_FAILING_TOO_LONG = 60 - - mail.send( - 'chromeos-test-health@google.com', - ['chromeos-lab-infrastructure@google.com'], - [], - 'Long Failing Tests', - '1/1 tests have been failing for at least %d days.\n' - 'They are the following:\n\ntest' - % complete_failures._DAYS_TO_BE_FAILING_TOO_LONG) - - failures = ['test'] - all_tests = set(failures) - - self.mox.ReplayAll() - complete_failures.email_about_test_failure(failures, all_tests) - - - def test_email_has_test_names_sorted_alphabetically(self): - """Test that the email report has entries sorted alphabetically.""" - complete_failures._DAYS_TO_BE_FAILING_TOO_LONG = 60 - - mail.send( - 'chromeos-test-health@google.com', - ['chromeos-lab-infrastructure@google.com'], - [], - 'Long Failing Tests', - '2/2 tests have been failing for at least %d days.\n' - 'They are the following:\n\ntest1\ntest2' - % complete_failures._DAYS_TO_BE_FAILING_TOO_LONG) - - # We use an OrderedDict to gurantee that the elements would be out of - # order if we did a simple traversal. - failures = ['test2', 'test1'] - all_tests = set(failures) - - self.mox.ReplayAll() - complete_failures.email_about_test_failure(failures, all_tests) - - - def test_email_count_of_total_number_of_tests(self): - """Test that the email report displays total number of tests.""" - complete_failures._DAYS_TO_BE_FAILING_TOO_LONG = 60 - - mail.send( - 'chromeos-test-health@google.com', - ['chromeos-lab-infrastructure@google.com'], - [], - 'Long Failing Tests', - '1/2 tests have been failing for at least %d days.\n' - 'They are the following:\n\ntest' - % complete_failures._DAYS_TO_BE_FAILING_TOO_LONG) - - failures = ['test'] - all_tests = set(failures) | {'not_failure'} - - self.mox.ReplayAll() - complete_failures.email_about_test_failure(failures, all_tests) - - -class IsValidTestNameTests(test.TestCase): - """Tests the is_valid_test_name function.""" - - def test_returns_true_for_valid_test_name(self): - """Test that a valid test name returns True.""" - name = 'TestName.TestName' - self.assertTrue(complete_failures.is_valid_test_name(name)) - - - def test_returns_false_if_name_has_slash_in_it(self): - """Test that a name with a slash in it returns False.""" - name = 'path/to/test' - self.assertFalse(complete_failures.is_valid_test_name(name)) - - - def test_returns_false_for_try_new_image_entries(self): - """Test that a name that starts with try_new_image returns False.""" - name = 'try_new_image-blah' - self.assertFalse(complete_failures.is_valid_test_name(name)) - - -class PrepareLastPassesTests(test.TestCase): - """Tests the prepare_last_passes function.""" - - def setUp(self): - super(PrepareLastPassesTests, self).setUp() - - def tearDown(self): - super(PrepareLastPassesTests, self).tearDown() - - def test_does_not_return_invalid_test_names(self): - """Tests that tests with invalid test names are not returned.""" - results = complete_failures.prepare_last_passes(['invalid_test/name']) - - self.assertEqual(results, {}) - - -class GetRecentlyRanTestNamesTests(mox.MoxTestBase, test.TestCase): - """Tests the get_recently_ran_test_names function.""" - - def setUp(self): - super(GetRecentlyRanTestNamesTests, self).setUp() - self.mox.StubOutWithMock(MockDatetime, 'today') - self.datetime = datetime.datetime - datetime.datetime = MockDatetime - setup_test_environment.set_up() - self._orig_cutoff = complete_failures._DAYS_NOT_RUNNING_CUTOFF - - - def tearDown(self): - datetime.datetime = self.datetime - complete_failures._DAYS_NOT_RUNNING_CUTOFF = self._orig_cutoff - setup_test_environment.tear_down() - super(GetRecentlyRanTestNamesTests, self).tearDown() - - - def test_return_all_recently_ran_tests(self): - """Test that the function does as it says it does.""" - job = models.Job(job_idx=1) - kernel = models.Kernel(kernel_idx=1) - machine = models.Machine(machine_idx=1) - success_status = models.Status(status_idx=GOOD_STATUS_IDX) - - recent = models.Test(job=job, status=success_status, - kernel=kernel, machine=machine, - test='recent', - started_time=self.datetime(2012, 1, 1)) - recent.save() - old = models.Test(job=job, status=success_status, - kernel=kernel, machine=machine, - test='old', - started_time=self.datetime(2011, 1, 2)) - old.save() - - datetime.datetime.today().AndReturn(self.datetime(2012, 1, 4)) - complete_failures._DAYS_NOT_RUNNING_CUTOFF = 60 - - self.mox.ReplayAll() - results = complete_failures.get_recently_ran_test_names() - - self.assertEqual(set(results), set(['recent'])) - - - def test_returns_no_duplicate_names(self): - """Test that each test name appears only once.""" - job = models.Job(job_idx=1) - kernel = models.Kernel(kernel_idx=1) - machine = models.Machine(machine_idx=1) - success_status = models.Status(status_idx=GOOD_STATUS_IDX) - - test = models.Test(job=job, status=success_status, - kernel=kernel, machine=machine, - test='test', - started_time=self.datetime(2012, 1, 1)) - test.save() - duplicate = models.Test(job=job, status=success_status, - kernel=kernel, machine=machine, - test='test', - started_time=self.datetime(2012, 1, 2)) - duplicate.save() - - datetime.datetime.today().AndReturn(self.datetime(2012, 1, 3)) - complete_failures._DAYS_NOT_RUNNING_CUTOFF = 60 - - self.mox.ReplayAll() - results = complete_failures.get_recently_ran_test_names() - - self.assertEqual(len(results), 1) - - -class GetTestsToAnalyzeTests(mox.MoxTestBase): - """Tests the get_tests_to_analyze function.""" - - def test_returns_recent_test_names(self): - """Test should return all the test names in the database.""" - - recent_tests = {'passing_test', 'failing_test'} - last_passes = {'passing_test': datetime.datetime(2012, 1 ,1), - 'old_passing_test': datetime.datetime(2011, 1, 1)} - - results = complete_failures.get_tests_to_analyze(recent_tests, - last_passes) - - self.assertEqual(results, - {'passing_test': datetime.datetime(2012, 1, 1), - 'failing_test': datetime.datetime.min}) - - - def test_returns_failing_tests_with_min_datetime(self): - """Test that never-passed tests are paired with datetime.min.""" - - recent_tests = {'test'} - last_passes = {} - - self.mox.ReplayAll() - results = complete_failures.get_tests_to_analyze(recent_tests, - last_passes) - - self.assertEqual(results, {'test': datetime.datetime.min}) - - -class FilterOutGoodTestsTests(mox.MoxTestBase): - """Tests the filter_our_good_tests function.""" - - def setUp(self): - super(FilterOutGoodTestsTests, self).setUp() - self.mox.StubOutWithMock(MockDatetime, 'today') - self.datetime = datetime.datetime - datetime.datetime = MockDatetime - self._orig_too_long = complete_failures._DAYS_TO_BE_FAILING_TOO_LONG - - - def tearDown(self): - datetime.datetime = self.datetime - complete_failures._DAYS_TO_BE_FAILING_TOO_LONG = self._orig_too_long - super(FilterOutGoodTestsTests, self).tearDown() - - - def test_remove_all_tests_that_have_passed_recently_enough(self): - """Test that all recently passing tests are not returned.""" - - complete_failures._DAYS_TO_BE_FAILING_TOO_LONG = 10 - datetime.datetime.today().AndReturn(self.datetime(2012, 1, 21)) - - self.mox.ReplayAll() - result = complete_failures.filter_out_good_tests( - {'test': self.datetime(2012, 1, 20)}) - - self.assertEqual(result, []) - - - def test_keep_all_tests_that_have_not_passed_recently_enough(self): - """Test that the tests that have not recently passed are returned.""" - - complete_failures._DAYS_TO_BE_FAILING_TOO_LONG = 10 - datetime.datetime.today().AndReturn(self.datetime(2012, 1, 21)) - - self.mox.ReplayAll() - result = complete_failures.filter_out_good_tests( - {'test': self.datetime(2012, 1, 10)}) - - self.assertEqual(result, ['test']) - - -if __name__ == '__main__': - unittest.main() diff --git a/frontend/health/manual_check_passing_experimental.py b/frontend/health/manual_check_passing_experimental.py deleted file mode 100755 index 62759de251..0000000000 --- a/frontend/health/manual_check_passing_experimental.py +++ /dev/null @@ -1,73 +0,0 @@ -#!/usr/bin/python -# -# Copyright (c) 2013 The Chromium OS Authors. All rights reserved. -# Use of this source code is governed by a BSD-style license that can be -# found in the LICENSE file. - -import argparse, datetime, sys - -import common -# This must come before the import of complete_failures in order to use the -# in memory database. -from autotest_lib.frontend import setup_django_readonly_environment -from autotest_lib.frontend import setup_test_environment -from autotest_lib.frontend.afe import models as afe_models -from autotest_lib.frontend.health import passing_experimental -from autotest_lib.frontend.tko import models as tko_models - -GOOD_STATUS_IDX = 6 - - -def parse_options(args): - """Parse the command line options.""" - description = ('Sets up a fake database and then runs ' - 'passing_experimental.py main() function to simulate ' - 'running the script to test bug filing. Manually checking ' - 'will be required to verify that bugs have been submitted ' - 'correctly. Remember to set up the shadow_config.ini file ' - 'to point to the autotest-bug-filing-test dummy project.') - parser = argparse.ArgumentParser(description=description) - parser.parse_args(args) - - -def main(args): - """ - Run passing_experimental.py to check bug filing for it. - - This sets the fake database up so a bug is guranteed to be filed. However, - it requires manually verifying that the bug was filed and deduped. - - @param args: The arguments passed in from the commandline. - - """ - args = [] if args is None else args - parse_options(args) - - setup_test_environment.set_up() - - afe_models.Test(name='test_dedupe', test_type=0, path='test_dedupe', - experimental=True).save() - - tko_models.Status(status_idx=6, word='GOOD').save() - - job = tko_models.Job(job_idx=1) - kernel = tko_models.Kernel(kernel_idx=1) - machine = tko_models.Machine(machine_idx=1) - success_status = tko_models.Status(status_idx=GOOD_STATUS_IDX) - - tko_dedupe = tko_models.Test(job=job, status=success_status, - kernel=kernel, machine=machine, - test='test_dedupe', - started_time=datetime.datetime.today()) - tko_dedupe.save() - - passing_experimental.main() - - # We assume that the user is using the dummy tracker when using this script. - print ('Now check the bug tracker to make sure this was properly deduped.\n' - 'https://code.google.com/p/autotest-bug-filing-test/issues/list?' - 'q=PassingExperimental') - - -if __name__ == '__main__': - sys.exit(main(sys.argv[1:])) diff --git a/frontend/health/passing_experimental.py b/frontend/health/passing_experimental.py deleted file mode 100755 index 2cca4d8232..0000000000 --- a/frontend/health/passing_experimental.py +++ /dev/null @@ -1,125 +0,0 @@ -#!/usr/bin/python -# -# Copyright (c) 2013 The Chromium OS Authors. All rights reserved. -# Use of this source code is governed by a BSD-style license that can be -# found in the LICENSE file. - - -import argparse, datetime, sys - -import common -from autotest_lib.frontend import setup_django_readonly_environment -from autotest_lib.server.cros.dynamic_suite import reporting - -# Django and the models are only setup after -# the setup_django_readonly_environment module is imported. -from autotest_lib.frontend.afe import models as afe_models -from autotest_lib.frontend.health import utils as test_health_utils - - -# Keep tests that have not failed for at least this many days. -_MIN_DAYS_SINCE_FAILURE = 30 -# Ignore any tests that have not passed in this many days. -_MAX_DAYS_SINCE_LAST_PASS = 30 - - -def get_experimental_tests(): - """ - Get all the tests marked experimental from the afe_autotests table. - - @return the set of experimental test names. - - """ - entries = afe_models.Test.objects.values('name').filter(experimental=True) - return {entry['name'] for entry in entries} - - -def find_long_passing_tests(pass_times, fail_times, valid_names): - """ - Determine the experimental tests that have been passsing for a long time. - - @param pass_times: The dictionary of test_name:pass_time pairs. - @param fail_times: The dictionary of test_name:fail_time pairs. - @param valid_names: An iterable of experimental test names. - - @return the list of experimental test names that have been passing for a - long time. - - """ - failure_cutoff_date = (datetime.datetime.today() - - datetime.timedelta(_MIN_DAYS_SINCE_FAILURE)) - pass_cutoff_date = (datetime.datetime.today() - - datetime.timedelta(_MAX_DAYS_SINCE_LAST_PASS)) - - valid_passes = {test for test in valid_names if test in pass_times} - valid_failures = {test for test in valid_names if test in fail_times} - - recent_passes = {test for test in valid_passes - if (pass_times[test] > pass_cutoff_date)} - recent_fails = {test for test in valid_failures - if (fail_times[test] > failure_cutoff_date)} - - return recent_passes - recent_fails - - -def parse_options(args): - """Parse the command line options.""" - - description = ('Collects information about which experimental tests ' - 'have been passing for a long time and creates a bug ' - 'report for each one.') - parser = argparse.ArgumentParser(description=description) - parser.parse_args(args) - - -def submit_bug_reports(tests): - """ - Submits bug reports to make the long passing tests as not experimental. - - @param tests: The tests that need to be marked as not experimental. - """ - - for test in tests: - title = '%s should be promoted to non-experimental.' % test - summary = ('This bug has been automatically filed to track the ' - 'following issue:\n\n' - 'Test: %s\n' - 'Issue: Promote to non-experimental as it has been passing ' - 'for at least %d days.\n' - 'Suggested Actions: Navigate to the test\'s control file ' - 'and remove the EXPERIMENTAL flag.\n' - '\tSee http://www.chromium.org/chromium-os/testing/' - 'autotest-best-practices#TOC-Control-files' % - (test, _MIN_DAYS_SINCE_FAILURE)) - search_marker = 'PassingExperimental(%s)' % test - reporting.submit_generic_bug_report(title=title, summary=summary, - search_marker=search_marker) - - -def main(args=None): - """ - The script code. - - Allows other python code to import and run this code. This will be more - important if a nice way to test this code can be determined. - - @param args: The command line arguments being passed in. - - """ - args = [] if args is None else args - parse_options(args) - - experimental_tests = get_experimental_tests() - pass_times = test_health_utils.get_last_pass_times() - fail_times = test_health_utils.get_last_fail_times() - - long_passers = find_long_passing_tests(pass_times, fail_times, - experimental_tests) - - submit_bug_reports(long_passers) - - return 0 - - -if __name__ == '__main__': - sys.exit(main(sys.argv[1:])) diff --git a/frontend/health/passing_experimental_functional_test.py b/frontend/health/passing_experimental_functional_test.py deleted file mode 100755 index 1b2784fdad..0000000000 --- a/frontend/health/passing_experimental_functional_test.py +++ /dev/null @@ -1,124 +0,0 @@ -#!/usr/bin/python -# -# Copyright (c) 2013 The Chromium OS Authors. All rights reserved. -# Use of this source code is governed by a BSD-style license that can be -# found in the LICENSE file. - -import datetime, subprocess, unittest - -import mox - -import common -# This must come before the import of complete_failures in order to use the -# in-memory database. -from autotest_lib.frontend import setup_django_readonly_environment -from autotest_lib.frontend import setup_test_environment -from autotest_lib.frontend.health import passing_experimental -from autotest_lib.frontend.afe import models as afe_models -from autotest_lib.frontend.tko import models as tko_models -from autotest_lib.server.cros.dynamic_suite import reporting -from django import test - - -GOOD_STATUS_IDX = 6 -FAIL_STATUS_IDX = 4 - -# During the tests there is a point where Django does a type check on -# datetime.datetime. Unfortunately this means when datetime is mocked out, -# horrible failures happen when Django tries to do this check. The solution -# chosen is to create a pure Python class that inheirits from datetime.datetime -# so that the today class method can be directly mocked out. It is necesarry -# to mock out datetime.datetime completely as it a C class and so cannot have -# parts of itself mocked out. -class MockDatetime(datetime.datetime): - """Used to mock out parts of datetime.datetime.""" - pass - - -class PassingExperimentalFunctionalTests(mox.MoxTestBase, test.TestCase): - """ - Does a functional test of the passing_experimental.py script. - - It uses an in-memory database, mocks out the saving and loading of the - storage object and mocks out the sending of the bugs. Everything else - is a full run. - - """ - - def setUp(self): - super(PassingExperimentalFunctionalTests, self).setUp() - setup_test_environment.set_up() - # All of our tests will involve mocking out the datetime.today() class - # method. - self.mox.StubOutWithMock(MockDatetime, 'today') - self.datetime = datetime.datetime - datetime.datetime = MockDatetime - # We really do not want a script that modifies the DB to run during - # testing. So we will mock this out even though we will mock out the - # function that calls it in case of refactoring. - self.mox.StubOutWithMock(subprocess, 'call') - # We need to mock out this function so bugs are not filed. - self.mox.StubOutClassWithMocks(reporting, 'Bug') - self.mox.StubOutClassWithMocks(reporting, 'Reporter') - self._orig_since_failure = passing_experimental._MIN_DAYS_SINCE_FAILURE - self._orig_since_pass = passing_experimental._MAX_DAYS_SINCE_LAST_PASS - - - def tearDown(self): - passing_experimental._MAX_DAYS_SINCE_LAST_PASS = self._orig_since_pass - passing_experimental._MIN_DAYS_SINCE_FAILURE = self._orig_since_failure - datetime.datetime = self.datetime - setup_test_environment.tear_down() - super(PassingExperimentalFunctionalTests, self).tearDown() - - - def test(self): - """Does a basic test of as much of the system as possible.""" - afe_models.Test(name='test1', test_type=0, path='test1', - experimental=True).save() - afe_models.Test(name='test2', test_type=0, path='test2', - experimental=True).save() - - tko_models.Status(status_idx=6, word='GOOD').save() - - job = tko_models.Job(job_idx=1) - kernel = tko_models.Kernel(kernel_idx=1) - machine = tko_models.Machine(machine_idx=1) - success_status = tko_models.Status(status_idx=GOOD_STATUS_IDX) - fail_status = tko_models.Status(status_idx=FAIL_STATUS_IDX) - - tko_test1 = tko_models.Test(job=job, status=success_status, - kernel=kernel, machine=machine, - test='test1', - started_time=self.datetime(2012, 1, 20)) - tko_test1.save() - tko_test2 = tko_models.Test(job=job, status=success_status, - kernel=kernel, machine=machine, - test='test2', - started_time=self.datetime(2012, 1, 20)) - tko_test2.save() - - passing_experimental._MAX_DAYS_SINCE_LAST_PASS = 10 - passing_experimental._MIN_DAYS_SINCE_FAILURE = 10 - - MockDatetime.today().AndReturn(self.datetime(2012, 1, 21)) - MockDatetime.today().AndReturn(self.datetime(2012, 1, 21)) - reporter1 = reporting.Reporter() - bug1 = reporting.Bug( - title=u'test1 should be promoted to non-experimental.', - summary=mox.IgnoreArg(), - search_marker=u'PassingExperimental(test1)') - reporter1.report(bug1).AndReturn((11, 1)) - reporter2 = reporting.Reporter() - bug2 = reporting.Bug( - title=u'test2 should be promoted to non-experimental.', - summary=mox.IgnoreArg(), - search_marker=u'PassingExperimental(test2)') - reporter2.report(bug2).AndReturn((11, 1)) - - self.mox.ReplayAll() - passing_experimental.main() - - -if __name__ == '__main__': - unittest.main() diff --git a/frontend/health/passing_experimental_unittest.py b/frontend/health/passing_experimental_unittest.py deleted file mode 100755 index fe1aa15a62..0000000000 --- a/frontend/health/passing_experimental_unittest.py +++ /dev/null @@ -1,195 +0,0 @@ -#!/usr/bin/python -# -# Copyright (c) 2013 The Chromium OS Authors. All rights reserved. -# Use of this source code is governed by a BSD-style license that can be -# found in the LICENSE file. - -import datetime, unittest - -import mox - -import common -# This must come before the import of complete_failures in order to use the -# in memory database. -from autotest_lib.frontend import setup_django_readonly_environment -from autotest_lib.frontend import setup_test_environment -from autotest_lib.frontend.health import passing_experimental -from autotest_lib.frontend.afe import models as afe_models -from django import test - - -# datetime.datetime is all C code and so cannot be mocked out in a normal -# fashion. -class MockDatetime(datetime.datetime): - """Used to mock out parts of datetime.datetime.""" - pass - - -class GetExperimentalTestsTests(test.TestCase): - """Tests the get_experimetnal_tests function.""" - - def setUp(self): - super(GetExperimentalTestsTests, self).setUp() - setup_test_environment.set_up() - - - def tearDown(self): - setup_test_environment.tear_down() - super(GetExperimentalTestsTests, self).tearDown() - - - def test_returns_tests_marked_experimental(self): - """Test that tests marked as experimental are returned.""" - test = afe_models.Test(name='test', test_type=0, - experimental=True) - test.save() - - result = passing_experimental.get_experimental_tests() - - self.assertEqual(result, set(['test'])) - - - def test_does_not_return_tests_not_marked_experimental(self): - """Test that tests not marked as experimetnal are not returned.""" - test = afe_models.Test(name='test', test_type=0, - experimental=False) - test.save() - - result = passing_experimental.get_experimental_tests() - - self.assertEqual(result, set()) - - -class FindLongPassingTestsTests(mox.MoxTestBase, test.TestCase): - """Tests the find_long_passing_tests function.""" - def setUp(self): - super(FindLongPassingTestsTests, self).setUp() - self.mox.StubOutWithMock(MockDatetime, 'today') - self._datetime = datetime.datetime - datetime.datetime = MockDatetime - self._orig_since_failure = passing_experimental._MIN_DAYS_SINCE_FAILURE - self._orig_since_pass = passing_experimental._MAX_DAYS_SINCE_LAST_PASS - - - def tearDown(self): - passing_experimental._MAX_DAYS_SINCE_LAST_PASS = self._orig_since_pass - passing_experimental._MIN_DAYS_SINCE_FAILURE = self._orig_since_failure - datetime.datetime = self._datetime - super(FindLongPassingTestsTests, self).tearDown() - - - def test_do_not_return_tests_that_have_failed_recently(self): - """Test that tests that have failed recently are not returned.""" - passing_experimental._MIN_DAYS_SINCE_FAILURE = 10 - datetime.datetime.today().AndReturn(self._datetime(2013, 3, 20)) - datetime.datetime.today().AndReturn(self._datetime(2013, 3, 20)) - - pass_times = {'test': self._datetime(2013, 3, 12)} - fail_times = {'test': self._datetime(2013, 3, 13)} - valid_tests = {'test'} - - self.mox.ReplayAll() - results = passing_experimental.find_long_passing_tests(pass_times, - fail_times, - valid_tests) - - self.assertEqual(results, set([])) - - - def test_return_tests_that_have_recent_pass_but_not_recent_failure(self): - """Test returning tests that have recently passed but not failed.""" - passing_experimental._MIN_DAYS_SINCE_FAILURE = 10 - passing_experimental._MAX_DAYS_SINCE_LAST_PASS = 10 - datetime.datetime.today().AndReturn(self._datetime(2013, 3, 20)) - datetime.datetime.today().AndReturn(self._datetime(2013, 3, 20)) - - pass_times = {'test': self._datetime(2013, 3, 12)} - fail_times = {'test': self._datetime(2013, 3, 1)} - valid_tests = {'test'} - - self.mox.ReplayAll() - results = passing_experimental.find_long_passing_tests(pass_times, - fail_times, - valid_tests) - - self.assertEqual(results, set(['test'])) - - - def test_filter_out_tests_that_have_not_passed_recently(self): - """Test that tests that have not recently passed are not returned.""" - passing_experimental._MIN_DAYS_SINCE_FAILURE = 10 - passing_experimental._MAX_DAYS_SINCE_LAST_PASS = 10 - datetime.datetime.today().AndReturn(self._datetime(2013, 3, 20)) - datetime.datetime.today().AndReturn(self._datetime(2013, 3, 20)) - - pass_times = {'test': self._datetime(2013, 3, 1)} - fail_times = {'test': self._datetime(2013, 3, 1)} - valid_tests = {'test'} - - self.mox.ReplayAll() - results = passing_experimental.find_long_passing_tests(pass_times, - fail_times, - valid_tests) - - self.assertEqual(results, set([])) - - - def test_filter_out_tests_that_are_not_valid(self): - """Test that tests that are not valid are not returned.""" - passing_experimental._MIN_DAYS_SINCE_FAILURE = 10 - passing_experimental._MAX_DAYS_SINCE_LAST_PASS = 10 - datetime.datetime.today().AndReturn(self._datetime(2013, 3, 20)) - datetime.datetime.today().AndReturn(self._datetime(2013, 3, 20)) - - pass_times = {'test2': self._datetime(2013, 3, 1)} - fail_times = {'test2': self._datetime(2013, 3, 1)} - valid_tests = {'test'} - - self.mox.ReplayAll() - results = passing_experimental.find_long_passing_tests(pass_times, - fail_times, - valid_tests) - - self.assertEqual(results, set([])) - - - def test_return_tests_that_have_recently_passed_and_never_failed(self): - """Test that we can handle tests that have never failed.""" - passing_experimental._MIN_DAYS_SINCE_FAILURE = 10 - passing_experimental._MAX_DAYS_SINCE_LAST_PASS = 10 - datetime.datetime.today().AndReturn(self._datetime(2013, 3, 20)) - datetime.datetime.today().AndReturn(self._datetime(2013, 3, 20)) - - pass_times = {'test': self._datetime(2013, 3, 11)} - fail_times = {} - valid_tests = {'test'} - - self.mox.ReplayAll() - results = passing_experimental.find_long_passing_tests(pass_times, - fail_times, - valid_tests) - - self.assertEqual(results, set(['test'])) - - - def test_handle_tests_that_have_never_passed(self): - """Test that we can handle tests that have never passed.""" - passing_experimental._MIN_DAYS_SINCE_FAILURE = 10 - passing_experimental._MAX_DAYS_SINCE_LAST_PASS = 10 - datetime.datetime.today().AndReturn(self._datetime(2013, 3, 20)) - datetime.datetime.today().AndReturn(self._datetime(2013, 3, 20)) - - pass_times = {} - fail_times = {'test': self._datetime(2013, 3, 11)} - valid_tests = {'test'} - - self.mox.ReplayAll() - results = passing_experimental.find_long_passing_tests(pass_times, - fail_times, - valid_tests) - - self.assertEqual(results, set([])) - - -if __name__ == '__main__': - unittest.main() diff --git a/frontend/health/utils.py b/frontend/health/utils.py deleted file mode 100644 index 11186c953f..0000000000 --- a/frontend/health/utils.py +++ /dev/null @@ -1,54 +0,0 @@ -# Copyright (c) 2013 The Chromium OS Authors. All rights reserved. -# Use of this source code is governed by a BSD-style license that can be -# found in the LICENSE file. - -import common -from autotest_lib.frontend import setup_django_readonly_environment - -# Django and the models are only setup after -# the setup_django_readonly_environment module is imported. -from autotest_lib.frontend.tko import models as tko_models -from django.db import models as django_models - -_TEST_ERROR_STATUS = 'ERROR' -_TEST_ABORT_STATUS = 'ABORT' -_TEST_FAIL_STATUS = 'FAIL' -_TEST_WARN_STATUS = 'WARN' -_TEST_PASS_STATUS = 'GOOD' -_TEST_ALERT_STATUS = 'ALERT' - - -def get_last_pass_times(): - """ - Get all the tests that have passed and the time they last passed. - - @return the dict of test_name:last_finish_time pairs for tests that have - passed. - - """ - results = tko_models.Test.objects.values('test').filter( - status__word=_TEST_PASS_STATUS).annotate( - last_pass=django_models.Max('started_time')) - return {result['test']: result['last_pass'] for result in results} - - -def get_last_fail_times(): - """ - Get all the tests that have failed and the time they last failed. - - @return the dict of test_name:last_finish_time pairs for tests that have - failed. - - """ - - failure_clauses = (django_models.Q(status__word=_TEST_FAIL_STATUS) | - django_models.Q(status__word=_TEST_ERROR_STATUS) | - django_models.Q(status__word=_TEST_ABORT_STATUS) | - django_models.Q(status__word=_TEST_WARN_STATUS) | - django_models.Q(status__word=_TEST_ALERT_STATUS)) - - results = tko_models.Test.objects.values('test').filter( - failure_clauses).annotate( - last_pass=django_models.Max('started_time')) - - return {result['test']: result['last_pass'] for result in results} diff --git a/frontend/health/utils_unittest.py b/frontend/health/utils_unittest.py deleted file mode 100755 index 5f7620bb88..0000000000 --- a/frontend/health/utils_unittest.py +++ /dev/null @@ -1,287 +0,0 @@ -#!/usr/bin/python -# -# Copyright (c) 2013 The Chromium OS Authors. All rights reserved. -# Use of this source code is governed by a BSD-style license that can be -# found in the LICENSE file. - -import datetime, unittest - -import mox - -import common -# This must come before the import of utils in order to use the in memory -# database. -from autotest_lib.frontend import setup_django_readonly_environment -from autotest_lib.frontend import setup_test_environment -from autotest_lib.frontend.health import utils -from autotest_lib.frontend.tko import models -from django import test - -ERROR_STATUS = models.Status(status_idx=2, word='ERROR') -ABORT_STATUS = models.Status(status_idx=3, word='ABORT') -FAIL_STATUS = models.Status(status_idx=4, word='FAIL') -WARN_STATUS = models.Status(status_idx=5, word='WARN') -GOOD_STATUS = models.Status(status_idx=6, word='GOOD') -ALERT_STATUS = models.Status(status_idx=7, word='ALERT') - - -def add_statuses(): - """ - Save the statuses to the in-memory database. - - These normally exist in the database and the code expects them. However, the - normal test database setup does not do this for us. - """ - ERROR_STATUS.save() - ABORT_STATUS.save() - FAIL_STATUS.save() - WARN_STATUS.save() - GOOD_STATUS.save() - ALERT_STATUS.save() - - -class GetLastPassTimesTests(mox.MoxTestBase, test.TestCase): - """Tests the get_last_pass_times function.""" - - def setUp(self): - super(GetLastPassTimesTests, self).setUp() - setup_test_environment.set_up() - add_statuses() - - - def tearDown(self): - setup_test_environment.tear_down() - super(GetLastPassTimesTests, self).tearDown() - - - def test_return_most_recent_pass(self): - """The last time a test passed should be returned.""" - # To add a test entry to the database, the test object has to - # be instantiated with various other model instances. We give these - # instances dummy id values. - job = models.Job(job_idx=1) - kernel = models.Kernel(kernel_idx=1) - machine = models.Machine(machine_idx=1) - - early_pass = models.Test(job=job, status=GOOD_STATUS, - kernel=kernel, machine=machine, - test='test', - started_time=datetime.datetime(2012, 1, 1)) - early_pass.save() - late_pass = models.Test(job=job, status=GOOD_STATUS, - kernel=kernel, machine=machine, - test='test', - started_time=datetime.datetime(2012, 1, 2)) - late_pass.save() - - results = utils.get_last_pass_times() - - self.assertEquals(results, {'test': datetime.datetime(2012, 1, 2)}) - - - def test_only_return_passing_tests(self): - """Tests that only tests that have passed at some point are returned.""" - job = models.Job(job_idx=1) - kernel = models.Kernel(kernel_idx=1) - machine = models.Machine(machine_idx=1) - - passing_test = models.Test(job=job, status=GOOD_STATUS, - kernel=kernel, machine=machine, - test='passing_test', - started_time=datetime.datetime(2012, 1, 1)) - passing_test.save() - failing_test = models.Test(job=job, status=FAIL_STATUS, - kernel=kernel, machine=machine, - test='failing_test', - started_time=datetime.datetime(2012, 1, 1)) - failing_test.save() - - results = utils.get_last_pass_times() - - self.assertEquals(results, - {'passing_test': datetime.datetime(2012, 1, 1)}) - - - def test_return_all_passing_tests(self): - """This function returns all tests that passed at least once.""" - job = models.Job(job_idx=1) - kernel = models.Kernel(kernel_idx=1) - machine = models.Machine(machine_idx=1) - - test1 = models.Test(job=job, status=GOOD_STATUS, - kernel=kernel, machine=machine, - test='test1', - started_time=datetime.datetime(2012, 1, 1)) - test1.save() - test2 = models.Test(job=job, status=GOOD_STATUS, - kernel=kernel, machine=machine, - test='test2', - started_time=datetime.datetime(2012, 1, 2)) - test2.save() - - results = utils.get_last_pass_times() - - self.assertEquals(results, {'test1': datetime.datetime(2012, 1, 1), - 'test2': datetime.datetime(2012, 1, 2)}) - - -class GetLastFailTimesTests(mox.MoxTestBase, test.TestCase): - """Tests the get_last_fail_times function.""" - - def setUp(self): - super(GetLastFailTimesTests, self).setUp() - setup_test_environment.set_up() - add_statuses() - - - def tearDown(self): - setup_test_environment.tear_down() - super(GetLastFailTimesTests, self).tearDown() - - - def test_return_most_recent_fail(self): - """The last time a test failed should be returned.""" - # To add a test entry to the database, the test object has to - # be instantiated with various other model instances. We give these - # instances dummy id values. - job = models.Job(job_idx=1) - kernel = models.Kernel(kernel_idx=1) - machine = models.Machine(machine_idx=1) - - early_fail = models.Test(job=job, status=FAIL_STATUS, - kernel=kernel, machine=machine, - test='test', - started_time=datetime.datetime(2012, 1, 1)) - early_fail.save() - late_fail = models.Test(job=job, status=FAIL_STATUS, - kernel=kernel, machine=machine, - test='test', - started_time=datetime.datetime(2012, 1, 2)) - late_fail.save() - - results = utils.get_last_fail_times() - - self.assertEquals(results, {'test': datetime.datetime(2012, 1, 2)}) - - - def test_does_not_return_passing_tests(self): - """Test that passing test entries are not included.""" - job = models.Job(job_idx=1) - kernel = models.Kernel(kernel_idx=1) - machine = models.Machine(machine_idx=1) - - passing_test = models.Test(job=job, status=GOOD_STATUS, - kernel=kernel, machine=machine, - test='passing_test', - started_time=datetime.datetime(2012, 1, 1)) - passing_test.save() - failing_test = models.Test(job=job, status=FAIL_STATUS, - kernel=kernel, machine=machine, - test='failing_test', - started_time=datetime.datetime(2012, 1, 1)) - failing_test.save() - - results = utils.get_last_fail_times() - - self.assertEquals(results, - {'failing_test': datetime.datetime(2012, 1, 1)}) - - - def test_return_all_failing_tests(self): - """This function returns all tests that failed at least once.""" - job = models.Job(job_idx=1) - kernel = models.Kernel(kernel_idx=1) - machine = models.Machine(machine_idx=1) - - test1 = models.Test(job=job, status=FAIL_STATUS, - kernel=kernel, machine=machine, - test='test1', - started_time=datetime.datetime(2012, 1, 1)) - test1.save() - test2 = models.Test(job=job, status=FAIL_STATUS, - kernel=kernel, machine=machine, - test='test2', - started_time=datetime.datetime(2012, 1, 2)) - test2.save() - - results = utils.get_last_fail_times() - - self.assertEquals(results, {'test1': datetime.datetime(2012, 1, 1), - 'test2': datetime.datetime(2012, 1, 2)}) - - - def test_returns_treats_error_status_as_failure(self): - """Error statuses should count as a failure.""" - job = models.Job(job_idx=1) - kernel = models.Kernel(kernel_idx=1) - machine = models.Machine(machine_idx=1) - - test = models.Test(job=job, status=ERROR_STATUS, - kernel=kernel, machine=machine, - test='error', - started_time=datetime.datetime(2012, 1, 1)) - test.save() - - results = utils.get_last_fail_times() - - self.assertEquals(results, {'error': datetime.datetime(2012, 1, 1)}) - - - def test_returns_treats_abort_status_as_failure(self): - """ - Abort statuses should count as failures. - - This should be changed once Abort only represents user caused aborts. - See issue crbug.com/188217. - """ - job = models.Job(job_idx=1) - kernel = models.Kernel(kernel_idx=1) - machine = models.Machine(machine_idx=1) - - test = models.Test(job=job, status=ABORT_STATUS, - kernel=kernel, machine=machine, - test='abort', - started_time=datetime.datetime(2012, 1, 1)) - test.save() - - results = utils.get_last_fail_times() - - self.assertEquals(results, {'abort': datetime.datetime(2012, 1, 1)}) - - - def test_returns_treats_warn_status_as_failure(self): - """Warn statuses should count as failures.""" - job = models.Job(job_idx=1) - kernel = models.Kernel(kernel_idx=1) - machine = models.Machine(machine_idx=1) - - test = models.Test(job=job, status=WARN_STATUS, - kernel=kernel, machine=machine, - test='warn', - started_time=datetime.datetime(2012, 1, 1)) - test.save() - - results = utils.get_last_fail_times() - - self.assertEquals(results, {'warn': datetime.datetime(2012, 1, 1)}) - - - def test_returns_treats_alert_status_as_failure(self): - """Alert statuses should count as failures.""" - job = models.Job(job_idx=1) - kernel = models.Kernel(kernel_idx=1) - machine = models.Machine(machine_idx=1) - - test = models.Test(job=job, status=ALERT_STATUS, - kernel=kernel, machine=machine, - test='alert', - started_time=datetime.datetime(2012, 1, 1)) - test.save() - - results = utils.get_last_fail_times() - - self.assertEquals(results, {'alert': datetime.datetime(2012, 1, 1)}) - - -if __name__ == '__main__': - unittest.main() diff --git a/global_config.ini b/global_config.ini index 9acdf2175d..5c0420cd06 100644 --- a/global_config.ini +++ b/global_config.ini @@ -356,12 +356,6 @@ sam_instances: cautotest test_instance: chromeos-autotest.cbf extra_servers: chromeos-mcp -# Settings for metadb server. -ES_HOST: -ES_PORT: 9200 -ES_UDP_PORT: 9700 -ES_USE_HTTP: False - skip_devserver_health_check: True # The swarming instance that will be used for golo proxy diff --git a/scheduler/monitor_db_cleanup.py b/scheduler/monitor_db_cleanup.py index 3935fa9167..1cb816960a 100644 --- a/scheduler/monitor_db_cleanup.py +++ b/scheduler/monitor_db_cleanup.py @@ -1,15 +1,12 @@ -""" -Autotest AFE Cleanup used by the scheduler -""" - +"""Autotest AFE Cleanup used by the scheduler""" +import contextlib import logging import random import time from autotest_lib.client.common_lib import utils from autotest_lib.frontend.afe import models -from autotest_lib.scheduler import email_manager from autotest_lib.scheduler import scheduler_config from autotest_lib.client.common_lib import global_config from autotest_lib.client.common_lib import host_protections @@ -20,6 +17,9 @@ except ImportError: metrics = utils.metrics_mock +_METRICS_PREFIX = 'chromeos/autotest/scheduler/cleanup' + + class PeriodicCleanup(object): """Base class to schedule periodical cleanup work. """ @@ -64,8 +64,7 @@ class UserCleanup(PeriodicCleanup): self._last_reverify_time = time.time() - @metrics.SecondsTimerDecorator( - 'chromeos/autotest/scheduler/cleanup/user/durations') + @metrics.SecondsTimerDecorator(_METRICS_PREFIX + '/user/durations') def _cleanup(self): logging.info('Running periodic cleanup') self._abort_timed_out_jobs() @@ -77,13 +76,19 @@ class UserCleanup(PeriodicCleanup): def _abort_timed_out_jobs(self): - msg = 'Aborting all jobs that have timed out and are not complete' - logging.info(msg) + logging.info( + 'Aborting all jobs that have timed out and are not complete') query = models.Job.objects.filter(hostqueueentry__complete=False).extra( where=['created_on + INTERVAL timeout_mins MINUTE < NOW()']) - for job in query.distinct(): - logging.warning('Aborting job %d due to job timeout', job.id) - job.abort() + jobs = query.distinct() + if not jobs: + return + + with _cleanup_warning_banner('timed out jobs', len(jobs)): + for job in jobs: + logging.warning('Aborting job %d due to job timeout', job.id) + job.abort() + _report_detected_errors('jobs_timed_out', len(jobs)) def _abort_jobs_past_max_runtime(self): @@ -99,9 +104,16 @@ class UserCleanup(PeriodicCleanup): """) query = models.HostQueueEntry.objects.filter( id__in=[row[0] for row in rows]) - for queue_entry in query.distinct(): - logging.warning('Aborting entry %s due to max runtime', queue_entry) - queue_entry.abort() + hqes = query.distinct() + if not hqes: + return + + with _cleanup_warning_banner('hqes past max runtime', len(hqes)): + for queue_entry in hqes: + logging.warning('Aborting entry %s due to max runtime', + queue_entry) + queue_entry.abort() + _report_detected_errors('hqes_past_max_runtime', len(hqes)) def _check_for_db_inconsistencies(self): @@ -109,34 +121,64 @@ class UserCleanup(PeriodicCleanup): self._check_all_invalid_related_objects() - def _check_invalid_related_objects_one_way(self, first_model, - relation_field, second_model): - if 'invalid' not in first_model.get_field_dict(): - return [] - invalid_objects = list(first_model.objects.filter(invalid=True)) - first_model.objects.populate_relationships(invalid_objects, - second_model, - 'related_objects') - error_lines = [] + def _check_invalid_related_objects_one_way(self, invalid_model, + relation_field, valid_model): + if 'invalid' not in invalid_model.get_field_dict(): + return + + invalid_objects = list(invalid_model.objects.filter(invalid=True)) + invalid_model.objects.populate_relationships( + invalid_objects, valid_model, 'related_objects') + if not invalid_objects: + return + + num_objects_with_invalid_relations = 0 + errors = [] for invalid_object in invalid_objects: if invalid_object.related_objects: - related_list = ', '.join(str(related_object) for related_object - in invalid_object.related_objects) - error_lines.append('Invalid %s %s is related to %ss: %s' - % (first_model.__name__, invalid_object, - second_model.__name__, related_list)) + related_objects = invalid_object.related_objects + related_list = ', '.join(str(x) for x in related_objects) + num_objects_with_invalid_relations += 1 + errors.append('Invalid %s is related to: %s' % + (invalid_object, related_list)) related_manager = getattr(invalid_object, relation_field) related_manager.clear() - return error_lines + + # Only log warnings after we're sure we've seen at least one invalid + # model with some valid relations to avoid empty banners from getting + # printed. + if errors: + invalid_model_name = invalid_model.__name__ + valid_model_name = valid_model.__name__ + banner = 'invalid %s related to valid %s' % (invalid_model_name, + valid_model_name) + with _cleanup_warning_banner(banner, len(errors)): + for error in errors: + logging.warning(error) + _report_detected_errors( + 'invalid_related_objects', + num_objects_with_invalid_relations, + fields={'invalid_model': invalid_model_name, + 'valid_model': valid_model_name}) + _report_detected_errors( + 'invalid_related_objects_relations', + len(errors), + fields={'invalid_model': invalid_model_name, + 'valid_model': valid_model_name}) def _check_invalid_related_objects(self, first_model, first_field, second_model, second_field): - errors = self._check_invalid_related_objects_one_way( - first_model, first_field, second_model) - errors.extend(self._check_invalid_related_objects_one_way( - second_model, second_field, first_model)) - return errors + self._check_invalid_related_objects_one_way( + first_model, + first_field, + second_model, + ) + self._check_invalid_related_objects_one_way( + second_model, + second_field, + first_model, + ) def _check_all_invalid_related_objects(self): @@ -145,20 +187,17 @@ class UserCleanup(PeriodicCleanup): (models.AclGroup, 'users', models.User, 'aclgroup_set'), (models.Test, 'dependency_labels', models.Label, 'test_set')) - errors = [] for first_model, first_field, second_model, second_field in model_pairs: - errors.extend(self._check_invalid_related_objects( - first_model, first_field, second_model, second_field)) + self._check_invalid_related_objects( + first_model, + first_field, + second_model, + second_field, + ) - if errors: - m = 'chromeos/autotest/scheduler/cleanup/invalid_models_cleaned' - metrics.Counter(m).increment_by(len(errors)) - logging.warn('Cleaned invalid models due to errors: %s' - % ('\n'.join(errors))) def _clear_inactive_blocks(self): - msg = 'Clear out blocks for all completed jobs.' - logging.info(msg) + logging.info('Clear out blocks for all completed jobs.') # this would be simpler using NOT IN (subquery), but MySQL # treats all IN subqueries as dependent, so this optimizes much # better @@ -203,8 +242,13 @@ class UserCleanup(PeriodicCleanup): hosts = list(hosts) total_hosts = len(hosts) hosts = self._choose_subset_of_hosts_to_reverify(hosts) - logging.info('Reverifying dead hosts (%d of %d) %s', len(hosts), - total_hosts, ', '.join(host.hostname for host in hosts)) + logging.info('Reverifying dead hosts (%d of %d)', len(hosts), + total_hosts) + with _cleanup_warning_banner('reverify dead hosts', len(hosts)): + for host in hosts: + logging.warning(host.hostname) + _report_detected_errors('dead_hosts_triggered_reverify', len(hosts)) + _report_detected_errors('dead_hosts_require_reverify', total_hosts) for host in hosts: models.SpecialTask.schedule_special_task( host=host, task=models.SpecialTask.Task.VERIFY) @@ -240,8 +284,7 @@ class TwentyFourHourUpkeep(PeriodicCleanup): db, clean_interval_minutes, run_at_initialize=run_at_initialize) - @metrics.SecondsTimerDecorator( - 'chromeos/autotest/scheduler/cleanup/daily/durations') + @metrics.SecondsTimerDecorator(_METRICS_PREFIX + '/daily/durations') def _cleanup(self): logging.info('Running 24 hour clean up') self._check_for_uncleanable_db_inconsistencies() @@ -257,19 +300,26 @@ class TwentyFourHourUpkeep(PeriodicCleanup): def _check_for_active_and_complete_queue_entries(self): query = models.HostQueueEntry.objects.filter(active=True, complete=True) - if query.count() != 0: - subject = ('%d queue entries found with active=complete=1' - % query.count()) - lines = [] + num_bad_hqes = query.count() + if num_bad_hqes == 0: + return + + num_aborted = 0 + logging.warning('%d queue entries found with active=complete=1', + num_bad_hqes) + with _cleanup_warning_banner('active and complete hqes', num_bad_hqes): for entry in query: - lines.append(str(entry.get_object_dict())) if entry.status == 'Aborted': - logging.error('Aborted entry: %s is both active and ' - 'complete. Setting active value to False.', - str(entry)) entry.active = False entry.save() - self._send_inconsistency_message(subject, lines) + recovery_path = 'was also aborted, set active to False' + num_aborted += 1 + else: + recovery_path = 'can not recover' + logging.warning('%s (recovery: %s)', entry.get_object_dict(), + recovery_path) + _report_detected_errors('hqes_active_and_complete', num_bad_hqes) + _report_detected_errors('hqes_aborted_set_to_inactive', num_aborted) def _check_for_multiple_platform_hosts(self): @@ -284,11 +334,14 @@ class TwentyFourHourUpkeep(PeriodicCleanup): GROUP BY afe_hosts.id HAVING platform_count > 1 ORDER BY hostname""") + if rows: - subject = '%s hosts with multiple platforms' % self._db.rowcount - lines = [' '.join(str(item) for item in row) - for row in rows] - self._send_inconsistency_message(subject, lines) + logging.warning('Cleanup found hosts with multiple platforms') + with _cleanup_warning_banner('hosts with multiple platforms', + len(rows)): + for row in rows: + logging.warning(' '.join(str(item) for item in row)) + _report_detected_errors('hosts_with_multiple_platforms', len(rows)) def _check_for_no_platform_hosts(self): @@ -301,16 +354,10 @@ class TwentyFourHourUpkeep(PeriodicCleanup): WHERE platform) WHERE NOT afe_hosts.invalid AND afe_hosts_labels.host_id IS NULL""") if rows: - logging.warning('%s hosts with no platform\n%s', self._db.rowcount, - ', '.join(row[0] for row in rows)) - - - def _send_inconsistency_message(self, subject, lines): - logging.error(subject) - message = '\n'.join(lines) - if len(message) > 5000: - message = message[:5000] + '\n(truncated)\n' - email_manager.manager.enqueue_notify_email(subject, message) + with _cleanup_warning_banner('hosts with no platform', len(rows)): + for row in rows: + logging.warning(row[0]) + _report_detected_errors('hosts_with_no_platform', len(rows)) def _cleanup_orphaned_containers(self): @@ -324,7 +371,46 @@ class TwentyFourHourUpkeep(PeriodicCleanup): ssp_enabled = global_config.global_config.get_config_value( 'AUTOSERV', 'enable_ssp_container') if not ssp_enabled: - logging.info('Server-side packaging is not enabled, no need to clean' - ' up orphaned containers.') + logging.info( + 'Server-side packaging is not enabled, no need to clean ' + 'up orphaned containers.') return self.drone_manager.cleanup_orphaned_containers() + + +def _report_detected_errors(metric_name, count, fields={}): + """Reports a counter metric for recovered errors + + @param metric_name: Name of the metric to report about. + @param count: How many "errors" were fixed this cycle. + @param fields: Optional fields to include with the metric. + """ + m = '%s/errors_recovered/%s' % (_METRICS_PREFIX, metric_name) + metrics.Counter(m).increment_by(count, fields=fields) + + +def _report_detected_errors(metric_name, gauge, fields={}): + """Reports a gauge metric for errors detected + + @param metric_name: Name of the metric to report about. + @param gauge: Outstanding number of unrecoverable errors of this type. + @param fields: Optional fields to include with the metric. + """ + m = '%s/errors_detected/%s' % (_METRICS_PREFIX, metric_name) + metrics.Gauge(m).set(gauge, fields=fields) + + +@contextlib.contextmanager +def _cleanup_warning_banner(banner, error_count=None): + """Put a clear context in the logs around list of errors + + @param: banner: The identifying header to print for context. + @param: error_count: If not None, the number of errors detected. + """ + if error_count is not None: + banner += ' (total: %d)' % error_count + logging.warning('#### START: %s ####', banner) + try: + yield + finally: + logging.warning('#### END: %s ####', banner) diff --git a/scheduler/rdb_utils.py b/scheduler/rdb_utils.py index 342634ac5b..7640312bf6 100644 --- a/scheduler/rdb_utils.py +++ b/scheduler/rdb_utils.py @@ -169,9 +169,11 @@ class RequestAccountant(object): return self.requests_to_counts[host_request] + # TODO(akeshet): Possibly this code is dead, see crbug.com/738508 for + # context. def record_acquire_min_duts(cls, host_request, hosts_required, acquired_host_count): - """Send stats to graphite about host acquisition. + """Send stats about host acquisition. @param host_request: A request. @param hosts_required: Number of hosts required to satisfy request. diff --git a/scheduler/scheduler_models.py b/scheduler/scheduler_models.py index 681a076dab..866cbb7b32 100644 --- a/scheduler/scheduler_models.py +++ b/scheduler/scheduler_models.py @@ -28,7 +28,6 @@ import weakref from autotest_lib.client.common_lib import global_config, host_protections from autotest_lib.client.common_lib import time_utils from autotest_lib.client.common_lib import utils -from autotest_lib.client.common_lib.cros.graphite import autotest_es from autotest_lib.frontend.afe import models, model_attributes from autotest_lib.scheduler import drone_manager, email_manager from autotest_lib.scheduler import rdb_lib @@ -631,27 +630,6 @@ class HostQueueEntry(DBObject): flags_str)) - def record_state(self, type_str, state, value): - """Record metadata in elasticsearch. - - If ES configured to use http, then we will time that http request. - Otherwise, it uses UDP, so we will not need to time it. - - @param type_str: sets the _type field in elasticsearch db. - @param state: string representing what state we are recording, - e.g. 'status' - @param value: value of the state, e.g. 'verifying' - """ - metadata = { - 'time_changed': time.time(), - state: value, - 'job_id': self.job_id, - } - if self.host: - metadata['hostname'] = self.host.hostname - autotest_es.post(type_str=type_str, metadata=metadata) - - def set_status(self, status): logging.info("%s -> %s", self, status) @@ -684,7 +662,6 @@ class HostQueueEntry(DBObject): if should_email_status: self._email_on_status(status) logging.debug('HQE Set Status Complete') - self.record_state('hqe_status', 'status', status) def _on_complete(self, status): diff --git a/server/autoserv b/server/autoserv index 90b899cf73..288ef50484 100755 --- a/server/autoserv +++ b/server/autoserv @@ -25,7 +25,6 @@ from autotest_lib.client.common_lib import control_data from autotest_lib.client.common_lib import error from autotest_lib.client.common_lib import global_config from autotest_lib.client.common_lib import utils -from autotest_lib.client.common_lib.cros.graphite import autotest_es try: from chromite.lib import metrics @@ -225,13 +224,6 @@ def _run_with_ssp(job, container_name, job_id, results, parser, ssp_url, metrics.Counter( 'chromeos/autotest/experimental/execute_job_in_ssp').increment( fields={'success': success}) - # metadata is uploaded separately so it can use http to upload. - metadata = {'drone': socket.gethostname(), - 'job_id': job_id, - 'success': success} - autotest_es.post(use_http=True, - type_str=lxc.CONTAINER_RUN_TEST_METADB_TYPE, - metadata=metadata) test_container.destroy() @@ -250,16 +242,8 @@ def correct_results_folder_permission(results): if not results: return - try: - utils.run('sudo -n chown -R %s "%s"' % (os.getuid(), results)) - utils.run('sudo -n chgrp -R %s "%s"' % (os.getgid(), results)) - except error.CmdError as e: - metadata = {'error': str(e), - 'result_folder': results, - 'drone': socket.gethostname()} - autotest_es.post(use_http=True, type_str='correct_results_folder_failure', - metadata=metadata) - raise + utils.run('sudo -n chown -R %s "%s"' % (os.getuid(), results)) + utils.run('sudo -n chgrp -R %s "%s"' % (os.getgid(), results)) def _start_servod(machine): @@ -364,29 +348,17 @@ def run_autoserv(pid_file_manager, results, parser, ssp_url, use_ssp): if use_ssp: logging.debug('Destroy container %s before aborting the autoserv ' 'process.', container_name) - metadata = {'drone': socket.gethostname(), - 'job_id': job_or_task_id, - 'container_name': container_name, - 'action': 'abort', - 'success': True} try: bucket = lxc.ContainerBucket() container = bucket.get(container_name) if container: container.destroy() else: - metadata['success'] = False - metadata['error'] = 'container not found' logging.debug('Container %s is not found.', container_name) except: - metadata['success'] = False - metadata['error'] = 'Exception: %s' % str(sys.exc_info()) # Handle any exception so the autoserv process can be aborted. logging.exception('Failed to destroy container %s.', container_name) - autotest_es.post(use_http=True, - type_str=lxc.CONTAINER_RUN_TEST_METADB_TYPE, - metadata=metadata) # Try to correct the result file permission again after the # container is destroyed, as the container might have created some # new files in the result folder. @@ -558,6 +530,10 @@ def run_autoserv(pid_file_manager, results, parser, ssp_url, use_ssp): finally: job.close() + # Special task doesn't run parse, so result summary needs to be + # built here. + if results and (repair or verify or reset or cleanup or provision): + site_utils.collect_result_sizes(results) except: exit_code = 1 traceback.print_exc() diff --git a/server/base_utils.py b/server/base_utils.py index 255d3556a0..8b64300c6b 100644 --- a/server/base_utils.py +++ b/server/base_utils.py @@ -188,20 +188,6 @@ def find_pid(command): return None -def nohup(command, stdout='/dev/null', stderr='/dev/null', background=True, - env = {}): - cmd = ' '.join(key+'='+val for key, val in env.iteritems()) - cmd += ' nohup ' + command - cmd += ' > %s' % stdout - if stdout == stderr: - cmd += ' 2>&1' - else: - cmd += ' 2> %s' % stderr - if background: - cmd += ' &' - utils.system(cmd) - - def default_mappings(machines): """ Returns a simple mapping in which all machines are assigned to the diff --git a/server/cros/dynamic_suite/constants.py b/server/cros/dynamic_suite/constants.py index 8bf35af039..750af5085e 100644 --- a/server/cros/dynamic_suite/constants.py +++ b/server/cros/dynamic_suite/constants.py @@ -28,6 +28,7 @@ FWRO_BUILD = 'fwro_build' JOB_REPO_URL = 'job_repo_url' VERSION_PREFIX = 'cros-version:' BOARD_PREFIX = 'board:' +MODEL_LABEL = 'model' OS_PREFIX = 'os' # Bug filing diff --git a/server/cros/dynamic_suite/job_status.py b/server/cros/dynamic_suite/job_status.py index 9dde058af7..e7ab82b1c7 100644 --- a/server/cros/dynamic_suite/job_status.py +++ b/server/cros/dynamic_suite/job_status.py @@ -115,6 +115,59 @@ def _status_for_test(status): status.test_name.startswith('CLIENT_JOB')) +class _JobResultWaiter(object): + """Class for waiting on job results.""" + + def __init__(self, afe, tko): + """Instantiate class + + @param afe: an instance of AFE as defined in server/frontend.py. + @param tko: an instance of TKO as defined in server/frontend.py. + """ + self._afe = afe + self._tko = tko + self._job_ids = set() + + def add_job(self, job): + """Add job to wait on. + + @param job: Job object to get results from, as defined in + server/frontend.py + """ + self.add_jobs((job,)) + + def add_jobs(self, jobs): + """Add job to wait on. + + @param jobs: Iterable of Job object to get results from, as defined in + server/frontend.py + """ + self._job_ids.update(job.id for job in jobs) + + def wait_for_results(self): + """Wait for jobs to finish and return their results. + + The returned generator blocks until all jobs have finished, + naturally. + + @yields an iterator of Statuses, one per test. + """ + while self._job_ids: + for job in self._get_finished_jobs(): + for result in _yield_job_results(self._afe, self._tko, job): + yield result + self._job_ids.remove(job.id) + self._sleep() + + def _get_finished_jobs(self): + # This is an RPC call which serializes to JSON, so we can't pass + # in sets. + return self._afe.get_jobs(id__in=list(self._job_ids), finished=True) + + def _sleep(self): + time.sleep(_DEFAULT_POLL_INTERVAL_SECONDS * (random.random() + 0.5)) + + def _yield_job_results(afe, tko, job): """ Yields the results of an individual job. @@ -185,31 +238,14 @@ def wait_for_child_results(afe, tko, parent_job_id): @param parent_job_id: Parent job id for the jobs to wait on. @yields an iterator of Statuses, one per test. """ - remaining_child_jobs = set(job.id for job in - afe.get_jobs(parent_job_id=parent_job_id)) - while remaining_child_jobs: - new_finished_jobs = afe.get_jobs(id__in=list(remaining_child_jobs), - finished=True) - - for job in new_finished_jobs: - - remaining_child_jobs.remove(job.id) - for result in _yield_job_results(afe, tko, job): - # To figure out what new jobs (like retry jobs) have been - # created since last iteration, we could re-poll for - # the set of child jobs in each iteration and - # calculate the set difference against the set we got in - # last iteration. As an alternative, we could just make - # the caller 'send' new jobs to this generator. We go - # with the latter to avoid unnecessary overhead. - new_child_jobs = (yield result) - if new_child_jobs: - remaining_child_jobs.update([new_job.id for new_job in - new_child_jobs]) - # Return nothing if 'send' is called - yield None - - time.sleep(_DEFAULT_POLL_INTERVAL_SECONDS * (random.random() + 0.5)) + waiter = _JobResultWaiter(afe, tko) + waiter.add_jobs(afe.get_jobs(parent_job_id=parent_job_id)) + for result in waiter.wait_for_results(): + new_jobs = (yield result) + if new_jobs: + waiter.add_jobs(new_jobs) + # Return nothing if 'send' is called + yield None def wait_for_results(afe, tko, jobs): @@ -225,23 +261,14 @@ def wait_for_results(afe, tko, jobs): @param jobs: a list of Job objects, as defined in server/frontend.py. @yields an iterator of Statuses, one per test. """ - local_jobs = list(jobs) - while local_jobs: - for job in list(local_jobs): - if not afe.get_jobs(id=job.id, finished=True): - continue - - local_jobs.remove(job) - for result in _yield_job_results(afe, tko, job): - # The caller can 'send' new jobs (i.e. retry jobs) - # to this generator at any time. - new_jobs = (yield result) - if new_jobs: - local_jobs.extend(new_jobs) - # Return nothing if 'send' is called - yield None - - time.sleep(_DEFAULT_POLL_INTERVAL_SECONDS * (random.random() + 0.5)) + waiter = _JobResultWaiter(afe, tko) + waiter.add_jobs(jobs) + for result in waiter.wait_for_results(): + new_jobs = (yield result) + if new_jobs: + waiter.add_jobs(new_jobs) + # Return nothing if 'send' is called + yield None class Status(object): diff --git a/server/cros/dynamic_suite/job_status_unittest.py b/server/cros/dynamic_suite/job_status_unittest.py index c70325bcaf..3dea51d99f 100755 --- a/server/cros/dynamic_suite/job_status_unittest.py +++ b/server/cros/dynamic_suite/job_status_unittest.py @@ -44,11 +44,6 @@ class StatusTest(mox.MoxTestBase): shutil.rmtree(self.tmpdir, ignore_errors=True) - def expect_result_gathering(self, job): - self.afe.get_jobs(id=job.id, finished=True).AndReturn(job) - self.expect_yield_job_entries(job) - - def expect_yield_job_entries(self, job): entries = [s.entry for s in job.statuses] self.afe.run('get_host_queue_entries', @@ -68,7 +63,6 @@ class StatusTest(mox.MoxTestBase): FakeJob(3, [FakeStatus('FAIL', 'T0', 'broken')]), FakeJob(4, [FakeStatus('ERROR', 'SERVER_JOB', 'server error'), FakeStatus('GOOD', 'T0', '')]),] - # TODO: Write a better test for the case where we yield # results for aborts vs cannot yield results because of # a premature abort. Currently almost all client aborts @@ -78,21 +72,23 @@ class StatusTest(mox.MoxTestBase): # FakeJob(5, [FakeStatus('ERROR', 'T0', 'gah', True)]), # The next job shouldn't be recorded in the results. # FakeJob(6, [FakeStatus('GOOD', 'SERVER_JOB', '')])] - for status in jobs[4].statuses: status.entry['job'] = {'name': 'broken_infra_job'} - # To simulate a job that isn't ready the first time we check. - self.afe.get_jobs(id=jobs[0].id, finished=True).AndReturn([]) - # Expect all the rest of the jobs to be good to go the first time. - for job in jobs[1:]: - self.expect_result_gathering(job) - # Then, expect job[0] to be ready. - self.expect_result_gathering(jobs[0]) - # Expect us to poll twice. + job_id_set = set([job.id for job in jobs]) + yield_values = [ + [jobs[1]], + [jobs[0], jobs[2]], + jobs[3:6] + ] self.mox.StubOutWithMock(time, 'sleep') - time.sleep(mox.IgnoreArg()) - time.sleep(mox.IgnoreArg()) + for yield_this in yield_values: + self.afe.get_jobs(id__in=list(job_id_set), + finished=True).AndReturn(yield_this) + for job in yield_this: + self.expect_yield_job_entries(job) + job_id_set.remove(job.id) + time.sleep(mox.IgnoreArg()) self.mox.ReplayAll() results = [result for result in job_status.wait_for_results(self.afe, @@ -119,7 +115,6 @@ class StatusTest(mox.MoxTestBase): FakeJob(4, [FakeStatus('ERROR', 'SERVER_JOB', 'server error'), FakeStatus('GOOD', 'T0', '')], parent_job_id=parent_job_id),] - # TODO: Write a better test for the case where we yield # results for aborts vs cannot yield results because of # a premature abort. Currently almost all client aborts diff --git a/server/hosts/adb_host.py b/server/hosts/adb_host.py index f51de5a826..9260e048f9 100644 --- a/server/hosts/adb_host.py +++ b/server/hosts/adb_host.py @@ -21,7 +21,6 @@ from autotest_lib.client.common_lib import error from autotest_lib.client.common_lib import global_config from autotest_lib.client.common_lib.cros import dev_server from autotest_lib.client.common_lib.cros import retry -from autotest_lib.server import afe_utils from autotest_lib.server import autoserv_parser from autotest_lib.server import constants as server_constants from autotest_lib.server import utils @@ -31,6 +30,7 @@ from autotest_lib.server.cros.dynamic_suite import constants from autotest_lib.server.hosts import abstract_ssh from autotest_lib.server.hosts import adb_label from autotest_lib.server.hosts import base_label +from autotest_lib.server.hosts import host_info from autotest_lib.server.hosts import teststation_host @@ -1685,22 +1685,21 @@ class ADBHost(abstract_ssh.AbstractSSHHost): teststation_temp_dir = self.teststation.get_tmp_dir() try: - job_repo_url = afe_utils.get_host_attribute( - self, self.job_repo_url_attribute) - except error.AutoservError: + info = self.host_info_store.get() + except host_info.StoreError: logging.warning( 'Device %s could not get repo url for build info.', self.adb_serial) return + job_repo_url = info.attributes.get(self.job_repo_url_attribute, '') if not job_repo_url: logging.warning( 'Device %s could not get repo url for build info.', self.adb_serial) return - build_info = ADBHost.get_build_info_from_build_url( - job_repo_url) + build_info = ADBHost.get_build_info_from_build_url(job_repo_url) target = build_info['target'] branch = build_info['branch'] diff --git a/server/hosts/cros_host.py b/server/hosts/cros_host.py index edbb5eaf5d..2d9ccd6b83 100644 --- a/server/hosts/cros_host.py +++ b/server/hosts/cros_host.py @@ -17,7 +17,6 @@ from autotest_lib.client.common_lib import hosts from autotest_lib.client.common_lib import lsbrelease_utils from autotest_lib.client.common_lib.cros import autoupdater from autotest_lib.client.common_lib.cros import dev_server -from autotest_lib.client.common_lib.cros.graphite import autotest_es from autotest_lib.client.cros import constants as client_constants from autotest_lib.client.cros import cros_ui from autotest_lib.client.cros.audio import cras_utils @@ -1318,9 +1317,6 @@ class CrosHost(abstract_ssh.AbstractSSHHost): except rpm_client.RemotePowerException: logging.error('Failed to turn Power On for this host after ' 'cleanup through the RPM Infrastructure.') - autotest_es.post( - type_str='RPM_poweron_failure', - metadata={'hostname': self.hostname}) battery_percentage = self.get_battery_percentage() if battery_percentage and battery_percentage < 50: @@ -1427,9 +1423,6 @@ class CrosHost(abstract_ssh.AbstractSSHHost): label.remove_hosts(hosts=host_list) mismatch_found = True if mismatch_found: - autotest_es.post(use_http=True, - type_str='cros_version_label_mismatch', - metadata={'hostname': self.hostname}) raise error.AutoservError('The host has wrong cros-version label.') diff --git a/server/hosts/cros_label.py b/server/hosts/cros_label.py index c7a85df9a9..ef52f39d13 100644 --- a/server/hosts/cros_label.py +++ b/server/hosts/cros_label.py @@ -47,6 +47,27 @@ class BoardLabel(base_label.StringPrefixLabel): return [release_info['CHROMEOS_RELEASE_BOARD']] +class ModelLabel(base_label.StringPrefixLabel): + """Determine the correct model label for the device.""" + + _NAME = ds_constants.MODEL_LABEL + + def generate_labels(self, host): + # Return the existing label if set to defend against any bad image + # pushes to the host. See comment in BoardLabel for more details. + for label in host._afe_host.labels: + if label.startswith(self._NAME + ':'): + return [label.split(':')[-1]] + + cmd = "mosys platform model" + result = host.run(command=cmd, ignore_status=True) + if result.exit_status == 0: + return result.stddout + else: + logging.info("%s exited with status %d", cmd, result.exit_status) + return "" + + class LightSensorLabel(base_label.BaseLabel): """Label indicating if a light sensor is detected.""" diff --git a/server/hosts/servo_host.py b/server/hosts/servo_host.py index c488551ba8..09e23c4306 100644 --- a/server/hosts/servo_host.py +++ b/server/hosts/servo_host.py @@ -12,7 +12,6 @@ import httplib import logging import socket -import traceback import xmlrpclib from autotest_lib.client.bin import utils @@ -25,7 +24,6 @@ from autotest_lib.client.common_lib import lsbrelease_utils from autotest_lib.client.common_lib.cros import autoupdater from autotest_lib.client.common_lib.cros import dev_server from autotest_lib.client.common_lib.cros import retry -from autotest_lib.client.common_lib.cros.graphite import autotest_es from autotest_lib.client.common_lib.cros.network import ping_runner from autotest_lib.client.cros import constants as client_constants from autotest_lib.server import afe_utils @@ -429,15 +427,6 @@ class ServoHost(ssh_host.SSHHost): # Sometimes creating the job will raise an exception. We'll log it # but we don't want to fail because of it. logging.exception('Scheduling reboot job failed due to Exception.') - metadata = {'dut': dut, - 'servo_host': self.hostname, - 'error': str(e), - 'details': traceback.format_exc()} - # We want to track how often we fail here so we can justify - # investing some effort into hardening up afe.create_job(). - autotest_es.post(use_http=True, - type_str='servohost_Reboot_schedule_fail', - metadata=metadata) def reboot(self, *args, **dargs): diff --git a/server/site_tests/android_ACTS/android_ACTS.py b/server/site_tests/android_ACTS/android_ACTS.py index 9f6b7549cb..7beb45ca5b 100644 --- a/server/site_tests/android_ACTS/android_ACTS.py +++ b/server/site_tests/android_ACTS/android_ACTS.py @@ -10,9 +10,9 @@ import common from autotest_lib.client.common_lib import error from autotest_lib.client.common_lib import global_config from autotest_lib.client.common_lib.cros import dev_server -from autotest_lib.server import afe_utils from autotest_lib.server import test from autotest_lib.server.hosts import adb_host +from autotest_lib.server.hosts import host_info from autotest_lib.site_utils import acts_lib from server.cros import dnsname_mangler @@ -140,11 +140,12 @@ class android_ACTS(test.test): if valid_job_urls_only: for v in testbed.get_adb_devices().values(): try: - afe_utils.get_host_attribute(v, v.job_repo_url_attribute) - except error.AutoservError: + info = v.host_info_store.get() + except host_info.StoreError: pass else: - valid_hosts.append(v) + if v.job_repo_url_attribute in info.attributes: + valid_hosts.append(v) else: valid_hosts = list(testbed.get_adb_devices().values()) diff --git a/server/site_tests/android_ACTS/control.BluetoothPowerTest b/server/site_tests/android_ACTS/control.BluetoothPowerTest index e641a01ab7..b63155dc87 100644 --- a/server/site_tests/android_ACTS/control.BluetoothPowerTest +++ b/server/site_tests/android_ACTS/control.BluetoothPowerTest @@ -2,10 +2,12 @@ # Use of this source code is governed by a BSD-style license that can be # found in the LICENSE file. -AUTHOR = 'jimtan' +AUTHOR = 'ajanti' NAME = 'android_ACTS.BluetoothPowerTest' TIME = 'LONG' TEST_TYPE = 'Server' +ATTRIBUTES = 'suite:android_bluetooth' +DEPENDENCIES = 'bluetooth_power' DOC = """ This test runs Bluetooth Power tests diff --git a/server/site_tests/android_EasySetup/android_EasySetup.py b/server/site_tests/android_EasySetup/android_EasySetup.py index dda3b421b2..0ba44bb461 100644 --- a/server/site_tests/android_EasySetup/android_EasySetup.py +++ b/server/site_tests/android_EasySetup/android_EasySetup.py @@ -6,9 +6,8 @@ import logging import common from autotest_lib.server import test from autotest_lib.site_utils import acts_lib -from autotest_lib.client.common_lib import error from autotest_lib.server.cros import dnsname_mangler -from autotest_lib.server import afe_utils +from autotest_lib.server.hosts import host_info class android_EasySetup(test.test): @@ -42,11 +41,12 @@ class android_EasySetup(test.test): valid_hosts = [] for v in testbed.get_adb_devices().values(): try: - afe_utils.get_host_attribute(v, v.job_repo_url_attribute) - except error.AutoservError: + info = v.host_info_store.get() + except host_info.StoreError: pass else: - valid_hosts.append(v) + if v.job_repo_url_attribute in info.attributes: + valid_hosts.append(v) if not valid_hosts: logging.error('No valid devices.') diff --git a/server/site_tests/cheets_CTS_N/control.arm.CtsCameraTestCasesPreconditions b/server/site_tests/cheets_CTS_N/control.arm.CtsCameraTestCasesPreconditions index 126fc2e29d..67d0d28ae9 100644 --- a/server/site_tests/cheets_CTS_N/control.arm.CtsCameraTestCasesPreconditions +++ b/server/site_tests/cheets_CTS_N/control.arm.CtsCameraTestCasesPreconditions @@ -28,6 +28,7 @@ def run_CTS(machine): tag='CtsCameraTestCases', target_module='CtsCameraTestCases', bundle='arm', + warn_on_test_retry=False, timeout=3600) parallel_simple(run_CTS, machines) diff --git a/server/site_tests/cheets_CTS_N/control.arm.CtsMediaTestCasesPreconditions b/server/site_tests/cheets_CTS_N/control.arm.CtsMediaTestCasesPreconditions index 9fa9c5665a..c3a77a096d 100644 --- a/server/site_tests/cheets_CTS_N/control.arm.CtsMediaTestCasesPreconditions +++ b/server/site_tests/cheets_CTS_N/control.arm.CtsMediaTestCasesPreconditions @@ -28,6 +28,7 @@ def run_CTS(machine): tag='CtsMediaTestCases', target_module='CtsMediaTestCases', bundle='arm', + warn_on_test_retry=False, timeout=21600) parallel_simple(run_CTS, machines) diff --git a/server/site_tests/cheets_CTS_N/control.arm.CtsNetTestCasesPreconditions b/server/site_tests/cheets_CTS_N/control.arm.CtsNetTestCasesPreconditions index cb61615e7d..b10cd25988 100644 --- a/server/site_tests/cheets_CTS_N/control.arm.CtsNetTestCasesPreconditions +++ b/server/site_tests/cheets_CTS_N/control.arm.CtsNetTestCasesPreconditions @@ -34,6 +34,7 @@ def run_CTS(machine): tag='CtsNetTestCases', target_module='CtsNetTestCases', bundle='arm', + warn_on_test_retry=False, timeout=3600, pre_condition_commands=[ '/usr/local/autotest/cros/scripts/wifi connect %s %s' % (ssid, wifipass), diff --git a/server/site_tests/cheets_CTS_N/control.arm.CtsSecurityHostTestCasesPreconditions b/server/site_tests/cheets_CTS_N/control.arm.CtsSecurityHostTestCasesPreconditions index 4833405a3c..978b0a1708 100644 --- a/server/site_tests/cheets_CTS_N/control.arm.CtsSecurityHostTestCasesPreconditions +++ b/server/site_tests/cheets_CTS_N/control.arm.CtsSecurityHostTestCasesPreconditions @@ -28,6 +28,7 @@ def run_CTS(machine): tag='CtsSecurityHostTestCases', target_module='CtsSecurityHostTestCases', bundle='arm', + warn_on_test_retry=False, timeout=5400, pre_condition_commands=[ 'echo 3 > /proc/sys/kernel/perf_event_paranoid' diff --git a/server/site_tests/cheets_CTS_N/control.arm.CtsUsageStatsTestCasesPreconditions b/server/site_tests/cheets_CTS_N/control.arm.CtsUsageStatsTestCasesPreconditions index ccaa25c36f..b168f96369 100644 --- a/server/site_tests/cheets_CTS_N/control.arm.CtsUsageStatsTestCasesPreconditions +++ b/server/site_tests/cheets_CTS_N/control.arm.CtsUsageStatsTestCasesPreconditions @@ -34,6 +34,7 @@ def run_CTS(machine): tag='CtsUsageStatsTestCases', target_module='CtsUsageStatsTestCases', bundle='arm', + warn_on_test_retry=False, timeout=3600, pre_condition_commands=[ '/usr/local/autotest/cros/scripts/wifi connect %s %s' % (ssid, wifipass), diff --git a/server/site_tests/cheets_CTS_N/control.x86.CtsCameraTestCasesPreconditions b/server/site_tests/cheets_CTS_N/control.x86.CtsCameraTestCasesPreconditions index 4791567cbf..5eba580f42 100644 --- a/server/site_tests/cheets_CTS_N/control.x86.CtsCameraTestCasesPreconditions +++ b/server/site_tests/cheets_CTS_N/control.x86.CtsCameraTestCasesPreconditions @@ -28,6 +28,7 @@ def run_CTS(machine): tag='CtsCameraTestCases', target_module='CtsCameraTestCases', bundle='x86', + warn_on_test_retry=False, timeout=3600) parallel_simple(run_CTS, machines) diff --git a/server/site_tests/cheets_CTS_N/control.x86.CtsMediaTestCasesPreconditions b/server/site_tests/cheets_CTS_N/control.x86.CtsMediaTestCasesPreconditions index 620e90bc10..18c630c4e3 100644 --- a/server/site_tests/cheets_CTS_N/control.x86.CtsMediaTestCasesPreconditions +++ b/server/site_tests/cheets_CTS_N/control.x86.CtsMediaTestCasesPreconditions @@ -28,6 +28,7 @@ def run_CTS(machine): tag='CtsMediaTestCases', target_module='CtsMediaTestCases', bundle='x86', + warn_on_test_retry=False, timeout=21600) parallel_simple(run_CTS, machines) diff --git a/server/site_tests/cheets_CTS_N/control.x86.CtsNetTestCasesPreconditions b/server/site_tests/cheets_CTS_N/control.x86.CtsNetTestCasesPreconditions index d6230f7065..b4b528a394 100644 --- a/server/site_tests/cheets_CTS_N/control.x86.CtsNetTestCasesPreconditions +++ b/server/site_tests/cheets_CTS_N/control.x86.CtsNetTestCasesPreconditions @@ -34,6 +34,7 @@ def run_CTS(machine): tag='CtsNetTestCases', target_module='CtsNetTestCases', bundle='x86', + warn_on_test_retry=False, timeout=3600, pre_condition_commands=[ '/usr/local/autotest/cros/scripts/wifi connect %s %s' % (ssid, wifipass), diff --git a/server/site_tests/cheets_CTS_N/control.x86.CtsSecurityHostTestCasesPreconditions b/server/site_tests/cheets_CTS_N/control.x86.CtsSecurityHostTestCasesPreconditions index c7f9853c7e..e2a0d0bd9b 100644 --- a/server/site_tests/cheets_CTS_N/control.x86.CtsSecurityHostTestCasesPreconditions +++ b/server/site_tests/cheets_CTS_N/control.x86.CtsSecurityHostTestCasesPreconditions @@ -29,6 +29,7 @@ def run_CTS(machine): target_module='CtsSecurityHostTestCases', bundle='x86', timeout=5400, + warn_on_test_retry=False, pre_condition_commands=[ 'echo 3 > /proc/sys/kernel/perf_event_paranoid' ]) diff --git a/server/site_tests/cheets_CTS_N/control.x86.CtsUsageStatsTestCasesPreconditions b/server/site_tests/cheets_CTS_N/control.x86.CtsUsageStatsTestCasesPreconditions index 3bc25c380b..67ed56f617 100644 --- a/server/site_tests/cheets_CTS_N/control.x86.CtsUsageStatsTestCasesPreconditions +++ b/server/site_tests/cheets_CTS_N/control.x86.CtsUsageStatsTestCasesPreconditions @@ -35,6 +35,7 @@ def run_CTS(machine): target_module='CtsUsageStatsTestCases', bundle='x86', timeout=3600, + warn_on_test_retry=False, pre_condition_commands=[ '/usr/local/autotest/cros/scripts/wifi connect %s %s' % (ssid, wifipass), '/usr/local/autotest/cros/scripts/reorder-services-moblab.sh wifi', diff --git a/server/site_utils.py b/server/site_utils.py index 4e8626f06c..d57a690762 100644 --- a/server/site_utils.py +++ b/server/site_utils.py @@ -12,11 +12,16 @@ import os import random import re import time +import traceback import urllib2 import common +from autotest_lib.client.bin.result_tools import utils as result_utils +from autotest_lib.client.bin.result_tools import utils_lib as result_utils_lib +from autotest_lib.client.bin.result_tools import view as result_view from autotest_lib.client.common_lib import utils from autotest_lib.client.common_lib import error +from autotest_lib.client.common_lib import file_utils from autotest_lib.client.common_lib import global_config from autotest_lib.client.common_lib import host_queue_entry_states from autotest_lib.client.common_lib import host_states @@ -24,6 +29,11 @@ from autotest_lib.server.cros import provision from autotest_lib.server.cros.dynamic_suite import constants from autotest_lib.server.cros.dynamic_suite import job_status +try: + from chromite.lib import metrics +except ImportError: + metrics = utils.metrics_mock + CONFIG = global_config.global_config @@ -53,6 +63,8 @@ ANDROID_BOARD_TO_TARGET_MAP = { 'gm4g_sprout': 'seed_l8150', 'bat': 'bat_land' } +# Prefix for the metrics name for result size information. +RESULT_METRICS_PREFIX = 'chromeos/autotest/result_collection/' class TestLabException(Exception): """Exception raised when the Test Lab blocks a test or suite.""" @@ -807,7 +819,7 @@ def wait_for_idle_duts(duts, afe, max_wait=IDLE_DUT_WAIT_TIMEOUT): @param duts: List of duts to check for idle state. @param afe: afe instance. - @param max_wait: Max wait time in seconds. + @param max_wait: Max wait time in seconds to wait for duts to be idle. @returns Boolean True if all hosts are idle or False if any hosts did not go idle within max_wait. @@ -847,7 +859,7 @@ def lock_duts_and_wait(duts, afe, lock_msg='default lock message', @param duts: List of duts to lock. @param afe: afe instance. @param lock_msg: message for afe on locking this host. - @param max_wait: Max wait time in seconds. + @param max_wait: Max wait time in seconds to wait for duts to be idle. @returns Boolean lock_success where True if all duts locked successfully or False if we timed out waiting too long for hosts to go idle. @@ -884,3 +896,91 @@ def board_labels_allowed(boards): if not re.match('board:[^-]+-\d+', board): return False return True + + +def _get_default_size_info(path): + """Get the default result size information. + + In case directory summary is failed to build, assume the test result is not + throttled and all result sizes are the size of existing test results. + + @return: A namedtuple of result size informations, including: + client_result_collected_KB: The total size (in KB) of test results + collected from test device. Set to be the total size of the + given path. + original_result_total_KB: The original size (in KB) of test results + before being trimmed. Set to be the total size of the given + path. + result_uploaded_KB: The total size (in KB) of test results to be + uploaded. Set to be the total size of the given path. + result_throttled: True if test results collection is throttled. + It's set to False in this default behavior. + """ + total_size = file_utils.get_directory_size_kibibytes(path); + return result_utils_lib.ResultSizeInfo( + client_result_collected_KB=total_size, + original_result_total_KB=total_size, + result_uploaded_KB=total_size, + result_throttled=False) + + +def _report_result_size_metrics(result_size_info): + """Report result sizes information to metrics. + + @param result_size_info: A ResultSizeInfo namedtuple containing information + of test result sizes. + """ + fields = {'result_throttled' : result_size_info.result_throttled} + metrics.Counter(RESULT_METRICS_PREFIX + 'client_result_collected_KB', + description='The total size (in KB) of test results ' + 'collected from test device. Set to be the total size of ' + 'the given path.' + ).increment_by(result_size_info.client_result_collected_KB, + fields=fields) + metrics.Counter(RESULT_METRICS_PREFIX + 'original_result_total_KB', + description='The original size (in KB) of test results ' + 'before being trimmed.' + ).increment_by(result_size_info.original_result_total_KB, + fields=fields) + metrics.Counter(RESULT_METRICS_PREFIX + 'result_uploaded_KB', + description='The total size (in KB) of test results to be ' + 'uploaded.' + ).increment_by(result_size_info.result_uploaded_KB, + fields=fields) + + +def collect_result_sizes(path, log=logging.debug): + """Collect the result sizes information and build result summary. + + It first tries to merge directory summaries and calculate the result sizes + including: + client_result_collected_KB: The volume in KB that's transfered from the test + device. + original_result_total_KB: The volume in KB that's the original size of the + result files before being trimmed. + result_uploaded_KB: The volume in KB that will be uploaded. + result_throttled: Indicating if the result files were throttled. + + If directory summary merging failed for any reason, fall back to use the + total size of the given result directory. + + @param path: Path of the result directory to get size information. + @param log: The logging method, default to logging.debug + @return: A ResultSizeInfo namedtuple containing information of test result + sizes. + """ + try: + client_collected_bytes, summary = result_utils.merge_summaries(path) + result_size_info = result_utils_lib.get_result_size_info( + client_collected_bytes, summary) + html_file = os.path.join(path, result_view.DEFAULT_RESULT_SUMMARY_NAME) + result_view.build(client_collected_bytes, summary, html_file) + except: + log('Failed to calculate result sizes based on directory summaries for ' + 'directory %s. Fall back to record the total size.\nException: %s' % + (path, traceback.format_exc())) + result_size_info = _get_default_size_info(path) + + _report_result_size_metrics(result_size_info) + + return result_size_info
\ No newline at end of file diff --git a/site_utils/analyze_reboot_time.py b/site_utils/analyze_reboot_time.py deleted file mode 100644 index 1c911961c6..0000000000 --- a/site_utils/analyze_reboot_time.py +++ /dev/null @@ -1,161 +0,0 @@ -#!/usr/bin/env python - -# Copyright (c) 2014 The Chromium OS Authors. All rights reserved. -# Use of this source code is governed by a BSD-style license that can be -# found in the LICENSE file. - -# This file defines script for getting entries from ES concerning reboot time. - -""" -Example usage: - python analyze_reboot_time.py -l 12 --server cautotest --board daisy_spring - -Usage: analyze_reboot_time.py [-h] [-l LAST] --server AUTOTEST_SERVER - [--board BOARD] [--pool POOL] [--start START] - [--end END] [--gte GTE] [--lte LTE] [-n SIZE] - [--hosts HOSTS [HOSTS ...]] - -optional arguments: - -h, --help show this help message and exit - -l LAST last hours to search results across - --server AUTOTEST_SERVER - Enter Autotest instance name, e.g. "cautotest". - --board BOARD restrict query by board, not implemented yet - --pool POOL restrict query by pool, not implemented yet - --start START Enter start time as: yyyy-mm-dd hh-mm-ss,defualts to - 24h ago. - --end END Enter end time as: yyyy-mm-dd hh-mm-ss,defualts to - current time. - --gte GTE Enter lower bound on reboot time for entries to - return. - --lte LTE Enter upper bound on reboot time for entries to - return. - -n SIZE Maximum number of entries to return. - --hosts HOSTS [HOSTS ...] - Enter space deliminated hostnames -""" - -import argparse -import time - -import common -import host_history -from autotest_lib.client.common_lib import time_utils -from autotest_lib.client.common_lib.cros.graphite import autotest_es - - -def get_entries(time_start, time_end, gte, lte, size, index, hostname): - """Gets all entries from es db with the given constraints. - - @param time_start: Earliest time entry was recorded - @param time_end: Latest time entry was recorded - @param gte: Lowest reboot_time to return - @param lte: Highest reboot_time to return - @param size: Max number of entries to return - @param index: es db index to get entries for, i.e. 'cautotest' - @param hostname: string representing hostname to query for - @returns: Entries from esdb. - """ - time_start_epoch = time_utils.to_epoch_time(time_start) - time_end_epoch = time_utils.to_epoch_time(time_end) - gte_epoch = time_utils.to_epoch_time(gte) - lte_epoch = time_utils.to_epoch_time(lte) - return autotest_es.query( - index=index, - fields_returned=['hostname', 'time_recorded', 'value'], - equality_constraints=[('_type', 'reboot_total'), - ('hostname', hostname)], - range_constraints=[('time_recorded', time_start_epoch, time_end_epoch), - ('value', gte_epoch, lte_epoch)], - size=size, - sort_specs=[{'hostname': 'asc'}, {'value': 'desc'}]) - return results - - -def get_results_string(hostname, time_start, time_end, results): - """Prints entries from esdb in a readable fashion. - - @param hostname: Hostname of DUT we are printing result for. - @param time_start: Earliest time entry was recorded - @param time_end: Latest time entry was recorded - @param gte: Lowest reboot_time to return - @param lte: Highest reboot_time to return - @param size: Max number of entries to return - @returns: String reporting reboot times for this host. - """ - return_string = ' Host: %s \n Number of entries: %s \n' % ( - hostname, results.total) - return_string += ' %s - %s \n' % ( - time_utils.epoch_time_to_date_string(time_start), - time_utils.epoch_time_to_date_string(time_end)) - if results.total <= 0: - return return_string - for result in results.hits: - time_recorded = result['time_recorded'][0] - time_string = time_utils.epoch_time_to_date_string( - time_recorded) - reboot_total = result['value'][0] - spaces = (15 - len(str(time_string))) * ' ' - return_string += ' %s Reboot_time: %.3fs\n' % ( - time_string, reboot_total) - return return_string - - -if __name__ == '__main__': - """main script""" - t_now = time.time() - t_now_minus_one_day = t_now - 3600 * 24 - parser = argparse.ArgumentParser() - parser.add_argument('-l', type=float, dest='last', - help='last hours to search results across', - default=24) - parser.add_argument('--server', type=str, dest='autotest_server', - required=True, - help='Enter Autotest instance name, e.g. "cautotest".') - parser.add_argument('--board', type=str, dest='board', - help='restrict query by board, not implemented yet', - default=None) - parser.add_argument('--pool', type=str, dest='pool', - help='restrict query by pool, not implemented yet', - default=None) - parser.add_argument('--start', type=str, dest='start', - help=('Enter start time as: yyyy-mm-dd hh-mm-ss,' - 'defualts to 24h ago.'), - default=t_now_minus_one_day) - parser.add_argument('--end', type=str, dest='end', - help=('Enter end time as: yyyy-mm-dd hh-mm-ss,' - 'defualts to current time.'), - default=t_now) - parser.add_argument('--gte', type=float, dest='gte', - help=('Enter lower bound on reboot time ' - 'for entries to return.'), - default=0) - parser.add_argument('--lte', type=float, dest='lte', - help=('Enter upper bound on reboot time ' - 'for entries to return.'), - default=None) - parser.add_argument('-n', type=int, dest='size', - help='Maximum number of entries to return.', - default=10000) - parser.add_argument('--hosts', nargs='+', dest='hosts', - help='Enter space deliminated hostnames', - default=[]) - options = parser.parse_args() - - if options.last: - t_start = t_now - 3600 * options.last - t_end = t_now - else: - t_start = time_utils.to_epoch_time(options.start) - t_end = time_utils.to_epoch_time(options.end) - if options.hosts: - hosts = options.hosts - else: - hosts = host_history.get_matched_hosts(options.autotest_server, - options.board, options.pool) - - for hostname in hosts: - results = get_entries( - t_start, t_end, options.gte, options.lte, options.size, - options.autotest_server, hostname) - print get_results_string(hostname, t_start, t_end, results) diff --git a/site_utils/collect_host_stats.py b/site_utils/collect_host_stats.py deleted file mode 100755 index 460c7dcecd..0000000000 --- a/site_utils/collect_host_stats.py +++ /dev/null @@ -1,161 +0,0 @@ -#!/usr/bin/env python - -# Copyright (c) 2014 The Chromium OS Authors. All rights reserved. -# Use of this source code is governed by a BSD-style license that can be -# found in the LICENSE file. - -"""This script is to be run daily to report machine utilization stats across -each board and pool. -""" - - -import argparse -from datetime import date -from datetime import datetime -from datetime import timedelta - -import common -from autotest_lib.client.common_lib import time_utils -from autotest_lib.client.common_lib import utils -from autotest_lib.site_utils import gmail_lib -from autotest_lib.site_utils import host_history -from autotest_lib.site_utils import host_history_utils -from autotest_lib.site_utils import host_label_utils - -try: - from chromite.lib import metrics - from chromite.lib import ts_mon_config -except ImportError: - metrics = utils.metrics_mock - ts_mon_config = utils.metrics_mock - - -_MACHINE_UTILIZATION_RATE_HOURLY = metrics.Float( - 'chromeos/autotest/host/machine_utilization_rate/hourly') -_MACHINE_AVAILABILITY_RATE_HOURLY = metrics.Float( - 'chromeos/autotest/host/machine_availability_rate/hourly') -_MACHINE_IDLE_RATE_HOURLY = metrics.Float( - 'chromeos/autotest/host/machine_idle_rate/hourly') -_MACHINE_UTILIZATION_RATE_DAILY = metrics.Float( - 'chromeos/autotest/host/machine_utilization_rate/daily') -_MACHINE_AVAILABILITY_RATE_DAILY = metrics.Float( - 'chromeos/autotest/host/machine_availability_rate/daily') -_MACHINE_IDLE_RATE_DAILY = metrics.Float( - 'chromeos/autotest/host/machine_idle_rate/daily') - -def report_stats(board, pool, start_time, end_time, span): - """Report machine stats for given board, pool and time period. - - @param board: Name of board. - @param pool: Name of pool. - @param start_time: start time to collect stats. - @param end_time: end time to collect stats. - @param span: Number of hours that the stats should be collected for. - @return: Error message collected when calculating the stats. - """ - print '================ %-12s %-12s ================' % (board, pool) - try: - history = host_history.get_history_details(start_time=start_time, - end_time=end_time, - board=board, - pool=pool) - except host_history_utils.NoHostFoundException as e: - print 'No history found. Error:\n%s' % e - history = None - mur = -1 - mar = -1 - mir = -1 - - if history: - status_intervals = host_history_utils.get_status_intervals(history) - stats_all, num_hosts = host_history_utils.aggregate_hosts( - status_intervals) - total = 0 - total_time = span*3600*num_hosts - for status, interval in stats_all.iteritems(): - total += interval - if abs(total - total_time) > 10: - error = ('Status intervals do not add up. No stats will be ' - 'collected for board: %s, pool: %s, diff: %s' % - (board, pool, total - total_time)) - hosts = [] - for history_for_host in status_intervals: - total = 0 - for interval in history_for_host.keys(): - total += interval[1] - interval[0] - if total > span*3600: - hosts.append(history_for_host.values()[0]['metadata']['hostname']) - error += ' hosts: %s' % ','.join(hosts) - print error - return error - - mur = host_history_utils.get_machine_utilization_rate(stats_all) - mar = host_history_utils.get_machine_availability_rate(stats_all) - mir = mar - mur - - for status, interval in stats_all.iteritems(): - print '%-18s %-16s %-10.2f%%' % (status, interval, - 100*interval/total_time) - print 'Machine utilization rate = %-4.2f%%' % (100*mur) - print 'Machine availability rate = %-4.2f%%' % (100*mar) - - fields = {'board': board, - 'pool': pool} - if span == 1: - _MACHINE_UTILIZATION_RATE_HOURLY.set(mur, fields=fields) - _MACHINE_AVAILABILITY_RATE_HOURLY.set(mar, fields=fields) - _MACHINE_IDLE_RATE_HOURLY.set(mir, fields=fields) - elif span == 24: - _MACHINE_UTILIZATION_RATE_DAILY.set(mur, fields=fields) - _MACHINE_AVAILABILITY_RATE_DAILY.set(mar, fields=fields) - _MACHINE_IDLE_RATE_DAILY.set(mir, fields=fields) - - -def main(): - """main script. """ - parser = argparse.ArgumentParser() - parser.add_argument('--span', type=int, dest='span', default=1, - help=('Number of hours that stats should be collected. ' - 'If it is set to 24, the end time of stats being ' - 'collected will set to the mid of the night. ' - 'Default is set to 1 hour.')) - parser.add_argument('-e', '--email', dest='email', default=None, - help='Email any errors to the given email address.') - options = parser.parse_args() - - boards = host_label_utils.get_all_boards() - pools = ['bvt', 'suites', 'cq'] - - if options.span == 24: - today = datetime.combine(date.today(), datetime.min.time()) - end_time = time_utils.to_epoch_time(today) - else: - now = datetime.now() - end_time = datetime(year=now.year, month=now.month, day=now.day, - hour=now.hour) - end_time = time_utils.to_epoch_time(end_time) - - start_time = end_time - timedelta(hours=options.span).total_seconds() - print ('Collecting host stats from %s to %s...' % - (time_utils.epoch_time_to_date_string(start_time), - time_utils.epoch_time_to_date_string(end_time))) - - ts_mon_config.SetupTsMonGlobalState('collect_host_stats') - - errors = [] - if not boards: - errors.append('Error! No board found in metadb.') - for board in boards: - for pool in pools: - error = report_stats(board, pool, start_time, end_time, - options.span) - if error: - errors.append(error) - if options.email and errors: - gmail_lib.send_email(options.email, - 'Error occured when collecting host stats.', - '\n'.join(errors)) - - -if __name__ == '__main__': - main() diff --git a/site_utils/collect_suite_time_stats.py b/site_utils/collect_suite_time_stats.py deleted file mode 100755 index cdda7dcd98..0000000000 --- a/site_utils/collect_suite_time_stats.py +++ /dev/null @@ -1,443 +0,0 @@ -#!/usr/bin/python -# -# Copyright (c) 2014 The Chromium OS Authors. All rights reserved. -# Use of this source code is governed by a BSD-style license that can be -# found in the LICENSE file. - - -"""Script to calculate timing stats for suites. - -This script measures nine stats for a suite run. -1. Net suite runtime. -2. Suite scheduling overhead. -3. Average scheduling overhead. -4. Average Queuing time. -5. Average Resetting time. -6. Average provisioning time. -7. Average Running time. -8. Average Parsing time. -9. Average Gathering time. - -When the cron_mode is enabled, this script throws all stats but the first one -(Net suite runtime) to Graphite because the first one is already -being sent to Graphite by Autotest online. - -Net suite runtime is end-to-end time for a suite from the beginning -to the end. -It is stored in a field, "duration", of a type, "suite_runtime" in -elasticsearch (ES). - -Suite scheduling overhead is defined by the average of DUT overheads. -Suite is composed of one or more jobs, and those jobs are run on -one or more DUTs that are available. -A DUT overhead is defined by: - DUT_i overhead = sum(net time for job_k - runtime for job_k - - runtime for special tasks of job_k) - Job_k are the jobs run on DUT_i. - -Net time for a job is the time from job_queued_time to hqe_finished_time. -job_queued_time is stored in the "queued_time" column of "tko_jobs" table. -hqe_finished_time is stored in the "finished_on" of "afe_host_queue_entries" -table. -We do not use "job_finished_time" of "tko_jobs" as job_finished_time is -recorded before gathering/parsing. -We do not use hqe started time ("started_on" of "afe_host_queue_entries"), -as it does not account for the lag from a host is assigned to the job till -the scheduler sees the assignment. - -Runtime for job_k is the sum of durations for the records of -"job_time_breakdown" type in ES that have "Queued" or "Running" status. -It is possible that a job has multiple "Queued" records when the job's test -failed and tried again. -We take into account only the last "Queued" record. - -Runtime for special tasks of job_k is the sum of durations for the records -of "job_time_breakdown" type in ES that have "Resetting", "Provisioning", -"Gathering", or "Parsing" status. -We take into account only the records whose timestamp is larger than -the timestamp of the last "Queued" record. -""" - -import argparse -from datetime import datetime -from datetime import timedelta - -import common -from autotest_lib.client.common_lib import host_queue_entry_states -from autotest_lib.client.common_lib import time_utils -from autotest_lib.client.common_lib.cros.graphite import autotest_es -from autotest_lib.frontend import setup_django_environment -from autotest_lib.frontend.afe import models -from autotest_lib.frontend.tko import models as tko_models -from autotest_lib.server import utils -from autotest_lib.site_utils import job_overhead - - -_options = None - -_hqes = host_queue_entry_states.Status -_states = [ - _hqes.QUEUED, _hqes.RESETTING, _hqes.PROVISIONING, - _hqes.RUNNING, _hqes.GATHERING, _hqes.PARSING] - - -def mean(l): - """ - Calculates an Arithmetic Mean for the numbers in a list. - - @param l: A list of numbers. - @return: Arithmetic mean if the list is not empty. - Otherwise, returns zero. - """ - return float(sum(l)) / len(l) if l else 0 - - -def print_verbose(string, *args): - if _options.verbose: - print(string % args) - - -def get_nontask_runtime(job_id, dut, job_info_dict): - """ - Get sum of durations for "Queued", "Running", "Parsing", and "Gathering" - status records. - job_info_dict will be modified in this function to store the duration - for each status. - - @param job_id: The job id of interest. - @param dut: Hostname of a DUT that the job ran on. - @param job_info_dict: Dictionary that has information for jobs. - @return: Tuple of sum of durations and the timestamp for the last - Queued record. - """ - results = autotest_es.query( - fields_returned=['status', 'duration', 'time_recorded'], - equality_constraints=[('_type', 'job_time_breakdown'), - ('job_id', job_id), - ('hostname', dut)], - sort_specs=[{'time_recorded': 'desc'}]) - - sum = 0 - last_queued_timestamp = 0 - # There could be multiple "Queued" records. - # Get sum of durations for the records after the last "Queued" records - # (including the last "Queued" record). - # Exploits the fact that "results" are ordered in the descending order - # of time_recorded. - for hit in results.hits: - job_info_dict[job_id][hit['status']] = float(hit['duration']) - if hit['status'] == 'Queued': - # The first Queued record is the last one because of the descending - # order of "results". - last_queued_timestamp = float(hit['time_recorded']) - sum += float(hit['duration']) - break - else: - sum += float(hit['duration']) - return (sum, last_queued_timestamp) - - -def get_tasks_runtime(task_list, dut, t_start, job_id, job_info_dict): - """ - Get sum of durations for special tasks. - job_info_dict will be modified in this function to store the duration - for each special task. - - @param task_list: List of task id. - @param dut: Hostname of a DUT that the tasks ran on. - @param t_start: Beginning timestamp. - @param job_id: The job id that is related to the tasks. - This is used only for debugging purpose. - @param job_info_dict: Dictionary that has information for jobs. - @return: Sum of durations of the tasks. - """ - t_start_epoch = time_utils.to_epoch_time(t_start) - results = autotest_es.query( - fields_returned=['status', 'task_id', 'duration'], - equality_constraints=[('_type', 'job_time_breakdown'), - ('hostname', dut)], - range_constraints=[('time_recorded', t_start_epoch, None)], - batch_constraints=[('task_id', task_list)]) - sum = 0 - for hit in results.hits: - sum += float(hit['duration']) - job_info_dict[job_id][hit['status']] = float(hit['duration']) - print_verbose('Task %s for Job %s took %s', - hit['task_id'], job_id, hit['duration']) - return sum - - -def get_job_runtime(job_id, dut, job_info_dict): - """ - Get sum of durations for the entries that are related to a job. - job_info_dict will be modified in this function. - - @param job_id: The job id of interest. - @param dut: Hostname of a DUT that the job ran on. - @param job_info_dict: Dictionary that has information for jobs. - @return: Total duration taken by a job. - """ - sum, t_last_queued = get_nontask_runtime(job_id, dut, job_info_dict) - print_verbose('Job %s took %f, last Queued: %s', - job_id, sum, t_last_queued) - sum += get_tasks_runtime( - list(job_info_dict[job_id]['tasks']), dut, t_last_queued, - job_id, job_info_dict) - return sum - - -def get_dut_overhead(dut, jobs, job_info_dict): - """ - Calculates the scheduling overhead of a DUT. - - The scheduling overhead of a DUT is defined by the sum of scheduling - overheads for the jobs that ran on the DUT. - The scheduling overhead for a job is defined by the difference - of net job runtime and real job runtime. - job_info_dict will be modified in this function. - - @param dut: Hostname of a DUT. - @param jobs: The list of jobs that ran on the DUT. - @param job_info_dict: Dictionary that has information for jobs. - @return: Scheduling overhead of a DUT in a floating point value. - The unit is a second. - """ - overheads = [] - for job_id in jobs: - (t_start, t_end) = job_info_dict[job_id]['timestamps'] - runtime = get_job_runtime(job_id, dut, job_info_dict) - overheads.append(t_end - t_start - runtime) - print_verbose('Job: %s, Net runtime: %f, Real runtime: %f, ' - 'Overhead: %f', job_id, t_end - t_start, runtime, - t_end - t_start - runtime) - return sum(overheads) - - -def get_child_jobs_info(suite_job_id, num_child_jobs, sanity_check): - """ - Gets information about child jobs of a suite. - - @param suite_job_id: Job id of a suite. - @param num_child_jobs: Number of child jobs of the suite. - @param sanity_check: Do sanity check if True. - @return: A tuple of (dictionary, list). For dictionary, the key is - a DUT's hostname and the value is a list of jobs that ran on - the DUT. List is the list of all jobs of the suite. - """ - results = autotest_es.query( - fields_returned=['job_id', 'hostname'], - equality_constraints=[('_type', 'host_history'), - ('parent_job_id', suite_job_id), - ('status', 'Running'),]) - - dut_jobs_dict = {} - job_filter = set() - for hit in results.hits: - job_id = hit['job_id'] - dut = hit['hostname'] - if job_id in job_filter: - continue - job_list = dut_jobs_dict.setdefault(dut, []) - job_list.append(job_id) - job_filter.add(job_id) - - if sanity_check and len(job_filter) != num_child_jobs: - print('WARNING: Mismatch number of child jobs of a suite (%d): ' - '%d != %d' % (suite_job_id, len(job_filter), num_child_jobs)) - return dut_jobs_dict, list(job_filter) - - -def get_job_timestamps(job_list, job_info_dict): - """ - Get beginning time and ending time for each job. - - The beginning time of a job is "queued_time" of "tko_jobs" table. - The ending time of a job is "finished_on" of "afe_host_queue_entries" table. - job_info_dict will be modified in this function to store the timestamps. - - @param job_list: List of job ids - @param job_info_dict: Dictionary that timestamps for each job will be stored - """ - tko = tko_models.Job.objects.filter(afe_job_id__in=job_list) - hqe = models.HostQueueEntry.objects.filter(job_id__in=job_list) - job_start = {} - for t in tko: - job_start[t.afe_job_id] = time_utils.to_epoch_time(t.queued_time) - job_end = {} - for h in hqe: - job_end[h.job_id] = time_utils.to_epoch_time(h.finished_on) - - for job_id in job_list: - info_dict = job_info_dict.setdefault(job_id, {}) - info_dict.setdefault('timestamps', (job_start[job_id], job_end[job_id])) - - -def get_job_tasks(job_list, job_info_dict): - """ - Get task ids for each job. - job_info_dict will be modified in this function to store the task ids. - - @param job_list: List of job ids - @param job_info_dict: Dictionary that task ids for each job will be stored. - """ - results = autotest_es.query( - fields_returned=['job_id', 'task_id'], - equality_constraints=[('_type', 'host_history')], - batch_constraints=[('job_id', job_list)]) - for hit in results.hits: - if 'task_id' in hit: - info_dict = job_info_dict.setdefault(hit['job_id'], {}) - task_set = info_dict.setdefault('tasks', set()) - task_set.add(hit['task_id']) - - -def get_scheduling_overhead(suite_job_id, num_child_jobs, sanity_check=True): - """ - Calculates a scheduling overhead. - - A scheduling overhead is defined by the average of DUT overheads - for the DUTs that the child jobs of a suite ran on. - - @param suite_job_id: Job id of a suite. - @param num_child_jobs: Number of child jobs of the suite. - @param sanity_check: Do sanity check if True. - @return: Dictionary storing stats. - """ - dut_jobs_dict, job_list = get_child_jobs_info( - suite_job_id, num_child_jobs, sanity_check) - job_info_dict = {} - get_job_timestamps(job_list, job_info_dict) - get_job_tasks(job_list, job_info_dict) - - dut_overheads = [] - avg_overhead = 0 - for dut, jobs in dut_jobs_dict.iteritems(): - print_verbose('Dut: %s, Jobs: %s', dut, jobs) - overhead = get_dut_overhead(dut, jobs, job_info_dict) - avg_overhead += overhead - print_verbose('Dut overhead: %f', overhead) - dut_overheads.append(overhead) - - if job_list: - avg_overhead = avg_overhead / len(job_list) - - state_samples_dict = {} - for info in job_info_dict.itervalues(): - for state in _states: - if state in info: - samples = state_samples_dict.setdefault(state, []) - samples.append(info[state]) - - if state_samples_dict: - result = {state: mean(state_samples_dict[state]) - if state in state_samples_dict else 0 - for state in _states} - result['suite_overhead'] = mean(dut_overheads) - result['overhead'] = avg_overhead - result['num_duts'] = len(dut_jobs_dict) - return result - - -def print_suite_stats(suite_stats): - """Prints out statistics for a suite to standard output.""" - print('suite_overhead: %(suite_overhead)f, overhead: %(overhead)f,' % - suite_stats), - for state in _states: - if state in suite_stats: - print('%s: %f,' % (state, suite_stats[state])), - print('num_duts: %(num_duts)d' % suite_stats) - - -def analyze_suites(start_time, end_time): - """ - Calculates timing stats (i.e., suite runtime, scheduling overhead) - for the suites that finished within the timestamps given by parameters. - - @param start_time: Beginning timestamp. - @param end_time: Ending timestamp. - """ - print('Analyzing suites from %s to %s...' % ( - time_utils.epoch_time_to_date_string(start_time), - time_utils.epoch_time_to_date_string(end_time))) - - if _options.bvtonly: - batch_constraints = [ - ('suite_name', ['bvt-inline', 'bvt-cq', 'bvt-perbuild'])] - else: - batch_constraints = [] - - start_time_epoch = time_utils.to_epoch_time(start_time) - end_time_epoch = time_utils.to_epoch_time(end_time) - results = autotest_es.query( - fields_returned=['suite_name', 'suite_job_id', 'board', 'build', - 'num_child_jobs', 'duration'], - equality_constraints=[('_type', job_overhead.SUITE_RUNTIME_KEY),], - range_constraints=[('time_recorded', start_time_epoch, - end_time_epoch)], - sort_specs=[{'time_recorded': 'asc'}], - batch_constraints=batch_constraints) - print('Found %d suites' % (results.total)) - - for hit in results.hits: - suite_job_id = hit['suite_job_id'] - - try: - suite_name = hit['suite_name'] - num_child_jobs = int(hit['num_child_jobs']) - suite_runtime = float(hit['duration']) - - print('Suite: %s (%s), Board: %s, Build: %s, Num child jobs: %d' % ( - suite_name, suite_job_id, hit['board'], hit['build'], - num_child_jobs)) - - suite_stats = get_scheduling_overhead(suite_job_id, num_child_jobs) - print('Suite: %s (%s) runtime: %f,' % ( - suite_name, suite_job_id, suite_runtime)), - print_suite_stats(suite_stats) - - except Exception as e: - print('ERROR: Exception is raised while processing suite %s' % ( - suite_job_id)) - print e - - -def analyze_suite(suite_job_id): - suite_stats = get_scheduling_overhead(suite_job_id, 0, False) - print('Suite (%s)' % suite_job_id), - print_suite_stats(suite_stats) - - -def main(): - """main script.""" - parser = argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument('-c', dest='cron_mode', action='store_true', - help=('Run in a cron mode. Cron mode ' - 'sends calculated stat data to Graphite.'), - default=False) - parser.add_argument('-s', type=int, dest='span', - help=('Number of hours that stats should be ' - 'collected.'), - default=1) - parser.add_argument('--bvtonly', dest='bvtonly', action='store_true', - help=('Gets bvt suites only (i.e., bvt-inline,' - 'bvt-cq, bvt-perbuild).'), - default=False) - parser.add_argument('--suite', type=int, dest='suite_job_id', - help=('Job id of a suite.')) - parser.add_argument('--verbose', dest='verbose', action='store_true', - help=('Prints out more info if True.'), - default=False) - global _options - _options = parser.parse_args() - - if _options.suite_job_id: - analyze_suite(_options.suite_job_id) - else: - end_time = time_utils.to_epoch_time(datetime.now()) - start_time = end_time - timedelta(hours=_options.span).total_seconds() - analyze_suites(start_time, end_time) - - -if __name__ == '__main__': - main() diff --git a/site_utils/devserver_history.py b/site_utils/devserver_history.py deleted file mode 100755 index af91c25c7a..0000000000 --- a/site_utils/devserver_history.py +++ /dev/null @@ -1,257 +0,0 @@ -#!/usr/bin/env python - -# Copyright (c) 2014 The Chromium OS Authors. All rights reserved. -# Use of this source code is governed by a BSD-style license that can be -# found in the LICENSE file. - -# Script to check the history of stage calls made to devserver. -# Following are some sample use cases: -# -# 1. Find all stage request for autotest and image nyan_big-release/R38-6055.0.0 -# in the last 10 days across all devservers. -# ./devserver_history.py --image_filters nyan_big 38 6055.0.0 -l 240 \ -# --artifact_filters autotest -v -# output: -# ============================================================================== -# 170.21.64.22 -# ============================================================================== -# Number of calls: 1 -# Number of unique images: 1 -# 2014-08-23 12:45:00: nyan_big-release/R38-6055.0.0 autotest -# ============================================================================== -# 170.21.64.23 -# ============================================================================== -# Number of calls: 2 -# Number of unique images: 1 -# 2014-08-23 12:45:00: nyan_big-release/R38-6055.0.0 autotest, test_suites -# 2014-08-23 12:55:00: nyan_big-release/R38-6055.0.0 autotest, test_suites -# -# 2. Find all duplicated stage request for the last 10 days. -# ./devserver_history.py -d -l 240 -# output: -# Detecting artifacts staged in multiple devservers. -# ============================================================================== -# nyan_big-release/R38-6055.0.0 -# ============================================================================== -# 170.21.64.22: 23 requests 2014-09-04 22:44:28 -- 2014-09-05 00:03:23 -# 170.21.64.23: 6 requests 2014-09-04 22:48:58 -- 2014-09-04 22:49:42 -# -# Count of images with duplicated stages on each devserver: -# 170.21.64.22 : 22 -# 170.21.64.23 : 11 - - -import argparse -import datetime -import logging -import operator -import re -import time -from itertools import groupby - -import common -from autotest_lib.client.common_lib import global_config -from autotest_lib.client.common_lib import time_utils -from autotest_lib.client.common_lib.cros.graphite import autotest_es - - -class devserver_call(object): - """A container to store the information of devserver stage call. - """ - - def __init__(self, hit): - """Retrieve information from a ES query hit. - """ - self.devserver = hit['devserver'] - self.subname = hit['subname'] - self.artifacts = hit['artifacts'].split(' ') - self.image = hit['image'] - self.value = hit['value'] - self.time_recorded = time_utils.epoch_time_to_date_string( - hit['time_recorded']) - - - def __str__(self): - pairs = ['%-20s: %s' % (attr, getattr(self, attr)) for attr in dir(self) - if not attr.startswith('__') and - not callable(getattr(self, attr))] - return '\n'.join(pairs) - - -def get_calls(time_start, time_end, artifact_filters=None, - regex_constraints=None, devserver=None, size=1e7): - """Gets all devserver calls from es db with the given constraints. - - @param time_start: Earliest time entry was recorded. - @param time_end: Latest time entry was recorded. - @param artifact_filters: A list of names to match artifacts. - @param regex_constraints: A list of regex constraints for ES query. - @param devserver: name of devserver to query for. If it's set to None, - return calls for all devservers. Default is set to None. - @param size: Max number of entries to return, default to 1 million. - - @returns: Entries from esdb. - """ - eqs = [('_type', 'devserver')] - if devserver: - eqs.append(('devserver', devserver)) - if artifact_filters: - for artifact in artifact_filters: - eqs.append(('artifacts', artifact)) - time_start_epoch = time_utils.to_epoch_time(time_start) - time_end_epoch = time_utils.to_epoch_time(time_end) - results = autotest_es.query( - fields_returned=None, - equality_constraints=eqs, - range_constraints=[('time_recorded', time_start_epoch, - time_end_epoch)], - size=size, - sort_specs=[{'time_recorded': 'desc'}], - regex_constraints=regex_constraints) - devserver_calls = [] - for hit in results.hits: - devserver_calls.append(devserver_call(hit)) - logging.info('Found %d calls.', len(devserver_calls)) - return devserver_calls - - -def print_call_details(calls, verbose): - """Print details of each call to devserver to stage artifacts. - - @param calls: A list of devserver stage requests. - @param verbose: Set to True to print out all devserver calls. - """ - calls = sorted(calls, key=lambda c: c.devserver) - for devserver,calls_for_devserver in groupby(calls, lambda c: c.devserver): - calls_for_devserver = list(calls_for_devserver) - print '='*80 - print devserver - print '='*80 - print 'Number of calls: %d' % len(calls_for_devserver) - print ('Number of unique images: %d' % - len(set([call.image for call in calls_for_devserver]))) - if verbose: - for call in sorted(calls_for_devserver, - key=lambda c: c.time_recorded): - print ('%s %s %s' % (call.time_recorded, call.image, - ', '.join(call.artifacts))) - - -def detect_duplicated_stage(calls): - """Detect any artifact for same build was staged in multiple devservers. - - @param calls: A list of devserver stage requests. - """ - print '\nDetecting artifacts staged in multiple devservers.' - calls = sorted(calls, key=lambda c: c.image) - # Count how many times a devserver staged duplicated artifacts. A number - # significantly larger then others can indicate that the devserver failed - # check_health too often and needs to be removed from production. - duplicated_stage_count = {} - for image,calls_for_image in groupby(calls, lambda c: c.image): - calls_for_image = list(calls_for_image) - devservers = set([call.devserver for call in calls_for_image]) - if len(devservers) > 1: - print '='*80 - print image - print '='*80 - calls_for_image = sorted(calls_for_image, key=lambda c: c.devserver) - for devserver,calls_for_devserver in groupby(calls_for_image, - lambda c: c.devserver): - timestamps = [c.time_recorded for c in calls_for_devserver] - print ('%s: %-3d requests %s -- %s' % - (devserver, len(timestamps), min(timestamps), - max(timestamps))) - duplicated_stage_count[devserver] = ( - duplicated_stage_count.get(devserver, 0) + 1) - print '\nCount of images with duplicated stages on each devserver:' - counts = sorted(duplicated_stage_count.iteritems(), - key=operator.itemgetter(1), reverse=True) - for k,v in counts: - print '%-15s: %d' % (k, v) - - -def main(): - """main script. """ - t_now = time.time() - t_now_minus_one_day = t_now - 3600 * 24 - parser = argparse.ArgumentParser() - parser.add_argument('-l', type=float, dest='last', - help='last hours to search results across', - default=None) - parser.add_argument('--start', type=str, dest='start', - help=('Enter start time as: yyyy-mm-dd hh-mm-ss,' - 'defualts to 24h ago. This option is ignored when' - ' -l is used.'), - default=time_utils.epoch_time_to_date_string( - t_now_minus_one_day)) - parser.add_argument('--end', type=str, dest='end', - help=('Enter end time in as: yyyy-mm-dd hh-mm-ss,' - 'defualts to current time. This option is ignored' - ' when -l is used.'), - default=time_utils.epoch_time_to_date_string(t_now)) - parser.add_argument('--devservers', nargs='+', dest='devservers', - help=('Enter space deliminated devservers. Default are' - ' all devservers specified in global config.'), - default=[]) - parser.add_argument('--artifact_filters', nargs='+', - dest='artifact_filters', - help=('Enter space deliminated filters on artifact ' - 'name. For example "autotest test_suites". The ' - 'filter does not support regex.'), - default=[]) - parser.add_argument('--image_filters', nargs='+', dest='image_filters', - help=('Enter space deliminated filters on image name. ' - 'For example "nyan 38 6566", search will use ' - 'regex to match each filter. Do not use filters ' - 'with mixed letter and number, e.g., R38.'), - default=[]) - parser.add_argument('-d', '--detect_duplicated_stage', action='store_true', - dest='detect_duplicated_stage', - help=('Set to True to detect if an artifacts for a same' - ' build was staged in multiple devservers. ' - 'Default is True.'), - default=False) - parser.add_argument('-v', action='store_true', dest='verbose', - default=False, - help='-v to print out ALL entries.') - options = parser.parse_args() - if options.verbose: - logging.getLogger().setLevel(logging.INFO) - - if options.last: - end_time = datetime.datetime.now() - start_time = end_time - datetime.timedelta(seconds=3600 * options.last) - else: - start_time = datetime.datetime.strptime(options.start, - time_utils.TIME_FMT) - end_time = datetime.datetime.strptime(options.end, time_utils.TIME_FMT) - logging.info('Searching devserver calls from %s to %s', start_time, - end_time) - - devservers = options.devservers - if not devservers: - devserver_urls = global_config.global_config.get_config_value( - 'CROS', 'dev_server', type=list, default=[]) - devservers = [] - for url in devserver_urls: - match = re.match('http://([^:]*):*\d*', url) - devservers.append(match.groups(0)[0] if match else url) - logging.info('Found devservers: %s', devservers) - - regex_constraints = [] - for filter in options.image_filters: - regex_constraints.append(('image', '.*%s.*' % filter)) - calls = [] - for devserver in devservers: - calls.extend(get_calls(start_time, end_time, options.artifact_filters, - regex_constraints, devserver=devserver)) - - print_call_details(calls, options.verbose) - - if options.detect_duplicated_stage: - detect_duplicated_stage(calls) - - -if __name__ == '__main__': - main() diff --git a/site_utils/es_reindex.py b/site_utils/es_reindex.py deleted file mode 100755 index 8a21a38ccf..0000000000 --- a/site_utils/es_reindex.py +++ /dev/null @@ -1,91 +0,0 @@ -#!/usr/bin/python - -# Copyright (c) 2014 The Chromium OS Authors. All rights reserved. -# Use of this source code is governed by a BSD-style license that can be -# found in the LICENSE file. - - -"""This script copies all data from one index into another, and updates the -alias to point to the new index. - -usage: es_reindex.py [-h] [--host HOST] [--port PORT] [--old OLD] - [--new NEW] [--alias ALIAS] - -optional arguments: - -h, --help show this help message and exit - --host HOST name of ES server. - --port PORT - --old OLD Name of the old index. - --new NEW Name of the new index. - --alias ALIAS alias to be pointed to the new index. - -""" - -import argparse - -import common -from elasticsearch import Elasticsearch -from elasticsearch import helpers -from autotest_lib.client.common_lib.cros.graphite import autotest_es - - -def main(): - """main script. """ - - parser = argparse.ArgumentParser() - parser.add_argument('--host', type=str, dest='host', - help='name of ES server.') - parser.add_argument('--port', type=str, dest='port', default=9200) - parser.add_argument('--old', type=str, dest='old', - help='Name of the old index.') - parser.add_argument('--new', type=str, dest='new', - help='Name of the new index.') - parser.add_argument('--alias', type=str, dest='alias', - help='alias to be pointed to the new index.') - - options = parser.parse_args() - - query = {'query' : {'match_all' : {}}, - 'size': 1} - - result = autotest_es.execute_query(index=options.old, host=options.host, - port=options.port, query) - print 'Total number of records in index %s: %d' % (options.old, - result.total) - - print ('Re-index: %s to index: %s for server %s:%s' % - (options.old, options.new, options.host, options.port)) - - client = Elasticsearch(hosts=[{'host': options.host, 'port': options.port}]) - helpers.reindex(client, options.old, options.new) - print 'reindex completed.' - - print 'Checking records in the new index...' - result = es.execute_query(index=options.new, host=options.host, - port=options.port, query) - print 'Total number of records in index %s: %d' % (options.new, - result.total) - - # count_new can be larger than count if new records are added during - # reindexing. This check only tries to make sure no record was lost. - if count > count_new: - raise Exception('Error! There are %d records missing after reindexing. ' - 'Alias will not be updated to the new index. You might ' - 'want to try reindex again.' % - (count - count_new)) - - body = {'actions': [{'remove': {'alias': options.alias, - 'index': options.old}}, - {'add': {'alias': options.alias, - 'index': options.new}} - ] - } - client.indices.update_aliases(body=body) - print 'alias is updated.' - print ('Please verify the new index is working before deleting old index ' - 'with command:\n.curl -XDELETE %s:%s/%s' % - (options.host, options.port, options.old)) - - -if __name__ == '__main__': - main() diff --git a/site_utils/gs_offloader.py b/site_utils/gs_offloader.py index 95b31b60ff..177a3a6ed3 100755 --- a/site_utils/gs_offloader.py +++ b/site_utils/gs_offloader.py @@ -34,7 +34,6 @@ import common from autotest_lib.client.common_lib import file_utils from autotest_lib.client.common_lib import global_config from autotest_lib.client.common_lib import utils -from autotest_lib.client.common_lib.cros.graphite import autotest_es from autotest_lib.site_utils import job_directories from autotest_lib.site_utils import cloud_console_client from autotest_lib.tko import models @@ -543,11 +542,6 @@ class GSOffloader(BaseGSOffloader): m_any_error = 'chromeos/autotest/errors/gs_offloader/any_error' metrics.Counter(m_any_error).increment(fields=metrics_fields) - e.es_metadata['time_used_sec'] = time.time() - e.start_time - autotest_es.post(use_http=True, - type_str=GS_OFFLOADER_FAILURE_TYPE, - metadata=e.es_metadata) - # Rewind the log files for stdout and stderr and log # their contents. stdout_file.seek(0) @@ -608,10 +602,6 @@ class GSOffloader(BaseGSOffloader): if process.returncode != 0: raise error_obj _emit_offload_metrics(dir_entry) - es_metadata['time_used_sec'] = time.time() - start_time - autotest_es.post(use_http=True, - type_str=GS_OFFLOADER_SUCCESS_TYPE, - metadata=es_metadata) if self._console_client: gcs_uri = os.path.join(gs_path, diff --git a/site_utils/host_history.py b/site_utils/host_history.py deleted file mode 100755 index 8f4d145a9e..0000000000 --- a/site_utils/host_history.py +++ /dev/null @@ -1,267 +0,0 @@ -#!/usr/bin/env python - -# Copyright (c) 2014 The Chromium OS Authors. All rights reserved. -# Use of this source code is governed by a BSD-style license that can be -# found in the LICENSE file. - -# This file defines script for getting host_history for DUTs in Autotest. - -"""Script for checking host history for a selected group of hosts. - -Currently only supports aggregating stats for each host. - -Example usage: - python host_history.py -n 10000 -l 24 --board=daisy - -Output: - - trying to get all duts... - making the query... - found all duts. Time to get host_history. - usage stats for host: chromeos2-row5-rack1-host6 - 2014-07-24 10:24:07 - 2014-07-25 10:24:07 - Verifying: 0.00 % - Running: 0.00 % - Ready: 100.00 % - Repairing: 0.00 % - Repair Failed: 0.00 % - Cleaning: 0.00 % - Pending: 0.00 % - Resetting: 0.00 % - Provisioning: 0.00 % - Locked: 0.00 % - - -- --- ---- ----- ---- --- -- - - -Example usage2: more than one host: - python host_history.py -n 1000 -l 2 \ - --hosts chromeos2-row5-rack4-host6 chromeos4-row12-rack11-host2 - - ['chromeos2-row5-rack4-host6', 'chromeos4-row12-rack11-host2'] - found all duts. Time to get host_history. - usage stats for host: chromeos2-row5-rack4-host6 - 2014-07-25 13:02:22 - 2014-07-25 15:02:22 - Num entries found in this interval: 0 - Verifying: 0.00 % - Running: 0.00 % - Ready: 100.00 % - Repairing: 0.00 % - Repair Failed: 0.00 % - Cleaning: 0.00 % - Pending: 0.00 % - Resetting: 0.00 % - Provisioning: 0.00 % - Locked: 0.00 % - - -- --- ---- ----- ---- --- -- - - - usage stats for host: chromeos4-row12-rack11-host2 - 2014-07-25 13:02:22 - 2014-07-25 15:02:22 - Num entries found in this interval: 138 - Verifying: 0.00 % - Running: 70.45 % - Ready: 17.79 % - Repairing: 0.00 % - Repair Failed: 0.00 % - Cleaning: 0.00 % - Pending: 1.24 % - Resetting: 10.78 % - Provisioning: 0.00 % - Locked: 0.00 % - - -- --- ---- ----- ---- --- -- - -""" - -import argparse -import time -import traceback - -import common -from autotest_lib.client.common_lib import time_utils -from autotest_lib.site_utils import host_history_utils - - -def print_all_stats(results, labels, t_start, t_end): - """Prints overall stats followed by stats for each host. - - @param results: A list of tuples of three elements. - 1st element: String representing report for individual host. - 2nd element: An ordered dictionary with - key as (t_start, t_end) and value as (status, metadata) - status = status of the host. e.g. 'Repair Failed' - t_start is the beginning of the interval where the DUT's has - that status - t_end is the end of the interval where the DUT has that - status - metadata: A dictionary of other metadata, e.g., - {'task_id':123, 'task_name':'Reset'} - 3rd element: hostname of the dut. - @param labels: A list of labels useful for describing the group - of hosts these overall stats represent. - @param t_start: beginning of time period we are interested in. - @param t_end: end of time period we are interested in. - """ - result_strs, stat_intervals_lst, hostname = zip(*results) - overall_report_str = host_history_utils.get_overall_report( - labels, t_start, t_end, stat_intervals_lst) - # Print the overall stats - print overall_report_str - # Print the stats for each individual host. - for result_str in result_strs: - print result_str - - -def get_host_history(input): - """Gets the host history. - - @param input: A dictionary of input arguments to - host_history_utils.host_history_stats. - Must contain these keys: - 't_start', - 't_end', - 'hostname', - 'size,' - 'print_each_interval' - @returns: - result_str: String reporting history for specific host. - stat_intervals: A ordered dictionary with - key as (t_start, t_end) and value as (status, metadata) - status = status of the host. e.g. 'Repair Failed' - t_start is the beginning of the interval where the DUT's has - that status - t_end is the end of the interval where the DUT has that - status - metadata: A dictionary of other metadata, e.g., - {'task_id':123, 'task_name':'Reset'} - """ - try: - result_str, stat_intervals = host_history_utils.get_report_for_host( - **input) - return result_str, stat_intervals, input['hostname'] - except Exception as e: - # In case any process throws an Exception, we want to see it. - print traceback.print_exc() - return None, None, None - - -def get_results(start_time, end_time, hosts=None, board=None, pool=None, - verbose=False): - """Get history results of specified hosts or board/pool. - - If hosts is set to None, all hosts are used, filtered by the board and pool - constraints. If board is not provided, all boards are included. If pool is - not provided, all pools are included. - If a list of hosts is provided, the board and pool constraints are ignored. - - @param hosts: A list of hosts to search for history. Default is None. - @param board: board type of hosts. Default is None. - @param pool: pool type of hosts. Default is None. - @param start_time: start time to search for history, can be string value or - epoch time. - @param end_time: end time to search for history, can be string value or - epoch time. - @param verbose: True to print out detail intervals of host history. - - @returns: A dictionary of host history. - """ - assert start_time and end_time - start_time = time_utils.to_epoch_time(start_time) - end_time = time_utils.to_epoch_time(end_time) - assert start_time < end_time - - return host_history_utils.get_report(t_start=start_time, t_end=end_time, - hosts=hosts, board=board, pool=pool, - print_each_interval=verbose) - - -def get_history_details(start_time, end_time, hosts=None, board=None, - pool=None): - """Get the details of host history. - - The return is a dictionary of host history for each host, for example, - {'172.22.33.51': [{'status': 'Resetting' - 'start_time': '2014-08-07 10:02:16', - 'end_time': '2014-08-07 10:03:16', - 'log_url': 'http://autotest/reset-546546/debug', - 'task_id': 546546}, - {'status': 'Running' - 'start_time': '2014-08-07 10:03:18', - 'end_time': '2014-08-07 10:13:00', - 'log_url': ('http://%s/tko/retrieve_logs.cgi?job=/' - 'results/16853-debug/172.22.33.51'), - 'job_id': 16853} - ] - } - @param start_time: start time to search for history, can be string value or - epoch time. - @param end_time: end time to search for history, can be string value or - epoch time. - @param hosts: A list of hosts to search for history. Default is None. - @param board: board type of hosts. Default is None. - @param pool: pool type of hosts. Default is None. - @returns: A dictionary of the host history details. - """ - results = get_results(start_time=start_time, end_time=end_time, hosts=hosts, - board=board, pool=pool) - if not results: - # No host found. - return None - all_history = {} - for result_str, status_intervals, hostname in results: - if hostname: - all_history[hostname] = host_history_utils.build_history( - hostname, status_intervals) - return all_history - - -def main(): - """main script. """ - t_now = time.time() - t_now_minus_one_day = t_now - 3600 * 24 - parser = argparse.ArgumentParser() - parser.add_argument('-v', action='store_true', dest='verbose', - default=False, - help='-v to print out ALL entries.') - parser.add_argument('-l', type=float, dest='last', - help='last hours to search results across', - default=None) - parser.add_argument('--board', type=str, dest='board', - help='restrict query by board, not implemented yet', - default=None) - parser.add_argument('--pool', type=str, dest='pool', - help='restrict query by pool, not implemented yet', - default=None) - parser.add_argument('--hosts', nargs='+', dest='hosts', - help='Enter space deliminated hostnames', - default=[]) - parser.add_argument('--start', type=str, dest='start', - help=('Enter start time as: yyyy-mm-dd hh:mm:ss,' - 'defualts to 24h ago.'), - default=time_utils.epoch_time_to_date_string( - t_now_minus_one_day)) - parser.add_argument('--end', type=str, dest='end', - help=('Enter end time in as: yyyy-mm-dd hh:mm:ss,' - 'defualts to current time.'), - default=time_utils.epoch_time_to_date_string(t_now)) - options = parser.parse_args() - - if options.last: - start_time = t_now - 3600 * options.last - end_time = t_now - else: - start_time = time_utils.to_epoch_time(options.start) - end_time = time_utils.to_epoch_time(options.end) - - results = get_results(hosts=options.hosts, - board=options.board, - pool=options.pool, - start_time=start_time, - end_time=end_time, - verbose=options.verbose) - labels = [] - if options.board: - labels.append('board:%s' % (options.board)) - if options.pool: - labels.append('pool:%s' % (options.pool)) - print_all_stats(results, labels, start_time, end_time) - - -if __name__ == '__main__': - main() diff --git a/site_utils/host_history_utils.py b/site_utils/host_history_utils.py deleted file mode 100644 index 0a4e83fdcd..0000000000 --- a/site_utils/host_history_utils.py +++ /dev/null @@ -1,687 +0,0 @@ -# Copyright (c) 2014 The Chromium OS Authors. All rights reserved. -# Use of this source code is governed by a BSD-style license that can be -# found in the LICENSE file. - -# This file contains utility functions for host_history. - -import collections -import copy -import multiprocessing.pool -from itertools import groupby - -import common -from autotest_lib.client.common_lib import time_utils -from autotest_lib.client.common_lib.cros.graphite import autotest_es -from autotest_lib.frontend import setup_django_environment -from autotest_lib.frontend.afe import models -from autotest_lib.site_utils import host_label_utils -from autotest_lib.site_utils import job_history - - -_HOST_HISTORY_TYPE = 'host_history' -_LOCK_HISTORY_TYPE = 'lock_history' - -# The maximum number of days that the script will lookup for history. -_MAX_DAYS_FOR_HISTORY = 90 - -class NoHostFoundException(Exception): - """Exception raised when no host is found to search for history. - """ - - -def get_matched_hosts(board, pool): - """Get duts with matching board and pool labels from metaDB. - - @param board: board of DUT, set to None if board doesn't need to match. - @param pool: pool of DUT, set to None if pool doesn't need to match. - @return: A list of duts that match the specified board and pool. - """ - labels = [] - if pool: - labels.append('pool:%s' % pool) - if board: - labels.append('board:%s' % board) - host_labels = host_label_utils.get_host_labels(labels=labels) - return host_labels.keys() - - -def prepopulate_dict(keys, value, extras=None): - """Creates a dictionary with val=value for each key. - - @param keys: list of keys - @param value: the value of each entry in the dict. - @param extras: list of additional keys - @returns: dictionary - """ - result = collections.OrderedDict() - extra_keys = tuple(extras if extras else []) - for key in keys + extra_keys: - result[key] = value - return result - - -def lock_history_to_intervals(initial_lock_val, t_start, t_end, lock_history): - """Converts lock history into a list of intervals of locked times. - - @param initial_lock_val: Initial value of the lock (False or True) - @param t_start: beginning of the time period we are interested in. - @param t_end: end of the time period we are interested in. - @param lock_history: Result of querying es for locks (dict) - This dictionary should contain keys 'locked' and 'time_recorded' - @returns: Returns a list of tuples where the elements of each tuples - represent beginning and end of intervals of locked, respectively. - """ - locked_intervals = [] - t_prev = t_start - state_prev = initial_lock_val - for entry in lock_history.hits: - t_curr = entry['time_recorded'] - - #If it is locked, then we put into locked_intervals - if state_prev: - locked_intervals.append((t_prev, t_curr)) - - # update vars - t_prev = t_curr - state_prev = entry['locked'] - if state_prev: - locked_intervals.append((t_prev, t_end)) - return locked_intervals - - -def find_most_recent_entry_before(t, type_str, hostname, fields): - """Returns the fields of the most recent entry before t. - - @param t: time we are interested in. - @param type_str: _type in esdb, such as 'host_history' (string) - @param hostname: hostname of DUT (string) - @param fields: list of fields we are interested in - @returns: time, field_value of the latest entry. - """ - # History older than 90 days are ignored. This helps the ES query faster. - t_epoch = time_utils.to_epoch_time(t) - result = autotest_es.query( - fields_returned=fields, - equality_constraints=[('_type', type_str), - ('hostname', hostname)], - range_constraints=[('time_recorded', - t_epoch-3600*24*_MAX_DAYS_FOR_HISTORY, t_epoch)], - size=1, - sort_specs=[{'time_recorded': 'desc'}]) - if result.total > 0: - return result.hits[0] - return {} - - -def get_host_history_intervals(input): - """Gets stats for a host. - - This method uses intervals found in metaDB to build a full history of the - host. The intervals argument contains a list of metadata from querying ES - for records between t_start and t_end. To get the status from t_start to - the first record logged in ES, we need to look back to the last record - logged in ES before t_start. - - @param input: A dictionary of input args, which including following args: - t_start: beginning of time period we are interested in. - t_end: end of time period we are interested in. - hostname: hostname for the host we are interested in (string) - intervals: intervals from ES query. - @returns: dictionary, num_entries_found - dictionary of status: time spent in that status - num_entries_found: number of host history entries - found in [t_start, t_end] - - """ - t_start = input['t_start'] - t_end = input['t_end'] - hostname = input['hostname'] - intervals = input['intervals'] - lock_history_recent = find_most_recent_entry_before( - t=t_start, type_str=_LOCK_HISTORY_TYPE, hostname=hostname, - fields=['time_recorded', 'locked']) - # I use [0] and [None] because lock_history_recent's type is list. - t_lock = lock_history_recent.get('time_recorded', None) - t_lock_val = lock_history_recent.get('locked', None) - t_metadata = find_most_recent_entry_before( - t=t_start, type_str=_HOST_HISTORY_TYPE, hostname=hostname, - fields=None) - t_host = t_metadata.pop('time_recorded', None) - t_host_stat = t_metadata.pop('status', None) - status_first = t_host_stat if t_host else 'Ready' - t = min([t for t in [t_lock, t_host, t_start] if t]) - - t_epoch = time_utils.to_epoch_time(t) - t_end_epoch = time_utils.to_epoch_time(t_end) - lock_history_entries = autotest_es.query( - fields_returned=['locked', 'time_recorded'], - equality_constraints=[('_type', _LOCK_HISTORY_TYPE), - ('hostname', hostname)], - range_constraints=[('time_recorded', t_epoch, t_end_epoch)], - sort_specs=[{'time_recorded': 'asc'}]) - - # Validate lock history. If an unlock event failed to be recorded in metadb, - # lock history will show the dut being locked while host still has status - # changed over the time. This check tries to remove the lock event in lock - # history if: - # 1. There is only one entry in lock_history_entries (it's a good enough - # assumption to avoid the code being over complicated. - # 2. The host status has changes after the lock history starts as locked. - if (len(lock_history_entries.hits) == 1 and t_lock_val and - len(intervals) >1): - locked_intervals = None - print ('Lock history of dut %s is ignored, the dut may have missing ' - 'data in lock history in metadb. Try to lock and unlock the dut ' - 'in AFE will force the lock history to be updated in metadb.' - % hostname) - else: - locked_intervals = lock_history_to_intervals(t_lock_val, t, t_end, - lock_history_entries) - num_entries_found = len(intervals) - t_prev = t_start - status_prev = status_first - metadata_prev = t_metadata - intervals_of_statuses = collections.OrderedDict() - - for entry in intervals: - metadata = entry.copy() - t_curr = metadata.pop('time_recorded') - status_curr = metadata.pop('status') - intervals_of_statuses.update(calculate_status_times( - t_prev, t_curr, status_prev, metadata_prev, locked_intervals)) - # Update vars - t_prev = t_curr - status_prev = status_curr - metadata_prev = metadata - - # Do final as well. - intervals_of_statuses.update(calculate_status_times( - t_prev, t_end, status_prev, metadata_prev, locked_intervals)) - return hostname, intervals_of_statuses, num_entries_found - - -def calculate_total_times(intervals_of_statuses): - """Calculates total times in each status. - - @param intervals_of_statuses: ordereddict where key=(ti, tf) and val=status - @returns: dictionary where key=status value=time spent in that status - """ - total_times = prepopulate_dict(models.Host.Status.names, 0.0, - extras=['Locked']) - for key, status_info in intervals_of_statuses.iteritems(): - ti, tf = key - total_times[status_info['status']] += tf - ti - return total_times - - -def aggregate_hosts(intervals_of_statuses_list): - """Aggregates history of multiple hosts - - @param intervals_of_statuses_list: A list of dictionaries where keys - are tuple (ti, tf), and value is the status along with other metadata. - @returns: A dictionary where keys are strings, e.g. 'status' and - value is total time spent in that status among all hosts. - """ - stats_all = prepopulate_dict(models.Host.Status.names, 0.0, - extras=['Locked']) - num_hosts = len(intervals_of_statuses_list) - for intervals_of_statuses in intervals_of_statuses_list: - total_times = calculate_total_times(intervals_of_statuses) - for status, delta in total_times.iteritems(): - stats_all[status] += delta - return stats_all, num_hosts - - -def get_stats_string_aggregate(labels, t_start, t_end, aggregated_stats, - num_hosts): - """Returns string reporting overall host history for a group of hosts. - - @param labels: A list of labels useful for describing the group - of hosts these overall stats represent. - @param t_start: beginning of time period we are interested in. - @param t_end: end of time period we are interested in. - @param aggregated_stats: A dictionary where keys are string, e.g. 'status' - value is total time spent in that status among all hosts. - @returns: string representing the aggregate stats report. - """ - result = 'Overall stats for hosts: %s \n' % (', '.join(labels)) - result += ' %s - %s \n' % (time_utils.epoch_time_to_date_string(t_start), - time_utils.epoch_time_to_date_string(t_end)) - result += ' Number of total hosts: %s \n' % (num_hosts) - # This is multiplied by time_spent to get percentage_spent - multiplication_factor = 100.0 / ((t_end - t_start) * num_hosts) - for status, time_spent in aggregated_stats.iteritems(): - # Normalize by the total time we are interested in among ALL hosts. - spaces = ' ' * (15 - len(status)) - percent_spent = multiplication_factor * time_spent - result += ' %s: %s %.2f %%\n' % (status, spaces, percent_spent) - result += '- -- --- ---- ----- ---- --- -- -\n' - return result - - -def get_overall_report(label, t_start, t_end, intervals_of_statuses_list): - """Returns string reporting overall host history for a group of hosts. - - @param label: A string that can be useful for showing what type group - of hosts these overall stats represent. - @param t_start: beginning of time period we are interested in. - @param t_end: end of time period we are interested in. - @param intervals_of_statuses_list: A list of dictionaries where keys - are tuple (ti, tf), and value is the status along with other metadata, - e.g., task_id, task_name, job_id etc. - """ - stats_all, num_hosts = aggregate_hosts( - intervals_of_statuses_list) - return get_stats_string_aggregate( - label, t_start, t_end, stats_all, num_hosts) - - -def get_intervals_for_host(t_start, t_end, hostname): - """Gets intervals for the given. - - Query metaDB to return all intervals between given start and end time. - Note that intervals found in metaDB may miss the history from t_start to - the first interval found. - - @param t_start: beginning of time period we are interested in. - @param t_end: end of time period we are interested in. - @param hosts: A list of hostnames to look for history. - @param board: Name of the board to look for history. Default is None. - @param pool: Name of the pool to look for history. Default is None. - @returns: A dictionary of hostname: intervals. - """ - t_start_epoch = time_utils.to_epoch_time(t_start) - t_end_epoch = time_utils.to_epoch_time(t_end) - host_history_entries = autotest_es.query( - fields_returned=None, - equality_constraints=[('_type', _HOST_HISTORY_TYPE), - ('hostname', hostname)], - range_constraints=[('time_recorded', t_start_epoch, - t_end_epoch)], - sort_specs=[{'time_recorded': 'asc'}]) - return host_history_entries.hits - - -def get_intervals_for_hosts(t_start, t_end, hosts=None, board=None, pool=None): - """Gets intervals for given hosts or board/pool. - - Query metaDB to return all intervals between given start and end time. - If a list of hosts is provided, the board and pool constraints are ignored. - If hosts is set to None, and board or pool is set, this method will attempt - to search host history with labels for all hosts, to help the search perform - faster. - If hosts, board and pool are all set to None, return intervals for all - hosts. - Note that intervals found in metaDB may miss the history from t_start to - the first interval found. - - @param t_start: beginning of time period we are interested in. - @param t_end: end of time period we are interested in. - @param hosts: A list of hostnames to look for history. - @param board: Name of the board to look for history. Default is None. - @param pool: Name of the pool to look for history. Default is None. - @returns: A dictionary of hostname: intervals. - """ - hosts_intervals = {} - if hosts: - for host in hosts: - hosts_intervals[host] = get_intervals_for_host(t_start, t_end, host) - else: - hosts = get_matched_hosts(board, pool) - if not hosts: - raise NoHostFoundException('No host is found for board:%s, pool:%s.' - % (board, pool)) - equality_constraints=[('_type', _HOST_HISTORY_TYPE),] - if board: - equality_constraints.append(('labels', 'board:'+board)) - if pool: - equality_constraints.append(('labels', 'pool:'+pool)) - t_start_epoch = time_utils.to_epoch_time(t_start) - t_end_epoch = time_utils.to_epoch_time(t_end) - results = autotest_es.query( - equality_constraints=equality_constraints, - range_constraints=[('time_recorded', t_start_epoch, - t_end_epoch)], - sort_specs=[{'hostname': 'asc'}]) - results_group_by_host = {} - for hostname,intervals_for_host in groupby(results.hits, - lambda h: h['hostname']): - results_group_by_host[hostname] = intervals_for_host - for host in hosts: - intervals = results_group_by_host.get(host, None) - # In case the host's board or pool label was modified after - # the last status change event was reported, we need to run a - # separate query to get its history. That way the host's - # history won't be shown as blank. - if not intervals: - intervals = get_intervals_for_host(t_start, t_end, host) - hosts_intervals[host] = intervals - return hosts_intervals - - -def get_report(t_start, t_end, hosts=None, board=None, pool=None, - print_each_interval=False): - """Gets history for given hosts or board/pool - - If a list of hosts is provided, the board and pool constraints are ignored. - - @param t_start: beginning of time period we are interested in. - @param t_end: end of time period we are interested in. - @param hosts: A list of hostnames to look for history. - @param board: Name of the board to look for history. Default is None. - @param pool: Name of the pool to look for history. Default is None. - @param print_each_interval: True display all intervals, default is False. - @returns: stats report for this particular host. The report is a list of - tuples (stat_string, intervals, hostname), intervals is a sorted - dictionary. - """ - if hosts: - board=None - pool=None - - hosts_intervals = get_intervals_for_hosts(t_start, t_end, hosts, board, - pool) - history = {} - pool = multiprocessing.pool.ThreadPool(processes=16) - args = [] - for hostname,intervals in hosts_intervals.items(): - args.append({'t_start': t_start, - 't_end': t_end, - 'hostname': hostname, - 'intervals': intervals}) - results = pool.imap_unordered(get_host_history_intervals, args) - for hostname, intervals, count in results: - history[hostname] = (intervals, count) - report = [] - for hostname,intervals in history.items(): - total_times = calculate_total_times(intervals[0]) - stats = get_stats_string( - t_start, t_end, total_times, intervals[0], hostname, - intervals[1], print_each_interval) - report.append((stats, intervals[0], hostname)) - return report - - -def get_report_for_host(t_start, t_end, hostname, print_each_interval): - """Gets stats report for a host - - @param t_start: beginning of time period we are interested in. - @param t_end: end of time period we are interested in. - @param hostname: hostname for the host we are interested in (string) - @param print_each_interval: True or False, whether we want to - display all intervals - @returns: stats report for this particular host (string) - """ - # Search for status change intervals during given time range. - intervals = get_intervals_for_host(t_start, t_end, hostname) - num_entries_found = len(intervals) - # Update the status change intervals with status before the first entry and - # host's lock history. - _, intervals_of_statuses = get_host_history_intervals( - {'t_start': t_start, - 't_end': t_end, - 'hostname': hostname, - 'intervals': intervals}) - total_times = calculate_total_times(intervals_of_statuses) - return (get_stats_string( - t_start, t_end, total_times, intervals_of_statuses, - hostname, num_entries_found, print_each_interval), - intervals_of_statuses) - - -def get_stats_string(t_start, t_end, total_times, intervals_of_statuses, - hostname, num_entries_found, print_each_interval): - """Returns string reporting host_history for this host. - @param t_start: beginning of time period we are interested in. - @param t_end: end of time period we are interested in. - @param total_times: dictionary where key=status, - value=(time spent in that status) - @param intervals_of_statuses: dictionary where keys is tuple (ti, tf), - and value is the status along with other metadata. - @param hostname: hostname for the host we are interested in (string) - @param num_entries_found: Number of entries found for the host in es - @param print_each_interval: boolean, whether to print each interval - """ - delta = t_end - t_start - result = 'usage stats for host: %s \n' % (hostname) - result += ' %s - %s \n' % (time_utils.epoch_time_to_date_string(t_start), - time_utils.epoch_time_to_date_string(t_end)) - result += ' Num entries found in this interval: %s\n' % (num_entries_found) - for status, value in total_times.iteritems(): - spaces = (15 - len(status)) * ' ' - result += ' %s: %s %.2f %%\n' % (status, spaces, 100*value/delta) - result += '- -- --- ---- ----- ---- --- -- -\n' - if print_each_interval: - for interval, status_info in intervals_of_statuses.iteritems(): - t0, t1 = interval - t0_string = time_utils.epoch_time_to_date_string(t0) - t1_string = time_utils.epoch_time_to_date_string(t1) - status = status_info['status'] - delta = int(t1-t0) - id_info = status_info['metadata'].get( - 'task_id', status_info['metadata'].get('job_id', '')) - result += (' %s : %s %-15s %-10s %ss\n' % - (t0_string, t1_string, status, id_info, delta)) - return result - - -def calculate_status_times(t_start, t_end, int_status, metadata, - locked_intervals): - """Returns a list of intervals along w/ statuses associated with them. - - If the dut is in status Ready, i.e., int_status==Ready, the lock history - should be applied so that the time period when dut is locked is considered - as not available. Any other status is considered that dut is doing something - and being used. `Repair Failed` and Repairing are not checked with lock - status, since these two statuses indicate the dut is not available any way. - - @param t_start: start time - @param t_end: end time - @param int_status: status of [t_start, t_end] if not locked - @param metadata: metadata of the status change, e.g., task_id, task_name. - @param locked_intervals: list of tuples denoting intervals of locked states - @returns: dictionary where key = (t_interval_start, t_interval_end), - val = (status, metadata) - t_interval_start: beginning of interval for that status - t_interval_end: end of the interval for that status - status: string such as 'Repair Failed', 'Locked', etc. - metadata: A dictionary of metadata, e.g., - {'task_id':123, 'task_name':'Reset'} - """ - statuses = collections.OrderedDict() - - prev_interval_end = t_start - - # TODO: Put allow more information here in info/locked status - status_info = {'status': int_status, - 'metadata': metadata} - locked_info = {'status': 'Locked', - 'metadata': {}} - if not locked_intervals: - statuses[(t_start, t_end)] = status_info - return statuses - for lock_start, lock_end in locked_intervals: - if prev_interval_end >= t_end: - break - if lock_start > t_end: - # optimization to break early - # case 0 - # Timeline of status change: t_start t_end - # Timeline of lock action: lock_start lock_end - break - elif lock_end < prev_interval_end: - # case 1 - # prev_interval_end t_end - # lock_start lock_end - continue - elif lock_end <= t_end and lock_start >= prev_interval_end: - # case 2 - # prev_interval_end t_end - # lock_start lock_end - # Lock happened in the middle, while the host stays in the same - # status, consider the lock has no effect on host history. - statuses[(prev_interval_end, lock_end)] = status_info - prev_interval_end = lock_end - elif lock_end > prev_interval_end and lock_start < prev_interval_end: - # case 3 - # prev_interval_end t_end - # lock_start lock_end (or lock_end) - # If the host status changed in the middle of being locked, consider - # the new status change as part of the host history. - statuses[(prev_interval_end, min(lock_end, t_end))] = locked_info - prev_interval_end = lock_end - elif lock_start < t_end and lock_end > t_end: - # case 4 - # prev_interval_end t_end - # lock_start lock_end - # If the lock happens in the middle of host status change, consider - # the lock has no effect on the host history for that status. - statuses[(prev_interval_end, t_end)] = status_info - statuses[(lock_start, t_end)] = locked_info - prev_interval_end = t_end - # Otherwise we are in the case where lock_end < t_start OR - # lock_start > t_end, which means the lock doesn't apply. - if t_end > prev_interval_end: - # This is to avoid logging the same time - statuses[(prev_interval_end, t_end)] = status_info - return statuses - - -def get_log_url(hostname, metadata): - """Compile a url to job's debug log from debug string. - - @param hostname: Hostname of the dut. - @param metadata: A dictionary of other metadata, e.g., - {'task_id':123, 'task_name':'Reset'} - @return: Url of the debug log for special task or job url for test job. - """ - log_url = None - if 'task_id' in metadata and 'task_name' in metadata: - log_url = job_history.TASK_URL % {'hostname': hostname, - 'task_id': metadata['task_id'], - 'task_name': metadata['task_name']} - elif 'job_id' in metadata and 'owner' in metadata: - log_url = job_history.JOB_URL % {'hostname': hostname, - 'job_id': metadata['job_id'], - 'owner': metadata['owner']} - - return log_url - - -def build_history(hostname, status_intervals): - """Get host history information from given state intervals. - - @param hostname: Hostname of the dut. - @param status_intervals: A ordered dictionary with - key as (t_start, t_end) and value as (status, metadata) - status = status of the host. e.g. 'Repair Failed' - t_start is the beginning of the interval where the DUT's has - that status - t_end is the end of the interval where the DUT has that - status - metadata: A dictionary of other metadata, e.g., - {'task_id':123, 'task_name':'Reset'} - @return: A list of host history, e.g., - [{'status': 'Resetting' - 'start_time': '2014-08-07 10:02:16', - 'end_time': '2014-08-07 10:03:16', - 'log_url': 'http://autotest/reset-546546/debug', - 'task_id': 546546}, - {'status': 'Running' - 'start_time': '2014-08-07 10:03:18', - 'end_time': '2014-08-07 10:13:00', - 'log_url': 'http://autotest/afe/#tab_id=view_job&object_id=1683', - 'job_id': 1683} - ] - """ - history = [] - for time_interval, status_info in status_intervals.items(): - start_time = time_utils.epoch_time_to_date_string(time_interval[0]) - end_time = time_utils.epoch_time_to_date_string(time_interval[1]) - interval = {'status': status_info['status'], - 'start_time': start_time, - 'end_time': end_time} - interval['log_url'] = get_log_url(hostname, status_info['metadata']) - interval.update(status_info['metadata']) - history.append(interval) - return history - - -def get_status_intervals(history_details): - """Get a list of status interval from history details. - - This is a reverse method of above build_history. Caller gets the history - details from RPC get_host_history, and use this method to get the list of - status interval, which can be used to calculate stats from - host_history_utils.aggregate_hosts. - - @param history_details: A dictionary of host history for each host, e.g., - {'172.22.33.51': [{'status': 'Resetting' - 'start_time': '2014-08-07 10:02:16', - 'end_time': '2014-08-07 10:03:16', - 'log_url': 'http://autotest/reset-546546/debug', - 'task_id': 546546},] - } - @return: A list of dictionaries where keys are tuple (start_time, end_time), - and value is a dictionary containing at least key 'status'. - """ - status_intervals = [] - for host,history in history_details.iteritems(): - intervals = collections.OrderedDict() - for interval in history: - start_time = time_utils.to_epoch_time(interval['start_time']) - end_time = time_utils.to_epoch_time(interval['end_time']) - metadata = copy.deepcopy(interval) - metadata['hostname'] = host - intervals[(start_time, end_time)] = {'status': interval['status'], - 'metadata': metadata} - status_intervals.append(intervals) - return status_intervals - - -def get_machine_utilization_rate(stats): - """Get machine utilization rate from given stats. - - @param stats: A dictionary with a status as key and value is the total - number of seconds spent on the status. - @return: The percentage of time when dut is running test jobs. - """ - not_utilized_status = ['Repairing', 'Repair Failed', 'Ready', 'Verifying'] - excluded_status = ['Locked'] - total_time = 0 - total_time_not_utilized = 0.0 - for status, interval in stats.iteritems(): - if status in excluded_status: - continue - total_time += interval - if status in not_utilized_status: - total_time_not_utilized += interval - if total_time == 0: - # All duts are locked, assume MUR is 0% - return 0 - else: - return 1 - total_time_not_utilized/total_time - - -def get_machine_availability_rate(stats): - """Get machine availability rate from given stats. - - @param stats: A dictionary with a status as key and value is the total - number of seconds spent on the status. - @return: The percentage of time when dut is available to run jobs. - """ - not_available_status = ['Repairing', 'Repair Failed', 'Verifying'] - excluded_status = ['Locked'] - total_time = 0 - total_time_not_available = 0.0 - for status, interval in stats.iteritems(): - if status in excluded_status: - continue - total_time += interval - if status in not_available_status: - total_time_not_available += interval - if total_time == 0: - # All duts are locked, assume MAR is 0% - return 0 - else: - return 1 - total_time_not_available/total_time diff --git a/site_utils/host_history_utils_unittest.py b/site_utils/host_history_utils_unittest.py deleted file mode 100755 index 5b6067e13b..0000000000 --- a/site_utils/host_history_utils_unittest.py +++ /dev/null @@ -1,59 +0,0 @@ -#!/usr/bin/python -# Copyright (c) 2014 The Chromium OS Authors. All rights reserved. -# Use of this source code is governed by a BSD-style license that can be -# found in the LICENSE file. - -import collections -import unittest - -import common -from autotest_lib.site_utils import host_history_utils - -class HostHistoryUtilsTests(unittest.TestCase): - """Test functions in host_history_utils. - """ - - def testCalculateStatusTimes(self): - """Test function calculate_status_times. - """ - # Locks in the middle does not affect the host history. - locked_intervals = [(2, 4), (4, 8)] - results = host_history_utils.calculate_status_times( - t_start=0, t_end=10, int_status='Ready', metadata={}, - locked_intervals=locked_intervals) - expected = collections.OrderedDict( - [((0, 4), {'status': 'Ready', 'metadata': {}}), - ((4, 8), {'status': 'Ready', 'metadata': {}}), - ((8, 10), {'status': 'Ready', 'metadata': {}})]) - self.assertEqual(results, expected) - - locked_intervals = [(0, 4), (11, 14), (16, 18)] - results = host_history_utils.calculate_status_times( - t_start=10, t_end=15, int_status='Ready', metadata={}, - locked_intervals=locked_intervals) - expected = collections.OrderedDict( - [((10, 14), {'status': 'Ready', 'metadata': {}}), - ((14, 15), {'status': 'Ready', 'metadata': {}})]) - self.assertEqual(results, expected) - - locked_intervals = [(2, 4), (4, 8)] - results = host_history_utils.calculate_status_times( - t_start=0, t_end=10, int_status='Running', metadata={}, - locked_intervals=locked_intervals) - expected = collections.OrderedDict( - [((0, 4), {'status': 'Running', 'metadata': {}}), - ((4, 8), {'status': 'Running', 'metadata': {}}), - ((8, 10), {'status': 'Running', 'metadata': {}})]) - self.assertEqual(results, expected) - - locked_intervals = [(1, 8)] - results = host_history_utils.calculate_status_times( - t_start=2, t_end=5, int_status='Running', metadata={}, - locked_intervals=locked_intervals) - expected = collections.OrderedDict( - [((2, 5), {'status': 'Locked', 'metadata': {}})]) - self.assertEqual(results, expected) - - -if __name__ == '__main__': - unittest.main() diff --git a/site_utils/host_label_utils.py b/site_utils/host_label_utils.py deleted file mode 100755 index 461c962d2f..0000000000 --- a/site_utils/host_label_utils.py +++ /dev/null @@ -1,166 +0,0 @@ -#!/usr/bin/env python - -# Copyright (c) 2014 The Chromium OS Authors. All rights reserved. -# Use of this source code is governed by a BSD-style license that can be -# found in the LICENSE file. - -""" -This script provides functions to: -1. collect: Collect all hosts and their labels to metaDB, can be scheduled - run daily, e.g., - ./site_utils/host_label_utils.py collect -2. query: Query for hosts and their labels information at a given day, e.g., - ./site_utils/host_label_utils.py query -n 172.27.213.193 -l peppy -""" - -import argparse -import itertools -import logging -import pprint -import time - -import common -from autotest_lib.client.common_lib import time_utils -from autotest_lib.client.common_lib.cros.graphite import autotest_es -from autotest_lib.frontend import setup_django_environment -from autotest_lib.frontend.afe import models - - -# _type used for ES -_HOST_LABEL_TYPE = 'host_labels' -_HOST_LABEL_TIME_INDEX_TYPE = 'host_labels_time_index' - -def get_all_boards(labels=None): - """Get a list of boards from host labels. - - Scan through all labels of all duts and get all possible boards based on - label of name board:* - - @param labels: A list of labels to filter hosts. - @return: A list of board names, e.g., ['peppy', 'daisy'] - """ - host_labels = get_host_labels(labels=labels) - board_labels = [[label[6:] for label in labels - if label.startswith('board:')] - for labels in host_labels.values()] - boards = list(set(itertools.chain.from_iterable(board_labels))) - return boards - - -def get_host_labels(days_back=0, hostname=None, labels=None): - """Get the labels for a given host or all hosts. - - @param days_back: Get the label info around that number of days back. The - default is 0, i.e., the latest label information. - @param hostname: Name of the host, if set to None, return labels for all - hosts. Default is None. - @param labels: A list of labels to filter hosts. - @return: A dictionary of host labels, key is the hostname, and value is a - list of labels, e.g., - {'host1': ['board:daisy', 'pool:bvt']} - """ - # Search for the latest logged labels before the given days_back. - # Default is 0, which means the last time host labels were logged. - t_end = time.time() - days_back*24*3600 - results = autotest_es.query( - fields_returned=['time_index'], - equality_constraints=[('_type', _HOST_LABEL_TIME_INDEX_TYPE),], - range_constraints=[('time_index', None, t_end)], - size=1, - sort_specs=[{'time_index': 'desc'}]) - t_end_str = time_utils.epoch_time_to_date_string(t_end) - if results.total == 0: - logging.error('No label information was logged before %s.', t_end_str) - return - time_index = results.hits[0]['time_index'] - logging.info('Host labels were recorded at %s', - time_utils.epoch_time_to_date_string(time_index)) - - # Search for labels for a given host or all hosts, at time_index. - equality_constraints=[('_type', _HOST_LABEL_TYPE), - ('time_index', time_index),] - if hostname: - equality_constraints.append(('hostname', hostname)) - if labels: - for label in labels: - equality_constraints.append(('labels', label)) - results = autotest_es.query( - fields_returned=['hostname', 'labels'], - equality_constraints=equality_constraints) - - host_labels = {} - for hit in results.hits: - if 'labels' in hit: - host_labels[hit['hostname']] = hit['labels'] - - return host_labels - - -def collect_info(): - """Collect label info and report to metaDB. - """ - # time_index is to index all host labels collected together. It's - # converted to int to make search faster. - time_index = int(time.time()) - hosts = models.Host.objects.filter(invalid=False) - data_list = [] - for host in hosts: - info = {'_type': _HOST_LABEL_TYPE, - 'hostname': host.hostname, - 'labels': [label.name for label in host.labels.all()], - 'time_index': time_index} - data_list.append(info) - if not autotest_es.bulk_post(data_list, log_time_recorded=False): - raise Exception('Failed to upload host label info.') - - # After all host label information is logged, save the time stamp. - autotest_es.post(use_http=True, type_str=_HOST_LABEL_TIME_INDEX_TYPE, - metadata={'time_index': time_index}, - log_time_recorded=False) - logging.info('Finished collecting host labels for %d hosts.', len(hosts)) - - -def main(): - """Main script. - """ - parser = argparse.ArgumentParser() - parser.add_argument('action', - help=('collect or query. Action collect will collect ' - 'all hosts and their labels to metaDB. Action ' - 'query will query for hosts and their labels ' - 'information at a given day')) - parser.add_argument('-d', '--days_back', type=int, dest='days_back', - help=('Number of days before current time. Query will ' - 'get host label information collected before that' - ' time. The option is applicable to query only. ' - 'Default to 0, i.e., get the latest label info.'), - default=0) - parser.add_argument('-n', '--hostname', type=str, dest='hostname', - help=('Name of the host to query label information for.' - 'The option is applicable to query only. ' - 'Default to None, i.e., return label info for all' - ' hosts.'), - default=None) - parser.add_argument('-l', '--labels', nargs='+', dest='labels', - help=('A list of labels to filter hosts. The option is ' - 'applicable to query only. Default to None.'), - default=None) - parser.add_argument('-v', '--verbose', action="store_true", dest='verbose', - help='Allow more detail information to be shown.') - options = parser.parse_args() - - logging.getLogger().setLevel(logging.INFO if options.verbose - else logging.WARN) - if options.action == 'collect': - collect_info() - elif options.action == 'query': - host_labels = get_host_labels(options.days_back, options.hostname, - options.labels) - pprint.pprint(host_labels) - else: - logging.error('action %s is not supported, can only be collect or ' - 'query!', options.action) - - -if __name__ == '__main__': - main() diff --git a/site_utils/lxc/cleanup_if_fail.py b/site_utils/lxc/cleanup_if_fail.py index 14ef8746fb..09a3d5c372 100644 --- a/site_utils/lxc/cleanup_if_fail.py +++ b/site_utils/lxc/cleanup_if_fail.py @@ -3,14 +3,11 @@ # found in the LICENSE file. import logging -import socket import sys import common from autotest_lib.client.bin import utils from autotest_lib.client.common_lib import error -from autotest_lib.client.common_lib.cros.graphite import autotest_es -from autotest_lib.site_utils.lxc import constants def cleanup_if_fail(): @@ -49,24 +46,6 @@ def cleanup_if_fail(): except error.CmdError as e: logging.error(e) - try: - job_id = utils.get_function_arg_value( - func, 'job_id', args, kwargs) - except (KeyError, ValueError): - job_id = '' - metadata={'drone': socket.gethostname(), - 'job_id': job_id, - 'success': False} - # Record all args if job_id is not available. - if not job_id: - metadata['args'] = str(args) - if kwargs: - metadata.update(kwargs) - autotest_es.post( - use_http=True, - type_str=constants.CONTAINER_CREATE_METADB_TYPE, - metadata=metadata) - # Raise the cached exception with original backtrace. raise exc_info[0], exc_info[1], exc_info[2] return func_cleanup_if_fail diff --git a/site_utils/lxc/config.py b/site_utils/lxc/config.py index e071273976..dbff4165d8 100644 --- a/site_utils/lxc/config.py +++ b/site_utils/lxc/config.py @@ -203,20 +203,26 @@ class DeployConfigManager(object): return c - def __init__(self, container): + def __init__(self, container, config_file=None): """Initialize the deploy config manager. @param container: The container needs to deploy config. - + @param config_file: An optional config file. For testing. """ self.container = container # If shadow config is used, the deployment procedure will skip some # special handling of config file, e.g., # 1. Set enable_master_ssh to False in autotest shadow config. # 2. Set ssh logleve to ERROR for all hosts. - self.is_shadow_config = os.path.exists(SSP_DEPLOY_SHADOW_CONFIG_FILE) - config_file = (SSP_DEPLOY_SHADOW_CONFIG_FILE if self.is_shadow_config - else SSP_DEPLOY_CONFIG_FILE) + if config_file is None: + self.is_shadow_config = os.path.exists( + SSP_DEPLOY_SHADOW_CONFIG_FILE) + config_file = ( + SSP_DEPLOY_SHADOW_CONFIG_FILE if self.is_shadow_config + else SSP_DEPLOY_CONFIG_FILE) + else: + self.is_shadow_config = False + with open(config_file) as f: deploy_configs = json.load(f) self.deploy_configs = [self.validate(c) for c in deploy_configs @@ -391,6 +397,9 @@ class DeployConfigManager(object): if (mount_config.force_create and not os.path.exists(mount_config.source)): utils.run('mkdir -p %s' % mount_config.source) + self.container.mount_dir(mount_config.source, + mount_config.target, + mount_config.readonly) def deploy_post_start(self): diff --git a/site_utils/lxc/container_bucket.py b/site_utils/lxc/container_bucket.py index 1ea7e8606b..441ca34fcf 100644 --- a/site_utils/lxc/container_bucket.py +++ b/site_utils/lxc/container_bucket.py @@ -4,13 +4,11 @@ import logging import os -import socket import time import common from autotest_lib.client.bin import utils from autotest_lib.client.common_lib import error -from autotest_lib.client.common_lib.cros.graphite import autotest_es from autotest_lib.site_utils.lxc import Container from autotest_lib.site_utils.lxc import config as lxc_config from autotest_lib.site_utils.lxc import constants @@ -137,13 +135,6 @@ class ContainerBucket(object): new_path=self.container_path, snapshot=False, cleanup=force_cleanup) - # Report metadata about retry success. - autotest_es.post( - use_http=True, - type_str=constants.CONTAINER_CREATE_RETRY_METADB_TYPE, - metadata={'drone': socket.gethostname(), - 'name': name, - 'success': True}) return container @@ -330,9 +321,6 @@ class ContainerBucket(object): False), ] - for mount_config in deploy_config_manager.mount_configs: - mount_entries.append((mount_config.source, mount_config.target, - mount_config.readonly)) # Update container config to mount directories. for source, destination, readonly in mount_entries: container.mount_dir(source, destination, readonly) @@ -353,12 +341,5 @@ class ContainerBucket(object): container.verify_autotest_setup(job_folder) - autotest_es.post(use_http=True, - type_str=constants.CONTAINER_CREATE_METADB_TYPE, - metadata={'drone': socket.gethostname(), - 'job_id': job_id, - 'time_used': time.time() - start_time, - 'success': True}) - logging.debug('Test container %s is set up.', name) return container diff --git a/site_utils/lxc/container_unittest.py b/site_utils/lxc/container_unittest.py index 7aebfd8204..fe0b5a5086 100644 --- a/site_utils/lxc/container_unittest.py +++ b/site_utils/lxc/container_unittest.py @@ -15,9 +15,12 @@ from contextlib import contextmanager import common from autotest_lib.client.common_lib import error from autotest_lib.site_utils import lxc +from autotest_lib.site_utils.lxc import constants +from autotest_lib.site_utils.lxc import unittest_http from autotest_lib.site_utils.lxc import unittest_logging from autotest_lib.site_utils.lxc import utils as lxc_utils - +from autotest_lib.site_utils.lxc.unittest_container_bucket \ + import FastContainerBucket options = None @@ -26,13 +29,12 @@ class ContainerTests(unittest.TestCase): @classmethod def setUpClass(cls): - logging.debug('setupclass') cls.test_dir = tempfile.mkdtemp(dir=lxc.DEFAULT_CONTAINER_PATH, prefix='container_unittest_') cls.shared_host_path = os.path.join(cls.test_dir, 'host') # Use a container bucket just to download and set up the base image. - cls.bucket = lxc.ContainerBucket(cls.test_dir, cls.shared_host_path) + cls.bucket = FastContainerBucket(cls.test_dir, cls.shared_host_path) if cls.bucket.base_container is None: logging.debug('Base container not found - reinitializing') @@ -144,6 +146,33 @@ class ContainerTests(unittest.TestCase): clone1.attach_run('test -f %s' % tmpfile) + def testInstallSsp(self): + """Verifies that installing the ssp in the container works.""" + # Hard-coded path to some golden data for this test. + test_ssp = os.path.join( + common.autotest_dir, + 'site_utils', 'lxc', 'test', 'test_ssp.tar.bz2') + # Create a container, install the self-served ssp, then check that it is + # installed into the container correctly. + with self.createContainer() as container: + with unittest_http.serve_locally(test_ssp) as url: + container.install_ssp(url) + container.start(wait_for_network=False) + + # The test ssp just contains a couple of text files, in known + # locations. Verify the location and content of those files in the + # container. + cat = lambda path: container.attach_run('cat %s' % path).stdout + test0 = cat(os.path.join(constants.CONTAINER_AUTOTEST_DIR, + 'test.0')) + test1 = cat(os.path.join(constants.CONTAINER_AUTOTEST_DIR, + 'dir0', 'test.1')) + self.assertEquals('the five boxing wizards jumped quickly', + test0) + self.assertEquals('the quick brown fox jumps over the lazy dog', + test1) + + def testInstallControlFile(self): """Verifies that installing a control file in the container works.""" _unused, tmpfile = tempfile.mkstemp() @@ -167,8 +196,10 @@ class ContainerTests(unittest.TestCase): if name is None: name = self.id().split('.')[-1] container = self.bucket.create_from_base(name) - yield container - container.destroy() + try: + yield container + finally: + container.destroy() def parse_options(): diff --git a/site_utils/lxc/lxc_config_unittest.py b/site_utils/lxc/lxc_config_unittest.py index 52184cb98d..02f11efc14 100644 --- a/site_utils/lxc/lxc_config_unittest.py +++ b/site_utils/lxc/lxc_config_unittest.py @@ -3,13 +3,18 @@ # Use of this source code is governed by a BSD-style license that can be # found in the LICENSE file. +import collections import json import os +import shutil +import tempfile import unittest +from contextlib import contextmanager import common +from autotest_lib.client.bin import utils from autotest_lib.site_utils.lxc import config as lxc_config - +from autotest_lib.site_utils.lxc import utils as lxc_utils class DeployConfigTest(unittest.TestCase): """Test DeployConfigManager. @@ -26,5 +31,149 @@ class DeployConfigTest(unittest.TestCase): lxc_config.DeployConfigManager.validate(config) -if '__main__': + def testPreStart(self): + """Verifies that pre-start works correctly. + Checks that mounts are correctly created in the container. + """ + with TempDir() as tmpdir: + config = [ + { + 'mount': True, + 'source': tempfile.mkdtemp(dir=tmpdir), + 'target': '/target0', + 'readonly': True, + 'force_create': False + }, + { + 'mount': True, + 'source': tempfile.mkdtemp(dir=tmpdir), + 'target': '/target1', + 'readonly': False, + 'force_create': False + }, + ] + with ConfigFile(config) as test_cfg, MockContainer() as container: + manager = lxc_config.DeployConfigManager(container, test_cfg) + manager.deploy_pre_start() + self.assertEqual(len(config), len(container.mounts)) + for c in config: + self.assertTrue(container.has_mount(c)) + + + def testPreStartWithCreate(self): + """Verifies that pre-start creates mounted dirs. + + Checks that missing mount points are created when force_create is + enabled. + """ + with TempDir() as tmpdir: + src_dir = os.path.join(tmpdir, 'foobar') + config = [{ + 'mount': True, + 'source': src_dir, + 'target': '/target0', + 'readonly': True, + 'force_create': True + }] + with ConfigFile(config) as test_cfg, MockContainer() as container: + manager = lxc_config.DeployConfigManager(container, test_cfg) + # Pre-condition: the path doesn't exist. + self.assertFalse(lxc_utils.path_exists(src_dir)) + + # After calling deploy_pre_start, the path should exist and the + # mount should be created in the container. + manager.deploy_pre_start() + self.assertTrue(lxc_utils.path_exists(src_dir)) + self.assertEqual(len(config), len(container.mounts)) + for c in config: + self.assertTrue(container.has_mount(c)) + + +class _MockContainer(object): + """A test mock for the container class. + + Don't instantiate this directly, use the MockContainer context manager + defined below. + """ + + def __init__(self): + self.rootfs = tempfile.mkdtemp() + self.mounts = [] + self.MountConfig = collections.namedtuple( + 'MountConfig', ['source', 'destination', 'readonly']) + + + def cleanup(self): + """Clean up tmp dirs created by the container.""" + # DeployConfigManager uses sudo to create some directories in the + # container, so it's necessary to use sudo to clean up. + utils.run('sudo rm -rf %s' % self.rootfs) + + + def mount_dir(self, src, dst, ro): + """Stub implementation of mount_dir. + + Records calls for later verification. + + @param src: Mount source dir. + @param dst: Mount destination dir. + @param ro: Read-only flag. + """ + self.mounts.append(self.MountConfig(src, dst, ro)) + + + def has_mount(self, config): + """Verifies whether an earlier call was made to mount_dir. + + @param config: The config object to verify. + + @return True if an earlier call was made to mount_dir that matches the + given mount configuration; False otherwise. + """ + mount = self.MountConfig(config['source'], + config['target'], + config['readonly']) + return mount in self.mounts + + +@contextmanager +def MockContainer(): + """Context manager for creating a _MockContainer for testing.""" + container = _MockContainer() + try: + yield container + finally: + container.cleanup() + + +@contextmanager +def ConfigFile(config): + """Context manager for creating a config file. + + The given configs are translated into json and pushed into a temporary file + that the DeployConfigManager can read. + + @param config: A list of config objects. Each config object is a dictionary + which conforms to the format described in config.py. + """ + with tempfile.NamedTemporaryFile() as tmp: + json.dump(config, tmp) + tmp.flush() + yield tmp.name + + +@contextmanager +def TempDir(): + """Context manager for creating a temporary directory. + + We have to mount something. Make temporary directories to mount. + """ + tmpdir = tempfile.mkdtemp() + try: + yield tmpdir + finally: + shutil.rmtree(tmpdir) + + +if __name__ == '__main__': unittest.main() diff --git a/site_utils/lxc/test/test_ssp.tar.bz2 b/site_utils/lxc/test/test_ssp.tar.bz2 Binary files differnew file mode 100644 index 0000000000..7627b3a7a2 --- /dev/null +++ b/site_utils/lxc/test/test_ssp.tar.bz2 diff --git a/site_utils/lxc/unittest_container_bucket.py b/site_utils/lxc/unittest_container_bucket.py new file mode 100644 index 0000000000..12008762f1 --- /dev/null +++ b/site_utils/lxc/unittest_container_bucket.py @@ -0,0 +1,49 @@ +# Copyright 2017 The Chromium OS Authors. All rights reserved. +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. + +import os + +import common +from autotest_lib.site_utils import lxc +from autotest_lib.site_utils.lxc import constants +from autotest_lib.site_utils.lxc import utils as lxc_utils + + +class FastContainerBucket(lxc.ContainerBucket): + """A fast container bucket for testing. + + If a base container image already exists in the default location on the + local machine, this container just makes a snapshot of it for testing, + rather than re-downloading and installing a fresh base continer. + """ + def __init__(self, lxc_path, host_path): + self.fast_setup = False + try: + if lxc_utils.path_exists( + os.path.join(constants.DEFAULT_CONTAINER_PATH, + constants.BASE)): + lxc_path = os.path.realpath(lxc_path) + if not lxc_utils.path_exists(lxc_path): + os.makedirs(lxc_path) + + # Clone the base container (snapshot for speed) to make a base + # container for the unit test. + base = lxc.Container.createFromExistingDir( + constants.DEFAULT_CONTAINER_PATH, constants.BASE) + lxc.Container.clone(src=base, + new_name=constants.BASE, + new_path=lxc_path, + snapshot=True, + cleanup=False) + self.fast_setup = True + finally: + super(FastContainerBucket, self).__init__(lxc_path, host_path) + if self.base_container is not None: + self._setup_shared_host_path() + + + def setup_base(self, *args, **kwargs): + """Runs setup_base if fast setup did not work.""" + if not self.fast_setup: + super(FastContainerBucket, self).setup_base(*args, **kwargs) diff --git a/site_utils/lxc/unittest_http.py b/site_utils/lxc/unittest_http.py new file mode 100644 index 0000000000..22a970e3e4 --- /dev/null +++ b/site_utils/lxc/unittest_http.py @@ -0,0 +1,74 @@ +# Copyright 2017 The Chromium OS Authors. All rights reserved. +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. +from contextlib import contextmanager + +import logging +import multiprocessing +import os +import shutil +import tempfile +import SocketServer +import SimpleHTTPServer + +import common +from autotest_lib.client.bin import utils +from autotest_lib.client.common_lib import error +from autotest_lib.site_utils.lxc import constants + + +@contextmanager +def serve_locally(file_path): + """Starts an http server on the localhost, to serve the given file. + + Copies the given file to a temporary location, then starts a local http + server to serve that file. Intended for use with unit tests that require + some URL to download a file (see testInstallSsp as an example). + + @param file_path: The path of the file to serve. + + @return The URL at which the file will be served. + """ + p = None + try: + # Copy the target file into a tmpdir for serving. + tmpdir = tempfile.mkdtemp() + shutil.copy(file_path, tmpdir) + + httpd = SocketServer.TCPServer( + ('', 0), SimpleHTTPServer.SimpleHTTPRequestHandler) + port = httpd.socket.getsockname()[1] + + # Start the http daemon in the tmpdir to serve the file. + def serve(): + """Serves the tmpdir.""" + os.chdir(tmpdir) + httpd.serve_forever() + p = multiprocessing.Process(target=serve) + p.daemon = True + p.start() + + utils.poll_for_condition( + condition=lambda: http_up(port), + timeout=constants.NETWORK_INIT_TIMEOUT, + sleep_interval=constants.NETWORK_INIT_CHECK_INTERVAL) + url = 'http://127.0.0.1:{port}/{filename}'.format( + port=port, filename=os.path.basename(file_path)) + logging.debug('Serving %s as %s', file_path, url) + yield url + finally: + if p is not None: + p.terminate() + shutil.rmtree(tmpdir) + + +def http_up(port): + """Checks for an http server on localhost:port. + + @param port: The port to check. + """ + try: + utils.run('curl --head http://127.0.0.1:%d' % port) + return True + except error.CmdError: + return False diff --git a/site_utils/lxc/zygote.py b/site_utils/lxc/zygote.py index b939e543ae..21b5f2b8dc 100644 --- a/site_utils/lxc/zygote.py +++ b/site_utils/lxc/zygote.py @@ -2,12 +2,16 @@ # Use of this source code is governed by a BSD-style license that can be # found in the LICENSE file. +import logging import os +import tempfile import common from autotest_lib.client.bin import utils +from autotest_lib.client.common_lib import error from autotest_lib.site_utils.lxc import Container from autotest_lib.site_utils.lxc import constants +from autotest_lib.site_utils.lxc import lxc from autotest_lib.site_utils.lxc import utils as lxc_utils @@ -72,6 +76,55 @@ class Zygote(Container): super(Zygote, self).set_hostname(hostname) + def install_ssp(self, ssp_url): + """Downloads and installs the given server package. + + @param ssp_url: The URL of the ssp to download and install. + """ + # The host dir is mounted directly on /usr/local/autotest within the + # container. The SSP structure assumes it gets untarred into the + # /usr/local directory of the container's rootfs. In order to unpack + # with the correct directory structure, create a tmpdir, mount the + # container's host dir as ./autotest, and unpack the SSP. + tmpdir = None + autotest_tmp = None + try: + tmpdir = tempfile.mkdtemp(dir=self.container_path, + prefix='%s.' % self.name, + suffix='.tmp') + autotest_tmp = os.path.join(tmpdir, 'autotest') + os.mkdir(autotest_tmp) + utils.run( + 'sudo mount --bind %s %s' % (self.host_path, autotest_tmp)) + download_tmp = os.path.join(tmpdir, + 'autotest_server_package.tar.bz2') + lxc.download_extract(ssp_url, download_tmp, tmpdir) + finally: + if autotest_tmp is not None: + try: + utils.run('sudo umount %s' % autotest_tmp) + except error.CmdError: + logging.exception('Failure while cleaning up SSP tmpdir.') + if tmpdir is not None: + utils.run('sudo rm -rf %s' % tmpdir) + + + def install_control_file(self, control_file): + """Installs the given control file. + + The given file will be moved into the container. + + @param control_file: Path to the control file to install. + """ + # Compute the control temp path relative to the host mount. + dst_path = os.path.join( + self.host_path, + os.path.relpath(constants.CONTROL_TEMP_PATH, + constants.CONTAINER_AUTOTEST_DIR)) + utils.run('sudo mkdir -p %s' % dst_path) + utils.run('sudo mv %s %s' % (control_file, dst_path)) + + def _cleanup_host_mount(self): """Unmount and remove the host dir for this container.""" lxc_utils.cleanup_host_mount(self.host_path); diff --git a/site_utils/lxc/zygote_unittest.py b/site_utils/lxc/zygote_unittest.py index 8a102b321f..3da7815113 100644 --- a/site_utils/lxc/zygote_unittest.py +++ b/site_utils/lxc/zygote_unittest.py @@ -15,8 +15,12 @@ from contextlib import contextmanager import common from autotest_lib.client.bin import utils from autotest_lib.site_utils import lxc +from autotest_lib.site_utils.lxc import constants +from autotest_lib.site_utils.lxc import unittest_http from autotest_lib.site_utils.lxc import unittest_logging from autotest_lib.site_utils.lxc import utils as lxc_utils +from autotest_lib.site_utils.lxc.unittest_container_bucket \ + import FastContainerBucket options = None @@ -32,7 +36,7 @@ class ZygoteTests(unittest.TestCase): cls.shared_host_path = os.path.join(cls.test_dir, 'host') # Use a container bucket just to download and set up the base image. - cls.bucket = lxc.ContainerBucket(cls.test_dir, cls.shared_host_path) + cls.bucket = FastContainerBucket(cls.test_dir, cls.shared_host_path) if cls.bucket.base_container is None: logging.debug('Base container not found - reinitializing') @@ -152,6 +156,49 @@ class ZygoteTests(unittest.TestCase): self.assertEqual(test_string, test_output) + def testInstallSsp(self): + """Verifies that installing the ssp in the container works.""" + # Hard-coded path to some golden data for this test. + test_ssp = os.path.join( + common.autotest_dir, + 'site_utils', 'lxc', 'test', 'test_ssp.tar.bz2') + # Create a container, install the self-served ssp, then check that it is + # installed into the container correctly. + with self.createZygote() as zygote: + # Note: start the zygote first, then install the SSP. This mimics + # the way things would work in the production environment. + zygote.start(wait_for_network=False) + with unittest_http.serve_locally(test_ssp) as url: + zygote.install_ssp(url) + + # The test ssp just contains a couple of text files, in known + # locations. Verify the location and content of those files in the + # container. + cat = lambda path: zygote.attach_run('cat %s' % path).stdout + test0 = cat(os.path.join(constants.CONTAINER_AUTOTEST_DIR, + 'test.0')) + test1 = cat(os.path.join(constants.CONTAINER_AUTOTEST_DIR, + 'dir0', 'test.1')) + self.assertEquals('the five boxing wizards jumped quickly', + test0) + self.assertEquals('the quick brown fox jumps over the lazy dog', + test1) + + + def testInstallControlFile(self): + """Verifies that installing a control file in the container works.""" + _unused, tmpfile = tempfile.mkstemp() + with self.createZygote() as zygote: + # Note: start the zygote first. This mimics the way things would + # work in the production environment. + zygote.start(wait_for_network=False) + zygote.install_control_file(tmpfile) + # Verify that the file is found in the zygote. + zygote.attach_run( + 'test -f %s' % os.path.join(lxc.CONTROL_TEMP_PATH, + os.path.basename(tmpfile))) + + @contextmanager def createZygote(self, name = None, @@ -179,9 +226,11 @@ class ZygoteTests(unittest.TestCase): self.base_container, snapshot, host_path) - yield zygote - if not options.skip_cleanup: - zygote.destroy() + try: + yield zygote + finally: + if not options.skip_cleanup: + zygote.destroy() def verifyBindMount(self, container, container_path, host_path): @@ -212,7 +261,7 @@ def parse_options(): # Hack: python unittest also processes args. Construct an argv to pass to # it, that filters out the options it won't recognize. if args.verbose: - argv.append('-v') + argv.insert(0, '-v') argv.insert(0, sys.argv[0]) return args, argv diff --git a/site_utils/metadata_reporter.py b/site_utils/metadata_reporter.py index 196007be26..91b07d90e8 100644 --- a/site_utils/metadata_reporter.py +++ b/site_utils/metadata_reporter.py @@ -16,7 +16,6 @@ import threading import common from autotest_lib.client.common_lib import utils -from autotest_lib.client.common_lib.cros.graphite import autotest_es try: from chromite.lib import metrics @@ -63,7 +62,6 @@ def queue(data): @param data: A metadata entry, which should be a dictionary. """ if not is_running(): - autotest_es.post(type_str=data['_type'], metadata=data) return try: @@ -101,22 +99,6 @@ def _run(): data_list.append(metadata_queue.get_nowait()) if data_list: success = False - if autotest_es.bulk_post(data_list=data_list): - time_used = time.time() - start_time - logging.info('%d entries of metadata uploaded in %s ' - 'seconds.', len(data_list), time_used) - first_failed_upload = None - success = True - metrics.SecondsDistribution( - _METADATA_METRICS_PREFIX + 'upload/seconds').add( - time_used, fields=_get_metrics_fields()) - else: - logging.warn('Failed to upload %d entries of metadata, ' - 'they will be retried later.', len(data_list)) - for data in data_list: - queue(data) - if not first_failed_upload: - first_failed_upload = time.time() fields = _get_metrics_fields().copy() fields['success'] = success metrics.Gauge( @@ -157,10 +139,8 @@ def start(): logging.error('There is already a metadata reporter thread.') return - if not autotest_es.METADATA_ES_SERVER: - logging.warn('ES_HOST is not set in global config, no metadata will be ' - 'reported.') - return + logging.warn('Elasticsearch db deprecated, no metadata will be ' + 'reported.') _report_lock.acquire() reporting_thread = threading.Thread(target=_run) diff --git a/site_utils/stable_version_utils.py b/site_utils/stable_version_utils.py index 0477c34097..d887fa3544 100644 --- a/site_utils/stable_version_utils.py +++ b/site_utils/stable_version_utils.py @@ -8,7 +8,6 @@ import common import django.core.exceptions from autotest_lib.client.common_lib import global_config -from autotest_lib.client.common_lib.cros.graphite import autotest_es from autotest_lib.frontend import setup_django_environment from autotest_lib.frontend.afe import models @@ -76,8 +75,6 @@ def set(version, board=DEFAULT): stable_version.save() except django.core.exceptions.ObjectDoesNotExist: models.StableVersion.objects.create(board=board, version=version) - autotest_es.post(type_str=_STABLE_VERSION_TYPE, - metadata={'board': board, 'version': version}) def delete(board): @@ -87,5 +84,3 @@ def delete(board): """ stable_version = models.StableVersion.objects.get(board=board) stable_version.delete() - autotest_es.post(type_str=_STABLE_VERSION_TYPE, - metadata={'board': board, 'version': get()}) @@ -481,6 +481,7 @@ class db_sql(object): for test in job.tests: self.insert_test(job, test, commit=commit) + data['job_idx'] = job.index return data diff --git a/tko/parse.py b/tko/parse.py index 4a80aa38f8..ecb691bbe3 100755 --- a/tko/parse.py +++ b/tko/parse.py @@ -13,14 +13,9 @@ import sys import traceback import common -from autotest_lib.client.bin.result_tools import utils as result_utils -from autotest_lib.client.bin.result_tools import utils_lib as result_utils_lib -from autotest_lib.client.bin.result_tools import view as result_view -from autotest_lib.client.common_lib import file_utils from autotest_lib.client.common_lib import global_config from autotest_lib.client.common_lib import mail, pidfile from autotest_lib.client.common_lib import utils -from autotest_lib.client.common_lib.cros.graphite import autotest_es from autotest_lib.frontend import setup_django_environment from autotest_lib.frontend.tko import models as tko_models from autotest_lib.server import site_utils @@ -31,29 +26,16 @@ from autotest_lib.tko import db as tko_db, utils as tko_utils from autotest_lib.tko import models, parser_lib from autotest_lib.tko.perf_upload import perf_uploader +try: + from chromite.lib import metrics +except ImportError: + metrics = utils.metrics_mock + + _ParseOptions = collections.namedtuple( 'ParseOptions', ['reparse', 'mail_on_failure', 'dry_run', 'suite_report', 'datastore_creds', 'export_to_gcloud_path']) -# Key names related to test result sizes to be stored in tko_job_keyvals. -# The total size (in kB) of test results that generated during the test, -# including: -# * server side test logs and result files. -# * client side test logs, sysinfo, system logs and crash dumps. -# Note that a test can collect the same test result files from DUT multiple -# times during the test, before and after each iteration/test. So the value of -# CLIENT_RESULT_COLLECTED_KB could be larger than the value of -# RESULT_UPLOADED_KB, which is the size of result directory on the server side, -# even if the test result throttling is not applied. -# The total size (in KB) of test results collected from test device. -CLIENT_RESULT_COLLECTED_KB = 'client_result_collected_KB' -# The original size (in KB) of test results before being trimmed. -ORIGINAL_RESULT_TOTAL_KB = 'original_result_total_KB' -# The total size (in KB) of test results to be uploaded by gs_offloader. -RESULT_UPLOADED_KB = 'result_uploaded_KB' -# Flag to indicate if test results collection is throttled. -RESULT_THROTTLED = 'result_throttled' - def parse_args(): """Parse args.""" # build up our options parser and parse sys.argv @@ -255,53 +237,6 @@ def _invalidate_original_tests(orig_job_idx, retry_job_idx): tko_utils.dprint('DEBUG: Invalidated tests associated to job: ' + msg) -def _get_result_sizes(path): - """Get the result sizes information. - - It first tries to merge directory summaries and calculate the result sizes - including: - CLIENT_RESULT_COLLECTED_KB: The volume in KB that's transfered from the test - device. - ORIGINAL_RESULT_TOTAL_KB: The volume in KB that's the original size of the - result files before being trimmed. - RESULT_UPLOADED_KB: The volume in KB that will be uploaded. - RESULT_THROTTLED: Indicating if the result files were throttled. - - If directory summary merging failed for any reason, fall back to use the - total size of the given result directory. - - @param path: Path of the result directory to get size information. - @return: A dictionary of result sizes information. - """ - sizes = {} - try: - client_collected_bytes, summary = result_utils.merge_summaries(path) - root_entry = summary[result_utils_lib.ROOT_DIR] - sizes[CLIENT_RESULT_COLLECTED_KB] = client_collected_bytes / 1024 - sizes[ORIGINAL_RESULT_TOTAL_KB] = ( - root_entry[result_utils_lib.ORIGINAL_SIZE_BYTES]) / 1024 - sizes[RESULT_UPLOADED_KB] = ( - root_entry[result_utils_lib.TRIMMED_SIZE_BYTES])/ 1024 - # Test results are considered to be throttled if the total size of - # results collected is different from the total size of trimmed results - # from the client side. - sizes[RESULT_THROTTLED] = ( - root_entry[result_utils_lib.ORIGINAL_SIZE_BYTES] != - root_entry[result_utils_lib.TRIMMED_SIZE_BYTES]) - html_file = os.path.join(path, result_view.DEFAULT_RESULT_SUMMARY_NAME) - result_view.build(client_collected_bytes, summary, html_file) - except: - tko_utils.dprint('Failed to calculate result sizes based on directory ' - 'summaries. Fall back to record the total size.\n' - 'Exception: %s' % traceback.format_exc()) - total_size = file_utils.get_directory_size_kibibytes(path); - sizes[CLIENT_RESULT_COLLECTED_KB] = total_size - sizes[ORIGINAL_RESULT_TOTAL_KB] = total_size - sizes[RESULT_UPLOADED_KB] = total_size - sizes[RESULT_THROTTLED] = 0 - return sizes - - def parse_one(db, jobname, path, parse_options): """Parse a single job. Optionally send email on failure. @@ -395,15 +330,18 @@ def parse_one(db, jobname, path, parse_options): job.board = label_info.get('board', None) job.suite = label_info.get('suite', None) + # Record test result size to job_keyvals + result_size_info = site_utils.collect_result_sizes( + path, log=tko_utils.dprint) + job.keyval_dict.update(result_size_info.__dict__) + # Upload job details to Sponge. if not dry_run: sponge_url = sponge_utils.upload_results(job, log=tko_utils.dprint) if sponge_url: job.keyval_dict['sponge_url'] = sponge_url - # Record test result size to job_keyvals - sizes = _get_result_sizes(path) - job.keyval_dict.update(sizes) + # TODO(dshi): Update sizes with sponge_invocation.xml and throttle it. # check for failures message_lines = [""] @@ -433,6 +371,21 @@ def parse_one(db, jobname, path, parse_options): jobname, job, parent_job_id=job_keyval.get(constants.PARENT_JOB_ID, None)) + # Verify the job data is written to the database. + if job.tests: + tests_in_db = db.find_tests(job_data['job_idx']) + tests_in_db_count = len(tests_in_db) if tests_in_db else 0 + if tests_in_db_count != len(job.tests): + tko_utils.dprint( + 'Failed to find enough tests for job_idx: %d. The ' + 'job should have %d tests, only found %d tests.' % + (job_data['job_idx'], len(job.tests), + tests_in_db_count)) + metrics.Counter( + 'chromeos/autotest/result/db_save_failure', + description='The number of times parse failed to ' + 'save job to TKO database.').increment() + # Upload perf values to the perf dashboard, if applicable. for test in job.tests: perf_uploader.upload_test(job, test, jobname) @@ -451,12 +404,8 @@ def parse_one(db, jobname, path, parse_options): afe_job_id=orig_afe_job_id).job_idx _invalidate_original_tests(orig_job_idx, job.index) except Exception as e: - metadata = {'path': path, 'error': str(e), - 'details': traceback.format_exc()} tko_utils.dprint("Hit exception while uploading to tko db:\n%s" % traceback.format_exc()) - autotest_es.post(use_http=True, type_str='parse_failure', - metadata=metadata) raise e # Serializing job into a binary file @@ -698,13 +647,6 @@ def main(): except Exception as e: pid_file_manager.close_file(1) - - metadata = {'results_dir': results_dir, - 'error': str(e), - 'details': traceback.format_exc()} - autotest_es.post(use_http=True, type_str='parse_failure_final', - metadata=metadata) - raise else: pid_file_manager.close_file(0) |