blob: 742ebaea084e06c1c7e42d86d99cf32c4e47e635 [file] [log] [blame]
#!/usr/bin/env python
# Copyright 2014 The Chromium OS Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
import argparse
import sys
import time
import common
from autotest_lib.client.common_lib import global_config
from autotest_lib.client.common_lib import time_utils
from autotest_lib.client.common_lib.cros.graphite import es_utils
# Values used to describe the diagnosis of a DUT. These values
# are used to indicate both DUT status after a single state
# transition, and also diagnosis of whether the DUT was working
# at the end of a given time interval.
#
# _NO_STATUS: Used when there are no state transitions recorded in
# a given time interval.
# _UNKNOWN: For an individual transition, indicates that the DUT
# status is unchanged from the previous transition. For a time
# interval, indicates that the DUT's status can't be determined
# from the transition history.
# _WORKING: Indicates that the DUT was working normally after the
# transition, or at the end of the time interval.
# _BROKEN: Indicates that the DUT needed manual repair after the
# transition, or at the end of the time interval.
#
_NO_STATUS = 0
_UNKNOWN = 1
_WORKING = 2
_BROKEN = 3
# List of string values to display for the diagnosis values above,
# indexed by those values.
_DIAGNOSIS_IDS = ['??', '--', 'OK', 'NO']
# Default time interval for the --duration option when a value isn't
# specified on the command line.
_DEFAULT_DURATION = 12
def _parse_time(time_string):
return int(time_utils.date_string_to_epoch_time(time_string))
class StateTransition(object):
"""Information about a state transistion in host history.
This remembers the relevant data from a host history object
in the elastic search database. This include primary data from
the ES database such as the hostname, the AFE host state of the
transition, and the time of the transition. It also includes
secondary data calculated from the primary data.
State transitions can be caused by either a regular test job or
a special task. The specific kind of transition is determined
by the `job_id` and `task_id` member fields as follows:
* `self.job_id is not None and self.task_id is None` -
This indicates a regular job with id `self.job_id`.
* `self.job_id is None and self.task_id is not None` -
This indicates a 'Repair' special task with id
`self.task_id`, triggered on a failed DUT.
* `self.job_id is not None and self.task_id is not None` -
This indicates a special task with id `self.task_id`
associated with an HQE (such as a 'Provision' job). The HQE
job has the id `self.job_id`.
The `job_id` and `task_id` fields cannot both be `None`.
Each state transition implies a diagnosis about whether the DUT
was working at the time, based on the `status` value:
* 'Ready' - The device was working normally at the time of the
transition.
* 'Repair Failed' - The device probably needed manual
intervention at the time of the transition.
* All other status values - The device's state is unchanged
from the previous transition.
@property hostname Host name of the DUT.
@property status DUT state after the transition; valid
values are the same as for
`afe_hosts.status`.
@property timestamp Time when the scheduler recorded the
transition.
@property job_id ID in the AFE database for the job that
triggered the transition. `None` if the
transition was for a special task without
an associated HQE.
@property task_id ID in the AFE database for the special
task that triggered the transition.
`None` if the transition was for a regular
job.
@property job_url URL to the logs for the job that triggered
the transition.
@property diagnosis Working status of the DUT, derived from
`status`.
"""
get_config_value = global_config.global_config.get_config_value
_AFE_HOSTNAME = get_config_value('SERVER', 'hostname')
_LOG_URL_PATTERN = get_config_value('CROS', 'log_url_pattern')
@classmethod
def get_transitions(cls, hostname, start_time, end_time):
"""Get a list of StateTransition objects from ES host history.
The returned list includes all transitions on the given host
in the given time interval.
@param hostname Host for the transitions in the host
history.
@param start_time Start of the time interval to search.
@param end_time End of the time interval to search.
"""
equality_constraints = [('_type', 'host_history'),
('hostname', hostname)]
range_constraints = [('time_recorded', start_time, end_time)]
query = es_utils.create_range_eq_query_multiple(
fields_returned=None,
equality_constraints=equality_constraints,
range_constraints=range_constraints,
size=end_time - start_time,
sort_specs=[{'time_recorded': 'desc'}])
result = es_utils.execute_query(query)
return [cls(o['_source']) for o in result['hits']['hits']]
def __init__(self, transition_data):
self.hostname = transition_data['hostname']
self.status = transition_data['status']
self.timestamp = transition_data['time_recorded']
self.job_id = transition_data.get('job_id')
self.task_id = transition_data.get('task_id')
if self.task_id is not None:
logdir = ('hosts/%s/%s-%s' %
(self.hostname, self.task_id,
transition_data['task_name'].lower()))
else:
logdir = '%s-%s' % (self.job_id, transition_data['owner'])
self.job_url = StateTransition._LOG_URL_PATTERN % (
StateTransition._AFE_HOSTNAME, logdir)
self.diagnosis = _UNKNOWN
if self.status == 'Repair Failed':
self.diagnosis = _BROKEN
elif self.status == 'Ready':
self.diagnosis = _WORKING
class HostStateHistory(object):
"""Class to query and remember DUT state transition history.
This class is responsible for querying the elastic search
database to determine the history of a single DUT in a time
interval of interest, and for remembering the query results for
reporting.
@property hostname Host name of the DUT.
@property start_time Start of the requested time interval.
@property end_time End of the requested time interval.
@property history A list of state transitions on the DUT
during the given time interval, ordered
from most to least recent.
"""
def __init__(self, hostname, start_time, end_time):
self.hostname = hostname
self.start_time = start_time
self.end_time = end_time
self.history = self._get_history(start_time, end_time)
def __iter__(self):
return self.history.__iter__()
def _get_history(self, start_time, end_time):
return StateTransition.get_transitions(
self.hostname, start_time, end_time)
def last_diagnosis(self):
"""Return the most recent diagnosis for the DUT.
This searches the DUT's state history from most to least
recent, looking for transitions that indicate whether the
DUT was working. Return a tuple of `(diagnosis, transition)`.
The `diagnosis` entry in the tuple is one of these values:
* _NO_STATUS - The state transition history is empty.
* _UNKNOWN - No state in the history indicated a
positive diagnosis.
* _WORKING - At last check, the DUT was working.
* _BROKEN - At last check, the DUT likely required manual
intervention.
The `transition` entry in the tuple is the entry that led to
the diagnosis. The transition will be `None` if the value
is `_NO_STATUS` or `_UNKNOWN`.
@return A tuple with the DUT's status and the transition that
determined the diagnosis.
"""
if not self.history:
return _NO_STATUS, None
for transition in self:
diagnosis = transition.diagnosis
if diagnosis == _BROKEN or diagnosis == _WORKING:
return diagnosis, transition
return _UNKNOWN, None
def _print_simple_status(arguments):
fmt = '%-28s %-2s %-19s %s'
print fmt % ('hostname', 'S', 'last checked', 'URL')
for hostname in arguments.hostnames:
history = HostStateHistory(hostname,
arguments.since, arguments.until)
status, transition = history.last_diagnosis()
if transition is not None:
url = transition.job_url
datestr = time_utils.epoch_time_to_date_string(
transition.timestamp)
else:
url = '---'
datestr = '---'
print fmt % (history.hostname,
_DIAGNOSIS_IDS[status],
datestr,
url)
def _print_host_transitions(arguments):
for hostname in arguments.hostnames:
print hostname
history = HostStateHistory(hostname,
arguments.since, arguments.until)
for transition in history:
start_time = time_utils.epoch_time_to_date_string(
transition.timestamp)
print ' %s %s %s' % (
start_time,
_DIAGNOSIS_IDS[transition.diagnosis],
transition.job_url)
def _validate_command(arguments):
if (arguments.duration is not None and
arguments.since is not None and arguments.until is not None):
print >>sys.stderr, ('Can specify at most two of '
'--since, --until, and --duration')
sys.exit(1)
if (arguments.until is None and (arguments.since is None or
arguments.duration is None)):
arguments.until = int(time.time())
if arguments.since is None:
if arguments.duration is None:
arguments.duration = _DEFAULT_DURATION
arguments.since = (arguments.until -
arguments.duration * 60 * 60)
elif arguments.until is None:
arguments.until = (arguments.since +
arguments.duration * 60 * 60)
def _parse_command(argv):
parser = argparse.ArgumentParser(
prog=argv[0],
description='Display DUT status and execution history',
epilog='You can specify one or two of --since, --until, '
'and --duration, but not all three.\n'
'The date/time format is "YYYY-MM-DD HH:MM:SS".')
parser.add_argument('-s', '--since', type=_parse_time,
metavar='DATE/TIME',
help='starting time for history display')
parser.add_argument('-u', '--until', type=_parse_time,
metavar='DATE/TIME',
help='ending time for history display'
' (default: now)')
parser.add_argument('-d', '--duration', type=int,
metavar='HOURS',
help='number of hours of history to display'
' (default: %d)' % _DEFAULT_DURATION)
parser.add_argument('-f', '--full_history', action='store_true')
parser.add_argument('hostnames',
nargs='*',
help='host names of DUTs to display')
arguments = parser.parse_args(argv[1:])
_validate_command(arguments)
return arguments
def main(argv):
"""Standard main() for command line processing.
@param argv Command line arguments (normally sys.argv).
"""
arguments = _parse_command(argv)
if arguments.full_history:
_print_host_transitions(arguments)
else:
_print_simple_status(arguments)
if __name__ == '__main__':
main(sys.argv)