blob: 3cde9cea360a6d810258e534792569d802a5a014 [file] [log] [blame]
#!/usr/bin/env python
# Copyright 2015 The Chromium OS Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
"""Create e-mail reports of the Lab's DUT inventory.
Gathers a list of all DUTs of interest in the Lab, segregated by
board and pool, and determines whether each DUT is working or
broken. Then, send one or more e-mail reports summarizing the
status to e-mail addresses provided on the command line.
usage: lab_inventory.py [ options ] [ board ... ]
Options:
--duration / -d <hours>
How far back in time to search job history to determine DUT
status.
--board-notify <address>[,<address>]
Send the "board status" e-mail to all the specified e-mail
addresses.
--pool-notify <address>[,<address>]
Send the "pool status" e-mail to all the specified e-mail
addresses.
--recommend <number>
When generating the "board status" e-mail, included a list of
<number> specific DUTs to be recommended for repair.
--repair-loops
Scan the inventory for DUTs stuck in repair loops, and report them
via a Monarch presence metric.
--logdir <directory>
Log progress and actions in a file under this directory. Text
of any e-mail sent will also be logged in a timestamped file in
this directory.
--debug
Suppress all logging, metrics reporting, and sending e-mail.
Instead, write the output that would be generated onto stdout.
<board> arguments:
With no arguments, gathers the status for all boards in the lab.
With one or more named boards on the command line, restricts
reporting to just those boards.
"""
import argparse
import collections
import logging
import logging.handlers
import os
import re
import sys
import time
import common
from autotest_lib.client.bin import utils
from autotest_lib.client.common_lib import time_utils
from autotest_lib.server import constants
from autotest_lib.server import site_utils
from autotest_lib.server.cros.dynamic_suite import frontend_wrappers
from autotest_lib.server.hosts import servo_host
from autotest_lib.server.lib import status_history
from autotest_lib.site_utils import gmail_lib
from autotest_lib.utils import labellib
from chromite.lib import metrics
CRITICAL_POOLS = constants.Pools.CRITICAL_POOLS
SPARE_POOL = constants.Pools.SPARE_POOL
MANAGED_POOLS = constants.Pools.MANAGED_POOLS
# _EXCLUDED_LABELS - A set of labels that disqualify a DUT from
# monitoring by this script. Currently, we're excluding these:
# + 'adb' - We're not ready to monitor Android or Brillo hosts.
# + 'board:guado_moblab' - These are maintained by a separate
# process that doesn't use this script.
_EXCLUDED_LABELS = {'adb', 'board:guado_moblab'}
# _DEFAULT_DURATION:
# Default value used for the --duration command line option.
# Specifies how far back in time to search in order to determine
# DUT status.
_DEFAULT_DURATION = 24
# _LOGDIR:
# Relative path used in the calculation of the default setting for
# the --logdir option. The full path is relative to the root of the
# autotest directory, as determined from sys.argv[0].
# _LOGFILE:
# Basename of a file to which general log information will be
# written.
# _LOG_FORMAT:
# Format string for log messages.
_LOGDIR = os.path.join('logs', 'dut-data')
_LOGFILE = 'lab-inventory.log'
_LOG_FORMAT = '%(asctime)s | %(levelname)-10s | %(message)s'
# Pattern describing location-based host names in the Chrome OS test
# labs. Each DUT hostname designates the DUT's location:
# * A lab (room) that's physically separated from other labs
# (i.e. there's a door).
# * A row (or aisle) of DUTs within the lab.
# * A vertical rack of shelves on the row.
# * A specific host on one shelf of the rack.
_HOSTNAME_PATTERN = re.compile(
r'(chromeos\d+)-row(\d+)-rack(\d+)-host(\d+)')
# Default entry for managed pools.
_MANAGED_POOL_DEFAULT = 'all_pools'
# _REPAIR_LOOP_THRESHOLD:
# The number of repeated Repair tasks that must be seen to declare
# that a DUT is stuck in a repair loop.
_REPAIR_LOOP_THRESHOLD = 4
class _CachedHostJobHistories(object):
"""Maintains a set of `HostJobHistory` objects for a pool.
The collected history objects are nominally all part of a single
scheduling pool of DUTs. The collection maintains a list of
working DUTs, a list of broken DUTs, and a list of all DUTs.
Performance note: Certain methods in this class are potentially
expensive:
* `get_working()`
* `get_working_list()`
* `get_broken()`
* `get_broken_list()`
* `get_idle()`
* `get_idle_list()`
The first time any one of these methods is called, it causes
multiple RPC calls with a relatively expensive set of database
queries. However, the results of the queries are cached in the
individual `HostJobHistory` objects, so only the first call
actually pays the full cost.
Additionally, `get_working_list()`, `get_broken_list()` and
`get_idle_list()` cache their return values to avoid recalculating
lists at every call; this caching is separate from the caching of RPC
results described above.
This class is deliberately constructed to delay the RPC cost
until the accessor methods are called (rather than to query in
`record_host()`) so that it's possible to construct a complete
`_LabInventory` without making the expensive queries at creation
time. `_populate_board_counts()`, below, assumes this behavior.
"""
def __init__(self):
self._histories = []
self._working_list = None
self._broken_list = None
self._idle_list = None
def record_host(self, host_history):
"""Add one `HostJobHistory` object to the collection.
@param host_history The `HostJobHistory` object to be
remembered.
"""
self._working_list = None
self._broken_list = None
self._idle_list = None
self._histories.append(host_history)
def get_working_list(self):
"""Return a list of all working DUTs in the pool.
Filter `self._histories` for histories where the last
diagnosis is `WORKING`.
Cache the result so that we only cacluate it once.
@return A list of HostJobHistory objects.
"""
if self._working_list is None:
self._working_list = [h for h in self._histories
if h.last_diagnosis()[0] == status_history.WORKING]
return self._working_list
def get_working(self):
"""Return the number of working DUTs in the pool."""
return len(self.get_working_list())
def get_broken_list(self):
"""Return a list of all broken DUTs in the pool.
Filter `self._histories` for histories where the last
diagnosis is `BROKEN`.
Cache the result so that we only cacluate it once.
@return A list of HostJobHistory objects.
"""
if self._broken_list is None:
self._broken_list = [h for h in self._histories
if h.last_diagnosis()[0] == status_history.BROKEN]
return self._broken_list
def get_broken(self):
"""Return the number of broken DUTs in the pool."""
return len(self.get_broken_list())
def get_idle_list(self):
"""Return a list of all idle DUTs in the pool.
Filter `self._histories` for histories where the last
diagnosis is `UNUSED` or `UNKNOWN`.
Cache the result so that we only cacluate it once.
@return A list of HostJobHistory objects.
"""
idle_list = [status_history.UNUSED, status_history.UNKNOWN]
if self._idle_list is None:
self._idle_list = [h for h in self._histories
if h.last_diagnosis()[0] in idle_list]
return self._idle_list
def get_idle(self):
"""Return the number of idle DUTs in the pool."""
return len(self.get_idle_list())
def get_total(self):
"""Return the total number of DUTs in the pool."""
return len(self._histories)
class _ManagedPoolsHostJobHistories(object):
"""Maintains a set of `HostJobHistory`s per managed pool.
The collection maintains a count of working DUTs, a count of broken DUTs,
and a total count. The counts can be obtained either for a single pool, or
as a total across all pools.
DUTs in the collection must be assigned to one of the pools in
`_MANAGED_POOLS`.
The `get_working()` and `get_broken()` methods rely on the
methods of the same name in _CachedHostJobHistories, so the performance
note in _CachedHostJobHistories applies here as well.
"""
def __init__(self):
self._histories_by_pool = {
pool: _CachedHostJobHistories() for pool in MANAGED_POOLS
}
def record_host(self, host_history):
"""Add one `HostJobHistory` object to the collection.
@param host_history The `HostJobHistory` object to be
remembered.
"""
pool = host_history.host_pool
self._histories_by_pool[pool].record_host(host_history)
def _count_pool(self, get_pool_count, pool=None):
"""Internal helper to count hosts in a given pool.
The `get_pool_count` parameter is a function to calculate
the exact count of interest for the pool.
@param get_pool_count Function to return a count from a
_PoolCount object.
@param pool The pool to be counted. If `None`,
return the total across all pools.
"""
if pool is None:
return sum([get_pool_count(cached_history) for cached_history in
self._histories_by_pool.values()])
else:
return get_pool_count(self._histories_by_pool[pool])
def get_working_list(self):
"""Return a list of all working DUTs (across all pools).
Go through all HostJobHistory objects across all pools, selecting the
ones where the last diagnosis is `WORKING`.
@return A list of HostJobHistory objects.
"""
l = []
for p in self._histories_by_pool.values():
l.extend(p.get_working_list())
return l
def get_working(self, pool=None):
"""Return the number of working DUTs in a pool.
@param pool The pool to be counted. If `None`, return the
total across all pools.
@return The total number of working DUTs in the selected
pool(s).
"""
return self._count_pool(_CachedHostJobHistories.get_working, pool)
def get_broken_list(self):
"""Return a list of all broken DUTs (across all pools).
Go through all HostJobHistory objects in the across all pools,
selecting the ones where the last diagnosis is `BROKEN`.
@return A list of HostJobHistory objects.
"""
l = []
for p in self._histories_by_pool.values():
l.extend(p.get_broken_list())
return l
def get_broken(self, pool=None):
"""Return the number of broken DUTs in a pool.
@param pool The pool to be counted. If `None`, return the
total across all pools.
@return The total number of broken DUTs in the selected pool(s).
"""
return self._count_pool(_CachedHostJobHistories.get_broken, pool)
def get_idle_list(self, pool=None):
"""Return a list of all idle DUTs in the given pool.
Go through all HostJobHistory objects in the given pool, selecting the
ones where the last diagnosis is `UNUSED` or `UNKNOWN`.
@param pool: The pool to be counted. If `None`, return the total list
across all pools.
@return A list of HostJobHistory objects.
"""
if pool is None:
l = []
for p in self._histories_by_pool.values():
l.extend(p.get_idle_list())
return l
else:
return _CachedHostJobHistories.get_idle_list(
self._histories_by_pool[pool])
def get_idle(self, pool=None):
"""Return the number of idle DUTs in a pool.
@param pool: The pool to be counted. If `None`, return the total
across all pools.
@return The total number of idle DUTs in the selected pool(s).
"""
return self._count_pool(_CachedHostJobHistories.get_idle, pool)
def get_spares_buffer(self):
"""Return the the nominal number of working spares.
Calculates and returns how many working spares there would
be in the spares pool if all broken DUTs were in the spares
pool. This number may be negative, indicating a shortfall
in the critical pools.
@return The total number DUTs in the spares pool, less the total
number of broken DUTs in all pools.
"""
return self.get_total(SPARE_POOL) - self.get_broken()
def get_total(self, pool=None):
"""Return the total number of DUTs in a pool.
@param pool The pool to be counted. If `None`, return the
total across all pools.
@return The total number of DUTs in the selected pool(s).
"""
return self._count_pool(_CachedHostJobHistories.get_total, pool)
class _LabInventory(object):
"""Collection of `HostJobHistory` objects for the Lab's inventory.
Important attributes:
by_board: A dict mapping board to ManagedPoolsHostJobHistories
"""
@staticmethod
def _eligible_host(afehost):
"""Return whether this host is eligible for monitoring.
A host is eligible if it's in exactly one pool and it has no
labels from the `_EXCLUDED_LABELS` set.
@param afehost The host to be tested for eligibility.
"""
pools = [l for l in afehost.labels
if l.startswith(constants.Labels.POOL_PREFIX)]
excluded = _EXCLUDED_LABELS.intersection(afehost.labels)
return len(pools) == 1 and not excluded
@classmethod
def create_inventory(cls, afe, start_time, end_time, boardlist=[]):
"""Return a Lab inventory with specified parameters.
By default, gathers inventory from `HostJobHistory` objects
for all DUTs in the `MANAGED_POOLS` list. If `boardlist`
is supplied, the inventory will be restricted to only the
given boards.
@param afe AFE object for constructing the
`HostJobHistory` objects.
@param start_time Start time for the `HostJobHistory`
objects.
@param end_time End time for the `HostJobHistory`
objects.
@param boardlist List of boards to include. If empty,
include all available boards.
@return A `_LabInventory` object for the specified boards.
"""
label_list = [constants.Labels.POOL_PREFIX + l
for l in MANAGED_POOLS]
afehosts = afe.get_hosts(labels__name__in=label_list)
if boardlist:
# We're deliberately not checking host eligibility in this
# code path. This is a debug path, not used in production;
# it may be useful to include ineligible hosts here.
boardhosts = []
for board in boardlist:
board_label = constants.Labels.BOARD_PREFIX + board
host_list = [h for h in afehosts
if board_label in h.labels]
boardhosts.extend(host_list)
afehosts = boardhosts
else:
afehosts = [h for h in afehosts if cls._eligible_host(h)]
create = lambda host: (
status_history.HostJobHistory(afe, host,
start_time, end_time))
return cls([create(host) for host in afehosts])
def __init__(self, histories):
# N.B. The query that finds our hosts is restricted to those
# with a valid pool: label, but doesn't check for a valid
# board: label. In some (insufficiently) rare cases, the
# AFE hosts table has been known to (incorrectly) have DUTs
# with a pool: but no board: label. We explicitly exclude
# those here.
histories = [h for h in histories
if h.host_board is not None]
self.histories = histories
self._dut_count = len(histories)
self._managed_boards = {}
self._managed_models = {}
self.by_board = self._classify_by_label_type('board')
self.by_model = self._classify_by_label_type('model')
def _classify_by_label_type(self, label_key):
"""Classify histories by labels with the given key.
@returns a dict mapping labels with the given key to
_ManagedPoolsHostJobHistories for DUTs with that label.
"""
classified = collections.defaultdict(_ManagedPoolsHostJobHistories)
for h in self.histories:
labels = labellib.LabelsMapping(h.host.labels)
if label_key in labels:
classified[labels[label_key]].record_host(h)
return dict(classified)
def get_managed_boards(self, pool=_MANAGED_POOL_DEFAULT):
"""Return the set of "managed" boards.
@param pool: The specified pool for managed boards.
@return A set of all the boards that have both spare and
non-spare pools, unless the pool is specified,
then the set of boards in that pool.
"""
if self._managed_boards.get(pool) is None:
self._managed_boards[pool] = set()
for board, counts in self.by_board.iteritems():
if self._is_managed(pool, counts):
self._managed_boards[pool].add(board)
return self._managed_boards[pool]
def get_managed_models(self, pool=_MANAGED_POOL_DEFAULT):
"""Return the set of "managed" models.
@param pool: The specified pool for managed models.
@return A set of all the models that have both spare and
non-spare pools, unless the pool is specified,
then the set of models in that pool.
"""
if self._managed_models.get(pool) is None:
self._managed_models[pool] = set()
for board, counts in self.by_model.iteritems():
if self._is_managed(pool, counts):
self._managed_models[pool].add(board)
return self._managed_models[pool]
def _is_managed(self, pool, histories):
"""Deterime if the given histories contain DUTs to be managed for pool.
Operationally, saying a board is "managed" means that the
board will be included in the "board" and "repair
recommendations" reports. That is, if there are failures in
the board's inventory then lab techs will be asked to fix
them without a separate ticket.
For purposes of implementation, a board is "managed" if it
has DUTs in both the spare and a non-spare (i.e. critical)
pool.
"""
# Get the counts for all pools, otherwise get it for the
# specified pool.
if pool == _MANAGED_POOL_DEFAULT:
spares = histories.get_total(SPARE_POOL)
total = histories.get_total()
return spares != 0 and spares != total
else:
return histories.get_total(pool) != 0
def get_num_duts(self):
"""Return the total number of DUTs in the inventory."""
return self._dut_count
def get_num_boards(self):
"""Return the total number of boards in the inventory."""
return len(self.by_board)
def get_num_models(self):
"""Return the total number of models in the inventory."""
return len(self.by_model)
def _sort_by_location(inventory_list):
"""Return a list of DUTs, organized by location.
Take the given list of `HostJobHistory` objects, separate it
into a list per lab, and sort each lab's list by location. The
order of sorting within a lab is
* By row number within the lab,
* then by rack number within the row,
* then by host shelf number within the rack.
Return a list of the sorted lists.
Implementation note: host locations are sorted by converting
each location into a base 100 number. If row, rack or
host numbers exceed the range [0..99], then sorting will
break down.
@return A list of sorted lists of DUTs.
"""
BASE = 100
lab_lists = {}
for history in inventory_list:
location = _HOSTNAME_PATTERN.match(history.host.hostname)
if location:
lab = location.group(1)
key = 0
for idx in location.group(2, 3, 4):
key = BASE * key + int(idx)
lab_lists.setdefault(lab, []).append((key, history))
return_list = []
for dut_list in lab_lists.values():
dut_list.sort(key=lambda t: t[0])
return_list.append([t[1] for t in dut_list])
return return_list
def _score_repair_set(buffer_counts, repair_list):
"""Return a numeric score rating a set of DUTs to be repaired.
`buffer_counts` is a dictionary mapping board names to the
size of the board's spares buffer.
`repair_list` is a list of DUTs to be repaired.
This function calculates the new set of buffer counts that would
result from the proposed repairs, and scores the new set using
two numbers:
* Worst case buffer count for any board (higher is better).
This is the more siginficant number for comparison.
* Number of boards at the worst case (lower is better). This
is the less significant number.
Implementation note: The score could fail to reflect the
intended criteria if there are more than 1000 boards in the
inventory.
@param spare_counts A dictionary mapping boards to buffer counts.
@param repair_list A list of boards to be repaired.
@return A numeric score.
"""
# Go through `buffer_counts`, and create a list of new counts
# that records the buffer count for each board after repair.
# The new list of counts discards the board names, as they don't
# contribute to the final score.
_NBOARDS = 1000
repair_inventory = _LabInventory(repair_list)
new_counts = []
for b, c in buffer_counts.items():
if b in repair_inventory.by_board:
newcount = repair_inventory.by_board[b].get_total()
else:
newcount = 0
new_counts.append(c + newcount)
# Go through the new list of counts. Find the worst available
# spares count, and count how many times that worst case occurs.
worst_count = new_counts[0]
num_worst = 1
for c in new_counts[1:]:
if c == worst_count:
num_worst += 1
elif c < worst_count:
worst_count = c
num_worst = 1
# Return the calculated score
return _NBOARDS * worst_count - num_worst
def _generate_repair_recommendation(inventory, num_recommend):
"""Return a summary of selected DUTs needing repair.
Returns a message recommending a list of broken DUTs to be
repaired. The list of DUTs is selected based on these
criteria:
* No more than `num_recommend` DUTs will be listed.
* All DUTs must be in the same lab.
* DUTs should be selected for some degree of physical
proximity.
* DUTs for boards with a low spares buffer are more important
than DUTs with larger buffers.
The algorithm used will guarantee that at least one DUT from a
board with the smallest spares buffer will be recommended. If
the worst spares buffer number is shared by more than one board,
the algorithm will tend to prefer repair sets that include more
of those boards over sets that cover fewer boards.
@param inventory Inventory for generating recommendations.
@param num_recommend Number of DUTs to recommend for repair.
"""
logging.debug('Creating DUT repair recommendations')
board_buffer_counts = {}
broken_list = []
for board in inventory.get_managed_boards():
logging.debug('Listing failed DUTs for %s', board)
counts = inventory.by_board[board]
if counts.get_broken() != 0:
board_buffer_counts[board] = counts.get_spares_buffer()
broken_list.extend(counts.get_broken_list())
# N.B. The logic inside this loop may seem complicated, but
# simplification is hard:
# * Calculating an initial recommendation outside of
# the loop likely would make things more complicated,
# not less.
# * It's necessary to calculate an initial lab slice once per
# lab _before_ the while loop, in case the number of broken
# DUTs in a lab is less than `num_recommend`.
recommendation = None
best_score = None
for lab_duts in _sort_by_location(broken_list):
start = 0
end = num_recommend
lab_slice = lab_duts[start : end]
lab_score = _score_repair_set(board_buffer_counts,
lab_slice)
while end < len(lab_duts):
start += 1
end += 1
new_slice = lab_duts[start : end]
new_score = _score_repair_set(board_buffer_counts,
new_slice)
if new_score > lab_score:
lab_slice = new_slice
lab_score = new_score
if recommendation is None or lab_score > best_score:
recommendation = lab_slice
best_score = lab_score
# N.B. The trailing space here is manadatory: Without it, Gmail
# will parse the URL wrong. Don't ask. If you simply _must_
# know more, go try it yourself...
line_fmt = '%-30s %-16s %-6s\n %s '
message = ['Repair recommendations:\n',
line_fmt % ( 'Hostname', 'Board', 'Servo?', 'Logs URL')]
for h in recommendation:
servo_name = servo_host.make_servo_hostname(h.host.hostname)
servo_present = utils.host_is_in_lab_zone(servo_name)
_, event = h.last_diagnosis()
line = line_fmt % (
h.host.hostname, h.host_board,
'Yes' if servo_present else 'No', event.job_url)
message.append(line)
return '\n'.join(message)
def _generate_board_inventory_message(inventory):
"""Generate the "board inventory" e-mail message.
The board inventory is a list by board summarizing the number
of working and broken DUTs, and the total shortfall or surplus
of working devices relative to the minimum critical pool
requirement.
The report omits boards with no DUTs in the spare pool or with
no DUTs in a critical pool.
N.B. For sample output text formattted as users can expect to
see it in e-mail and log files, refer to the unit tests.
@param inventory _LabInventory object with the inventory to
be reported on.
@return String with the inventory message to be sent.
"""
logging.debug('Creating board inventory')
nworking = 0
nbroken = 0
nidle = 0
nbroken_boards = 0
ntotal_boards = 0
summaries = []
for board in inventory.get_managed_boards():
counts = inventory.by_board[board]
logging.debug('Counting %2d DUTS for board %s',
counts.get_total(), board)
# Summary elements laid out in the same order as the text
# headers:
# Board Avail Bad Idle Good Spare Total
# e[0] e[1] e[2] e[3] e[4] e[5] e[6]
element = (board,
counts.get_spares_buffer(),
counts.get_broken(),
counts.get_idle(),
counts.get_working(),
counts.get_total(SPARE_POOL),
counts.get_total())
if element[2]:
summaries.append(element)
nbroken_boards += 1
ntotal_boards += 1
nbroken += element[2]
nidle += element[3]
nworking += element[4]
ntotal = nworking + nbroken + nidle
summaries = sorted(summaries, key=lambda e: (e[1], -e[2]))
broken_percent = int(round(100.0 * nbroken / ntotal))
idle_percent = int(round(100.0 * nidle / ntotal))
working_percent = 100 - broken_percent - idle_percent
message = ['Summary of DUTs in inventory:',
'%10s %10s %10s %6s' % ('Bad', 'Idle', 'Good', 'Total'),
'%5d %3d%% %5d %3d%% %5d %3d%% %6d' % (
nbroken, broken_percent,
nidle, idle_percent,
nworking, working_percent,
ntotal),
'',
'Boards with failures: %d' % nbroken_boards,
'Boards in inventory: %d' % ntotal_boards,
'', '',
'Full board inventory:\n',
'%-22s %5s %5s %5s %5s %5s %5s' % (
'Board', 'Avail', 'Bad', 'Idle', 'Good',
'Spare', 'Total')]
message.extend(
['%-22s %5d %5d %5d %5d %5d %5d' % e for e in summaries])
return '\n'.join(message)
_POOL_INVENTORY_HEADER = '''\
Notice to Infrastructure deputies: All boards shown below are at
less than full strength, please take action to resolve the issues.
Once you're satisified that failures won't recur, failed DUTs can
be replaced with spares by running `balance_pool`. Detailed
instructions can be found here:
http://go/cros-manage-duts
'''
def _generate_pool_inventory_message(inventory):
"""Generate the "pool inventory" e-mail message.
The pool inventory is a list by pool and board summarizing the
number of working and broken DUTs in the pool. Only boards with
at least one broken DUT are included in the list.
N.B. For sample output text formattted as users can expect to
see it in e-mail and log files, refer to the unit tests.
@param inventory _LabInventory object with the inventory to
be reported on.
@return String with the inventory message to be sent.
"""
logging.debug('Creating pool inventory')
message = [_POOL_INVENTORY_HEADER]
newline = ''
for pool in CRITICAL_POOLS:
message.append(
'%sStatus for pool:%s, by board:' % (newline, pool))
message.append(
'%-20s %5s %5s %5s %5s' % (
'Board', 'Bad', 'Idle', 'Good', 'Total'))
data_list = []
for board, counts in inventory.by_board.iteritems():
logging.debug('Counting %2d DUTs for %s, %s',
counts.get_total(pool), board, pool)
broken = counts.get_broken(pool)
idle = counts.get_idle(pool)
# boards at full strength are not reported
if broken == 0 and idle == 0:
continue
working = counts.get_working(pool)
total = counts.get_total(pool)
data_list.append((board, broken, idle, working, total))
if data_list:
data_list = sorted(data_list, key=lambda d: -d[1])
message.extend(
['%-20s %5d %5d %5d %5d' % t for t in data_list])
else:
message.append('(All boards at full strength)')
newline = '\n'
return '\n'.join(message)
_IDLE_INVENTORY_HEADER = '''\
Notice to Infrastructure deputies: The hosts shown below haven't
run any jobs for at least 24 hours. Please check each host; locked
hosts should normally be unlocked; stuck jobs should normally be
aborted.
'''
def _generate_idle_inventory_message(inventory):
"""Generate the "idle inventory" e-mail message.
The idle inventory is a host list with corresponding pool and board,
where the hosts are idle (`UNKWOWN` or `UNUSED`).
N.B. For sample output text format as users can expect to
see it in e-mail and log files, refer to the unit tests.
@param inventory _LabInventory object with the inventory to
be reported on.
@return String with the inventory message to be sent.
"""
logging.debug('Creating idle inventory')
message = [_IDLE_INVENTORY_HEADER]
message.append('Idle Host List:')
message.append('%-30s %-20s %s' % ('Hostname', 'Board', 'Pool'))
data_list = []
for pool in MANAGED_POOLS:
for board, counts in inventory.by_board.iteritems():
logging.debug('Counting %2d DUTs for %s, %s',
counts.get_total(pool), board, pool)
data_list.extend([(dut.host.hostname, board, pool)
for dut in counts.get_idle_list(pool)])
if data_list:
message.extend(['%-30s %-20s %s' % t for t in data_list])
else:
message.append('(No idle DUTs)')
return '\n'.join(message)
def _send_email(arguments, tag, subject, recipients, body):
"""Send an inventory e-mail message.
The message is logged in the selected log directory using `tag`
for the file name.
If the --debug option was requested, the message is neither
logged nor sent, but merely printed on stdout.
@param arguments Parsed command-line options.
@param tag Tag identifying the inventory for logging
purposes.
@param subject E-mail Subject: header line.
@param recipients E-mail addresses for the To: header line.
@param body E-mail message body.
"""
logging.debug('Generating email: "%s"', subject)
all_recipients = ', '.join(recipients)
report_body = '\n'.join([
'To: %s' % all_recipients,
'Subject: %s' % subject,
'', body, ''])
if arguments.debug:
print report_body
else:
filename = os.path.join(arguments.logdir, tag)
try:
report_file = open(filename, 'w')
report_file.write(report_body)
report_file.close()
except EnvironmentError as e:
logging.error('Failed to write %s: %s', filename, e)
try:
gmail_lib.send_email(all_recipients, subject, body)
except Exception as e:
logging.error('Failed to send e-mail to %s: %s',
all_recipients, e)
def _populate_board_counts(inventory):
"""Gather board counts while providing interactive feedback.
Gathering the status of all individual DUTs in the lab can take
considerable time (~30 minutes at the time of this writing).
Normally, we pay that cost by querying as we go. However, with
the `--debug` option, we expect a human being to be watching the
progress in real time. So, we force the first (expensive)
queries to happen up front, and provide simple ASCII output
(without using logging) to show a progress bar and results.
@param inventory _LabInventory object with the inventory to
be gathered.
"""
n = 0
total_broken = 0
for counts in inventory.by_board.itervalues():
n += 1
if n % 10 == 5:
c = '+'
elif n % 10 == 0:
c = '%d' % ((n / 10) % 10)
else:
c = '.'
sys.stdout.write(c)
sys.stdout.flush()
# This next call is where all the time goes - it forces all
# of a board's HostJobHistory objects to query the database
# and cache their results.
total_broken += counts.get_broken()
sys.stdout.write('\n')
sys.stdout.write('Found %d broken DUTs\n' % total_broken)
def _perform_board_inventory(arguments, inventory, timestamp):
"""Perform the board inventory report.
The board inventory report consists of the following:
* A list of DUTs that are recommended to be repaired.
This list is optional, and only appears if the `--recommend`
option is present.
* A list of all boards that have failed DUTs, with counts
of working, broken, and spare DUTs, among others.
@param arguments Command-line arguments as returned by
`ArgumentParser`
@param inventory _LabInventory object with the inventory to
be reported.
@param timestamp A string used to identify this run's timestamp
in logs and email output.
"""
if arguments.recommend:
recommend_message = _generate_repair_recommendation(
inventory, arguments.recommend) + '\n\n\n'
else:
recommend_message = ''
board_message = _generate_board_inventory_message(inventory)
_send_email(arguments,
'boards-%s.txt' % timestamp,
'DUT board inventory %s' % timestamp,
arguments.board_notify,
recommend_message + board_message)
def _perform_pool_inventory(arguments, inventory, timestamp):
"""Perform the pool inventory report.
The pool inventory report consists of the following:
* A list of all critical pools that have failed DUTs, with counts
of working, broken, and idle DUTs.
* A list of all idle DUTs by hostname including the board and
pool.
@param arguments Command-line arguments as returned by
`ArgumentParser`
@param inventory _LabInventory object with the inventory to be
reported.
@param timestamp A string used to identify this run's timestamp in
logs and email output.
"""
pool_message = _generate_pool_inventory_message(inventory)
idle_message = _generate_idle_inventory_message(inventory)
_send_email(arguments,
'pools-%s.txt' % timestamp,
'DUT pool inventory %s' % timestamp,
arguments.pool_notify,
pool_message + '\n\n\n' + idle_message)
def _dut_in_repair_loop(history):
"""Return whether a DUT's history indicates a repair loop.
A DUT is considered looping if it runs no tests, and no tasks pass
other than repair tasks.
@param history An instance of `status_history.HostJobHistory` to be
scanned for a repair loop. The caller guarantees
that this history corresponds to a working DUT.
@returns Return a true value if the DUT's most recent history
indicates a repair loop.
"""
# Our caller passes only histories for working DUTs; that means
# we've already paid the cost of fetching the diagnosis task, and
# we know that the task was successful. The diagnosis task will be
# one of the tasks we must scan to find a loop, so if the task isn't
# a repair task, then our history includes a successful non-repair
# task, and we're not looping.
#
# The for loop below is very expensive, because it must fetch the
# full history, regardless of how many tasks we examine. At the
# time of this writing, this check against the diagnosis task
# reduces the cost of finding loops in the full inventory from hours
# to minutes.
if history.last_diagnosis()[1].name != 'Repair':
return False
repair_ok_count = 0
for task in history:
if not task.is_special:
# This is a test, so we're not looping.
return False
if task.diagnosis == status_history.BROKEN:
# Failed a repair, so we're not looping.
return False
if (task.diagnosis == status_history.WORKING
and task.name != 'Repair'):
# Non-repair task succeeded, so we're not looping.
return False
# At this point, we have either a failed non-repair task, or
# a successful repair.
if task.name == 'Repair':
repair_ok_count += 1
if repair_ok_count >= _REPAIR_LOOP_THRESHOLD:
return True
def _perform_repair_loop_report(arguments, inventory):
"""Scan the inventory for DUTs stuck in a repair loop.
This routine walks through the given inventory looking for DUTs
where the most recent history shows that the DUT is regularly
passing repair tasks, but has not run any tests.
@param arguments Command-line arguments as returned by
`ArgumentParser`
@param inventory _LabInventory object with the inventory to be
reported.
"""
loop_presence = metrics.BooleanMetric(
'chromeos/autotest/inventory/repair_loops',
'DUTs stuck in repair loops')
logging.info('Scanning for DUTs in repair loops.')
for counts in inventory.by_board.itervalues():
for history in counts.get_working_list():
# Managed DUTs with names that don't match
# _HOSTNAME_PATTERN shouldn't be possible. However, we
# don't want arbitrary strings being attached to the
# 'dut_hostname' field, so for safety, we exclude all
# anomalies.
if not _HOSTNAME_PATTERN.match(history.hostname):
continue
if _dut_in_repair_loop(history):
fields = {'dut_hostname': history.hostname,
'board': history.host_board,
'pool': history.host_pool}
logging.info('Looping DUT: %(dut_hostname)s, '
'board: %(board)s, pool: %(pool)s',
fields)
loop_presence.set(True, fields=fields)
def _log_startup(arguments, startup_time):
"""Log the start of this inventory run.
Print various log messages indicating the start of the run. Return
a string based on `startup_time` that will be used to identify this
run in log files and e-mail messages.
@param startup_time A UNIX timestamp marking the moment when
this inventory run began.
@returns A timestamp string that will be used to identify this run
in logs and email output.
"""
timestamp = time.strftime('%Y-%m-%d.%H',
time.localtime(startup_time))
logging.debug('Starting lab inventory for %s', timestamp)
if arguments.board_notify:
if arguments.recommend:
logging.debug('Will include repair recommendations')
logging.debug('Will include board inventory')
if arguments.pool_notify:
logging.debug('Will include pool inventory')
return timestamp
def _create_inventory(arguments, end_time):
"""Create the `_LabInventory` instance to use for reporting.
@param end_time A UNIX timestamp for the end of the time range
to be searched in this inventory run.
"""
start_time = end_time - arguments.duration * 60 * 60
afe = frontend_wrappers.RetryingAFE(server=None)
inventory = _LabInventory.create_inventory(
afe, start_time, end_time, arguments.boardnames)
logging.info('Found %d hosts across %d boards',
inventory.get_num_duts(),
inventory.get_num_boards())
return inventory
def _perform_inventory_reports(arguments):
"""Perform all inventory checks requested on the command line.
Create the initial inventory and run through the inventory reports
as called for by the parsed command-line arguments.
@param arguments Command-line arguments as returned by
`ArgumentParser`.
"""
startup_time = time.time()
timestamp = _log_startup(arguments, startup_time)
inventory = _create_inventory(arguments, startup_time)
if arguments.debug:
_populate_board_counts(inventory)
if arguments.board_notify:
_perform_board_inventory(arguments, inventory, timestamp)
if arguments.pool_notify:
_perform_pool_inventory(arguments, inventory, timestamp)
if arguments.repair_loops:
_perform_repair_loop_report(arguments, inventory)
def _separate_email_addresses(address_list):
"""Parse a list of comma-separated lists of e-mail addresses.
@param address_list A list of strings containing comma
separate e-mail addresses.
@return A list of the individual e-mail addresses.
"""
newlist = []
for arg in address_list:
newlist.extend([email.strip() for email in arg.split(',')])
return newlist
def _verify_arguments(arguments):
"""Validate command-line arguments.
Join comma separated e-mail addresses for `--board-notify` and
`--pool-notify` in separate option arguments into a single list.
For non-debug uses, require that notification be requested for
at least one report. For debug, if notification isn't specified,
treat it as "run all the reports."
The return value indicates success or failure; in the case of
failure, we also write an error message to stderr.
@param arguments Command-line arguments as returned by
`ArgumentParser`
@return True if the arguments are semantically good, or False
if the arguments don't meet requirements.
"""
arguments.board_notify = _separate_email_addresses(
arguments.board_notify)
arguments.pool_notify = _separate_email_addresses(
arguments.pool_notify)
if not arguments.board_notify and not arguments.pool_notify:
if not arguments.debug:
sys.stderr.write('Must specify at least one of '
'--board-notify or --pool-notify\n')
return False
else:
# We want to run all the reports. An empty notify list
# will cause a report to be skipped, so make sure the
# lists are non-empty.
arguments.board_notify = ['']
arguments.pool_notify = ['']
return True
def _get_default_logdir(script):
"""Get the default directory for the `--logdir` option.
The default log directory is based on the parent directory
containing this script.
@param script Path to this script file.
@return A path to a directory.
"""
basedir = os.path.dirname(os.path.abspath(script))
basedir = os.path.dirname(basedir)
return os.path.join(basedir, _LOGDIR)
def _parse_command(argv):
"""Parse the command line arguments.
Create an argument parser for this command's syntax, parse the
command line, and return the result of the ArgumentParser
parse_args() method.
@param argv Standard command line argument vector; argv[0] is
assumed to be the command name.
@return Result returned by ArgumentParser.parse_args().
"""
parser = argparse.ArgumentParser(
prog=argv[0],
description='Gather and report lab inventory statistics')
parser.add_argument('-d', '--duration', type=int,
default=_DEFAULT_DURATION, metavar='HOURS',
help='number of hours back to search for status'
' (default: %d)' % _DEFAULT_DURATION)
parser.add_argument('--board-notify', action='append',
default=[], metavar='ADDRESS',
help='Generate board inventory message, '
'and send it to the given e-mail address(es)')
parser.add_argument('--pool-notify', action='append',
default=[], metavar='ADDRESS',
help='Generate pool inventory message, '
'and send it to the given address(es)')
parser.add_argument('-r', '--recommend', type=int, default=None,
help=('Specify how many DUTs should be '
'recommended for repair (default: no '
'recommendation)'))
parser.add_argument('--repair-loops', action='store_true',
help='Check for devices stuck in repair loops.')
parser.add_argument('--debug', action='store_true',
help='Print e-mail messages on stdout '
'without sending them.')
parser.add_argument('--logdir', default=_get_default_logdir(argv[0]),
help='Directory where logs will be written.')
parser.add_argument('boardnames', nargs='*',
metavar='BOARD',
help='names of boards to report on '
'(default: all boards)')
arguments = parser.parse_args(argv[1:])
if not _verify_arguments(arguments):
return None
return arguments
def _configure_logging(arguments):
"""Configure the `logging` module for our needs.
How we log depends on whether the `--debug` option was provided on
the command line.
* Without the option, we configure the logging to capture all
potentially relevant events in a log file. The log file is
configured to rotate once a week on Friday evening, preserving
~3 months worth of history.
* With the option, we expect stdout to contain other
human-readable output (including the contents of the e-mail
messages), so we restrict the output to INFO level.
For convenience, when `--debug` is on, the logging format has
no adornments, so that a call like `logging.info(msg)` simply writes
`msg` to stdout, plus a trailing newline.
@param arguments Command-line arguments as returned by
`ArgumentParser`
"""
root_logger = logging.getLogger()
if arguments.debug:
root_logger.setLevel(logging.INFO)
handler = logging.StreamHandler(sys.stdout)
handler.setFormatter(logging.Formatter())
else:
if not os.path.exists(arguments.logdir):
os.mkdir(arguments.logdir)
root_logger.setLevel(logging.DEBUG)
logfile = os.path.join(arguments.logdir, _LOGFILE)
handler = logging.handlers.TimedRotatingFileHandler(
logfile, when='W4', backupCount=13)
formatter = logging.Formatter(_LOG_FORMAT,
time_utils.TIME_FMT)
handler.setFormatter(formatter)
# TODO(jrbarnette) This is gross. Importing client.bin.utils
# implicitly imported logging_config, which calls
# logging.basicConfig() *at module level*. That gives us an
# extra logging handler that we don't want. So, clear out all
# the handlers here.
for h in root_logger.handlers:
root_logger.removeHandler(h)
root_logger.addHandler(handler)
def main(argv):
"""Standard main routine.
@param argv Command line arguments, including `sys.argv[0]`.
"""
arguments = _parse_command(argv)
if not arguments:
sys.exit(1)
_configure_logging(arguments)
try:
if not arguments.debug:
with site_utils.SetupTsMonGlobalState(
'repair_loops', short_lived=True, auto_flush=False):
_perform_inventory_reports(arguments)
else:
_perform_inventory_reports(arguments)
except KeyboardInterrupt:
pass
except EnvironmentError as e:
logging.exception('Unexpected OS error: %s', e)
except Exception as e:
logging.exception('Unexpected exception: %s', e)
def get_inventory(afe):
end_time = int(time.time())
start_time = end_time - 24 * 60 * 60
return _LabInventory.create_inventory(afe, start_time, end_time)
def get_managed_boards(afe):
return get_inventory(afe).get_managed_boards()
if __name__ == '__main__':
main(sys.argv)