blob: 8872e4bca9a805692fa36680eab2e8e23152091c [file] [log] [blame]
# Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
import compiler, datetime, hashlib, logging, os
import common
from autotest_lib.client.common_lib import control_data, global_config
from autotest_lib.client.common_lib import error, utils
from autotest_lib.client.common_lib.cros import dev_server
from autotest_lib.server.cros.dynamic_suite import constants
from autotest_lib.server.cros.dynamic_suite import control_file_getter
from autotest_lib.server.cros.dynamic_suite import frontend_wrappers
from autotest_lib.server.cros.dynamic_suite import host_lock_manager, job_status
from autotest_lib.server.cros.dynamic_suite import tools
from autotest_lib.server.cros.dynamic_suite.job_status import Status
from autotest_lib.server import frontend
from autotest_lib.frontend.afe.json_rpc import proxy
class Reimager(object):
"""
A class that can run jobs to reimage devices.
@var _afe: a frontend.AFE instance used to talk to autotest.
@var _tko: a frontend.TKO instance used to query the autotest results db.
@var _cf_getter: a ControlFileGetter used to get the AU control file.
"""
JOB_NAME = 'try_new_image'
def __init__(self, autotest_dir, afe=None, tko=None, results_dir=None):
"""
Constructor
@param autotest_dir: the place to find autotests.
@param afe: an instance of AFE as defined in server/frontend.py.
@param tko: an instance of TKO as defined in server/frontend.py.
@param results_dir: The directory where the job can write results to.
This must be set if you want job_id of sub-jobs
list in the job keyvals.
"""
self._afe = afe or frontend_wrappers.RetryingAFE(timeout_min=30,
delay_sec=10,
debug=False)
self._tko = tko or frontend_wrappers.RetryingTKO(timeout_min=30,
delay_sec=10,
debug=False)
self._results_dir = results_dir
self._reimaged_hosts = {}
self._cf_getter = control_file_getter.FileSystemGetter(
[os.path.join(autotest_dir, 'server/site_tests')])
def attempt(self, build, board, pool, devserver, record, check_hosts,
manager, num=None):
"""
Synchronously attempt to reimage some machines.
Fire off attempts to reimage |num| machines of type |board|, using an
image at |url| called |build|. Wait for completion, polling every
10s, and log results with |record| upon completion.
@param build: the build to install e.g.
x86-alex-release/R18-1655.0.0-a1-b1584.
@param board: which kind of devices to reimage.
@param pool: Specify the pool of machines to use for scheduling
purposes.
@param devserver: an instance of a devserver to use to complete this
call.
@param record: callable that records job status.
prototype:
record(base_job.status_log_entry)
@param check_hosts: require appropriate hosts to be available now.
@param manager: an as-yet-unused HostLockManager instance to handle
locking DUTs that we decide to reimage.
@param num: the maximum number of devices to reimage.
@return True if all reimaging jobs succeed, false otherwise.
"""
if not num:
num = tools.sharding_factor()
logging.debug("scheduling reimaging across %d machines", num)
begin_time_str = datetime.datetime.now().strftime(job_status.TIME_FMT)
try:
self._ensure_version_label(constants.VERSION_PREFIX + build)
if check_hosts:
# TODO make DEPENDENCIES-aware
self._ensure_enough_hosts(board, pool, num)
# Schedule job and record job metadata.
# TODO make DEPENDENCIES-aware
canary_job = self._schedule_reimage_job(build, board, pool,
num, devserver)
self._record_job_if_possible(Reimager.JOB_NAME, canary_job)
logging.info('Created re-imaging job: %d', canary_job.id)
job_status.wait_for_jobs_to_start(self._afe, [canary_job])
logging.debug('Re-imaging job running.')
hosts = job_status.wait_for_and_lock_job_hosts(self._afe,
[canary_job],
manager)
logging.info('%r locked for reimaging.', hosts)
job_status.wait_for_jobs_to_finish(self._afe, [canary_job])
logging.debug('Re-imaging job finished.')
# Gather job results.
results = self.get_results(canary_job)
self._reimaged_hosts[build] = results.keys()
except error.InadequateHostsException as e:
logging.warning(e)
Status('WARN', Reimager.JOB_NAME, str(e),
begin_time_str=begin_time_str).record_all(record)
return False
except Exception as e:
# catch Exception so we record the job as terminated no matter what.
logging.error(e)
Status('ERROR', Reimager.JOB_NAME, str(e),
begin_time_str=begin_time_str).record_all(record)
return False
return job_status.record_and_report_results(results.values(), record)
def get_results(self, canary_job):
"""
Gather results for |canary_job|, in a map of Statuses indexed by host.
A host's results will be named Reimager.JOB_NAME-<host> in the map, e.g.
{'chromeos2-rack1': Status('GOOD', 'try_new_image-chromeos2-rack1')}
@param canary_job: a completed frontend.Job
@return a map of hostname: job_status.Status objects.
"""
return job_status.gather_per_host_results(self._afe,
self._tko,
[canary_job],
Reimager.JOB_NAME + '-')
def _ensure_enough_hosts(self, board, pool, num):
"""
Determine if there are enough working hosts to run on.
Raises exception if there are not enough hosts.
@param board: which kind of devices to reimage.
@param pool: the pool of machines to use for scheduling purposes.
@param num: the maximum number of devices to reimage.
@raises NoHostsException: if no working hosts.
@raises InadequateHostsException: if too few working hosts.
"""
labels = [l for l in [board, pool] if l is not None]
available = self._count_usable_hosts(labels)
if available == 0:
raise error.NoHostsException('All hosts with %r are dead!' % labels)
elif num > available:
raise error.InadequateHostsException(
'Too few hosts with %r' % labels)
def _count_usable_hosts(self, host_spec):
"""
Given a set of host labels, count the live hosts that have them all.
@param host_spec: list of labels specifying a set of hosts.
@return the number of live hosts that satisfy |host_spec|.
"""
count = 0
for host in self._afe.get_hosts(multiple_labels=host_spec):
if self._alive(host) and not self._incorrectly_locked(host):
count += 1
return count
def _alive(self, host):
"""
Given a host, determine if the host is alive.
@param host: Host instance (as in server/frontend.py)
@return True if host is not under, or in need of, repair. Else, False.
"""
return host.status not in ['Repair Failed', 'Repairing']
def _incorrectly_locked(self, host):
"""
Given a host, determine if the host is locked by some user.
If the host is unlocked, or locked by the test infrastructure,
this will return False. Usernames defined as 'part of the test
infrastructure' are listed in global_config.ini under the [CROS]
section in the 'infrastructure_users' field.
@param host: Host instance (as in server/frontend.py)
@return False if the host is not locked, or locked by the infra.
True if the host is locked by someone we haven't blessed.
"""
return (host.locked and
host.locked_by not in tools.infrastructure_user_list())
def clear_reimaged_host_state(self, build):
"""
Clear per-host state created in the autotest DB for this job.
After reimaging a host, we label it and set some host attributes on it
that are then used by the suite scheduling code. This call cleans
that up.
@param build: the build whose hosts we want to clean up e.g.
x86-alex-release/R18-1655.0.0-a1-b1584.
"""
for host in self._reimaged_hosts.get('build', []):
if not host.startswith('hostless'):
self._clear_build_state(host)
def _clear_build_state(self, machine):
"""
Clear all build-specific labels, attributes from the target.
@param machine: the host to clear labels, attributes from.
"""
self._afe.set_host_attribute(constants.JOB_REPO_URL, None,
hostname=machine)
def _record_job_if_possible(self, test_name, job):
"""
Record job id as keyval, if possible, so it can be referenced later.
If |self._results_dir| is None, then this is a NOOP.
@param test_name: the test to record id/owner for.
@param job: the job object to pull info from.
"""
if self._results_dir:
job_id_owner = '%s-%s' % (job.id, job.owner)
utils.write_keyval(
self._results_dir,
{hashlib.md5(test_name).hexdigest(): job_id_owner})
def _ensure_version_label(self, name):
"""
Ensure that a label called |name| exists in the autotest DB.
@param name: the label to check for/create.
"""
try:
self._afe.create_label(name=name)
except proxy.ValidationError as ve:
if ('name' in ve.problem_keys and
'This value must be unique' in ve.problem_keys['name']):
logging.debug('Version label %s already exists', name)
else:
raise ve
def _schedule_reimage_job(self, build, board, pool, num_machines,
devserver):
"""
Schedules the reimaging of |num_machines| |board| devices with |image|.
Sends an RPC to the autotest frontend to enqueue reimaging jobs on
|num_machines| devices of type |board|
@param build: the build to install (must be unique).
@param board: which kind of devices to reimage.
@param pool: the pool of machines to use for scheduling purposes.
@param num_machines: the maximum number of devices to reimage.
@param devserver: an instance of devserver that DUTs should use to get
build artifacts from.
@return a frontend.Job object for the reimaging job we scheduled.
"""
image_url = tools.image_url_pattern() % (devserver.url(), build)
control_file = tools.inject_vars(
dict(image_url=image_url, image_name=build,
devserver_url=devserver.url()),
self._cf_getter.get_control_file_contents_by_name('autoupdate'))
job_deps = []
if pool:
meta_host = pool
board_label = board
job_deps.append(board_label)
else:
# No pool specified use board.
meta_host = board
return self._afe.create_job(control_file=control_file,
name=build + '-try',
control_type='Server',
priority='Low',
meta_hosts=[meta_host] * num_machines,
dependencies=job_deps)