blob: 6b1fd56979c53a2cec0ea2337573cffa25d346cb [file] [log] [blame]
# Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
import ast, datetime, logging
import common
from autotest_lib.client.common_lib import base_job
from autotest_lib.client.common_lib import error, utils
from autotest_lib.client.common_lib.cros import dev_server
from autotest_lib.server.cros.dynamic_suite import constants
from autotest_lib.server.cros.dynamic_suite import frontend_wrappers
from autotest_lib.server.cros.dynamic_suite import job_status
from autotest_lib.server.cros.dynamic_suite import reimager
from autotest_lib.server.cros.dynamic_suite import tools
from autotest_lib.server.cros.dynamic_suite.suite import Suite
from autotest_lib.tko import utils as tko_utils
"""CrOS dynamic test suite generation and execution module.
This module implements runtime-generated test suites for CrOS.
Design doc: http://goto.google.com/suitesv2
Individual tests can declare themselves as a part of one or more
suites, and the code here enables control files to be written
that can refer to these "dynamic suites" by name. We also provide
support for reimaging devices with a given build and running a
dynamic suite across all reimaged devices.
The public API for defining a suite includes one method: reimage_and_run().
A suite control file can be written by importing this module and making
an appropriate call to this single method. In normal usage, this control
file will be run in a 'hostless' server-side autotest job, scheduling
sub-jobs to do the needed reimaging and test running.
Example control file:
import common
from autotest_lib.server.cros.dynamic_suite import dynamic_suite
dynamic_suite.reimage_and_run(
build=build, board=board, name='bvt', job=job, pool=pool,
check_hosts=check_hosts, add_experimental=True, num=num,
skip_reimage=dynamic_suite.skip_reimage(globals()),
devserver_url=devserver_url)
This will -- at runtime -- find all control files that contain "bvt" in their
"SUITE=" clause, schedule jobs to reimage |num| or less devices in the
specified pool of the specified board with the specified build and, upon
completion of those jobs, schedule and wait for jobs that run all the tests it
discovered across those |num| or less machines.
Suites can be run by using the atest command-line tool:
atest suite create -b <board> -i <build/name> <suite>
e.g.
atest suite create -b x86-mario -i x86-mario/R20-2203.0.0 bvt
-------------------------------------------------------------------------
Implementation details
In addition to the create_suite_job() RPC defined in the autotest frontend,
there are two main classes defined here: Suite and {Os,Fw}Reimager.
A Suite instance represents a single test suite, defined by some predicate
run over all known control files. The simplest example is creating a Suite
by 'name'.
The Reimager classes provides support for reimaging a heterogenous set
of devices with an appropriate build, in preparation for a test run.
One could use a single Reimager, followed by the instantiation and use
of multiple Suite objects.
create_suite_job() takes the parameters needed to define a suite run (board,
build to test, machine pool, and which suite to run), ensures important
preconditions are met, finds the appropraite suite control file, and then
schedules the hostless job that will do the rest of the work.
reimage_and_run() works by creating a Reimager, using it to perform the
requested installs, and then instantiating a Suite and running it on the
machines that were just reimaged. We'll go through this process in stages.
Note that we have more than one Dev server in our test lab architecture.
We currently load balance per-build being tested, so one and only one dev
server is used by any given run through the reimaging/testing flow.
- create_suite_job()
The primary role of create_suite_job() is to ensure that the required
artifacts for the build to be tested are staged on the dev server. This
includes payloads required to autoupdate machines to the desired build, as
well as the autotest control files appropriate for that build. Then, the
RPC pulls the control file for the suite to be run from the dev server and
uses it to create the suite job with the autotest frontend.
+----------------+
| Google Storage | Client
+----------------+ |
| ^ | create_suite_job()
payloads/ | | |
control files | | request |
V | V
+-------------+ download request +--------------------------+
| |<----------------------| |
| Dev Server | | Autotest Frontend (AFE) |
| |---------------------->| |
+-------------+ suite control file +--------------------------+
|
V
Suite Job (hostless)
- Reimage and Run
The overall process followed by the suite is to schedule the reimage job,
schedule all the tests, wait for the reimage job, and then wait for the tests
to complete. This is advantageous because it allows for the tests to start
running as soon as one of the reimage jobs complete. We still need to
synchronously wait for the reimaging to complete so that we can
a) enforce a reimaging timeout
b) fail tests that, because of DEPENDENCIES, require a specific machine
that failed to reimage properly.
After this, we can sit around and wait for the suite run to finish.
- The Reimaging Process
In short, the Reimager can schedule and wait for a number of update 'test' jobs
that perform image or firmware installation and make sure the device comes back
up. It labels the machines that it reimages with the newly-installed version,
so that later steps of running tests can refer to the machines by version and
board, instead of having to keep track of hostnames or some such. As soon as
the machines finish reimaging, they will be available for tests to be run on
them.
The number of machines to use is called the 'sharding_factor', and the default
is defined in the [CROS] section of global_config.ini. This can be overridden
by passing a 'num=N' parameter to create_suite_job(), which is piped through
to reimage_and_run() just like the 'build' and 'board' parameters are.
A test control file can specify a list of DEPENDENCIES, which are really just
the set of labels a host needs to have in order for that test to be scheduled
on it. In the case of a dynamic_suite, many tests in the suite may have
DEPENDENCIES specified. They may overlap, they may not. This complicates
reimaging, as we need to try to cover the DEPENDENCIES of all tests in the
suite while still staying within the sharding_factor.
A lengthier discussion is available at:
http://goto.google.com/dynamic-suites-deps
For now, we use a naive algorithm to find a set of machines that can
satisfy all the test DEPENDENCIES in a given suite.
Per-test-per-suite DEPENDENCIES information is generated at build
time, and then fetched and consumed during suite preparation. If info
exists for the suite being run, the infrastructure will look for a
host that matches each unique DEPENDENCY list. If it can find such a
set that is <= the suite sharding factor, the infrastructure will
schedule those machines for reimaging. If not, the suite will fail.
If a suite has no DEPENDENCIES info, we just do the standard meta_host-based
scheduling.
- Scheduling Reimaging
Step by step:
0) Fetch DEPENDENCIES info for the suite to be run.
1) Process the DEPENDENCIES with whatever board and device 'pool' are
specified to create a list of unique dependency lists.
2) Determine what hosts we need to reimage.
2a) If we have no DEPENDENCIES, set up a meta_host that specifies N hosts of
the right board in the right pool and move on.
2b) If we do, query the AFE to find at least one live host that matches
each dependency list. If some list has no matching hosts, bail.
If we have more lists than our sharding_factor, bail.
3) Schedule autoupdate 'tests' across chosen devices.
- Technically, one job that has N tests across N hosts.
- This 'test' is in server/site_tests/autoupdate/control
- The control file is modified at runtime to inject the name of the build
to install, and the URL to get said build from.
- This is the _TOT_ version of the autoupdate test; it must be able to run
successfully on all currently supported branches at all times.
4) Label successfully reimaged devices with a 'cros-version' label
- This is actually done by the autoupdate 'test' control file.
5) Add a host attribute ('job_repo_url') to each reimaged host indicating
the URL where packages should be downloaded for subsequent tests
- This is actually done by the autoupdate 'test' control file
- This information is consumed in server/site_autotest.py
- job_repo_url points to some location on the dev server, where build
artifacts are staged -- including autotest packages.
6) Return success if the reimage job has been kicked off successfully.
+------------+ +--------------------------+
| | | |
| Dev Server | | Autotest Frontend (AFE) |
| | | [Suite Job] |
+------------+ +--------------------------+
| payloads | | | |
V V autoupdate test | | |
+--------+ +--------+ <-----+----------------+ | |
| Host 1 |<------| Host 2 |-------+ | |
+--------+ +--------+ label | |
VersLabel VersLabel <-----------------------+ |
job_repo_url job_repo_url <-----------------------------+
host-attribute
To sum up, after re-imaging, we have the following assumptions:
- At most |num| devices of type |board| have |build| installed.
- These devices are labeled appropriately
- They have a host attribute called 'job_repo_url' dictating where autotest
packages can be downloaded for test runs.
- Scheduling Suites
A Suite instance uses the labels created by the Reimager to schedule test jobs
across all the hosts that were just reimaged. It then waits for all these jobs.
As an optimization, the Dev server stages the payloads necessary to run a suite
in the background _after_ it has completed all the things necessary for
reimaging. Before running a suite, reimage_and_run() calls out to the Dev
server and blocks until it's completed staging all build artifacts needed to
run test suites.
Step by step:
0) At instantiation time, find all appropriate control files for this suite
that were included in the build to be tested. To do this, we consult the
Dev Server, where all these control files are staged.
+------------+ control files? +--------------------------+
| |<----------------------| |
| Dev Server | | Autotest Frontend (AFE) |
| |---------------------->| [Suite Job] |
+------------+ control files! +--------------------------+
1) Now that the Suite instance exists, it schedules jobs for every control
file it deemed appropriate, to be run on the hosts that were labeled
by the Reimager. We stuff keyvals into these jobs, indicating what
build they were testing and which suite they were for.
+--------------------------+ Job for VersLabel +--------+
| |------------------------>| Host 1 | VersLabel
| Autotest Frontend (AFE) | +--------+ +--------+
| [Suite Job] |----------->| Host 2 |
+--------------------------+ Job for +--------+
| ^ VersLabel VersLabel
| |
+----------------+
One job per test
{'build': build/name,
'suite': suite_name}
2) Now that all jobs are scheduled, they'll be doled out as labeled hosts
finish their assigned work and become available again.
- Waiting on Reimaging
0) Wait for the reimage job to get assigned hosts and kicked off.
1) If we don't get enough hosts by a certain time, fail the rest of the hosts.
2) Wait for all reimaging to run to completion.
3) If we can no longer fulfill the DEPENDENCIES requirements of some tests
because reimaging hosts fulfilling those HostSpecs failed, then fail the
tests.
- Waiting on Suites
0) As we clean up each test job, we check to see if any crashes occurred. If
they did, we look at the 'build' keyval in the job to see which build's debug
symbols we'll need to symbolicate the crash dump we just found.
1) Using this info, we tell a special Crash Server to stage the required debug
symbols. Once that's done, we ask the Crash Server to use those symbols to
symbolicate the crash dump in question.
+----------------+
| Google Storage |
+----------------+
| ^
symbols! | | symbols?
V |
+------------+ stage symbols for build +--------------------------+
| |<--------------------------| |
| Crash | | |
| Server | dump to symbolicate | Autotest Frontend (AFE) |
| |<--------------------------| [Suite Job] |
| |-------------------------->| |
+------------+ symbolicated dump +--------------------------+
2) As jobs finish, we record their success or failure in the status of the suite
job. We also record a 'job keyval' in the suite job for each test, noting
the job ID and job owner. This can be used to refer to test logs later.
3) Once all jobs are complete, status is recorded for the suite job, and the
job_repo_url host attribute is removed from all hosts used by the suite.
"""
DEFAULT_TRY_JOB_TIMEOUT_MINS = tools.try_job_timeout_mins()
# Relevant CrosDynamicSuiteExceptions are defined in client/common_lib/error.py.
class SuiteSpec(object):
"""
This class contains the info that defines a suite run.
Currently required:
@var build: the build to install e.g.
x86-alex-release/R18-1655.0.0-a1-b1584.
@var board: which kind of devices to reimage.
@var devserver: An instance of the devserver to use with this suite.
@var name: a value of the SUITE control file variable to search for.
@var job: an instance of client.common_lib.base_job representing the
currently running suite job.
Currently supported optional fields:
@var pool: specify the pool of machines to use for scheduling purposes.
Default: None
@var num: the maximum number of devices to reimage.
Default in global_config
@var check_hosts: require appropriate hosts to be available now.
@var skip_reimage: skip reimaging, used for testing purposes.
Default: False
@var add_experimental: schedule experimental tests as well, or not.
Default: True
@var dependencies: map of test names to dependency lists.
Initially {'': []}.
@param suite_dependencies: A string with a comma separated list of suite
level dependencies, which act just like test
dependencies and are appended to each test's
set of dependencies at job creation time.
"""
def __init__(self, build=None, board=None, name=None, job=None,
pool=None, num=None, check_hosts=True,
skip_reimage=False, add_experimental=True, file_bugs=False,
max_runtime_mins=24*60, firmware_reimage=False,
try_job_timeout_mins=DEFAULT_TRY_JOB_TIMEOUT_MINS,
suite_dependencies=None,
reimage_type=constants.REIMAGE_TYPE_OS,
bug_template={}, devserver_url=None, **dargs):
"""
Vets arguments for reimage_and_run() and populates self with supplied
values.
Currently required args:
@param build: the build to install e.g.
x86-alex-release/R18-1655.0.0-a1-b1584.
@param board: which kind of devices to reimage.
@param name: a value of the SUITE control file variable to search for.
@param job: an instance of client.common_lib.base_job representing the
currently running suite job.
Currently supported optional args:
@param pool: specify the pool of machines to use for scheduling purposes
Default: None
@param num: the maximum number of devices to reimage.
Default in global_config
@param check_hosts: require appropriate hosts to be available now.
@param skip_reimage: skip reimaging, used for testing purposes.
Default: False
@param add_experimental: schedule experimental tests as well, or not.
Default: True
@param max_runtime_mins: Max runtime in mins for each of the sub-jobs
this suite will run.
@param firmware_reimage: True if we should use the FwReimager,
False if we should use OsReimager.
(This flag has now been deprecated in favor of
reimage_type.)
@param try_job_timeout_mins: Max time in mins we allow a try job to run
before timing out.
@param suite_dependencies: A string with a comma separated list of suite
level dependencies, which act just like test
dependencies and are appended to each test's
set of dependencies at job creation time.
@param reimage_type: A string identifying the type of reimaging that
should be done before running tests.
@param bug_template: A template dictionary specifying the default bug
filing options for failures in this suite.
@param devserver_url: url to the selected devserver.
@param **dargs: these arguments will be ignored. This allows us to
deprecate and remove arguments in ToT while not
breaking branch builds.
"""
required_keywords = {'build': str,
'board': str,
'name': str,
'job': base_job.base_job}
for key, expected in required_keywords.iteritems():
value = locals().get(key)
if not value or not isinstance(value, expected):
raise error.SuiteArgumentException(
"reimage_and_run() needs %s=<%r>" % (key, expected))
self.build = build
self.board = 'board:%s' % board
if devserver_url:
self.devserver = dev_server.ImageServer(devserver_url)
else:
self.devserver = dev_server.ImageServer.resolve(self.build)
self.name = name
self.job = job
if pool:
self.pool = 'pool:%s' % pool
else:
self.pool = pool
self.num = num
self.check_hosts = check_hosts
self.skip_reimage = skip_reimage
self.add_experimental = add_experimental
self.file_bugs = file_bugs
self.dependencies = {'': []}
self.max_runtime_mins = max_runtime_mins
self.firmware_reimage = firmware_reimage
self.try_job_timeout_mins = try_job_timeout_mins
self.suite_dependencies = suite_dependencies
self.reimage_type = reimage_type
self.bug_template = bug_template
def skip_reimage(g):
"""
Pulls the SKIP_IMAGE value out of a global variables dictionary.
@param g: The global variables dictionary.
@return: Value associated with SKIP-IMAGE
"""
return g.get('SKIP_IMAGE')
def reimage_and_run(**dargs):
"""
Backward-compatible API for dynamic_suite.
Will re-image a number of devices (of the specified board) with the
provided build, and then run the indicated test suite on them.
Guaranteed to be compatible with any build from stable to dev.
@param dargs: Dictionary containing the arguments listed below.
Currently required args:
@param build: the build to install e.g.
x86-alex-release/R18-1655.0.0-a1-b1584.
@param board: which kind of devices to reimage.
@param name: a value of the SUITE control file variable to search for.
@param job: an instance of client.common_lib.base_job representing the
currently running suite job.
Currently supported optional args:
@param pool: specify the pool of machines to use for scheduling purposes.
Default: None
@param num: the maximum number of devices to reimage.
Default in global_config
@param check_hosts: require appropriate hosts to be available now.
@param skip_reimage: skip reimaging, used for testing purposes.
Default: False
@param add_experimental: schedule experimental tests as well, or not.
Default: True
@param file_bugs: automatically file bugs on test failures.
Default: False
@param suite_dependencies: A string with a comma separated list of suite
level dependencies, which act just like test
dependencies and are appended to each test's
set of dependencies at job creation time.
@param reimage_type: A string indicating what type of reimaging that the
suite wishes to have done to the machines it will
run on. Valid arguments are given as constants in this
file.
@param devserver_url: url to the selected devserver.
@raises AsynchronousBuildFailure: if there was an issue finishing staging
from the devserver.
@raises MalformedDependenciesException: if the dependency_info file for
the required build fails to parse.
"""
suite_spec = SuiteSpec(**dargs)
# Horrible hacks to handle backwards compatibility, overall goal here is
# reimage_firmware == True -> Firmware
# reimage_firmware == False AND reimage_type == None -> OS
# reimage_firmware == False AND reimage_type != None -> reimage_type
# and once we've set reimage_type right, ignore that reimage_firmware
# has ever existed...
# Remove all this code and reimage_firmware once R26 falls off stable.
if suite_spec.firmware_reimage:
suite_spec.reimage_type = constants.REIMAGE_TYPE_FIRMWARE
logging.warning("reimage_and_run |firmware_reimage=True| argument "
"has been deprecated. Please use |reimage_type='firmware'| "
"instead.")
elif suite_spec.reimage_type is None:
suite_spec.reimage_type = constants.REIMAGE_TYPE_OS
suite_spec.firmware_reimage = False
# </backwards_compatibility_hacks>
suite_spec.dependencies = _gatherAndParseDependencies(suite_spec)
logging.debug('Full dependency dictionary: %s', suite_spec.dependencies)
afe = frontend_wrappers.RetryingAFE(timeout_min=30, delay_sec=10,
user=suite_spec.job.user, debug=False)
tko = frontend_wrappers.RetryingTKO(timeout_min=30, delay_sec=10,
user=suite_spec.job.user, debug=False)
try:
reimager_class = reimager.reimager_for(suite_spec.reimage_type)
except KeyError:
raise error.UnknownReimageType("%s not recognized reimage_type" %
suite_spec.reimage_type)
imager = reimager_class(suite_spec.job.autodir, suite_spec.board, afe,
tko, results_dir=suite_spec.job.resultdir)
try:
my_job_id = int(tko_utils.get_afe_job_id(dargs['job'].tag))
logging.debug('Determined own job id: %d', my_job_id)
except ValueError:
my_job_id = None
logging.warning('Could not determine own job id.')
_perform_reimage_and_run(suite_spec, afe, tko, imager,
suite_job_id=my_job_id)
imager.clear_reimaged_host_state(suite_spec.build)
def _gatherAndParseDependencies(suite_spec):
"""Gather dependecy info for all suite jobs from image server, and
combine this with suite dependencies to return a dictionary of per-test
dependencies.
@param suite_spec: A SuiteSpec object with (at least) populated
name, build, and devserver fields. Used to retrieve
dependency info for the suite from devserver.
@return A dictionary mapping test name (string) to per-test dependencies
(list of strings)
"""
# Gather per-suite:per-test DEPENDENCIES info, if this build has it.
all_dependencies = {}
try:
all_dependencies = ast.literal_eval(
suite_spec.devserver.get_dependencies_file(suite_spec.build))
except SyntaxError as e:
if e.text:
message = '%s has bad syntax in DEPENDENCIES info: %r' % (
suite_spec.build, e)
else:
message = ('%s has a 0 byte DEPENDENCIES info file. Check the '
'build artifacts.') % suite_spec.build
logging.error(message)
raise error.MalformedDependenciesException(message)
except ValueError as e:
message = '%s has malformed DEPENDENCIES info: %r' % (suite_spec.build,
e)
logging.error(message)
raise error.MalformedDependenciesException(message)
except dev_server.DevServerException:
# Not all builds have dependency info at this time, which is OK.
logging.info('Proceeding without DEPENDENCIES information.')
dep_dict = all_dependencies.get(suite_spec.name, {'': []})
# Parse the suite_dependency string into a list of dependency labels,
# then append this list of suite dependencies to all individual job
# dependency lists.
if suite_spec.suite_dependencies:
suite_deplist = [deplabel.strip(' ') for deplabel in
suite_spec.suite_dependencies.split(',')]
for deplist in dep_dict.values():
deplist.extend(suite_deplist)
return dep_dict
def _perform_reimage_and_run(spec, afe, tko, reimager, suite_job_id=None):
"""
Do the work of reimaging hosts and running tests.
@param spec: a populated SuiteSpec object.
@param afe: an instance of AFE as defined in server/frontend.py.
@param tko: an instance of TKO as defined in server/frontend.py.
@param reimager: the Reimager to use to reimage DUTs.
@param suite_job_id: Job id that will act as parent id to all sub jobs.
Default: None
"""
# Kicking off the reimage job is now async, so we immeditely continue on.
tests_to_skip = []
if not spec.skip_reimage:
if not reimager.schedule(spec.build, spec.pool, spec.devserver,
spec.job.record_entry, spec.check_hosts, tests_to_skip,
spec.dependencies, num=spec.num, suite_job_id=suite_job_id):
# Tryjob will be marked with WARN or ABORT by the reimager if it
# fails to launch the reimaging.
return
# However, we can't do anything else until the devserver has finished
# downloading all of the artifacts (primarily autotest.tar).
try:
spec.devserver.finish_download(spec.build)
except dev_server.DevServerException as e:
# If we can't run the suite, there's no point in reimaging machines.
reimager.abort()
raise error.AsynchronousBuildFailure(e)
timestamp = datetime.datetime.now().strftime(job_status.TIME_FMT)
utils.write_keyval(
spec.job.resultdir,
{constants.ARTIFACT_FINISHED_TIME: timestamp})
# We need to do suite creation after devserver finishes downloading,
# as this is going to go poke at the control files which are
# only available after |finish_download| completes.
suite = Suite.create_from_name_and_blacklist(
spec.name, tests_to_skip, spec.build, spec.devserver,
afe=afe, tko=tko, pool=spec.pool,
results_dir=spec.job.resultdir,
max_runtime_mins=spec.max_runtime_mins,
version_prefix=reimager.version_prefix,
file_bugs=spec.file_bugs,
suite_job_id=suite_job_id)
# Now we get to asychronously schedule tests.
suite.schedule(spec.job.record_entry, spec.add_experimental)
# Unfortunately, due to having to be able to invalidate tests with
# DEPENDENCIES if the corresponding host fails to reimage, we do still
# need to synchronously wait for the reimage job to finish (or get killed).
if not spec.skip_reimage:
# It's also worthwhile to note that sbasi's tryjob timeout code is
# still in here, so if a HQE isn't fulfilled within a certain time,
# we will kill everything.
# This could lead to some weird interaction where we start running some
# tests because a machine becomes available, but then abort the suite
# because not enough machines became available.
reimage_successful = reimager.wait(
spec.build, spec.pool, spec.job.record_entry,
spec.check_hosts, tests_to_skip, spec.dependencies,
timeout_mins=spec.try_job_timeout_mins)
else:
reimage_successful = True
# Sit around and wait for some test results.
if reimage_successful:
suite.wait(spec.job.record_entry, spec.bug_template)
else:
suite.abort()