server/cros/dynamic_suite/dynamic_suite.py - mirrors/cros/chromiumos/third_party/autotest - Git at Google

 # Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
 # Use of this source code is governed by a BSD-style license that can be
 # found in the LICENSE file.

 import datetime
 import logging

 import common

 from autotest_lib.client.common_lib import base_job
 from autotest_lib.client.common_lib import error
 from autotest_lib.client.common_lib import priorities
 from autotest_lib.client.common_lib import time_utils
 from autotest_lib.client.common_lib import utils
 from autotest_lib.client.common_lib.cros import dev_server
 from autotest_lib.server.cros import provision
 from autotest_lib.server.cros.dynamic_suite import constants
 from autotest_lib.server.cros.dynamic_suite import frontend_wrappers
 from autotest_lib.server.cros.dynamic_suite import tools
 from autotest_lib.server.cros.dynamic_suite.suite import Suite
 from autotest_lib.tko import utils as tko_utils


 """CrOS dynamic test suite generation and execution module.

 This module implements runtime-generated test suites for CrOS.
 Design doc: http://goto.google.com/suitesv2

 Individual tests can declare themselves as a part of one or more
 suites, and the code here enables control files to be written
 that can refer to these "dynamic suites" by name.  We also provide
 support for reimaging devices with a given build and running a
 dynamic suite across all reimaged devices.

 The public API for defining a suite includes one method: reimage_and_run().
 A suite control file can be written by importing this module and making
 an appropriate call to this single method.  In normal usage, this control
 file will be run in a 'hostless' server-side autotest job, scheduling
 sub-jobs to do the needed reimaging and test running.

 Example control file:

 import common
 from autotest_lib.server.cros import provision
 from autotest_lib.server.cros.dynamic_suite import dynamic_suite

 dynamic_suite.reimage_and_run(
     build=build, board=board, name='bvt', job=job, pool=pool,
     check_hosts=check_hosts, add_experimental=True, num=num,
     devserver_url=devserver_url, version_prefix=provision.CROS_VERSION_PREFIX)

 This will -- at runtime -- find all control files that contain "bvt" in their
 "SUITE=" clause, schedule jobs to reimage |num| or less devices in the
 specified pool of the specified board with the specified build and, upon
 completion of those jobs, schedule and wait for jobs that run all the tests it
 discovered.

 Suites can be run by using the atest command-line tool:
   atest suite create -b <board> -i <build/name> <suite>
 e.g.
   atest suite create -b x86-mario -i x86-mario/R20-2203.0.0 bvt

 -------------------------------------------------------------------------
 Implementation details

 A Suite instance represents a single test suite, defined by some predicate
 run over all known control files.  The simplest example is creating a Suite
 by 'name'.

 create_suite_job() takes the parameters needed to define a suite run (board,
 build to test, machine pool, and which suite to run), ensures important
 preconditions are met, finds the appropraite suite control file, and then
 schedules the hostless job that will do the rest of the work.

 Note that we have more than one Dev server in our test lab architecture.
 We currently load balance per-build being tested, so one and only one dev
 server is used by any given run through the reimaging/testing flow.

 - create_suite_job()
 The primary role of create_suite_job() is to ensure that the required
 artifacts for the build to be tested are staged on the dev server.  This
 includes payloads required to autoupdate machines to the desired build, as
 well as the autotest control files appropriate for that build.  Then, the
 RPC pulls the control file for the suite to be run from the dev server and
 uses it to create the suite job with the autotest frontend.

      +----------------+
      | Google Storage |                                Client
      +----------------+                                   |
                | ^                                        | create_suite_job()
  payloads/     | |                                        |
  control files | | request                                |
                V |                                        V
        +-------------+   download request    +--------------------------+
        |             |<----------------------|                          |
        | Dev Server  |                       | Autotest Frontend (AFE)  |
        |             |---------------------->|                          |
        +-------------+  suite control file   +--------------------------+
                                                           |
                                                           V
                                                       Suite Job (hostless)

 - Reimage and Run
 The overall process is to schedule all the tests, and then wait for the tests
 to complete.

 - The Reimaging Process

 As an artifact of an old implementation, the number of machines to use
 is called the 'sharding_factor', and the default is defined in the [CROS]
 section of global_config.ini.  This can be overridden by passing a 'num=N'
 parameter to create_suite_job(), which is piped through to reimage_and_run()
 just like the 'build' and 'board' parameters are.  However, with provisioning,
 this machine accounting hasn't been implemented nor removed.  However, 'num' is
 still passed around, as it might be used one day.

 A test control file can specify a list of DEPENDENCIES, which are really just
 the set of labels a host needs to have in order for that test to be scheduled
 on it.  In the case of a dynamic_suite, many tests in the suite may have
 DEPENDENCIES specified.  All tests are scheduled with the DEPENDENCIES that
 they specify, along with any suite dependencies that were specified, and the
 scheduler will find and provision a host capable of running the test.

 - Scheduling Suites
 A Suite instance uses the labels specified in the suite dependencies to
 schedule tests across all the hosts in the pool.  It then waits for all these
 jobs.  As an optimization, the Dev server stages the payloads necessary to
 run a suite in the background _after_ it has completed all the things
 necessary for reimaging.  Before running a suite, reimage_and_run() calls out
 to the Dev server and blocks until it's completed staging all build artifacts
 needed to run test suites.

 Step by step:
 0) At instantiation time, find all appropriate control files for this suite
    that were included in the build to be tested.  To do this, we consult the
    Dev Server, where all these control files are staged.

           +------------+    control files?     +--------------------------+
           |            |<----------------------|                          |
           | Dev Server |                       | Autotest Frontend (AFE)  |
           |            |---------------------->|       [Suite Job]        |
           +------------+    control files!     +--------------------------+

 1) Now that the Suite instance exists, it schedules jobs for every control
    file it deemed appropriate, to be run on the hosts that were labeled
    by the provisioning.  We stuff keyvals into these jobs, indicating what
    build they were testing and which suite they were for.

    +--------------------------+ Job for VersLabel       +--------+
    |                          |------------------------>| Host 1 | VersLabel
    | Autotest Frontend (AFE)  |            +--------+   +--------+
    |       [Suite Job]        |----------->| Host 2 |
    +--------------------------+ Job for    +--------+
        |                ^       VersLabel        VersLabel
        |                |
        +----------------+
         One job per test
         {'build': build/name,
          'suite': suite_name}

 2) Now that all jobs are scheduled, they'll be doled out as labeled hosts
    finish their assigned work and become available again.

 - Waiting on Suites
 0) As we clean up each test job, we check to see if any crashes occurred.  If
    they did, we look at the 'build' keyval in the job to see which build's debug
    symbols we'll need to symbolicate the crash dump we just found.

 1) Using this info, we tell a special Crash Server to stage the required debug
    symbols. Once that's done, we ask the Crash Server to use those symbols to
    symbolicate the crash dump in question.

      +----------------+
      | Google Storage |
      +----------------+
           |     ^
  symbols! |     | symbols?
           V     |
       +------------+  stage symbols for build  +--------------------------+
       |            |<--------------------------|                          |
       |   Crash    |                           |                          |
       |   Server   |   dump to symbolicate     | Autotest Frontend (AFE)  |
       |            |<--------------------------|       [Suite Job]        |
       |            |-------------------------->|                          |
       +------------+    symbolicated dump      +--------------------------+

 2) As jobs finish, we record their success or failure in the status of the suite
    job.  We also record a 'job keyval' in the suite job for each test, noting
    the job ID and job owner.  This can be used to refer to test logs later.
 3) Once all jobs are complete, status is recorded for the suite job, and the
    job_repo_url host attribute is removed from all hosts used by the suite.

 """


 DEFAULT_TRY_JOB_TIMEOUT_MINS = tools.try_job_timeout_mins()

 # Relevant CrosDynamicSuiteExceptions are defined in client/common_lib/error.py.

 class SuiteSpec(object):
     """
     This class contains the info that defines a suite run.

     Currently required:
     @var build: the build to install e.g.
                   x86-alex-release/R18-1655.0.0-a1-b1584.
     @var board: which kind of devices to reimage.
     @var devserver: An instance of the devserver to use with this suite.
     @var name: a value of the SUITE control file variable to search for.
     @var job: an instance of client.common_lib.base_job representing the
                 currently running suite job.

     Currently supported optional fields:
     @var pool: specify the pool of machines to use for scheduling purposes.
                Default: None
     @var num: the maximum number of devices to reimage.
               Default in global_config
     @var check_hosts: require appropriate hosts to be available now.
     @var add_experimental: schedule experimental tests as well, or not.
                            Default: True
     @var dependencies: map of test names to dependency lists.
                        Initially {'': []}.
     @param suite_dependencies: A string with a comma separated list of suite
                                level dependencies, which act just like test
                                dependencies and are appended to each test's
                                set of dependencies at job creation time.
     @param predicate: Optional argument. If present, should be a function
                       mapping ControlData objects to True if they should be
                       included in suite. If argument is absent, suite
                       behavior will default to creating a suite of based
                       on the SUITE field of control files.
     """
     def __init__(self, build=None, board=None, name=None, job=None,
                  pool=None, num=None, check_hosts=True,
                  add_experimental=True, file_bugs=False,
                  file_experimental_bugs=False, max_runtime_mins=24*60,
                  timeout=24, timeout_mins=None, firmware_reimage=False,
                  suite_dependencies=[], version_prefix=None,
                  bug_template={}, devserver_url=None,
                  priority=priorities.Priority.DEFAULT, predicate=None,
                  wait_for_results=True, job_retry=False, max_retries=None,
                  offload_failures_only=False, **dargs):
         """
         Vets arguments for reimage_and_run() and populates self with supplied
         values.

         Currently required args:
         @param build: the build to install e.g.
                       x86-alex-release/R18-1655.0.0-a1-b1584.
         @param board: which kind of devices to reimage.
         @param name: a value of the SUITE control file variable to search for.
         @param job: an instance of client.common_lib.base_job representing the
                     currently running suite job.
         @param devserver_url: url to the selected devserver.

         Currently supported optional args:
         @param pool: specify the pool of machines to use for scheduling purposes
                      Default: None
         @param num: the maximum number of devices to reimage.
                     Default in global_config
         @param check_hosts: require appropriate hosts to be available now.
         @param add_experimental: schedule experimental tests as well, or not.
                                  Default: True
         @param file_bugs: File bugs when tests in this suite fail.
                           Default: False
         @param file_experimental_bugs: File bugs when experimental tests in
                                        this suite fail.
                                        Default: False
         @param max_runtime_mins: Max runtime in mins for each of the sub-jobs
                                  this suite will run.
         @param timeout: Max lifetime in hours for each of the sub-jobs that
                         this suite run.
         @param firmware_reimage: True if we should use FW_VERSION_PREFIX as
                                  the version_prefix.
                                  False if we should use CROS_VERSION_PREFIX as
                                  the version_prefix.
                                  (This flag has now been deprecated in favor of
                                   version_prefix.)
         @param suite_dependencies: A list of strings of suite level
                                    dependencies, which act just like test
                                    dependencies and are appended to each test's
                                    set of dependencies at job creation time.
                                    A string of comma seperated labels is
                                    accepted for backwards compatibility.
         @param bug_template: A template dictionary specifying the default bug
                              filing options for failures in this suite.
         @param version_prefix: A version prefix from provision.py that the
                                tests should be scheduled with.
         @param priority: Integer priority level.  Higher is more important.
         @param predicate: Optional argument. If present, should be a function
                           mapping ControlData objects to True if they should be
                           included in suite. If argument is absent, suite
                           behavior will default to creating a suite of based
                           on the SUITE field of control files.
         @param wait_for_results: Set to False to run the suite job without
                                  waiting for test jobs to finish. Default is
                                  True.
         @param job_retry: Set to True to enable job-level retry. Default is
                           False.
         @param max_retries: Maximum retry limit at suite level.
                             Regardless how many times each individual test
                             has been retried, the total number of retries
                             happening in the suite can't exceed _max_retries.
                             Default to None, no max.
         @param offload_failures_only: Only enable gs_offloading for failed
                                       jobs.
         @param **dargs: these arguments will be ignored.  This allows us to
                         deprecate and remove arguments in ToT while not
                         breaking branch builds.
         """
         required_keywords = {'build': str,
                              'board': str,
                              'name': str,
                              'job': base_job.base_job,
                              'devserver_url': str}
         for key, expected in required_keywords.iteritems():
             value = locals().get(key)
             if not value or not isinstance(value, expected):
                 raise error.SuiteArgumentException(
                     "reimage_and_run() needs %s=<%r>" % (key, expected))
         self.board = 'board:%s' % board
         self.devserver = dev_server.ImageServer(devserver_url)
         self.build = self.devserver.translate(build)
         self.name = name
         self.job = job
         if pool:
             self.pool = 'pool:%s' % pool
         else:
             self.pool = pool
         self.num = num
         self.check_hosts = check_hosts
         self.skip_reimage = skip_reimage
         self.add_experimental = add_experimental
         self.file_bugs = file_bugs
         self.file_experimental_bugs = file_experimental_bugs
         self.dependencies = {'': []}
         self.max_runtime_mins = max_runtime_mins
         self.timeout = timeout
         self.timeout_mins = timeout_mins or timeout * 60
         self.firmware_reimage = firmware_reimage
         if isinstance(suite_dependencies, str):
             self.suite_dependencies = [dep.strip(' ') for dep
                                        in suite_dependencies.split(',')]
         else:
             self.suite_dependencies = suite_dependencies
         self.bug_template = bug_template
         self.version_prefix = version_prefix
         self.priority = priority
         self.predicate = predicate
         self.wait_for_results = wait_for_results
         self.job_retry = job_retry
         self.max_retries = max_retries
         self.offload_failures_only = offload_failures_only


 def skip_reimage(g):
     """
     Pulls the SKIP_IMAGE value out of a global variables dictionary.
     @param g: The global variables dictionary.
     @return:  Value associated with SKIP-IMAGE
     """
     return False


 def reimage_and_run(**dargs):
     """
     Backward-compatible API for dynamic_suite.

     Will re-image a number of devices (of the specified board) with the
     provided build, and then run the indicated test suite on them.
     Guaranteed to be compatible with any build from stable to dev.

     @param dargs: Dictionary containing the arguments listed below.

     Currently required args:
     @param build: the build to install e.g.
                   x86-alex-release/R18-1655.0.0-a1-b1584.
     @param board: which kind of devices to reimage.
     @param name: a value of the SUITE control file variable to search for.
     @param job: an instance of client.common_lib.base_job representing the
                 currently running suite job.

     Currently supported optional args:
     @param pool: specify the pool of machines to use for scheduling purposes.
                  Default: None
     @param num: the maximum number of devices to reimage.
                 Default in global_config
     @param check_hosts: require appropriate hosts to be available now.
     @param add_experimental: schedule experimental tests as well, or not.
                              Default: True
     @param file_bugs: automatically file bugs on test failures.
                       Default: False
     @param suite_dependencies: A string with a comma separated list of suite
                                level dependencies, which act just like test
                                dependencies and are appended to each test's
                                set of dependencies at job creation time.
     @param devserver_url: url to the selected devserver.
     @param predicate: Optional argument. If present, should be a function
                       mapping ControlData objects to True if they should be
                       included in suite. If argument is absent, suite
                       behavior will default to creating a suite of based
                       on the SUITE field of control files.
     @param job_retry: A bool value indicating whether jobs should be retired
                       on failure. If True, the field 'JOB_RETRIES' in control
                       files will be respected. If False, do not retry.
     @param max_retries: Maximum retry limit at suite level.
                         Regardless how many times each individual test
                         has been retried, the total number of retries
                         happening in the suite can't exceed _max_retries.
                         Default to None, no max.
     @param offload_failures_only: Only enable gs_offloading for failed jobs.
     @raises AsynchronousBuildFailure: if there was an issue finishing staging
                                       from the devserver.
     @raises MalformedDependenciesException: if the dependency_info file for
                                             the required build fails to parse.
     """
     suite_spec = SuiteSpec(**dargs)

     # Horrible hacks to handle backwards compatibility, overall goal here is
     # reimage_firmware == True -> Firmware
     # reimage_firmware == False AND version_prefix == None -> OS
     # reimage_firmware == False AND version_prefix != None -> version_prefix
     # and once we've set version_prefix right, ignore that reimage_firmware
     # has ever existed...
     # Remove all this code and reimage_firmware once R31 falls off stable.
     if suite_spec.firmware_reimage:
         suite_spec.version_prefix = provision.FW_VERSION_PREFIX
         logging.warning("reimage_and_run |firmware_reimage=True| argument "
                 "has been deprecated. Please use "
                 "|version_prefix=provision.FW_VERSION_PREFIX| instead.")
     elif not suite_spec.version_prefix:
         suite_spec.version_prefix = provision.CROS_VERSION_PREFIX

     suite_spec.firmware_reimage = False
     # </backwards_compatibility_hacks>

     # version_prefix+build should make it into each test as a DEPENDENCY.  The
     # easiest way to do this is to tack it onto the suite_dependencies.
     if suite_spec.version_prefix:
         dependency = provision.join(suite_spec.version_prefix, suite_spec.build)
         suite_spec.suite_dependencies.append(dependency)

     afe = frontend_wrappers.RetryingAFE(timeout_min=30, delay_sec=10,
                                         user=suite_spec.job.user, debug=False)
     tko = frontend_wrappers.RetryingTKO(timeout_min=30, delay_sec=10,
                                         user=suite_spec.job.user, debug=False)

     try:
         my_job_id = int(tko_utils.get_afe_job_id(dargs['job'].tag))
         logging.debug('Determined own job id: %d', my_job_id)
     except ValueError:
         my_job_id = None
         logging.warning('Could not determine own job id.')

     if suite_spec.predicate is None:
         predicate = Suite.name_in_tag_predicate(suite_spec.name)
     else:
         predicate = suite_spec.predicate

     _perform_reimage_and_run(suite_spec, afe, tko,
                              predicate, suite_job_id=my_job_id)

     logging.debug('Returning from dynamic_suite.reimage_and_run.')


 def _perform_reimage_and_run(spec, afe, tko, predicate, suite_job_id=None):
     """
     Do the work of reimaging hosts and running tests.

     @param spec: a populated SuiteSpec object.
     @param afe: an instance of AFE as defined in server/frontend.py.
     @param tko: an instance of TKO as defined in server/frontend.py.
     @param predicate: A function mapping ControlData objects to True if they
                       should be included in the suite.
     @param suite_job_id: Job id that will act as parent id to all sub jobs.
                          Default: None
     """
     # We can't do anything else until the devserver has finished downloading
     # autotest.tar so that we can get the control files we should schedule.
     try:
         spec.devserver.stage_artifacts(
                 spec.build, ['control_files', 'test_suites'])
     except dev_server.DevServerException as e:
         # If we can't get the control files, there's nothing to run.
         raise error.AsynchronousBuildFailure(e)

     timestamp = datetime.datetime.now().strftime(time_utils.TIME_FMT)
     utils.write_keyval(
         spec.job.resultdir,
         {constants.ARTIFACT_FINISHED_TIME: timestamp})

     suite = Suite.create_from_predicates(
         predicates=[predicate], name=spec.name,
         build=spec.build, board=spec.board, devserver=spec.devserver,
         afe=afe, tko=tko, pool=spec.pool,
         results_dir=spec.job.resultdir,
         max_runtime_mins=spec.max_runtime_mins, timeout_mins=spec.timeout_mins,
         file_bugs=spec.file_bugs,
         file_experimental_bugs=spec.file_experimental_bugs,
         suite_job_id=suite_job_id, extra_deps=spec.suite_dependencies,
         priority=spec.priority, wait_for_results=spec.wait_for_results,
         job_retry=spec.job_retry, max_retries=spec.max_retries,
         offload_failures_only=spec.offload_failures_only)

     # Now we get to asychronously schedule tests.
     suite.schedule(spec.job.record_entry, spec.add_experimental)

     if suite.wait_for_results:
         logging.debug('Waiting on suite.')
         suite.wait(spec.job.record_entry, spec.bug_template)
         logging.debug('Finished waiting on suite. '
                       'Returning from _perform_reimage_and_run.')
     else:
         logging.info('wait_for_results is set to False, suite job will exit '
                      'without waiting for test jobs to finish.')