server/autoserv - mirrors/cros/chromiumos/third_party/autotest - Git at Google

 #!/usr/bin/python -u
 # Copyright 2007-2008 Martin J. Bligh <mbligh@google.com>, Google Inc.
 # Released under the GPL v2

 """
 Run a control file through the server side engine
 """

 import ast
 import datetime
 import getpass
 import logging
 import os
 import re
 import signal
 import socket
 import sys
 import traceback
 import time
 import urllib2

 import common

 from autotest_lib.client.common_lib import control_data
 from autotest_lib.client.common_lib import global_config
 from autotest_lib.client.common_lib import utils
 from autotest_lib.client.common_lib.cros.graphite import autotest_es
 from autotest_lib.client.common_lib.cros.graphite import autotest_stats
 try:
     from autotest_lib.puppylab import results_mocker
 except ImportError:
     results_mocker = None

 require_atfork = global_config.global_config.get_config_value(
         'AUTOSERV', 'require_atfork_module', type=bool, default=True)


 # Number of seconds to wait before returning if testing mode is enabled
 TESTING_MODE_SLEEP_SECS = 1

 try:
     import atfork
     atfork.monkeypatch_os_fork_functions()
     import atfork.stdlib_fixer
     # Fix the Python standard library for threading+fork safety with its
     # internal locks.  http://code.google.com/p/python-atfork/
     import warnings
     warnings.filterwarnings('ignore', 'logging module already imported')
     atfork.stdlib_fixer.fix_logging_module()
 except ImportError, e:
     from autotest_lib.client.common_lib import global_config
     if global_config.global_config.get_config_value(
             'AUTOSERV', 'require_atfork_module', type=bool, default=False):
         print >>sys.stderr, 'Please run utils/build_externals.py'
         print e
         sys.exit(1)

 from autotest_lib.server import frontend
 from autotest_lib.server import server_logging_config
 from autotest_lib.server import server_job, utils, autoserv_parser, autotest
 from autotest_lib.server import utils as server_utils
 from autotest_lib.site_utils import job_directories
 from autotest_lib.site_utils import job_overhead
 from autotest_lib.site_utils import lxc
 from autotest_lib.site_utils import lxc_utils
 from autotest_lib.client.common_lib import pidfile, logging_manager
 from autotest_lib.client.common_lib.cros.graphite import autotest_stats

 # Control segment to stage server-side package.
 STAGE_SERVER_SIDE_PACKAGE_CONTROL_FILE = server_job._control_segment_path(
         'stage_server_side_package')

 def log_alarm(signum, frame):
     logging.error("Received SIGALARM. Ignoring and continuing on.")
     sys.exit(1)


 def _get_machines(parser):
     """Get a list of machine names from command line arg -m or a file.

     @param parser: Parser for the command line arguments.

     @return: A list of machine names from command line arg -m or the
              machines file specified in the command line arg -M.
     """
     if parser.options.machines:
         machines = parser.options.machines.replace(',', ' ').strip().split()
     else:
         machines = []
     machines_file = parser.options.machines_file
     if machines_file:
         machines = []
         for m in open(machines_file, 'r').readlines():
             # remove comments, spaces
             m = re.sub('#.*', '', m).strip()
             if m:
                 machines.append(m)
         logging.debug('Read list of machines from file: %s', machines_file)
         logging.debug('Machines: %s', ','.join(machines))

     if machines:
         for machine in machines:
             if not machine or re.search('\s', machine):
                 parser.parser.error("Invalid machine: %s" % str(machine))
         machines = list(set(machines))
         machines.sort()
     return machines


 def _stage_ssp(parser):
     """Stage server-side package.

     This function calls a control segment to stage server-side package based on
     the job and autoserv command line option. The detail implementation could
     be different for each host type. Currently, only CrosHost has
     stage_server_side_package function defined.
     The script returns None if no server-side package is available. However,
     it may raise exception if it failed for reasons other than artifact (the
     server-side package) not found.

     @param parser: Command line arguments parser passed in the autoserv process.

     @return: url of the staged server-side package. Return None if server-
              side package is not found for the build.
     """
     # If test_source_build is not specified, default to use server-side test
     # code from build specified in --image.
     namespace = {'machines': _get_machines(parser),
                  'image': (parser.options.test_source_build or
                            parser.options.image),}
     script_locals = {}
     execfile(STAGE_SERVER_SIDE_PACKAGE_CONTROL_FILE, namespace, script_locals)
     return script_locals['ssp_url']


 def _run_with_ssp(container_name, job_id, results, parser, ssp_url):
     """Run the server job with server-side packaging.

     @param container_name: Name of the container to run the test.
     @param job_id: ID of the test job.
     @param results: Folder to store results. This could be different from
                     parser.options.results:
                     parser.options.results  can be set to None for results to be
                     stored in a temp folder.
                     results can be None for autoserv run requires no logging.
     @param parser: Command line parser that contains the options.
     @param ssp_url: url of the staged server-side package.
     """
     bucket = lxc.ContainerBucket()
     control = (parser.args[0] if len(parser.args) > 0 and parser.args[0] != ''
                else None)
     test_container = bucket.setup_test(container_name, job_id, ssp_url, results,
                                        control=control)
     args = sys.argv[:]
     args.remove('--require-ssp')
     # --parent_job_id is only useful in autoserv running in host, not in
     # container. Include this argument will cause test to fail for builds before
     # CL 286265 was merged.
     if '--parent_job_id' in args:
         index = args.index('--parent_job_id')
         args.remove('--parent_job_id')
         # Remove the actual parent job id in command line arg.
         del args[index]

     # A dictionary of paths to replace in the command line. Key is the path to
     # be replaced with the one in value.
     paths_to_replace = {}
     # Replace the control file path with the one in container.
     if control:
         container_control_filename = os.path.join(
                 lxc.CONTROL_TEMP_PATH, os.path.basename(control))
         paths_to_replace[control] = container_control_filename
     # Update result directory with the one in container.
     if parser.options.results:
         container_result_dir = os.path.join(lxc.RESULT_DIR_FMT % job_id)
         paths_to_replace[parser.options.results] = container_result_dir
     # Update parse_job directory with the one in container. The assumption is
     # that the result folder to be parsed is always the same as the results_dir.
     if parser.options.parse_job:
         container_parse_dir = os.path.join(lxc.RESULT_DIR_FMT % job_id)
         paths_to_replace[parser.options.parse_job] = container_result_dir

     args = [paths_to_replace.get(arg, arg) for arg in args]

     # Apply --use-existing-results, results directory is aready created and
     # mounted in container. Apply this arg to avoid exception being raised.
     if not '--use-existing-results' in args:
         args.append('--use-existing-results')

     # Make sure autoserv running in container using a different pid file.
     if not '--pidfile-label' in args:
         args.extend(['--pidfile-label', 'container_autoserv'])

     cmd_line = ' '.join(["'%s'" % arg if ' ' in arg else arg for arg in args])
     logging.info('Run command in container: %s', cmd_line)
     success = False
     try:
         test_container.attach_run(cmd_line)
         success = True
     finally:
         counter_key = '%s.%s' % (lxc.STATS_KEY,
                                  'success' if success else 'fail')
         autotest_stats.Counter(counter_key).increment()
         # metadata is uploaded separately so it can use http to upload.
         metadata = {'drone': socket.gethostname(),
                     'job_id': job_id,
                     'success': success}
         autotest_es.post(use_http=True,
                          type_str=lxc.CONTAINER_RUN_TEST_METADB_TYPE,
                          metadata=metadata)
         test_container.destroy()


 def correct_results_folder_permission(results):
     """Make sure the results folder has the right permission settings.

     For tests running with server-side packaging, the results folder has the
     owner of root. This must be changed to the user running the autoserv
     process, so parsing job can access the results folder.
     TODO(dshi): crbug.com/459344 Remove this function when test container can be
     unprivileged container.

     @param results: Path to the results folder.

     """
     if not results:
         return

     utils.run('sudo -n chown -R %s "%s"' % (os.getuid(), results))
     utils.run('sudo -n chgrp -R %s "%s"' % (os.getgid(), results))


 def run_autoserv(pid_file_manager, results, parser, ssp_url, use_ssp):
     """Run server job with given options.

     @param pid_file_manager: PidFileManager used to monitor the autoserv process
     @param results: Folder to store results.
     @param parser: Parser for the command line arguments.
     @param ssp_url: Url to server-side package.
     @param use_ssp: Set to True to run with server-side packaging.
     """
     if parser.options.warn_no_ssp:
         # Post a warning in the log.
         logging.warn('Autoserv is required to run with server-side packaging. '
                      'However, no drone is found to support server-side '
                      'packaging. The test will be executed in a drone without '
                      'server-side packaging supported.')

     # send stdin to /dev/null
     dev_null = os.open(os.devnull, os.O_RDONLY)
     os.dup2(dev_null, sys.stdin.fileno())
     os.close(dev_null)

     # Create separate process group
     os.setpgrp()

     # Container name is predefined so the container can be destroyed in
     # handle_sigterm.
     job_or_task_id = job_directories.get_job_id_or_task_id(
             parser.options.results)
     container_name = (lxc.TEST_CONTAINER_NAME_FMT %
                       (job_or_task_id, time.time(), os.getpid()))

     # Implement SIGTERM handler
     def handle_sigterm(signum, frame):
         logging.debug('Received SIGTERM')
         if pid_file_manager:
             pid_file_manager.close_file(1, signal.SIGTERM)
         logging.debug('Finished writing to pid_file. Killing process.')

         # Update results folder's file permission. This needs to be done ASAP
         # before the parsing process tries to access the log.
         if use_ssp and results:
             correct_results_folder_permission(results)

         # TODO (sbasi) - remove the time.sleep when crbug.com/302815 is solved.
         # This sleep allows the pending output to be logged before the kill
         # signal is sent.
         time.sleep(.1)
         if use_ssp:
             logging.debug('Destroy container %s before aborting the autoserv '
                           'process.', container_name)
             metadata = {'drone': socket.gethostname(),
                         'job_id': job_or_task_id,
                         'container_name': container_name,
                         'action': 'abort',
                         'success': True}
             try:
                 bucket = lxc.ContainerBucket()
                 container = bucket.get(container_name)
                 if container:
                     container.destroy()
                 else:
                     metadata['success'] = False
                     metadata['error'] = 'container not found'
                     logging.debug('Container %s is not found.', container_name)
             except:
                 metadata['success'] = False
                 metadata['error'] = 'Exception: %s' % sys.exc_info()
                 # Handle any exception so the autoserv process can be aborted.
                 logging.error('Failed to destroy container %s. Error: %s',
                               container_name, sys.exc_info())
             autotest_es.post(use_http=True,
                              type_str=lxc.CONTAINER_RUN_TEST_METADB_TYPE,
                              metadata=metadata)
             # Try to correct the result file permission again after the
             # container is destroyed, as the container might have created some
             # new files in the result folder.
             if results:
                 correct_results_folder_permission(results)

         os.killpg(os.getpgrp(), signal.SIGKILL)

     # Set signal handler
     signal.signal(signal.SIGTERM, handle_sigterm)

     # faulthandler is only needed to debug in the Lab and is not avaliable to
     # be imported in the chroot as part of VMTest, so Try-Except it.
     try:
         import faulthandler
         faulthandler.register(signal.SIGTERM, all_threads=True, chain=True)
         logging.debug('faulthandler registered on SIGTERM.')
     except ImportError:
         sys.exc_clear()

     # Ignore SIGTTOU's generated by output from forked children.
     signal.signal(signal.SIGTTOU, signal.SIG_IGN)

     # If we received a SIGALARM, let's be loud about it.
     signal.signal(signal.SIGALRM, log_alarm)

     # Server side tests that call shell scripts often depend on $USER being set
     # but depending on how you launch your autotest scheduler it may not be set.
     os.environ['USER'] = getpass.getuser()

     label = parser.options.label
     group_name = parser.options.group_name
     user = parser.options.user
     client = parser.options.client
     server = parser.options.server
     install_before = parser.options.install_before
     install_after = parser.options.install_after
     verify = parser.options.verify
     repair = parser.options.repair
     cleanup = parser.options.cleanup
     provision = parser.options.provision
     reset = parser.options.reset
     job_labels = parser.options.job_labels
     no_tee = parser.options.no_tee
     parse_job = parser.options.parse_job
     execution_tag = parser.options.execution_tag
     if not execution_tag:
         execution_tag = parse_job
     host_protection = parser.options.host_protection
     ssh_user = parser.options.ssh_user
     ssh_port = parser.options.ssh_port
     ssh_pass = parser.options.ssh_pass
     collect_crashinfo = parser.options.collect_crashinfo
     control_filename = parser.options.control_filename
     test_retry = parser.options.test_retry
     verify_job_repo_url = parser.options.verify_job_repo_url
     skip_crash_collection = parser.options.skip_crash_collection
     ssh_verbosity = int(parser.options.ssh_verbosity)
     ssh_options = parser.options.ssh_options
     no_use_packaging = parser.options.no_use_packaging

     # can't be both a client and a server side test
     if client and server:
         parser.parser.error("Can not specify a test as both server and client!")

     if provision and client:
         parser.parser.error("Cannot specify provisioning and client!")

     is_special_task = (verify or repair or cleanup or collect_crashinfo or
                        provision or reset)
     if len(parser.args) < 1 and not is_special_task:
         parser.parser.error("Missing argument: control file")

     if ssh_verbosity > 0:
         # ssh_verbosity is an integer between 0 and 3, inclusive
         ssh_verbosity_flag = '-' + 'v' * ssh_verbosity
     else:
         ssh_verbosity_flag = ''

     # We have a control file unless it's just a verify/repair/cleanup job
     if len(parser.args) > 0:
         control = parser.args[0]
     else:
         control = None

     machines = _get_machines(parser)
     if group_name and len(machines) < 2:
         parser.parser.error('-G %r may only be supplied with more than one '
                             'machine.' % group_name)

     kwargs = {'group_name': group_name, 'tag': execution_tag,
               'disable_sysinfo': parser.options.disable_sysinfo}
     if parser.options.parent_job_id:
         kwargs['parent_job_id'] = int(parser.options.parent_job_id)
     if control_filename:
         kwargs['control_filename'] = control_filename
     job = server_job.server_job(control, parser.args[1:], results, label,
                                 user, machines, client, parse_job,
                                 ssh_user, ssh_port, ssh_pass,
                                 ssh_verbosity_flag, ssh_options,
                                 test_retry, **kwargs)

     job.logging.start_logging()
     job.init_parser()

     # perform checks
     job.precheck()

     # run the job
     exit_code = 0
     try:
         try:
             if repair:
                 job.repair(host_protection, job_labels)
             elif verify:
                 job.verify(job_labels)
             elif provision:
                 job.provision(job_labels)
             elif reset:
                 job.reset(job_labels)
             elif cleanup:
                 job.cleanup(job_labels)
             else:
                 if use_ssp:
                     try:
                         _run_with_ssp(container_name, job_or_task_id, results,
                                       parser, ssp_url)
                     finally:
                         # Update the ownership of files in result folder.
                         correct_results_folder_permission(results)
                 else:
                     job.run(install_before, install_after,
                             verify_job_repo_url=verify_job_repo_url,
                             only_collect_crashinfo=collect_crashinfo,
                             skip_crash_collection=skip_crash_collection,
                             job_labels=job_labels,
                             use_packaging=(not no_use_packaging))
         finally:
             while job.hosts:
                 host = job.hosts.pop()
                 host.close()
     except:
         exit_code = 1
         traceback.print_exc()

     if pid_file_manager:
         pid_file_manager.num_tests_failed = job.num_tests_failed
         pid_file_manager.close_file(exit_code)
     job.cleanup_parser()

     sys.exit(exit_code)


 def record_autoserv(options, duration_secs):
     """Record autoserv end-to-end time in metadata db.

     @param options: parser options.
     @param duration_secs: How long autoserv has taken, in secs.
     """
     # Get machine hostname
     machines = options.machines.replace(
             ',', ' ').strip().split() if options.machines else []
     num_machines = len(machines)
     if num_machines > 1:
         # Skip the case where atomic group is used.
         return
     elif num_machines == 0:
         machines.append('hostless')

     # Determine the status that will be reported.
     s = job_overhead.STATUS
     task_mapping = {
             'reset': s.RESETTING, 'verify': s.VERIFYING,
             'provision': s.PROVISIONING, 'repair': s.REPAIRING,
             'cleanup': s.CLEANING, 'collect_crashinfo': s.GATHERING}
     match = filter(lambda task: getattr(options, task, False) == True,
                    task_mapping)
     status = task_mapping[match[0]] if match else s.RUNNING
     is_special_task = status not in [s.RUNNING, s.GATHERING]
     job_or_task_id = job_directories.get_job_id_or_task_id(options.results)
     job_overhead.record_state_duration(
             job_or_task_id, machines[0], status, duration_secs,
             is_special_task=is_special_task)


 def main():
     start_time = datetime.datetime.now()
     # White list of tests with run time measurement enabled.
     measure_run_time_tests_names = global_config.global_config.get_config_value(
                         'AUTOSERV', 'measure_run_time_tests', type=str)
     if measure_run_time_tests_names:
         measure_run_time_tests = [t.strip() for t in
                                   measure_run_time_tests_names.split(',')]
     else:
         measure_run_time_tests = []
     # grab the parser
     parser = autoserv_parser.autoserv_parser
     parser.parse_args()

     if len(sys.argv) == 1:
         parser.parser.print_help()
         sys.exit(1)

     # If the job requires to run with server-side package, try to stage server-
     # side package first. If that fails with error that autotest server package
     # does not exist, fall back to run the job without using server-side
     # packaging. If option warn_no_ssp is specified, that means autoserv is
     # running in a drone does not support SSP, thus no need to stage server-side
     # package.
     ssp_url = None
     ssp_url_warning = False
     if (not parser.options.warn_no_ssp and parser.options.require_ssp):
         ssp_url = _stage_ssp(parser)
         # The build does not have autotest server package. Fall back to not
         # to use server-side package. Logging is postponed until logging being
         # set up.
         ssp_url_warning = not ssp_url

     if parser.options.no_logging:
         results = None
     else:
         results = parser.options.results
         if not results:
             results = 'results.' + time.strftime('%Y-%m-%d-%H.%M.%S')
         results  = os.path.abspath(results)
         resultdir_exists = False
         for filename in ('control.srv', 'status.log', '.autoserv_execute'):
             if os.path.exists(os.path.join(results, filename)):
                 resultdir_exists = True
         if not parser.options.use_existing_results and resultdir_exists:
             error = "Error: results directory already exists: %s\n" % results
             sys.stderr.write(error)
             sys.exit(1)

         # Now that we certified that there's no leftover results dir from
         # previous jobs, lets create the result dir since the logging system
         # needs to create the log file in there.
         if not os.path.isdir(results):
             os.makedirs(results)

     # Server-side packaging will only be used if it's required and the package
     # is available. If warn_no_ssp is specified, it means that autoserv is
     # running in a drone does not have SSP supported and a warning will be logs.
     # Therefore, it should not run with SSP.
     use_ssp = (not parser.options.warn_no_ssp and parser.options.require_ssp
                and ssp_url)
     if use_ssp:
         log_dir = os.path.join(results, 'ssp_logs') if results else None
         if log_dir and not os.path.exists(log_dir):
             os.makedirs(log_dir)
     else:
         log_dir = results

     logging_manager.configure_logging(
             server_logging_config.ServerLoggingConfig(),
             results_dir=log_dir,
             use_console=not parser.options.no_tee,
             verbose=parser.options.verbose,
             no_console_prefix=parser.options.no_console_prefix)

     if ssp_url_warning:
         logging.warn(
                 'Autoserv is required to run with server-side packaging. '
                 'However, no server-side package can be found based on '
                 '`--image`, host attribute job_repo_url or host label of '
                 'cros-version. The test will be executed without '
                 'server-side packaging supported.')

     if results:
         logging.info("Results placed in %s" % results)

         # wait until now to perform this check, so it get properly logged
         if (parser.options.use_existing_results and not resultdir_exists and
             not utils.is_in_container()):
             logging.error("No existing results directory found: %s", results)
             sys.exit(1)

     logging.debug('autoserv is running in drone %s.', socket.gethostname())
     logging.debug('autoserv command was: %s', ' '.join(sys.argv))

     if parser.options.write_pidfile and results:
         pid_file_manager = pidfile.PidFileManager(parser.options.pidfile_label,
                                                   results)
         pid_file_manager.open_file()
     else:
         pid_file_manager = None

     autotest.BaseAutotest.set_install_in_tmpdir(
         parser.options.install_in_tmpdir)

     timer = None
     try:
         # Take the first argument as control file name, get the test name from
         # the control file. If the test name exists in the list of tests with
         # run time measurement enabled, start a timer to begin measurement.
         if (len(parser.args) > 0 and parser.args[0] != '' and
             parser.options.machines):
             try:
                 test_name = control_data.parse_control(parser.args[0],
                                                        raise_warnings=True).name
             except control_data.ControlVariableException:
                 logging.debug('Failed to retrieve test name from control file.')
                 test_name = None
             if test_name in measure_run_time_tests:
                 machines = parser.options.machines.replace(',', ' '
                                                            ).strip().split()
                 try:
                     afe = frontend.AFE()
                     board = server_utils.get_board_from_afe(machines[0], afe)
                     timer = autotest_stats.Timer('autoserv_run_time.%s.%s' %
                                                  (board, test_name))
                     timer.start()
                 except (urllib2.HTTPError, urllib2.URLError):
                     # Ignore error if RPC failed to get board
                     pass
     except control_data.ControlVariableException as e:
         logging.error(str(e))
     exit_code = 0
     # TODO(beeps): Extend this to cover different failure modes.
     # Testing exceptions are matched against labels sent to autoserv. Eg,
     # to allow only the hostless job to run, specify
     # testing_exceptions: test_suite in the shadow_config. To allow both
     # the hostless job and dummy_Pass to run, specify
     # testing_exceptions: test_suite,dummy_Pass. You can figure out
     # what label autoserv is invoked with by looking through the logs of a test
     # for the autoserv command's -l option.
     testing_exceptions = global_config.global_config.get_config_value(
             'AUTOSERV', 'testing_exceptions', type=list, default=[])
     test_mode = global_config.global_config.get_config_value(
             'AUTOSERV', 'testing_mode', type=bool, default=False)
     test_mode = (results_mocker and test_mode and not
                  any([ex in parser.options.label
                       for ex in testing_exceptions]))
     is_task = (parser.options.verify or parser.options.repair or
                parser.options.provision or parser.options.reset or
                parser.options.cleanup or parser.options.collect_crashinfo)
     try:
         try:
             if test_mode:
                 # The parser doesn't run on tasks anyway, so we can just return
                 # happy signals without faking results.
                 if not is_task:
                     machine = parser.options.results.split('/')[-1]

                     # TODO(beeps): The proper way to do this would be to
                     # refactor job creation so we can invoke job.record
                     # directly. To do that one needs to pipe the test_name
                     # through run_autoserv and bail just before invoking
                     # the server job. See the comment in
                     # puppylab/results_mocker for more context.
                     results_mocker.ResultsMocker(
                             test_name if test_name else 'unknown-test',
                             parser.options.results, machine
                             ).mock_results()
                 return
             else:
                 run_autoserv(pid_file_manager, results, parser, ssp_url,
                              use_ssp)
         except SystemExit as e:
             exit_code = e.code
             if exit_code:
                 logging.exception(e)
         except Exception as e:
             # If we don't know what happened, we'll classify it as
             # an 'abort' and return 1.
             logging.exception(e)
             exit_code = 1
     finally:
         if pid_file_manager:
             pid_file_manager.close_file(exit_code)
         if timer:
             timer.stop()
         # Record the autoserv duration time. Must be called
         # just before the system exits to ensure accuracy.
         duration_secs = (datetime.datetime.now() - start_time).total_seconds()
         record_autoserv(parser.options, duration_secs)
     sys.exit(exit_code)


 if __name__ == '__main__':
     main()