server/site_tests/telemetry_AFDOGenerate/telemetry_AFDOGenerate.py - mirrors/cros/chromiumos/third_party/autotest - Git at Google

 # Copyright 2013 The ChromiumOS Authors
 # Use of this source code is governed by a BSD-style license that can be
 # found in the LICENSE file.
 """
 Test to generate the AFDO profile for a set of ChromeOS benchmarks.

 This will run a pre-determined set of benchmarks on the DUT under
 the monitoring of the linux "perf" tool. The resulting perf.data
 file will then be copied to Google Storage (GS) where it can be
 used by the AFDO optimized build.

 Given that the telemetry benchmarks are quite unstable on ChromeOS at
 this point, this test also supports a mode where the benchmarks are
 executed outside of the telemetry framework. It is not the same as
 executing the benchmarks under telemetry because there is no telemetry
 measurement taken but, for the purposes of profiling Chrome, it should
 be pretty close.

 Example invocation:
 /usr/bin/test_that --debug --board=lumpy <DUT IP>
   --args="ignore_failures=True local=True gs_test_location=True"
   telemetry_AFDOGenerate
 """


 import bz2
 from contextlib import contextmanager
 from contextlib import ExitStack
 import logging
 import os
 import time

 from autotest_lib.client.common_lib import error
 from autotest_lib.server import autotest
 from autotest_lib.server import test
 from autotest_lib.server import utils
 from autotest_lib.server.cros import provision
 from autotest_lib.server.cros import filesystem_util
 from autotest_lib.server.cros import telemetry_runner
 from autotest_lib.server.cros import telemetry_setup
 from autotest_lib.site_utils import test_runner_utils

 from typing import Optional


 # These are arguments to the linux "perf" tool.
 # The -e value is processor specific and comes from the Intel SDM vol 3b
 INTEL_PROFILER_ARGS = 'record -a -e r20c4 -c 150001 -b'

 ARM_PROFILER_ARGS = 'record -e cs_etm/autofdo/u -a -S'
 ETM_STROBING_WINDOW = 1000
 ETM_STROBING_PERIOD = 10000

 # In practice, it takes >2min to copy the perf.data back from the DUT, set
 # this timeout to 600 secs to be safe.
 WAIT_FOR_CMD_TIMEOUT_SECS = 600

 # Time threshold of the test setup in seconds.
 # The benchmarks running in this script typically take 60+ seconds.
 # Setup time is expected to be 5-10 seconds + extra room.
 # - If the test failed and duration was below this limit that would mean that
 # the benchmark wasn't started.
 # - If the test duration exceeded the setup limit then it is likely the benchmark
 # completed but failed in post-processing.
 TEST_SETUP_DURATION_LIMIT = 30

 RSA_KEY = '-i %s' % test_runner_utils.TEST_KEY_PATH
 DUT_SCP_OPTIONS = ' '.join([
         '-o StrictHostKeyChecking=no', '-o UserKnownHostsFile=/dev/null',
         '-o BatchMode=yes', '-o ConnectTimeout=30',
         '-o ServerAliveInterval=900', '-o ServerAliveCountMax=3',
         '-o ConnectionAttempts=4', '-o Protocol=2'
 ])
 DUT_CHROME_RESULTS_DIR = '/usr/local/telemetry/src/tools/perf'

 _WAIT_CMD_TEMPLATE = """\
 for _ in {1..%(timeout)d}; do \
   ps %(pid)d >/dev/null || break; \
   sleep 1; \
 done; \
 ! ps %(pid)d >/dev/null \
 """


 def _wait_for_process(host, pid, timeout=-1):
     """Waits for a process on the DUT to terminate.

     @param host: A host object representing the DUT.
     @param pid: The process ID (integer).
     @param timeout: Number of seconds to wait; default is wait forever.
     """
     wait_cmd = _WAIT_CMD_TEMPLATE % {'pid': pid, 'timeout': timeout}
     return host.run(wait_cmd, ignore_status=True).exit_status


 # List of benchmarks to run to capture profile information. This is
 # based on the "superhero" list and other telemetry benchmarks. Goal is
 # to have a short list that is as representative as possible and takes a
 # short time to execute. At this point the list of benchmarks is in flux.
 TELEMETRY_AFDO_BENCHMARKS = (
         {
                 'name':
                 'rendering.desktop',
                 'args': (
                         '--story-tag-filter=motionmark_fixed_2_seconds',
                         '--also-run-disabled-tests',
                         '--assert-gpu-compositing',
                         '--extra-browser-args=--disable-features=SkiaGraphite',
                 ),
                 'archs': ('amd64', 'arm')
         },
         {
                 'name': 'system_health.common_desktop',
                 'args': ('--run-abridged-story-set', ),
                 'archs': ('amd64', 'arm')
         },
         {
                 'name': 'jetstream2',
                 'archs': ('amd64', 'arm')
         },
         {
                 'name': 'speedometer3',
                 'archs': ('amd64', 'arm')
         },
 )

 # Supported <board>: <architecture>.
 LLVM_BOARDS = {
         'chell': 'amd64',
         'hatch': 'amd64',
         'trogdor': 'arm',
 }


 class telemetry_AFDOGenerate(test.test):
     """Telemetry tests wrapper to collect profiles for AFDO.

     Run one or more telemetry benchmarks under the "perf" monitoring
     tool, generate a "perf.data" file and upload to GS for comsumption
     by the AFDO optimized build.
     """
     version = 1

     def _scp_perf_data(self, dut, host_dir):
         """Copy perf data from dut.

         @param dut: The autotest host object representing DUT.
         @param host_dir: The directory on host to put the file .

         @returns status code for scp command.
         """
         cmd = []
         src = f'root@{dut.hostname}:{DUT_CHROME_RESULTS_DIR}/perf.data'
         cmd.extend([
             'scp', DUT_SCP_OPTIONS, RSA_KEY,
             '-P %s' % str(dut.port) if dut.port else '', '-v', src,
             host_dir
         ])
         command = ' '.join(cmd)

         logging.debug('Retrieving Perf Data: %s', command)
         try:
             result = utils.run(command, timeout=WAIT_FOR_CMD_TIMEOUT_SECS)
             exit_code = result.exit_status
         except Exception as e:
             logging.error('Failed to retrieve results: %s', e)
             raise

         logging.debug('command return value: %d', exit_code)
         return exit_code

     @contextmanager
     def _perf_on_dut(self):
         """Start and kill perf process on DUT."""
         logging.info('Starting perf process in background.')
         if self._is_arm():
             profile_args = ARM_PROFILER_ARGS
             perf_data = 'perf-etm.data'
         else:
             profile_args = INTEL_PROFILER_ARGS
             perf_data = 'perf.data'

         perf_cmd = (f'nohup perf {profile_args} '
                     f'-o {DUT_CHROME_RESULTS_DIR}/{perf_data}')
         perf_pid = self._host.run_background(perf_cmd)

         if self._is_arm():
             # Send signals to perf_pid to trigger ETM data collection.
             # Period 100ms.
             # It will automatically terminate with perf.
             ping_cmd = f'while kill -USR2 {perf_pid} ; do sleep 4 ; done'
             self._host.run_background(ping_cmd)

         try:
             # Use `kill -0` to check whether the perf process is alive
             verify_cmd = f'kill -0 {perf_pid}'
             if self._host.run(verify_cmd, ignore_status=True).exit_status != 0:
                 logging.error('Perf process not started correctly on DUT')
                 raise RuntimeError
             logging.info('Perf PID: %s\nPerf command: %s', perf_pid, perf_cmd)
             yield
         finally:
             # Check if process is still alive after benchmark run, if yes,
             # then kill it with -2 (which is SIGINT).
             kill_cmd = f'kill -0 {perf_pid} && killall -2 perf'
             if self._host.run(kill_cmd, ignore_status=True).exit_status != 0:
                 logging.error('Perf process is not killed correctly on DUT.')
                 raise RuntimeError
             # Perf process may not be terminated right after the kill command,
             # wait until perf process finishes.
             status = _wait_for_process(self._host, int(perf_pid),
                                        WAIT_FOR_CMD_TIMEOUT_SECS)
             if status != 0:
                 logging.error('Error waiting for perf process to be killed.')
                 raise RuntimeError
             logging.info('Perf has been killed on DUT.')

         if self._is_arm():
             # Now we need to convert ETM data into Intel's LBR format
             # which allows us to re-use the same AFDO pipeline.
             perf_inject_cmd = ('perf inject --itrace=i1000il --strip '
                                f'-i {DUT_CHROME_RESULTS_DIR}/perf-etm.data '
                                f'-o {DUT_CHROME_RESULTS_DIR}/perf.data')
             if self._host.run(perf_inject_cmd).exit_status != 0:
                 logging.error(
                     'Perf inject failed to convert ETM trace into LBR format.')
                 raise RuntimeError

         status = self._scp_perf_data(self._host, self.profdir)
         if status != 0:
             logging.error('Cannot copy perf.data file to host.')
             raise RuntimeError

     @contextmanager
     def _disable_cpuidle(self):
         """Disable CPU idle states in a context. See b/185490945."""
         cpuidle_states = '/sys/devices/system/cpu/cpu*/cpuidle/state*/disable'
         # Disable CPU Idle states to reduce ETM performance overhead.
         disable_cmd = f'echo 1 | tee {cpuidle_states}'
         if self._host.run(disable_cmd).exit_status != 0:
             logging.error('Failed to disable CPU idle states before perf run.')
             raise RuntimeError
         try:
             yield
         finally:
             # Re-enable CPU idle.
             enable_cmd = f'echo 0 | tee {cpuidle_states}'
             if self._host.run(enable_cmd).exit_status != 0:
                 logging.error(
                     'Failed to re-enable CPU idle states after perf run.')
                 raise RuntimeError

     def _set_strobing(self, window, period):
         """Set ETM strobing settings."""
         stat1 = self._host.run(
             f'echo {window} > /sys/kernel/config/cs-syscfg/features/strobing/'
             'params/window/value')
         stat2 = self._host.run(
             f'echo {period} > /sys/kernel/config/cs-syscfg/features/strobing/'
             'params/period/value')
         if stat1.exit_status != 0 or stat2.exit_status != 0:
             logging.error(
                 'Failed to set up ETM strobing settings. '
                 'W/o strobing perf profiles can have 100x increase in size.')
             raise RuntimeError

     # TODO(b/328620954): remove this once the deeper issue here is fixed.
     # In particular, see comment #4 on the bug for some rationale/pointers.
     def _inject_host_info_into_host(self, host):
         """Hack to inject info scraped from the device into `host`."""
         # N.B., ignore_status defaults to False, so this is checked by default.
         run_result = self._host.run('cat /etc/lsb-release')
         stdout = run_result.stdout
         logging.info("/etc/lsb-release contents:\n%s", stdout)

         want_key = 'CHROMEOS_RELEASE_BUILDER_PATH'
         want_key_eq = f'{want_key}='
         builder_path_lines = [
                 x for x in stdout.splitlines() if x.startswith(want_key_eq)
         ]
         if not builder_path_lines:
             logging.info('No %s found; skip injection.', want_key)
             return None

         if len(builder_path_lines) > 1:
             raise ValueError(
                     f'Want 1 {want_key} line in /etc/lsb-release; got '
                     f'{builder_path_lines}')

         release_builder_path = builder_path_lines[0].split('=', 1)[1]
         logging.info('Detected %s%s', want_key_eq, release_builder_path)

         host_info = host.host_info_store.get()
         host_info.set_version_label(provision.CROS_VERSION_PREFIX,
                                     release_builder_path)
         host.host_info_store.commit(host_info)
         return release_builder_path

     def run_once(self, host, args):
         """Run a set of telemetry benchmarks.

         @param host: Host machine where test is run
         @param args: A dictionary of the arguments that were passed
                 to this test.
         @returns None.
         """
         self._host = host
         self._board = host.get_board().split(':')[1]

         self._parse_args(args)

         # Remove write protection on host, as now telemetry code will
         # try to remove write protection that causes the machine to
         # reboot and remount during run_benchmark. We want to avoid it.
         filesystem_util.make_rootfs_writable(self._host)
         builder_path = self._inject_host_info_into_host(host)
         is_staging_run = (builder_path is not None
                           and builder_path.startswith("staging-"))
         self._gs_staging_location = is_staging_run
         setup_bucket_override = (telemetry_setup.STAGING_DEPS_BUCKET
                                  if is_staging_run else None)

         with ExitStack() as stack:
             if self._is_arm():
                 self._set_strobing(ETM_STROBING_WINDOW, ETM_STROBING_PERIOD)
                 stack.enter_context(self._disable_cpuidle())
             stack.enter_context(self._perf_on_dut())

             if self._minimal_telemetry:
                 self._run_tests_minimal_telemetry()
             else:
                 tr = stack.enter_context(
                         telemetry_runner.TelemetryRunnerFactory().get_runner(
                                 self._host,
                                 self._local,
                                 telemetry_on_dut=False,
                                 override_setup_gs_bucket=setup_bucket_override,
                         ))
                 for benchmark_info in TELEMETRY_AFDO_BENCHMARKS:
                     if self._arch not in benchmark_info['archs']:
                         continue
                     benchmark = benchmark_info['name']
                     args = benchmark_info.setdefault('args', [])
                     self._run_test(tr, benchmark, *args)
         self._passed = True

     def after_run_once(self):
         """After the profile information has been collected, compress it
         and upload it to GS
         """
         if not self._passed:
             return

         PERF_FILE = 'perf.data'
         COMP_PERF_FILE = 'chromeos-chrome-{arch}-{ver}.perf.data'
         perf_data = os.path.join(self.profdir, PERF_FILE)
         comp_data = os.path.join(self.profdir, COMP_PERF_FILE.format(
             arch=self._arch, ver=self._version))
         compressed = self._compress_file(perf_data, comp_data)
         self._gs_upload(compressed, os.path.basename(compressed))

         # Also create copy of this file using "LATEST" as version so
         # it can be found in case the builder is looking for a version
         # number that does not match. It is ok to use a slighly old
         # version of the this file for the optimized build
         latest_data = COMP_PERF_FILE.format(arch=self._arch, ver='LATEST')
         latest_compressed = self._get_compressed_name(latest_data)
         self._gs_upload(compressed, latest_compressed)

         # So that they are not uploaded along with the logs.
         os.remove(compressed)
         os.remove(perf_data)

     def _parse_args(self, args):
         """Parses and validates input arguments to this autotest.

         @param args: Options->values dictionary.
         @raises error.TestFail if a bad option is passed.
         """
         # Set default values for the options.

         # Architecture for which we are collecting afdo data
         # is based on the board.
         if self._board not in LLVM_BOARDS:
             raise error.TestFail(
                 f'This test cannot be run on board {self._board}. '
                 f'Try one of {sorted(LLVM_BOARDS)}')
         self._arch = LLVM_BOARDS[self._board]
         # By default, write to production storage. You can upload to a
         # more-accessible (and more temporary) location by passing
         # 'gs_test_location=True'.
         self._gs_test_location = False
         # Ignore individual test failures unless they failed to start.
         self._ignore_failures = True
         # Use local copy of telemetry instead of using the dev server copy.
         self._local = False
         # Chrome version to which the AFDO data corresponds.
         self._version, _ = self._host.get_chrome_version()
         # Try to use the minimal support from Telemetry. The Telemetry
         # benchmarks in ChromeOS are too flaky at this point. So, initially,
         # this will be set to True by default.
         self._minimal_telemetry = False
         # Set when the telemetry test pass.
         self._passed = False
         # Set to True if this should upload to a staging bucket instead of the
         # production one.
         self._gs_staging_location = False

         # Ignored servo arguments.
         ignored_options = (
                 'buildartifactsurl',
                 'cache_endpoint',
                 'dut_servers',
                 'libs_server',
                 'servo_host',
                 'servo_port',
         )

         unknown_options = []
         for option_name, value in args.items():
             if option_name == 'arch':
                 # Verify board: arch.
                 if self._arch != value:
                     raise error.TestFail(
                         'Mismatch of the board and architecture: '
                         f'board: {self._board}, arch: {value}. '
                         f'Did you mean "arch={self._arch}"?')
             elif option_name == 'gs_test_location':
                 self._gs_test_location = (value == 'True')
             elif option_name == 'ignore_failures':
                 self._ignore_failures = (value == 'True')
             elif option_name == 'local':
                 self._local = (value == 'True')
             elif option_name == 'minimal_telemetry':
                 self._minimal_telemetry = (value == 'True')
             elif option_name == 'version':
                 self._version = value
             elif option_name not in ignored_options:
                 unknown_options.append(option_name)

         if unknown_options:
             raise error.TestFail(f'Unknown options passed: {unknown_options}')

     def _is_arm(self):
         """Return true if arch is arm."""
         return self._arch == 'arm'

     def _check_status(
             self,
             benchmark: str,
             ex: Optional[error.TestBaseException],
             status: str,
             duration: float,
     ) -> Optional[error.TestBaseException]:
         """Convert benchmark status into the afdo generate exception or None.

         Checks the test status based on return status, duration and
         'ignore_failures option. Log any errors including suppressed.

         @param benchmark: Name.
         @param status: Result status from telemetry_runner.
         @param ex: None, if no exception otherwise TestBaseException.
         @param duration: Test duration.

         @return error.TestFail or None
         """

         cant_ignore_format_message = (
                 '"%s" failed after %ds.\n'
                 'Can\'t ignore the failure with "ignore_failures" option set'
                 ' because the benchmark failed to start.\n'
                 'Fix setup or disable the benchmark if the problem persists.\n'
         )
         if ex:
             logging.warning(
                     'Got exception from Telemetry benchmark %s '
                     'after %f seconds. Exception: %s', benchmark, duration, ex)
             if not self._ignore_failures:
                 return ex

             # 'ignore_failures' option is set.
             if duration >= TEST_SETUP_DURATION_LIMIT:
                 # Ignore failures.
                 logging.info('Ignoring failure from benchmark %s.', benchmark)
                 return None

             # Premature failure, can't ignore.
             logging.error(cant_ignore_format_message, benchmark, duration)
             return ex
         else:
             logging.info(
                     'Completed Telemetry benchmark %s in %f seconds with status: %s',
                     benchmark,
                     duration,
                     status,
             )

             if status == telemetry_runner.SUCCESS_STATUS:
                 return None

             err = error.TestFail(
                     f'An error occurred while executing benchmark: {benchmark}'
             )
             if not self._ignore_failures:
                 return err

             # 'ignore_failures' option is set.
             if duration >= TEST_SETUP_DURATION_LIMIT:
                 # Benchmark was launched, failed and ignored.
                 logging.info(
                         "Ignoring failure with status '%s' returned by %s"
                         " (due to set 'ignore_failures' option)",
                         status,
                         benchmark,
                 )
                 return None

             # Failed prematurely.
             logging.error(cant_ignore_format_message, benchmark, duration)
             return err

     def _run_test(self, tr, benchmark, *args):
         """Run the benchmark using Telemetry.

         @param tr: Instance of the TelemetryRunner subclass.
         @param benchmark: Name of the benchmark to run.
         @param args: Additional arguments to pass to the telemetry execution
                      script.
         @raises Raises error.TestBaseException if
                 - execution of the test failed and "ignore_failures is not set;
                 - "ignore_failures" is set but the test duration is below
                   TEST_SETUP_DURATION_LIMIT meaning that the test wasn't started.
         """
         logging.info('Starting Telemetry benchmark %s', benchmark)
         start_time = time.time()
         try:
             result = tr.run_telemetry_benchmark(benchmark, None, *args)
             duration = time.time() - start_time
             err = self._check_status(benchmark, None, result.status, duration)
             if err:
                 raise err
         except error.TestBaseException as e:
             duration = time.time() - start_time
             err = self._check_status(benchmark, e,
                                      telemetry_runner.FAILED_STATUS, duration)
             if err:
                 raise err

     def _run_tests_minimal_telemetry(self):
         """Run the benchmarks using the minimal support from Telemetry.

         The benchmarks are run using a client side autotest test. This test
         will control Chrome directly using the chrome.Chrome support and it
         will ask Chrome to display the benchmark pages directly instead of
         using the "page sets" and "measurements" support from Telemetry.
         In this way we avoid using Telemetry benchmark support which is not
         stable on ChromeOS yet.
         """
         AFDO_GENERATE_CLIENT_TEST = 'telemetry_AFDOGenerateClient'

         # Execute the client side test.
         client_at = autotest.Autotest(self._host)
         client_at.run_test(AFDO_GENERATE_CLIENT_TEST, args='')

     @staticmethod
     def _get_compressed_name(name):
         """Given a file name, return bz2 compressed name.

         @param name: Name of uncompressed file.
         @returns name of compressed file.
         """
         return name + '.bz2'

     @staticmethod
     def _compress_file(unc_file, com_file):
         """Compresses specified file with bz2.

         @param unc_file: name of file to compress.
         @param com_file: prefix name of compressed file.
         @raises error.TestFail if compression failed
         @returns Name of compressed file.
         """
         dest = ''
         with open(unc_file, 'rb') as inp:
             dest = telemetry_AFDOGenerate._get_compressed_name(com_file)
             with bz2.BZ2File(dest, 'wb') as out:
                 for data in inp:
                     out.write(data)
         if not dest or not os.path.isfile(dest):
             raise error.TestFail(f'Could not compress {unc_file}')
         return dest

     def _gs_upload(self, local_file, remote_basename):
         """Uploads file to google storage specific location.

         @param local_file: name of file to upload.
         @param remote_basename: basename of remote file.
         @raises error.TestFail if upload failed.
         @returns nothing.
         """
         GS_LLVM_DEST = ('gs://chromeos-toolchain-artifacts/afdo/unvetted/'
                         f'benchmark/{remote_basename}')
         GS_STAGING_DEST = ('gs://staging-chromeos-toolchain-artifacts/afdo/'
                            f'unvetted/benchmark/{remote_basename}')
         GS_TEST_DEST = ('gs://chromeos-throw-away-bucket/afdo-job/canonicals/'
                         f'{remote_basename}')
         GS_ACL = 'project-private'

         if self._gs_test_location:
             remote_file = GS_TEST_DEST
         elif self._gs_staging_location:
             remote_file = GS_STAGING_DEST
         elif self._board in LLVM_BOARDS:
             remote_file = GS_LLVM_DEST
         else:
             raise error.TestFail(
                 f'This test cannot be run on board {self._board}')

         logging.info('About to upload to GS: %s', remote_file)
         if not utils.gs_upload(
                 local_file, remote_file, GS_ACL, result_dir=self.resultsdir):
             logging.info('Failed upload to GS: %s', remote_file)
             raise error.TestFail(
                 f'Unable to gs upload {local_file} to {remote_file}')

         logging.info('Successfull upload to GS: %s', remote_file)
	# Copyright 2013 The ChromiumOS Authors
	# Use of this source code is governed by a BSD-style license that can be
	# found in the LICENSE file.
	"""
	Test to generate the AFDO profile for a set of ChromeOS benchmarks.

	This will run a pre-determined set of benchmarks on the DUT under
	the monitoring of the linux "perf" tool. The resulting perf.data
	file will then be copied to Google Storage (GS) where it can be
	used by the AFDO optimized build.

	Given that the telemetry benchmarks are quite unstable on ChromeOS at
	this point, this test also supports a mode where the benchmarks are
	executed outside of the telemetry framework. It is not the same as
	executing the benchmarks under telemetry because there is no telemetry
	measurement taken but, for the purposes of profiling Chrome, it should
	be pretty close.

	Example invocation:
	/usr/bin/test_that --debug --board=lumpy <DUT IP>
	--args="ignore_failures=True local=True gs_test_location=True"
	telemetry_AFDOGenerate
	"""


	import bz2
	from contextlib import contextmanager
	from contextlib import ExitStack
	import logging
	import os
	import time

	from autotest_lib.client.common_lib import error
	from autotest_lib.server import autotest
	from autotest_lib.server import test
	from autotest_lib.server import utils
	from autotest_lib.server.cros import provision
	from autotest_lib.server.cros import filesystem_util
	from autotest_lib.server.cros import telemetry_runner
	from autotest_lib.server.cros import telemetry_setup
	from autotest_lib.site_utils import test_runner_utils

	from typing import Optional


	# These are arguments to the linux "perf" tool.
	# The -e value is processor specific and comes from the Intel SDM vol 3b
	INTEL_PROFILER_ARGS = 'record -a -e r20c4 -c 150001 -b'

	ARM_PROFILER_ARGS = 'record -e cs_etm/autofdo/u -a -S'
	ETM_STROBING_WINDOW = 1000
	ETM_STROBING_PERIOD = 10000

	# In practice, it takes >2min to copy the perf.data back from the DUT, set
	# this timeout to 600 secs to be safe.
	WAIT_FOR_CMD_TIMEOUT_SECS = 600

	# Time threshold of the test setup in seconds.
	# The benchmarks running in this script typically take 60+ seconds.
	# Setup time is expected to be 5-10 seconds + extra room.
	# - If the test failed and duration was below this limit that would mean that
	# the benchmark wasn't started.
	# - If the test duration exceeded the setup limit then it is likely the benchmark
	# completed but failed in post-processing.
	TEST_SETUP_DURATION_LIMIT = 30

	RSA_KEY = '-i %s' % test_runner_utils.TEST_KEY_PATH
	DUT_SCP_OPTIONS = ' '.join([
	'-o StrictHostKeyChecking=no', '-o UserKnownHostsFile=/dev/null',
	'-o BatchMode=yes', '-o ConnectTimeout=30',
	'-o ServerAliveInterval=900', '-o ServerAliveCountMax=3',
	'-o ConnectionAttempts=4', '-o Protocol=2'
	])
	DUT_CHROME_RESULTS_DIR = '/usr/local/telemetry/src/tools/perf'

	_WAIT_CMD_TEMPLATE = """\
	for _ in {1..%(timeout)d}; do \
	ps %(pid)d >/dev/null \|\| break; \
	sleep 1; \
	done; \
	! ps %(pid)d >/dev/null \
	"""


	def _wait_for_process(host, pid, timeout=-1):
	"""Waits for a process on the DUT to terminate.

	@param host: A host object representing the DUT.
	@param pid: The process ID (integer).
	@param timeout: Number of seconds to wait; default is wait forever.
	"""
	wait_cmd = _WAIT_CMD_TEMPLATE % {'pid': pid, 'timeout': timeout}
	return host.run(wait_cmd, ignore_status=True).exit_status


	# List of benchmarks to run to capture profile information. This is
	# based on the "superhero" list and other telemetry benchmarks. Goal is
	# to have a short list that is as representative as possible and takes a
	# short time to execute. At this point the list of benchmarks is in flux.
	TELEMETRY_AFDO_BENCHMARKS = (
	{
	'name':
	'rendering.desktop',
	'args': (
	'--story-tag-filter=motionmark_fixed_2_seconds',
	'--also-run-disabled-tests',
	'--assert-gpu-compositing',
	'--extra-browser-args=--disable-features=SkiaGraphite',
	),
	'archs': ('amd64', 'arm')
	},
	{
	'name': 'system_health.common_desktop',
	'args': ('--run-abridged-story-set', ),
	'archs': ('amd64', 'arm')
	},
	{
	'name': 'jetstream2',
	'archs': ('amd64', 'arm')
	},
	{
	'name': 'speedometer3',
	'archs': ('amd64', 'arm')
	},
	)

	# Supported <board>: <architecture>.
	LLVM_BOARDS = {
	'chell': 'amd64',
	'hatch': 'amd64',
	'trogdor': 'arm',
	}


	class telemetry_AFDOGenerate(test.test):
	"""Telemetry tests wrapper to collect profiles for AFDO.

	Run one or more telemetry benchmarks under the "perf" monitoring
	tool, generate a "perf.data" file and upload to GS for comsumption
	by the AFDO optimized build.
	"""
	version = 1

	def _scp_perf_data(self, dut, host_dir):
	"""Copy perf data from dut.

	@param dut: The autotest host object representing DUT.
	@param host_dir: The directory on host to put the file .

	@returns status code for scp command.
	"""
	cmd = []
	src = f'root@{dut.hostname}:{DUT_CHROME_RESULTS_DIR}/perf.data'
	cmd.extend([
	'scp', DUT_SCP_OPTIONS, RSA_KEY,
	'-P %s' % str(dut.port) if dut.port else '', '-v', src,
	host_dir
	])
	command = ' '.join(cmd)

	logging.debug('Retrieving Perf Data: %s', command)
	try:
	result = utils.run(command, timeout=WAIT_FOR_CMD_TIMEOUT_SECS)
	exit_code = result.exit_status
	except Exception as e:
	logging.error('Failed to retrieve results: %s', e)
	raise

	logging.debug('command return value: %d', exit_code)
	return exit_code

	@contextmanager
	def _perf_on_dut(self):
	"""Start and kill perf process on DUT."""
	logging.info('Starting perf process in background.')
	if self._is_arm():
	profile_args = ARM_PROFILER_ARGS
	perf_data = 'perf-etm.data'
	else:
	profile_args = INTEL_PROFILER_ARGS
	perf_data = 'perf.data'

	perf_cmd = (f'nohup perf {profile_args} '
	f'-o {DUT_CHROME_RESULTS_DIR}/{perf_data}')
	perf_pid = self._host.run_background(perf_cmd)

	if self._is_arm():
	# Send signals to perf_pid to trigger ETM data collection.
	# Period 100ms.
	# It will automatically terminate with perf.
	ping_cmd = f'while kill -USR2 {perf_pid} ; do sleep 4 ; done'
	self._host.run_background(ping_cmd)

	try:
	# Use `kill -0` to check whether the perf process is alive
	verify_cmd = f'kill -0 {perf_pid}'
	if self._host.run(verify_cmd, ignore_status=True).exit_status != 0:
	logging.error('Perf process not started correctly on DUT')
	raise RuntimeError
	logging.info('Perf PID: %s\nPerf command: %s', perf_pid, perf_cmd)
	yield
	finally:
	# Check if process is still alive after benchmark run, if yes,
	# then kill it with -2 (which is SIGINT).
	kill_cmd = f'kill -0 {perf_pid} && killall -2 perf'
	if self._host.run(kill_cmd, ignore_status=True).exit_status != 0:
	logging.error('Perf process is not killed correctly on DUT.')
	raise RuntimeError
	# Perf process may not be terminated right after the kill command,
	# wait until perf process finishes.
	status = _wait_for_process(self._host, int(perf_pid),
	WAIT_FOR_CMD_TIMEOUT_SECS)
	if status != 0:
	logging.error('Error waiting for perf process to be killed.')
	raise RuntimeError
	logging.info('Perf has been killed on DUT.')

	if self._is_arm():
	# Now we need to convert ETM data into Intel's LBR format
	# which allows us to re-use the same AFDO pipeline.
	perf_inject_cmd = ('perf inject --itrace=i1000il --strip '
	f'-i {DUT_CHROME_RESULTS_DIR}/perf-etm.data '
	f'-o {DUT_CHROME_RESULTS_DIR}/perf.data')
	if self._host.run(perf_inject_cmd).exit_status != 0:
	logging.error(
	'Perf inject failed to convert ETM trace into LBR format.')
	raise RuntimeError

	status = self._scp_perf_data(self._host, self.profdir)
	if status != 0:
	logging.error('Cannot copy perf.data file to host.')
	raise RuntimeError

	@contextmanager
	def _disable_cpuidle(self):
	"""Disable CPU idle states in a context. See b/185490945."""
	cpuidle_states = '/sys/devices/system/cpu/cpu/cpuidle/state/disable'
	# Disable CPU Idle states to reduce ETM performance overhead.
	disable_cmd = f'echo 1 \| tee {cpuidle_states}'
	if self._host.run(disable_cmd).exit_status != 0:
	logging.error('Failed to disable CPU idle states before perf run.')
	raise RuntimeError
	try:
	yield
	finally:
	# Re-enable CPU idle.
	enable_cmd = f'echo 0 \| tee {cpuidle_states}'
	if self._host.run(enable_cmd).exit_status != 0:
	logging.error(
	'Failed to re-enable CPU idle states after perf run.')
	raise RuntimeError

	def _set_strobing(self, window, period):
	"""Set ETM strobing settings."""
	stat1 = self._host.run(
	f'echo {window} > /sys/kernel/config/cs-syscfg/features/strobing/'
	'params/window/value')
	stat2 = self._host.run(
	f'echo {period} > /sys/kernel/config/cs-syscfg/features/strobing/'
	'params/period/value')
	if stat1.exit_status != 0 or stat2.exit_status != 0:
	logging.error(
	'Failed to set up ETM strobing settings. '
	'W/o strobing perf profiles can have 100x increase in size.')
	raise RuntimeError

	# TODO(b/328620954): remove this once the deeper issue here is fixed.
	# In particular, see comment #4 on the bug for some rationale/pointers.
	def _inject_host_info_into_host(self, host):
	"""Hack to inject info scraped from the device into `host`."""
	# N.B., ignore_status defaults to False, so this is checked by default.
	run_result = self._host.run('cat /etc/lsb-release')
	stdout = run_result.stdout
	logging.info("/etc/lsb-release contents:\n%s", stdout)

	want_key = 'CHROMEOS_RELEASE_BUILDER_PATH'
	want_key_eq = f'{want_key}='
	builder_path_lines = [
	x for x in stdout.splitlines() if x.startswith(want_key_eq)
	]
	if not builder_path_lines:
	logging.info('No %s found; skip injection.', want_key)
	return None

	if len(builder_path_lines) > 1:
	raise ValueError(
	f'Want 1 {want_key} line in /etc/lsb-release; got '
	f'{builder_path_lines}')

	release_builder_path = builder_path_lines[0].split('=', 1)[1]
	logging.info('Detected %s%s', want_key_eq, release_builder_path)

	host_info = host.host_info_store.get()
	host_info.set_version_label(provision.CROS_VERSION_PREFIX,
	release_builder_path)
	host.host_info_store.commit(host_info)
	return release_builder_path

	def run_once(self, host, args):
	"""Run a set of telemetry benchmarks.

	@param host: Host machine where test is run
	@param args: A dictionary of the arguments that were passed
	to this test.
	@returns None.
	"""
	self._host = host
	self._board = host.get_board().split(':')[1]

	self._parse_args(args)

	# Remove write protection on host, as now telemetry code will
	# try to remove write protection that causes the machine to
	# reboot and remount during run_benchmark. We want to avoid it.
	filesystem_util.make_rootfs_writable(self._host)
	builder_path = self._inject_host_info_into_host(host)
	is_staging_run = (builder_path is not None
	and builder_path.startswith("staging-"))
	self._gs_staging_location = is_staging_run
	setup_bucket_override = (telemetry_setup.STAGING_DEPS_BUCKET
	if is_staging_run else None)

	with ExitStack() as stack:
	if self._is_arm():
	self._set_strobing(ETM_STROBING_WINDOW, ETM_STROBING_PERIOD)
	stack.enter_context(self._disable_cpuidle())
	stack.enter_context(self._perf_on_dut())

	if self._minimal_telemetry:
	self._run_tests_minimal_telemetry()
	else:
	tr = stack.enter_context(
	telemetry_runner.TelemetryRunnerFactory().get_runner(
	self._host,
	self._local,
	telemetry_on_dut=False,
	override_setup_gs_bucket=setup_bucket_override,
	))
	for benchmark_info in TELEMETRY_AFDO_BENCHMARKS:
	if self._arch not in benchmark_info['archs']:
	continue
	benchmark = benchmark_info['name']
	args = benchmark_info.setdefault('args', [])
	self._run_test(tr, benchmark, *args)
	self._passed = True

	def after_run_once(self):
	"""After the profile information has been collected, compress it
	and upload it to GS
	"""
	if not self._passed:
	return

	PERF_FILE = 'perf.data'
	COMP_PERF_FILE = 'chromeos-chrome-{arch}-{ver}.perf.data'
	perf_data = os.path.join(self.profdir, PERF_FILE)
	comp_data = os.path.join(self.profdir, COMP_PERF_FILE.format(
	arch=self._arch, ver=self._version))
	compressed = self._compress_file(perf_data, comp_data)
	self._gs_upload(compressed, os.path.basename(compressed))

	# Also create copy of this file using "LATEST" as version so
	# it can be found in case the builder is looking for a version
	# number that does not match. It is ok to use a slighly old
	# version of the this file for the optimized build
	latest_data = COMP_PERF_FILE.format(arch=self._arch, ver='LATEST')
	latest_compressed = self._get_compressed_name(latest_data)
	self._gs_upload(compressed, latest_compressed)

	# So that they are not uploaded along with the logs.
	os.remove(compressed)
	os.remove(perf_data)

	def _parse_args(self, args):
	"""Parses and validates input arguments to this autotest.

	@param args: Options->values dictionary.
	@raises error.TestFail if a bad option is passed.
	"""
	# Set default values for the options.

	# Architecture for which we are collecting afdo data
	# is based on the board.
	if self._board not in LLVM_BOARDS:
	raise error.TestFail(
	f'This test cannot be run on board {self._board}. '
	f'Try one of {sorted(LLVM_BOARDS)}')
	self._arch = LLVM_BOARDS[self._board]
	# By default, write to production storage. You can upload to a
	# more-accessible (and more temporary) location by passing
	# 'gs_test_location=True'.
	self._gs_test_location = False
	# Ignore individual test failures unless they failed to start.
	self._ignore_failures = True
	# Use local copy of telemetry instead of using the dev server copy.
	self._local = False
	# Chrome version to which the AFDO data corresponds.
	self._version, _ = self._host.get_chrome_version()
	# Try to use the minimal support from Telemetry. The Telemetry
	# benchmarks in ChromeOS are too flaky at this point. So, initially,
	# this will be set to True by default.
	self._minimal_telemetry = False
	# Set when the telemetry test pass.
	self._passed = False
	# Set to True if this should upload to a staging bucket instead of the
	# production one.
	self._gs_staging_location = False

	# Ignored servo arguments.
	ignored_options = (
	'buildartifactsurl',
	'cache_endpoint',
	'dut_servers',
	'libs_server',
	'servo_host',
	'servo_port',
	)

	unknown_options = []
	for option_name, value in args.items():
	if option_name == 'arch':
	# Verify board: arch.
	if self._arch != value:
	raise error.TestFail(
	'Mismatch of the board and architecture: '
	f'board: {self._board}, arch: {value}. '
	f'Did you mean "arch={self._arch}"?')
	elif option_name == 'gs_test_location':
	self._gs_test_location = (value == 'True')
	elif option_name == 'ignore_failures':
	self._ignore_failures = (value == 'True')
	elif option_name == 'local':
	self._local = (value == 'True')
	elif option_name == 'minimal_telemetry':
	self._minimal_telemetry = (value == 'True')
	elif option_name == 'version':
	self._version = value
	elif option_name not in ignored_options:
	unknown_options.append(option_name)

	if unknown_options:
	raise error.TestFail(f'Unknown options passed: {unknown_options}')

	def _is_arm(self):
	"""Return true if arch is arm."""
	return self._arch == 'arm'

	def _check_status(
	self,
	benchmark: str,
	ex: Optional[error.TestBaseException],
	status: str,
	duration: float,
	) -> Optional[error.TestBaseException]:
	"""Convert benchmark status into the afdo generate exception or None.

	Checks the test status based on return status, duration and
	'ignore_failures option. Log any errors including suppressed.

	@param benchmark: Name.
	@param status: Result status from telemetry_runner.
	@param ex: None, if no exception otherwise TestBaseException.
	@param duration: Test duration.

	@return error.TestFail or None
	"""

	cant_ignore_format_message = (
	'"%s" failed after %ds.\n'
	'Can\'t ignore the failure with "ignore_failures" option set'
	' because the benchmark failed to start.\n'
	'Fix setup or disable the benchmark if the problem persists.\n'
	)
	if ex:
	logging.warning(
	'Got exception from Telemetry benchmark %s '
	'after %f seconds. Exception: %s', benchmark, duration, ex)
	if not self._ignore_failures:
	return ex

	# 'ignore_failures' option is set.
	if duration >= TEST_SETUP_DURATION_LIMIT:
	# Ignore failures.
	logging.info('Ignoring failure from benchmark %s.', benchmark)
	return None

	# Premature failure, can't ignore.
	logging.error(cant_ignore_format_message, benchmark, duration)
	return ex
	else:
	logging.info(
	'Completed Telemetry benchmark %s in %f seconds with status: %s',
	benchmark,
	duration,
	status,
	)

	if status == telemetry_runner.SUCCESS_STATUS:
	return None

	err = error.TestFail(
	f'An error occurred while executing benchmark: {benchmark}'
	)
	if not self._ignore_failures:
	return err

	# 'ignore_failures' option is set.
	if duration >= TEST_SETUP_DURATION_LIMIT:
	# Benchmark was launched, failed and ignored.
	logging.info(
	"Ignoring failure with status '%s' returned by %s"
	" (due to set 'ignore_failures' option)",
	status,
	benchmark,
	)
	return None

	# Failed prematurely.
	logging.error(cant_ignore_format_message, benchmark, duration)
	return err

	def _run_test(self, tr, benchmark, *args):
	"""Run the benchmark using Telemetry.

	@param tr: Instance of the TelemetryRunner subclass.
	@param benchmark: Name of the benchmark to run.
	@param args: Additional arguments to pass to the telemetry execution
	script.
	@raises Raises error.TestBaseException if
	- execution of the test failed and "ignore_failures is not set;
	- "ignore_failures" is set but the test duration is below
	TEST_SETUP_DURATION_LIMIT meaning that the test wasn't started.
	"""
	logging.info('Starting Telemetry benchmark %s', benchmark)
	start_time = time.time()
	try:
	result = tr.run_telemetry_benchmark(benchmark, None, *args)
	duration = time.time() - start_time
	err = self._check_status(benchmark, None, result.status, duration)
	if err:
	raise err
	except error.TestBaseException as e:
	duration = time.time() - start_time
	err = self._check_status(benchmark, e,
	telemetry_runner.FAILED_STATUS, duration)
	if err:
	raise err

	def _run_tests_minimal_telemetry(self):
	"""Run the benchmarks using the minimal support from Telemetry.

	The benchmarks are run using a client side autotest test. This test
	will control Chrome directly using the chrome.Chrome support and it
	will ask Chrome to display the benchmark pages directly instead of
	using the "page sets" and "measurements" support from Telemetry.
	In this way we avoid using Telemetry benchmark support which is not
	stable on ChromeOS yet.
	"""
	AFDO_GENERATE_CLIENT_TEST = 'telemetry_AFDOGenerateClient'

	# Execute the client side test.
	client_at = autotest.Autotest(self._host)
	client_at.run_test(AFDO_GENERATE_CLIENT_TEST, args='')

	@staticmethod
	def _get_compressed_name(name):
	"""Given a file name, return bz2 compressed name.

	@param name: Name of uncompressed file.
	@returns name of compressed file.
	"""
	return name + '.bz2'

	@staticmethod
	def _compress_file(unc_file, com_file):
	"""Compresses specified file with bz2.

	@param unc_file: name of file to compress.
	@param com_file: prefix name of compressed file.
	@raises error.TestFail if compression failed
	@returns Name of compressed file.
	"""
	dest = ''
	with open(unc_file, 'rb') as inp:
	dest = telemetry_AFDOGenerate._get_compressed_name(com_file)
	with bz2.BZ2File(dest, 'wb') as out:
	for data in inp:
	out.write(data)
	if not dest or not os.path.isfile(dest):
	raise error.TestFail(f'Could not compress {unc_file}')
	return dest

	def _gs_upload(self, local_file, remote_basename):
	"""Uploads file to google storage specific location.

	@param local_file: name of file to upload.
	@param remote_basename: basename of remote file.
	@raises error.TestFail if upload failed.
	@returns nothing.
	"""
	GS_LLVM_DEST = ('gs://chromeos-toolchain-artifacts/afdo/unvetted/'
	f'benchmark/{remote_basename}')
	GS_STAGING_DEST = ('gs://staging-chromeos-toolchain-artifacts/afdo/'
	f'unvetted/benchmark/{remote_basename}')
	GS_TEST_DEST = ('gs://chromeos-throw-away-bucket/afdo-job/canonicals/'
	f'{remote_basename}')
	GS_ACL = 'project-private'

	if self._gs_test_location:
	remote_file = GS_TEST_DEST
	elif self._gs_staging_location:
	remote_file = GS_STAGING_DEST
	elif self._board in LLVM_BOARDS:
	remote_file = GS_LLVM_DEST
	else:
	raise error.TestFail(
	f'This test cannot be run on board {self._board}')

	logging.info('About to upload to GS: %s', remote_file)
	if not utils.gs_upload(
	local_file, remote_file, GS_ACL, result_dir=self.resultsdir):
	logging.info('Failed upload to GS: %s', remote_file)
	raise error.TestFail(
	f'Unable to gs upload {local_file} to {remote_file}')

	logging.info('Successfull upload to GS: %s', remote_file)