blob: 0d9d8dc01a257e8593f84a9e3b071a7d61f4ee42 [file] [log] [blame]
#!/usr/bin/python2
#
# Copyright (c) 2018 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
import logging
import os
import json
import math
import re
from autotest_lib.server import test
from autotest_lib.server.cros import telemetry_runner
from autotest_lib.client.common_lib import error
# This test detects issues with low-throughput latency-sensitive workloads
# caused by entering idle state.
#
# Such loads sleep regularly but also need to wake up and hit deadlines. We've
# observed on some systems that if idle-state is enabled, we miss a lot of
# deadlines (even though the compute capacity is sufficient).
#
# This test runs top_25_smooth with idle-state both enabled and disabled, and
# looks for a discrepancy in the results. This workload is quite noisy, so
# we run multiple times and take N * stdev as the threshold for flagging an
# issue.
#
# In testing, this approach seemed quite robust, if the parameters (repetitions
# and threshold) are set appropriately. Increasing page-set repetitions helped a
# lot (reduces noise), as did selecting a good value for N (which trades off
# false positives vs. false negatives).
#
# Based on testing, we found good results by using 5 indicative pages, setting
# pageset-repetitions to 7, and taking the mean - 2 * stddev as the estimate
# for "we can be confident that the true regression is not worse than this".
#
# This results in under-estimating the regression (typically by around 2 with
# a healthy system), so false alarms should be rare or non-existent. In testing
# 50 iterations with a good and bad system, this identified 100% of regressions
# and non-regressions correctly (in fact mean - 1 * stddev would also have done
# so, but this seems a bit marginal).
# Repeat each page given number of times
PAGESET_REPEAT = 7
# PAGES can be set to a subset of pages to run for a shorter test, or None to
# run all pages in top_25_smooth.
# Simpler pages emphasise the issue more, as the system is more likely to enter
# idle state.
#
# These were selected by running all pages many times (on a system which
# exhibits the issue), and choosing the 5 pages which have the highest values
# for mean_regression - 2 * stddev - i.e. give the clearest indication of a
# regression.
PAGES = ['games.yahoo', 'Blogger', 'LinkedIn', 'cats', 'booking']
# Path to sysfs control file for disabling idle state
DISABLE_PATH = '/sys/devices/system/cpu/cpu{}/cpuidle/state{}/disable'
class kernel_IdlePerf(test.test):
"""
Server side regression test for performance impact of idle-state.
This test runs some smoothness tests with and without sleep enabled, to
check that the impact of enabling sleep is not significant.
"""
version = 1
_cleanup_required = False
def _check_sysfs(self, host):
# First check that we are on a suitable DUT which offers the ability to
# disable the idle state
arch = host.run_output('uname -m')
if arch != 'aarch64':
# Idle states differ between CPU architectures, so this test would
# need further development to support other platforms.
raise error.TestNAError('Test only supports Arm aarch64 CPUs')
if not host.path_exists(DISABLE_PATH.format(0, 1)):
logging.error('sysfs path absent: cannot disable idle state')
raise error.TestError('Cannot disable idle state')
# Identify available idle states. state0 is running state; other states
# should be disabled when disabling idle.
self.states = []
state_dirs = host.run_output(
'ls -1 /sys/devices/system/cpu/cpu0/cpuidle/')
for state in state_dirs.split('\n'):
if re.match('state[1-9][0-9]*$', state):
# Look for dirnames like 'state1' (but exclude 'state0')
self.states.append(int(state[5:]))
logging.info('Found idle states: {}'.format(self.states))
self.cpu_count = int(host.run_output('nproc --all'))
logging.info('Found {} cpus'.format(self.cpu_count))
logging.info('Idle enabled = {}'.format(self._is_idle_enabled(host)))
# From this point on we expect the test to be able to run, so we will
# need to ensure that the idle state is restored when the test exits
self._cleanup_required = True
self._enable_idle(host, False)
if self._is_idle_enabled(host):
logging.error('Failed to disable idle state')
raise error.TestError('Cannot disable idle state')
self._enable_idle(host, True)
if not self._is_idle_enabled(host):
logging.error('Failed to re-enable idle state')
raise error.TestError('Cannot disable idle state')
def _is_idle_enabled(self, host):
return host.run_output('cat ' + DISABLE_PATH.format(0, 1)) == '0'
def _enable_idle(self, host, enable):
logging.info('Setting idle enabled to {}'.format(enable))
x = '0' if enable else '1'
for cpu in range(0, self.cpu_count):
for state in self.states:
path = DISABLE_PATH.format(cpu, state)
host.run_output('echo {} > {}'.format(x, path))
def _parse_results_file(self, path):
def _mean(values):
return sum(values) / float(len(values))
with open(path) as fp:
histogram_json = json.load(fp)
scores = {}
# list of % smooth scores for each page and for each pageset-repetition
for page in histogram_json['charts']['percentage_smooth']:
if page == 'summary':
continue
page_result = histogram_json['charts']['percentage_smooth'][page]
scores[page] = {'percentage_smooth': _mean(page_result['values']),
'std': page_result['std']
}
return scores
def _compare_results(self, idle_enabled, idle_disabled):
results = {
'passed': True
}
for page in idle_enabled:
diff = (idle_disabled[page]['percentage_smooth']
- idle_enabled[page]['percentage_smooth'])
diff_std = (math.sqrt(idle_enabled[page]['std'] ** 2
+ idle_disabled[page]['std'] ** 2))
passed = (idle_enabled[page]['percentage_smooth'] >
(idle_disabled[page]['percentage_smooth'] - diff_std * 2))
key = re.sub('\W', '_', page)
results[key] = {
'idle_enabled': idle_enabled[page],
'idle_disabled': idle_disabled[page],
'difference': diff,
'difference_std': diff_std,
'passed': passed
}
results['passed'] = results['passed'] and passed
return results
def _run_telemetry(self, host, telemetry, enable):
logging.info('Running telemetry with idle enabled = {}'.format(enable))
self._enable_idle(host, enable)
args = ['--pageset-repeat={}'.format(PAGESET_REPEAT)]
if PAGES:
stories = r'\|'.join(r'\(' + p + r'\)' for p in PAGES)
story_filter = '--story-filter={}'.format(stories)
args.append(story_filter)
logging.info('Running telemetry with args: {}'.format(args))
result = telemetry.run_telemetry_benchmark(
'smoothness.top_25_smooth', self, *args)
if result.status != telemetry_runner.SUCCESS_STATUS:
raise error.TestFail('Failed to run benchmark')
# ensure first run doesn't get overwritten by second run
default_path = os.path.join(self.resultsdir, 'results-chart.json')
if enable:
unique_path = os.path.join(self.resultsdir,
'results-chart-idle-enabled.json')
else:
unique_path = os.path.join(self.resultsdir,
'results-chart-idle-disabled.json')
os.rename(default_path, unique_path)
return self._parse_results_file(unique_path)
def run_once(self, host=None, args={}):
"""Run the telemetry scrolling benchmark.
@param host: host we are running telemetry on.
"""
logging.info('Checking sysfs')
self._check_sysfs(host)
local = args.get('local') == 'True'
telemetry = telemetry_runner.TelemetryRunner(
host, local, telemetry_on_dut=False)
logging.info('Starting test')
results_idle = self._run_telemetry(host, telemetry, True)
results_noidle = self._run_telemetry(host, telemetry, False)
# Score is the regression in percentage of smooth frames caused by
# enabling CPU idle.
logging.info('Processing results')
results = self._compare_results(results_idle, results_noidle)
self.write_perf_keyval(results)
if not results['passed']:
raise error.TestFail('enabling CPU idle significantly '
'regresses scrolling performance')
def cleanup(self, host):
"""Cleanup of the test.
@param host: host we are running telemetry on.
"""
if self._cleanup_required:
logging.info('Restoring idle to enabled')
self._enable_idle(host, True)