| # Copyright (c) 2011 The Chromium OS Authors. All rights reserved. |
| # Use of this source code is governed by a BSD-style license that can be |
| # found in the LICENSE file. |
| |
| import logging, os, time |
| |
| from autotest_lib.client.common_lib import error |
| from autotest_lib.client.cros.crash_test import CrashTest as CrashTestDefs |
| from autotest_lib.server import test |
| from autotest_lib.client.bin import utils |
| |
| class platform_KernelErrorPaths(test.test): |
| version = 1 |
| |
| def breakme(self, text, cpu): |
| # This test is ensuring that the machine will reboot on any |
| # tyoe of kernel panic. If the sysctls below are not set |
| # correctly, the machine will not reboot. After verifying |
| # that the machine has the proper sysctl state, we make it |
| # reboot by writing to a /proc/breakme. |
| # |
| # 2011.03.09: ARM machines will currently fail due to |
| # 'preserved RAM' not being enabled. |
| self.client.run('sysctl kernel.panic|grep "kernel.panic = -1"'); |
| self.client.run('sysctl kernel.panic_on_oops|' |
| 'grep "kernel.panic_on_oops = 1"'); |
| |
| if cpu != None: |
| # Run on a specific CPU using taskset |
| command = "echo %s | taskset -c %d tee /proc/breakme" % (text, cpu) |
| else: |
| # Run normally |
| command = "echo %s > /proc/breakme" % text |
| |
| logging.info("KernelErrorPaths: executing '%s' on %s" % |
| (command, self.client.hostname)) |
| try: |
| # Simple sending text into /proc/breakme resets the target |
| # immediately, leaving files unsaved to disk and the master ssh |
| # connection wedged for a long time. The sequence below borrowed |
| # from logging_KernelCrashServer.py makes sure that the test |
| # proceeds smoothly. |
| self.client.run( |
| 'sh -c "sync; sleep 1; %s" >/dev/null 2>&1 &' % command) |
| except error.AutoservRunError, e: |
| # It is expected that this will cause a non-zero exit status. |
| pass |
| |
| def _exists_on_client(self, f): |
| return self.client.run('ls "%s"' % f, |
| ignore_status=True).exit_status == 0 |
| |
| def _enable_consent(self): |
| """ Enable consent so that crashes get stored in /var/spool/crash. """ |
| self._consent_files = [ |
| (CrashTestDefs._PAUSE_FILE, None, 'chronos'), |
| (CrashTestDefs._CONSENT_FILE, None, 'chronos'), |
| (CrashTestDefs._POLICY_FILE, 'mock_metrics_on.policy', 'root'), |
| (CrashTestDefs._OWNER_KEY_FILE, 'mock_metrics_owner.key', 'root'), |
| ] |
| for dst, src, owner in self._consent_files: |
| if self._exists_on_client(dst): |
| self.client.run('mv "%s" "%s.autotest_backup"' % (dst, dst)) |
| if src: |
| full_src = os.path.join(self.autodir, 'client/cros', src) |
| self.client.send_file(full_src, dst) |
| else: |
| self.client.run('touch "%s"' % dst) |
| self.client.run('chown "%s" "%s"' % (owner, dst)) |
| |
| def _restore_consent_files(self): |
| """ Restore consent files to their previous values. """ |
| for f, _, _ in self._consent_files: |
| self.client.run('rm -f "%s"' % f) |
| if self._exists_on_client('%s.autotest_backup' % f): |
| self.client.run('mv "%s.autotest_backup" "%s"' % (f, f)) |
| |
| def cleanup(self): |
| self._restore_consent_files() |
| test.test.cleanup(self) |
| |
| def run_once(self, host=None): |
| self.client = host |
| self._enable_consent() |
| |
| crash_log_dir = CrashTestDefs._SYSTEM_CRASH_DIR |
| |
| # Each tuple consists of two strings: the 'breakme' string to send |
| # into /proc/breakme on the target, and the crash report string to |
| # look for in the crash dump after target restarts. |
| # The third component is the timeout and the forth is whether we run |
| # the tests on all CPUs or not. Some tests take less to run than other |
| # (null pointer and panic) so it would be best if we would run them on |
| # all the CPUS as it wouldn't add that much time to the total. |
| # TODO(vbendeb): add the following breakme strings after fixing kernel |
| # bugs: |
| # 'deadlock' (has to be sent twice), 'softlockup', 'irqlockup' |
| test_tuples = ( |
| ('softlockup', 'BUG: soft lockup', 25, False), |
| ('bug', 'kernel BUG at', 10, False), |
| ('hungtask', 'hung_task: blocked tasks', 300, False), |
| ('nmiwatchdog', 'Watchdog detected hard LOCKUP', 50, False), |
| ('nullptr', |
| # x86 gives "BUG: unable to" while ARM gives "Unable to". |
| 'nable to handle kernel NULL pointer dereference at', 10, |
| True), |
| ('panic', 'Kernel panic - not syncing:', 10, True), |
| ) |
| |
| # Find out how many cpus we have |
| client_no_cpus = int( |
| self.client.run('cat /proc/cpuinfo | grep processor | wc -l') |
| .stdout.strip()) |
| no_cpus = 1 |
| |
| for action, text, timeout, all_cpu in test_tuples: |
| if action == "nmiwatchdog": |
| # ARM systems do not (presently) have NMI, so skip them for now. |
| arch = self.client.get_arch() |
| if arch.startswith('arm'): |
| logging.info("Skipping %s on architecture %s." % |
| (action, arch)) |
| continue; |
| # 3.2 kernels use "nmilockup" rather than "nmiwatchdog". |
| ver = self.client.get_kernel_ver(); |
| if utils.compare_versions(ver, "3.2") == 0: |
| action="nmilockup" |
| |
| if not all_cpu: |
| no_cpus = 1 |
| else: |
| no_cpus = client_no_cpus |
| for cpu in range(no_cpus): |
| # Always run on at least one cpu |
| # Delete crash results, if any |
| self.client.run('rm -f %s/*' % crash_log_dir) |
| boot_id = self.client.get_boot_id() |
| # This should cause target reset. |
| # Run on a specific cpu if we're running on all of them, |
| # otherwise run normally |
| if all_cpu : |
| self.breakme(action, cpu) |
| else: |
| self.breakme(action, None) |
| try: |
| self.client.wait_for_restart( |
| down_timeout=timeout, |
| down_warning=timeout, |
| old_boot_id=boot_id, |
| # Extend the default reboot timeout as some targets take |
| # longer than normal before ssh is available again. |
| timeout=self.client.DEFAULT_REBOOT_TIMEOUT * 4) |
| except error.AutoservShutdownError: |
| self.client.run('ps alx') |
| raise |
| |
| # give the crash_reporter some time to log the crash |
| time.sleep(5) |
| result = self.client.run('cat %s/kernel.*.kcrash' % |
| crash_log_dir) |
| if text not in result.stdout: |
| raise error.TestFail( |
| "No '%s' in the log after sending '%s' on cpu %d" |
| % (text, action, cpu)) |