blob: 9174d7ed098a8efb713858c862cfd9df5215549d [file] [log] [blame]
# Copyright 2016 The Chromium OS Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
import contextlib
import datetime
import logging
import pprint
import time
import common
from autotest_lib.client.common_lib import error
from autotest_lib.client.common_lib import utils as client_utils
from autotest_lib.client.common_lib.cros.network import ap_constants
from autotest_lib.client.common_lib.cros.network import iw_runner
from autotest_lib.server import hosts
from autotest_lib.server import site_linux_system
from autotest_lib.server.cros import host_lock_manager
from autotest_lib.server.cros.ap_configurators import ap_batch_locker
from autotest_lib.server.cros.ap_configurators \
import ap_configurator_factory
from autotest_lib.server.cros.network import chaos_clique_utils as utils
from autotest_lib.server.cros.network import wifi_client
# Webdriver master hostname
MASTERNAME = 'chromeos3-chaosvmmaster.cros.corp.google.com'
WEBDRIVER_PORT = 9515
class ChaosRunner(object):
"""Object to run a network_WiFi_ChaosXXX test."""
def __init__(self, test, host, spec, broken_pdus=list()):
"""Initializes and runs test.
@param test: a string, test name.
@param host: an Autotest host object, device under test.
@param spec: an APSpec object.
@param broken_pdus: list of offline PDUs.
"""
self._test = test
self._host = host
self._ap_spec = spec
self._broken_pdus = broken_pdus
# Log server and DUT times
dt = datetime.datetime.now()
logging.info('Server time: %s', dt.strftime('%a %b %d %H:%M:%S %Y'))
logging.info('DUT time: %s', self._host.run('date').stdout.strip())
def run(self, job, batch_size=10, tries=10, capturer_hostname=None,
conn_worker=None, work_client_hostname=None,
disabled_sysinfo=False):
"""Executes Chaos test.
@param job: an Autotest job object.
@param batch_size: an integer, max number of APs to lock in one batch.
@param tries: an integer, number of iterations to run per AP.
@param capturer_hostname: a string or None, hostname or IP of capturer.
@param conn_worker: ConnectionWorkerAbstract or None, to run extra
work after successful connection.
@param work_client_hostname: a string or None, hostname of work client
@param disabled_sysinfo: a bool, disable collection of logs from DUT.
@raises TestError: Issues locking VM webdriver instance
"""
lock_manager = host_lock_manager.HostLockManager()
webdriver_master = hosts.SSHHost(MASTERNAME, user='chaosvmmaster')
host_prefix = self._host.hostname.split('-')[0]
with host_lock_manager.HostsLockedBy(lock_manager):
capture_host = utils.allocate_packet_capturer(
lock_manager, hostname=capturer_hostname,
prefix=host_prefix)
# Cleanup and reboot packet capturer before the test.
utils.sanitize_client(capture_host)
capturer = site_linux_system.LinuxSystem(capture_host, {},
'packet_capturer')
# Run iw scan and abort if more than allowed number of APs are up.
iw_command = iw_runner.IwRunner(capture_host)
start_time = time.time()
logging.info('Performing a scan with a max timeout of 30 seconds.')
capture_interface = 'wlan0'
capturer_info = capture_host.run('cat /etc/lsb-release',
ignore_status=True, timeout=5).stdout
if 'whirlwind' in capturer_info:
# Use the dual band aux radio for scanning networks.
capture_interface = 'wlan2'
while time.time() - start_time <= ap_constants.MAX_SCAN_TIMEOUT:
networks = iw_command.scan(capture_interface)
if networks is None:
if (time.time() - start_time ==
ap_constants.MAX_SCAN_TIMEOUT):
raise error.TestError(
'Packet capturer is not responding to scans. Check'
'device and re-run test')
continue
elif len(networks) < ap_constants.MAX_SSID_COUNT:
break
elif len(networks) >= ap_constants.MAX_SSID_COUNT:
raise error.TestError(
'Probably someone is already running a '
'chaos test?!')
if conn_worker is not None:
work_client_machine = utils.allocate_packet_capturer(
lock_manager, hostname=work_client_hostname)
conn_worker.prepare_work_client(work_client_machine)
# Lock VM. If on, power off; always power on. Then create a tunnel.
webdriver_instance = utils.allocate_webdriver_instance(lock_manager)
if utils.is_VM_running(webdriver_master, webdriver_instance):
logging.info('VM %s was on; powering off for a clean instance',
webdriver_instance)
utils.power_off_VM(webdriver_master, webdriver_instance)
logging.info('Allow VM time to gracefully shut down')
time.sleep(5)
logging.info('Starting up VM %s', webdriver_instance)
utils.power_on_VM(webdriver_master, webdriver_instance)
logging.info('Allow VM time to power on before creating a tunnel.')
time.sleep(30)
if not client_utils.host_is_in_lab_zone(webdriver_instance.hostname):
self._ap_spec._webdriver_hostname = webdriver_instance.hostname
else:
# If in the lab then port forwarding must be done so webdriver
# connection will be over localhost.
self._ap_spec._webdriver_hostname = 'localhost'
webdriver_tunnel = webdriver_instance.create_ssh_tunnel(
WEBDRIVER_PORT, WEBDRIVER_PORT)
logging.info('Wait for tunnel to be created.')
for i in range(3):
time.sleep(10)
results = client_utils.run('lsof -i:%s' % WEBDRIVER_PORT,
ignore_status=True)
if results:
break
if not results:
raise error.TestError(
'Unable to listen to WEBDRIVER_PORT: %s', results)
batch_locker = ap_batch_locker.ApBatchLocker(
lock_manager, self._ap_spec,
ap_test_type=ap_constants.AP_TEST_TYPE_CHAOS)
while batch_locker.has_more_aps():
# Work around for CrOS devices only:crbug.com/358716
utils.sanitize_client(self._host)
healthy_dut = True
with contextlib.closing(wifi_client.WiFiClient(
hosts.create_host(
{
'hostname' : self._host.hostname,
'afe_host' : self._host._afe_host,
'host_info_store':
self._host.host_info_store,
},
host_class=self._host.__class__,
),
'./debug',
False,
)) as client:
aps = batch_locker.get_ap_batch(batch_size=batch_size)
if not aps:
logging.info('No more APs to test.')
break
# Power down all of the APs because some can get grumpy
# if they are configured several times and remain on.
# User the cartridge to down group power downs and
# configurations.
utils.power_down_aps(aps, self._broken_pdus)
utils.configure_aps(aps, self._ap_spec, self._broken_pdus)
aps = utils.filter_quarantined_and_config_failed_aps(aps,
batch_locker, job, self._broken_pdus)
for ap in aps:
# http://crbug.com/306687
if ap.ssid == None:
logging.error('The SSID was not set for the AP:%s',
ap)
healthy_dut = utils.is_dut_healthy(client, ap)
if not healthy_dut:
logging.error('DUT is not healthy, rebooting.')
batch_locker.unlock_and_reclaim_aps()
break
networks = utils.return_available_networks(
ap, capturer, job, self._ap_spec)
if networks is None:
# If scan returned no networks, iw scan failed.
# Reboot the packet capturer device and
# reconfigure the capturer.
batch_locker.unlock_and_reclaim_ap(ap.host_name)
logging.error('Packet capture is not healthy, '
'rebooting.')
capturer.host.reboot()
capturer = site_linux_system.LinuxSystem(
capture_host, {},'packet_capturer')
continue
if networks == list():
# Packet capturer did not find the SSID in scan or
# there was a security mismatch.
utils.release_ap(ap, batch_locker, self._broken_pdus)
continue
assoc_params = ap.get_association_parameters()
if not utils.is_conn_worker_healthy(
conn_worker, ap, assoc_params, job):
utils.release_ap(
ap, batch_locker, self._broken_pdus)
continue
name = ap.name
kernel_ver = self._host.get_kernel_ver()
firmware_ver = utils.get_firmware_ver(self._host)
if not firmware_ver:
firmware_ver = "Unknown"
debug_dict = {'+++PARSE DATA+++': '+++PARSE DATA+++',
'SSID': ap._ssid,
'DUT': client.wifi_mac,
'AP Info': ap.name,
'kernel_version': kernel_ver,
'wifi_firmware_version': firmware_ver}
debug_string = pprint.pformat(debug_dict)
logging.info('Waiting %d seconds for the AP dhcp '
'server', ap.dhcp_delay)
time.sleep(ap.dhcp_delay)
result = job.run_test(self._test,
capturer=capturer,
capturer_frequency=networks[0].frequency,
capturer_ht_type=networks[0].width,
host=self._host,
assoc_params=assoc_params,
client=client,
tries=tries,
debug_info=debug_string,
# Copy all logs from the system
disabled_sysinfo=disabled_sysinfo,
conn_worker=conn_worker,
tag=ap.ssid if conn_worker is None else
'%s.%s' % (conn_worker.name, ap.ssid))
utils.release_ap(ap, batch_locker, self._broken_pdus)
if conn_worker is not None:
conn_worker.cleanup()
if not healthy_dut:
continue
batch_locker.unlock_aps()
if webdriver_tunnel:
webdriver_instance.disconnect_ssh_tunnel(webdriver_tunnel)
webdriver_instance.close()
capturer.close()
logging.info('Powering off VM %s', webdriver_instance)
utils.power_off_VM(webdriver_master, webdriver_instance)
lock_manager.unlock(webdriver_instance.hostname)
if self._broken_pdus:
logging.info('PDU is down!!!\nThe following PDUs are down:\n')
pprint.pprint(self._broken_pdus)
factory = ap_configurator_factory.APConfiguratorFactory(
ap_constants.AP_TEST_TYPE_CHAOS)
factory.turn_off_all_routers(self._broken_pdus)