| #!/bin/bash |
| # |
| # Copyright (c) 2012 The Chromium OS Authors. All rights reserved. |
| # Use of this source code is governed by a BSD-style license that can be |
| # found in the LICENSE file. |
| |
| set -e |
| |
| # Only run this script on test machines that run in the lab. |
| # See autotest/server/hosts/site_host.py for more information. |
| if [ ! -f /mnt/stateful_partition/.labmachine ]; then |
| exit 0 |
| fi |
| |
| SHILL_START_LOCK_PATH="/var/lock/shill-start.lock" |
| |
| # See code in src/thirdparty/autotest/files/client/cros/power_suspend.py |
| PAUSE_FILE="/run/autotest_pause_ethernet_hook" |
| paused_time=0 |
| |
| |
| # Critical messages should be sent to /var/log/messages. Other messages should |
| # be sent via echo to be harvested by recover_duts.py. |
| # |
| # TODO(tbroch) Relocate this to common hook library if/when there's more than |
| # one hook. |
| critical_msg() { |
| echo $(date --rfc-3339=seconds) " $@" >&2 |
| logger -t "$(basename $0)" -- "$@" |
| } |
| |
| info_msg() { |
| echo $(date --rfc-3339=seconds) " $@" >&2 |
| } |
| |
| # Returns the default gateway. |
| get_default_gateway() { |
| local ip_route="$(ip route get 1.0.0.0)" |
| echo ${ip_route} | head -n 1 | cut -f 3 -d ' ' |
| } |
| |
| rescan_usb_hubs() { |
| local port |
| local portnum |
| local usbhub=/sys/bus/usb/drivers/hub |
| |
| for port in ${usbhub}/[0-9]*-0:1.0; do |
| critical_msg "Rescanning $port" |
| portnum=$(basename "${port}") |
| echo ${portnum} > ${usbhub}/unbind |
| sleep 1 |
| echo ${portnum} > ${usbhub}/bind |
| # USB needs a bit of time to scan the "hub" |
| sleep 3 |
| done |
| } |
| |
| search_devices() { |
| local device_path |
| local device |
| |
| for device_path in /sys/class/net/*; do |
| device=$(basename "${device_path}") |
| |
| # lab interconnect is full-duplex and tells us so. |
| # devices w/o link won't get listed here |
| if cat "${device_path}/duplex" > /dev/null 2>&1 ; then |
| echo "${device}" |
| continue |
| fi |
| |
| # ignore "wwan*" devices: they are 3G/4G/LTE modems. |
| case "${device}" in |
| wwan*|lo) continue;; |
| esac |
| |
| # ignore virtual devices (e.g. lxcbr0 or arcbr0) |
| if [ "$(cat ${device_path}/addr_assign_type 2>&1)" != "0" ] ; then |
| continue |
| fi |
| |
| # ignore "wlan" devices. |
| if iw "${device}" info > /dev/null 2>&1 ; then |
| continue |
| fi |
| |
| echo "${device}" |
| done |
| } |
| |
| # Shows the list of Ethernet interfaces found on the system. |
| find_ethernet_interfaces() { |
| local interfaces=$(search_devices) |
| |
| if [ -z "${interfaces}" ] ; then |
| # didn't find any eth devices. |
| # Some possible causes are: |
| # crbug.com/452686 RT8153 (USB3-GigE) dongle coming up as USB-storage |
| # (Fixed: "mist" will reset back to ethernet device) |
| # crbug.com/733425 USB enumeration fails with |
| # "device not accepting address 2, error -71" |
| info_msg "No ethernet found: Rescanning USB hubs" |
| rescan_usb_hubs |
| |
| # check again |
| interfaces=$(search_devices) |
| fi |
| echo $interfaces |
| } |
| |
| |
| # Pings the given ipaddress through all ethernet devices |
| # $1 - IP address to ping. |
| do_ping() { |
| local ip_addr=$1 |
| local eth |
| |
| for eth in $(find_ethernet_interfaces); do |
| ping -q -I ${eth} -c 9 ${ip_addr} && return 0 |
| done |
| |
| return 1 |
| } |
| |
| # Restart all our ethernet devices and restart shill. |
| # Return the remote IP address of the first established SSH connection |
| find_ssh_client() { |
| netstat -lanp | awk '/tcp.*:22 .*ESTABLISHED.*/ {split($5,a,":"); \ |
| if (a[1] != "127.0.0.1") print a[1]}' |
| } |
| |
| # Try to find a connected SSH client (our autotest server) and ping it |
| ping_controlling_server() { |
| local default_gateway="$(get_default_gateway)" || default_gateway= |
| if [ -n "${default_gateway}" ]; then |
| do_ping ${default_gateway} && return 0 |
| fi |
| |
| local ssh_client="$(find_ssh_client)" || ssh_client= |
| if [ -n "${ssh_client}" ]; then |
| do_ping ${ssh_client} && return 0 |
| fi |
| return 1 |
| } |
| |
| reload_ethernet_drivers() { |
| local eth |
| local ret=1 |
| for eth in $(find_ethernet_interfaces); do |
| info_msg "Reload driver for interface ${eth}" |
| reload_network_device "${eth}" |
| ret=0 |
| done |
| return ${ret} |
| } |
| |
| toggle_usb_ports() { |
| local device_path |
| local ret=1 |
| for device_path in /sys/class/net/*; do |
| local usbpath=$(readlink -f "${device_path}") |
| # Example of what usbpath is expect to look like: |
| # usbpath=/sys/devices/pci0000:00/0000:00:14.0/usb2/2-2/2-2:1.0/net/eth1 |
| # |
| # But we want the port for that device: |
| # echo ${usbpath%/*/net/*} |
| # /sys/devices/pci0000:00/0000:00:14.0/usb2/2-2 |
| usbpath=${usbpath%/*/net/*} |
| |
| # Only USB devices have "authorized" field in /sys |
| if [ -w "${usbpath}/authorized" ]; then |
| ret=0 |
| # disable port: sort of like unplugging/plugging the dongle |
| echo 0 > "${usbpath}/authorized" |
| sleep 2 |
| echo 1 > "${usbpath}/authorized" |
| sleep 1 |
| fi |
| done |
| return ${ret} |
| } |
| |
| restart_connection_manager() { |
| # NB: -e will fail on a dangling symlink. That's deliberate. The |
| # symlink should point at /proc/<locker's PID>. And if that path is |
| # gone, the locker has exited, and the lock is stale. |
| if [ -e "${SHILL_START_LOCK_PATH}" ]; then |
| lock_holder=$(readlink "${SHILL_START_LOCK_PATH}") |
| info_msg "Ignoring restart request; lock held by ${lock_holder}" |
| return |
| fi |
| initctl stop shill || info_msg "Shill was not running." |
| initctl start shill |
| } |
| |
| pause_check_ethernet() { |
| paused_time=0 |
| # power_SuspendStress tests requires many minutes of network timeout |
| # tolerance since the SSH connection to the autotest server will be |
| # disrupted. *** See http://crbug.com/334951 *** |
| # |
| # "Pause" the ethernet check for up to 30 minutes at the |
| # request of any test that creates and flocks PAUSE_FILE. |
| if flock -xn "${PAUSE_FILE}" -c : ; then |
| # file wasn't locked - can try to recover ethernet link. |
| rm -f "${PAUSE_FILE}" |
| return 1 |
| fi |
| |
| local now="$(date +%s)" |
| local start_time=$(stat -c%Z "${PAUSE_FILE}") || true |
| |
| if [ -z "${start_time}" ]; then |
| return 1 |
| fi |
| |
| paused_time=$((now - start_time)) |
| return 0 |
| } |
| |
| main() { |
| local recovery_method |
| |
| for recovery_method in toggle_usb_ports reload_ethernet_drivers; do |
| if pause_check_ethernet ; then |
| # pause_check_ethernet() sets up $paused_time. |
| if [ ${paused_time} -gt $((30*60)) ] ; then |
| critical_msg "Pause request exceeded 30 minutes. Checking lab network link." |
| rm -f "${PAUSE_FILE}" |
| else |
| info_msg "Ethernet Check Pause started ${paused_time} seconds ago." |
| return 0 |
| fi |
| fi |
| |
| # Attempt to ping our controlling autotest server over ethernet. |
| if ping_controlling_server; then |
| return 0 |
| fi |
| |
| critical_msg "Attempting recovery method \"${recovery_method}\"" |
| |
| # A success return from the recovery method implies that it successfully |
| # performed some action that makes it worth re-checking to see whether |
| # our connectivity was remediated. Otherwise, we move on to the next |
| # recovery method without delay. |
| "${recovery_method}" || continue |
| |
| local now="$(date +%s)" |
| local start_time="${now}" |
| local method_timeout=$(( now + 30 )) |
| |
| if ! initctl status shill | grep -q running ; then |
| restart_connection_manager |
| fi |
| |
| # poll "controlling_server" until timeout |
| # NB: Our Lab DHCP servers must respond in < 30 seconds. |
| while [ $now -lt $method_timeout ]; do |
| if ping_controlling_server; then |
| critical_msg "${recovery_method} successful after $(( now - start_time )) seconds." |
| return 0 |
| fi |
| sleep 1 |
| now=$(date +%s) |
| done |
| done |
| |
| critical_msg "All ethernet recovery methods have failed. Rebooting." |
| sync |
| (sleep 5 && reboot) & |
| return 1 |
| } |
| |
| main |