blob: 6ed17b7f697b088f8d45bd44575bd4b4501b6dd3 [file] [log] [blame]
#!/bin/bash
#
# Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
set -e
# Only run this script on test machines that run in the lab.
# See autotest/server/hosts/site_host.py for more information.
if [ ! -f /mnt/stateful_partition/.labmachine ]; then
exit 0
fi
NON_ETHERNET_DRIVERS="cdc_ether"
SHILL_START_LOCK_PATH="/var/lock/shill-start.lock"
# See code in src/thirdparty/autotest/files/client/cros/power_suspend.py
PAUSE_FILE="/run/autotest_pause_ethernet_hook"
# Critical messages should be sent to /var/log/messages. Other messages should
# be sent via echo to be harvested by recover_duts.py.
#
# TODO(tbroch) Relocate this to common hook library if/when there's more than
# one hook.
critical_msg() {
echo "$@"
logger -t "$(basename $0)" -- "$@"
}
# Returns the default gateway.
get_default_gateway() {
local ip_route="$(ip route get 1.0.0.0)"
echo ${ip_route} | head -n 1 | cut -f 3 -d ' '
}
# Returns 0 if $1 is a non-Ethernet driver, or 1 otherwise.
is_non_ethernet_driver() {
local driver="$1"
local non_ethernet_driver
for non_ethernet_driver in ${NON_ETHERNET_DRIVERS}; do
if [ "${driver}" = "${non_ethernet_driver}" ]; then
return 0
fi
done
return 1
}
# Returns 0 if $1 indicates its link is connected, or 1 otherwise.
is_connected() {
local device="$1"
ip link show "${device}" | grep -q LOWER_UP
}
# Shows the list of Ethernet interfaces found on the system.
find_ethernet_interfaces() {
local device_path
local driver_path
local device
local driver
for device_path in /sys/class/net/{eth,wan,lan,usbeth}*; do
if [ -e "${device_path}" ]; then
driver_path="${device_path}/device/driver"
if [ -e "${driver_path}" ]; then
device=$(basename "${device_path}")
driver=$(basename $(readlink -f "${driver_path}"))
if ! is_non_ethernet_driver "${driver}" || is_connected "${device}" ; then
echo "${device}"
fi
fi
fi
done
}
# Shows the list of USB-Ethernet interfaces found on the system.
find_usb_ethernet_interfaces() {
for device_path in /sys/class/net/{eth,wan,lan,usbeth}*; do
if [ -e "${device_path}" ]; then
if readlink -f "${device_path}" | grep -q usb; then
basename "${device_path}"
fi
fi
done
}
# Pings the given ipaddress through all wired ethernet devices
# $1 - IP address to ping.
do_ping() {
local ip_addr=$1
for eth in $(find_ethernet_interfaces); do
ping -q -I ${eth} -c 9 ${ip_addr} && return 0
done
return 1
}
# Restart all our ethernet devices and restart shill.
# Return the remote IP address of the first established SSH connection
find_ssh_client() {
netstat -lanp | awk '/tcp.*:22 .*ESTABLISHED.*/ {split($5,a,":"); \
if (a[1] != "127.0.0.1") print a[1]}'
}
# Try to find a connected SSH client (our autotest server) and ping it
ping_controlling_server() {
local default_gateway="$(get_default_gateway)" || default_gateway=
if [ -n "${default_gateway}" ]; then
do_ping ${default_gateway} && return 0
fi
local ssh_client="$(find_ssh_client)" || ssh_client=
if [ -n "${ssh_client}" ]; then
do_ping ${ssh_client} && return 0
fi
return 1
}
reload_usb_ethernet_devices() {
local eth
local ret=1
for eth in $(find_usb_ethernet_interfaces); do
echo "Reload interface ${eth}"
reload_network_device "${eth}"
ret=0
done
return ${ret}
}
toggle_ethernet_interfaces() {
local eth
local ret=1
for eth in $(find_ethernet_interfaces); do
echo "Bounce interface ${eth}"
ifconfig "${eth}" down
ifconfig "${eth}" up
ret=0
done
return ${ret}
}
toggle_usb_interfaces() {
local device_path
local ret=1
for device_path in /sys/class/net/*; do
local usbpath=$(readlink -f "${device_path}")
# Example of what usbpath is expect to look like:
# usbpath=/sys/devices/pci0000:00/0000:00:14.0/usb2/2-2/2-2:1.0/net/eth1
#
# But we want the port for that device:
# echo ${usbpath%/*/net/*}
# /sys/devices/pci0000:00/0000:00:14.0/usb2/2-2
usbpath=${usbpath%/*/net/*}
# Only USB devices have "authorized" field in /sys
if [ -w "${usbpath}/authorized" ]; then
ret=0
# disable port: sort of like unplugging/plugging the dongle
echo 0 > "${usbpath}/authorized"
sleep 3
echo 1 > "${usbpath}/authorized"
fi
done
return ${ret}
}
restart_connection_manager() {
# NB: -e will fail on a dangling symlink. That's deliberate. The
# symlink should point at /proc/<locker's PID>. And if that path is
# gone, the locker has exited, and the lock is stale.
if [ -e "${SHILL_START_LOCK_PATH}" ]; then
lock_holder=$(readlink "${SHILL_START_LOCK_PATH}")
echo "Ignoring restart request; lock held by ${lock_holder}"
return 1
fi
initctl stop shill || echo "Shill was not running."
initctl start shill
}
ensure_connection_manager_is_running() {
if initctl status shill | grep -q running ; then
return 1
fi
restart_connection_manager
}
pause_check_ethernet() {
# power_SuspendStress tests requires many minutes of network timeout
# tolerance since the SSH connection to the autotest server will be
# disrupted. *** See http://crbug.com/334951 ***
#
# "Pause" the ethernet check for up to 30 minutes at the
# request of any test that creates and flocks PAUSE_FILE.
if ! flock -xn "${PAUSE_FILE}" ; then
# file wasn't locked - can try to recover ethernet link.
rm -f "${PAUSE_FILE}"
return 0
fi
local now=$(date +%s)
local start_time=$(stat -c%Z "${PAUSE_FILE}") || true
if [ -n "${start_time}" ]; then
let paused_time=$((now - start_time))
if [ "${paused_time}" -gt 1800 ]; then
# check link despite PAUSE_FILE locked.
critical_msg "PAUSE_TIME (${paused_time}) exceeded 1800 seconds."
return 0
fi
fi
return 1
}
recover_network() {
for recovery_method in \
ensure_connection_manager_is_running \
toggle_ethernet_interfaces \
toggle_usb_interfaces \
reload_usb_ethernet_devices \
restart_connection_manager
do
critical_msg "Attempting recovery method \"${recovery_method}\""
# A success return from the recovery method implies that it successfully
# performed some action that makes it worth re-checking to see whether
# our connectivity was remediated. Otherwise, we move on to the next
# recovery method without delay.
"${recovery_method}" || continue
sleep 30
ifconfig -a
if ping_controlling_server; then
critical_msg "Recovery method \"${recovery_method}\" successful"
return 0
fi
if pause_ethernet_check; then
critical_msg "Pausing ethernet recovery."
return 0
fi
done
# results in reboot
return 1
}
TIMEOUT_MINUTES=15
TIMEOUT=$(( TIMEOUT_MINUTES * 60 ))
main() {
# Attempt to ping our controlling autotest server over ethernet.
local endtime=$(( $(date +%s) + TIMEOUT ))
if ping_controlling_server; then
return 0
fi
if pause_ethernet_check; then
return 0
fi
if recover_network; then
return 0
fi
critical_msg "All efforts to recover ethernet have been exhausted. Rebooting."
sync
(sleep 5 && reboot) &
return 1
}
main