blob: a0209a4163f00a6bc20c8708f876e49a65201330 [file] [log] [blame]
#!/bin/sh
# shellcheck disable=SC2039
#
# Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
set -e
# Only run this script on test machines that run in the lab.
# See autotest/server/hosts/site_host.py for more information.
if [ ! -f /mnt/stateful_partition/.labmachine ]; then
exit 0
fi
SHILL_START_LOCK_PATH="/var/lock/shill-start.lock"
# See code in src/thirdparty/autotest/files/client/cros/power_suspend.py
PAUSE_FILE=/run/autotest_pause_ethernet_hook
paused_time=0
# Critical messages should be sent to /var/log/messages. Other messages should
# be sent via echo to be harvested by recover_duts.py.
#
# TODO(tbroch) Relocate this to common hook library if/when there's more than
# one hook.
critical_msg() {
echo "$(date --rfc-3339=seconds) $*" >&2
logger -t "$(basename "$0")" -- "$*"
}
info_msg() {
echo "$(date --rfc-3339=seconds) $*" >&2
}
# Returns the default gateway.
get_default_gateway() {
local ip_route
ip_route="$(ip route get 1.0.0.0)"
echo "${ip_route}" | head -n 1 | cut -f 3 -d ' '
}
rescan_usb_hubs() {
local port
local portnum
local usbhub=/sys/bus/usb/drivers/hub
for port in "${usbhub}"/[0-9]*-0:1.0; do
critical_msg "Rescanning ${port}"
portnum="$(basename "${port}")"
echo "${portnum}" > "${usbhub}"/unbind
sleep 1
echo "${portnum}" > "${usbhub}"/bind
# USB needs a bit of time to scan the "hub"
sleep 3
done
}
search_devices() {
local device_path
local device
for device_path in /sys/class/net/*; do
device="$(basename "${device_path}")"
# lab interconnect is full-duplex and tells us so.
# devices w/o link won't get listed here
if cat "${device_path}/duplex" > /dev/null 2>&1 ; then
echo "${device}"
continue
fi
# ignore "wwan*" devices: they are 3G/4G/LTE modems.
case "${device}" in
wwan*|lo) continue;;
esac
# ignore virtual devices (e.g. lxcbr0 or arcbr0)
if [ "$(cat "${device_path}/addr_assign_type" 2>&1)" != "0" ] ; then
continue
fi
# ignore "wlan" devices.
if iw "${device}" info > /dev/null 2>&1 ; then
continue
fi
echo "${device}"
done
}
# Shows the list of Ethernet interfaces found on the system.
find_ethernet_interfaces() {
local interfaces
interfaces=$(search_devices)
if [ -z "${interfaces}" ] ; then
# didn't find any eth devices.
# Some possible causes are:
# crbug.com/452686 RT8153 (USB3-GigE) dongle coming up as USB-storage
# (Fixed: "mist" will reset back to ethernet device)
# crbug.com/733425 USB enumeration fails with
# "device not accepting address 2, error -71"
info_msg "No ethernet found: Rescanning USB hubs"
rescan_usb_hubs
# check again
interfaces=$(search_devices)
fi
echo "$interfaces"
}
# Pings the given ipaddress through all ethernet devices
# $1 - IP address to ping.
do_ping() {
local ip_addr=$1
local eth
for eth in $(find_ethernet_interfaces); do
ping -q -I "${eth}" -c 9 "${ip_addr}" && return 0
done
return 1
}
# Restart all our ethernet devices and restart shill.
# Return the remote IP address of the first established SSH connection
find_ssh_client() {
netstat -lanp | awk '/tcp.*:22 .*ESTABLISHED.*/ {split($5,a,":"); '
' if (a[1] != "127.0.0.1") print a[1]}'
}
# Return the IP address of a neighbor reachable via ethernet
find_ethernet_neighbor() {
local eth
local neighbor_ip
for eth in $(find_ethernet_interfaces); do
neighbor_ip=$(ip -4 neigh show dev "${eth}" |
awk '/REACHABLE|DELAY|STALE/ {print $1; exit}')
[ -n "${neighbor_ip}" ] && echo "${neighbor_ip}" && return 0
done
return 1
}
# Try to find a connected SSH client (our autotest server) and ping it
ping_controlling_server() {
local default_gateway
default_gateway="$(get_default_gateway)" || default_gateway=
if [ -n "${default_gateway}" ]; then
do_ping ${default_gateway} && return 0
fi
local ssh_client
ssh_client="$(find_ssh_client)" || ssh_client=
if [ -n "${ssh_client}" ]; then
do_ping ${ssh_client} && return 0
fi
# Last attempt: any recently seen neighbor
local neighbor
neighbor="$(find_ethernet_neighbor)" || neighbor=
if [ -n "${neighbor}" ]; then
do_ping ${neighbor} && return 0
fi
return 1
}
reload_ethernet_drivers() {
local eth
local ret=1
for eth in $(find_ethernet_interfaces); do
info_msg "Reload driver for interface ${eth}"
reload_network_device "${eth}"
ret=0
done
return ${ret}
}
toggle_usb_ports() {
local device_path
local ret=1
for device_path in /sys/class/net/*; do
local usbpath
usbpath=$(readlink -f "${device_path}")
# Example of what usbpath is expect to look like:
# usbpath=/sys/devices/pci0000:00/0000:00:14.0/usb2/2-2/2-2:1.0/net/eth1
#
# But we want the port for that device:
# echo ${usbpath%/*/net/*}
# /sys/devices/pci0000:00/0000:00:14.0/usb2/2-2
usbpath=${usbpath%/*/net/*}
# Only USB devices have "authorized" field in /sys
if [ -w "${usbpath}/authorized" ]; then
ret=0
# disable port: sort of like unplugging/plugging the dongle
echo 0 > "${usbpath}/authorized"
sleep 2
echo 1 > "${usbpath}/authorized"
sleep 1
fi
done
return ${ret}
}
restart_connection_manager() {
# NB: -e will fail on a dangling symlink. That's deliberate. The
# symlink should point at /proc/<locker's PID>. And if that path is
# gone, the locker has exited, and the lock is stale.
if [ -e "${SHILL_START_LOCK_PATH}" ]; then
lock_holder=$(readlink "${SHILL_START_LOCK_PATH}")
info_msg "Ignoring restart request; lock held by ${lock_holder}"
return
fi
initctl stop shill || info_msg "Shill was not running."
initctl start shill
}
pause_check_ethernet() {
paused_time=0
# power_SuspendStress tests requires many minutes of network timeout
# tolerance since the SSH connection to the autotest server will be
# disrupted. *** See http://crbug.com/334951 ***
#
# "Pause" the ethernet check for up to 30 minutes at the
# request of any test that creates and flocks PAUSE_FILE.
if flock -xn "${PAUSE_FILE}" -c : ; then
# file wasn't locked - can try to recover ethernet link.
rm -f "${PAUSE_FILE}"
return 1
fi
local now
local start_time
now="$(date +%s)"
start_time=$(stat -c%Z "${PAUSE_FILE}") || true
if [ -z "${start_time}" ]; then
return 1
fi
paused_time=$((now - start_time))
return 0
}
main() {
local recovery_method
for recovery_method in toggle_usb_ports reload_ethernet_drivers; do
if pause_check_ethernet ; then
# pause_check_ethernet() sets up $paused_time.
if [ ${paused_time} -gt $((30*60)) ] ; then
critical_msg "Pause request exceeded 30 minutes. Checking lab network link."
rm -f "${PAUSE_FILE}"
else
info_msg "Ethernet Check Pause started ${paused_time} seconds ago."
return 0
fi
fi
# Attempt to ping our controlling autotest server over ethernet.
if ping_controlling_server; then
return 0
fi
critical_msg "Attempting recovery method \"${recovery_method}\""
# A success return from the recovery method implies that it successfully
# performed some action that makes it worth re-checking to see whether
# our connectivity was remediated. Otherwise, we move on to the next
# recovery method without delay.
"${recovery_method}" || continue
local now
local start_time
local method_timeout
now="$(date +%s)"
start_time="${now}"
method_timeout=$((now+30))
if ! initctl status shill | grep -q running ; then
restart_connection_manager
fi
# poll "controlling_server" until timeout
# NB: Our Lab DHCP servers must respond in < 30 seconds.
while [ "${now}" -lt "${method_timeout}" ]; do
if ping_controlling_server; then
critical_msg "${recovery_method} successful after $((now-start_time)) seconds."
return 0
fi
sleep 1
now="$(date +%s)"
done
done
critical_msg "All ethernet recovery methods have failed. Rebooting."
sync
# Give powerd a chance to reboot via the standard path (and log messages that
# are helpful for debugging) before calling 'reboot' directly.
dbus-send --system --type=method_call --dest=org.chromium.PowerManager \
/org/chromium/PowerManager org.chromium.PowerManager.RequestRestart \
int32:2 string:'recover_duts check_ethernet hook failed' &
sleep 30
reboot
return 1
}
main