blob: 8e35c34656bfb7609a8b77df881ccc5d1a570cb5 [file] [log] [blame]
#!/bin/sh
# shellcheck disable=SC2039
#
# Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
set -e
SHILL_START_LOCK_PATH="/run/lock/shill-start.lock"
WIFI_CRED=/usr/local/etc/wifi_cred
# See do_suspend() in src/third_party/autotest/files/client/cros/power/sys_power.py
PAUSE_FILE=/run/autotest_pause_ethernet_hook
paused_time=0
# Critical messages should be sent to /var/log/messages. Other messages should
# be sent via echo to be harvested by recover_duts.py.
#
# TODO(tbroch) Relocate this to common hook library if/when there's more than
# one hook.
critical_msg() {
echo "$(date --rfc-3339=seconds) $*" >&2
logger -t "$(basename "$0")" -- "$*"
}
info_msg() {
echo "$(date --rfc-3339=seconds) $*" >&2
}
# Returns the default gateway.
get_default_gateway() {
local ip_route
ip_route="$(ip route get 1.0.0.0)"
echo "${ip_route}" | head -n 1 | cut -f 3 -d ' '
}
# Shows the list of Ethernet interfaces found on the system.
# Optional arg: when non-empty, only check for operational links.
search_devices() {
local device_path
local device
local operational="$1"
for device_path in /sys/class/net/*; do
device="$(basename "${device_path}")"
# Skip non-operational links.
if [ -n "${operational}" ] && \
[ "$(cat "${device_path}/operstate")" != "up" ]; then
continue
fi
# ignore "wwan*" devices: they are 3G/4G/LTE modems.
case "${device}" in
wwan*|lo) continue;;
esac
# ignore virtual devices (e.g. lxcbr0 or arcbr0)
if [ "$(cat "${device_path}/addr_assign_type" 2>&1)" != "0" ] ; then
continue
fi
# ignore "wlan" devices.
if iw "${device}" info > /dev/null 2>&1 ; then
continue
fi
echo "${device}"
done
}
# Pings the given ipaddress through all operational ethernet devices
# $1 - IP address to ping.
do_ping() {
local ip_addr=$1
local eth
for eth in $(search_devices 1); do
ping -q -I "${eth}" -c 9 "${ip_addr}" && return 0
done
return 1
}
# Return the remote IP address of the first established SSH connection
find_ssh_client() {
netstat -lanp | awk '/tcp.*:22 .*ESTABLISHED.*/ {split($5,a,":"); \
if (a[1] != "127.0.0.1") print a[1]}'
}
# Return the IP address of a neighbor reachable via ethernet
find_ethernet_neighbor() {
local eth
local neighbor_ip
for eth in $(search_devices 1); do
neighbor_ip=$(ip -4 neigh show dev "${eth}" |
awk '/REACHABLE|DELAY|STALE/ {print $1; exit}')
[ -n "${neighbor_ip}" ] && echo "${neighbor_ip}" && return 0
done
return 1
}
# Try to find a connected SSH client (our autotest server) and ping it
ping_controlling_server() {
local default_gateway
default_gateway="$(get_default_gateway)" || default_gateway=
if [ -n "${default_gateway}" ]; then
do_ping ${default_gateway} && return 0
fi
local ssh_client
ssh_client="$(find_ssh_client)" || ssh_client=
if [ -n "${ssh_client}" ]; then
do_ping ${ssh_client} && return 0
fi
# Last attempt: any recently seen neighbor
local neighbor
neighbor="$(find_ethernet_neighbor)" || neighbor=
if [ -n "${neighbor}" ]; then
do_ping ${neighbor} && return 0
fi
return 1
}
reload_ethernet_drivers() {
local eth
local ret=1
for eth in $(search_devices); do
info_msg "Reload driver for interface ${eth}"
reload_network_device "${eth}"
ret=0
done
return ${ret}
}
toggle_usb_ports() {
local device_path
local ret=1
for device_path in /sys/class/net/*; do
local usbpath
usbpath=$(readlink -f "${device_path}")
# Example of what usbpath is expect to look like:
# usbpath=/sys/devices/pci0000:00/0000:00:14.0/usb2/2-2/2-2:1.0/net/eth1
#
# But we want the port for that device:
# echo ${usbpath%/*/net/*}
# /sys/devices/pci0000:00/0000:00:14.0/usb2/2-2
usbpath=${usbpath%/*/net/*}
# Only USB devices have "authorized" field in /sys
if [ -w "${usbpath}/authorized" ]; then
ret=0
# disable port: sort of like unplugging/plugging the dongle
echo 0 > "${usbpath}/authorized"
sleep 2
echo 1 > "${usbpath}/authorized"
sleep 1
fi
done
return ${ret}
}
# If there are no devices available, rescan all hubs.
rescan_usb_hubs() {
local port
local portnum
local usbhub=/sys/bus/usb/drivers/hub
# If there's an operational Ethernet interface, no need to reset any hubs.
if [ -n "$(search_devices 1)" ]; then
return 1
fi
# Didn't find any eth devices.
# Some possible causes are:
# crbug.com/452686 RT8153 (USB3-GigE) dongle coming up as USB-storage
# (Fixed: "mist" will reset back to ethernet device)
# crbug.com/733425 USB enumeration fails with
# "device not accepting address 2, error -71"
info_msg "No ethernet found: Rescanning USB hubs"
for port in "${usbhub}"/[0-9]*-0:1.0; do
# Doesn't exist. Glob didn't match anything?
[ -e "${port}" ] || continue
critical_msg "Rescanning ${port}"
portnum="$(basename "${port}")"
echo "${portnum}" > "${usbhub}"/unbind
sleep 1
echo "${portnum}" > "${usbhub}"/bind
done
# Return status: *now* do we have any devices?
for _ in $(seq 1 5); do
# USB needs a bit of time to scan the "hub".
sleep 3
[ -n "$(search_devices 1)" ] && return 0
done
return 1
}
restart_connection_manager() {
# NB: -e will fail on a dangling symlink. That's deliberate. The
# symlink should point at /proc/<locker's PID>. And if that path is
# gone, the locker has exited, and the lock is stale.
if [ -e "${SHILL_START_LOCK_PATH}" ]; then
lock_holder=$(readlink "${SHILL_START_LOCK_PATH}")
info_msg "Ignoring restart request; lock held by ${lock_holder}"
return
fi
initctl stop shill || info_msg "Shill was not running."
initctl start shill
}
try_pause_lock() {
# Append, to avoid changing the mtime of an existing lock if we don't acquire
# it.
if ! exec 9>>"${PAUSE_FILE}"; then
critical_msg "Failed to open ${PAUSE_FILE}"
return 1
fi
if ! flock -xn 9; then
critical_msg "Failed to acquire ${PAUSE_FILE}"
return 1
fi
}
# Clear old locks and try to grab the lock. Does not block, and aborts if we
# fail.
force_pause_lock() {
critical_msg "Clobbering lock (${PAUSE_FILE})"
rm -f "${PAUSE_FILE}"
try_pause_lock || exit 1
}
# Return 0 if we need to pause (abort). Return non-zero and grab the "pause"
# lock if we can continue.
pause_check_ethernet() {
# power_SuspendStress tests requires many minutes of network timeout
# tolerance since the SSH connection to the autotest server will be
# disrupted. *** See http://crbug.com/334951 ***
#
# "Pause" the ethernet check for up to 30 minutes at the
# request of any test that creates and flocks PAUSE_FILE.
# Acquire the lock and hold it until exit, if possible.
if try_pause_lock; then
# File wasn't locked - no need to pause.
return 1
fi
local now
local start_time
now="$(date +%s)"
start_time=$(stat -c%Z "${PAUSE_FILE}") || true
if [ -z "${start_time}" ]; then
# Couldn't figure out lock time - just clobber it.
force_pause_lock
return 1
fi
local paused_time
paused_time=$((now - start_time))
if [ ${paused_time} -gt $((30*60)) ] ; then
critical_msg "Pause request exceeded 30 minutes. Checking lab network link."
force_pause_lock
return 1
fi
info_msg "Ethernet Check Pause started ${paused_time} seconds ago."
return 0
}
main() {
# Special check for devices running power autotests that connect to moblab via
# WiFi. This check saves devices from rebooting even though they don't have
# wired ethernet connection.
if [ -f "${WIFI_CRED}" ]; then
info_msg "${WIFI_CRED} found. No need to check ethernet."
return 0
fi
if pause_check_ethernet; then
return 0
fi
# Attempt to ping our controlling autotest server over ethernet.
if ping_controlling_server; then
return 0
fi
local recovery_method
for recovery_method in rescan_usb_hubs \
toggle_usb_ports \
reload_ethernet_drivers; do
critical_msg "Attempting recovery method \"${recovery_method}\""
# A success return from the recovery method implies that it successfully
# performed some action that makes it worth re-checking to see whether
# our connectivity was remediated. Otherwise, we move on to the next
# recovery method without delay.
"${recovery_method}" || continue
local now
local start_time
local method_timeout
now="$(date +%s)"
start_time="${now}"
method_timeout=$((now+30))
if ! initctl status shill | grep -q running ; then
restart_connection_manager
fi
# poll "controlling_server" until timeout
# NB: Our Lab DHCP servers must respond in < 30 seconds.
while [ "${now}" -lt "${method_timeout}" ]; do
if ping_controlling_server; then
critical_msg "${recovery_method} successful after $((now-start_time)) seconds."
return 0
fi
sleep 1
now="$(date +%s)"
done
done
critical_msg "All ethernet recovery methods have failed. Rebooting."
sync
# Give powerd a chance to reboot via the standard path (and log messages that
# are helpful for debugging) before calling 'reboot' directly.
dbus-send --system --type=method_call --dest=org.chromium.PowerManager \
/org/chromium/PowerManager org.chromium.PowerManager.RequestRestart \
int32:2 string:'recover_duts check_ethernet hook failed' &
sleep 30
reboot
return 1
}
main