| #!/bin/sh |
| # shellcheck disable=SC2039 |
| # |
| # Copyright (c) 2012 The Chromium OS Authors. All rights reserved. |
| # Use of this source code is governed by a BSD-style license that can be |
| # found in the LICENSE file. |
| |
| set -e |
| |
| SHILL_START_LOCK_PATH="/run/lock/shill-start.lock" |
| WIFI_CRED=/usr/local/etc/wifi_cred |
| |
| # See do_suspend() in src/third_party/autotest/files/client/cros/power/sys_power.py |
| PAUSE_FILE=/run/autotest_pause_ethernet_hook |
| paused_time=0 |
| |
| |
| # Critical messages should be sent to /var/log/messages. Other messages should |
| # be sent via echo to be harvested by recover_duts.py. |
| # |
| # TODO(tbroch) Relocate this to common hook library if/when there's more than |
| # one hook. |
| critical_msg() { |
| echo "$(date --rfc-3339=seconds) $*" >&2 |
| logger -t "$(basename "$0")" -- "$*" |
| } |
| |
| info_msg() { |
| echo "$(date --rfc-3339=seconds) $*" >&2 |
| } |
| |
| # Returns the default gateway. |
| get_default_gateway() { |
| local ip_route |
| ip_route="$(ip route get 1.0.0.0)" |
| echo "${ip_route}" | head -n 1 | cut -f 3 -d ' ' |
| } |
| |
| # Shows the list of Ethernet interfaces found on the system. |
| # Optional arg: when non-empty, only check for operational links. |
| search_devices() { |
| local device_path |
| local device |
| local operational="$1" |
| |
| for device_path in /sys/class/net/*; do |
| device="$(basename "${device_path}")" |
| |
| # Skip non-operational links. |
| if [ -n "${operational}" ] && \ |
| [ "$(cat "${device_path}/operstate")" != "up" ]; then |
| continue |
| fi |
| |
| # ignore "wwan*" devices: they are 3G/4G/LTE modems. |
| case "${device}" in |
| wwan*|lo) continue;; |
| esac |
| |
| # ignore virtual devices (e.g. lxcbr0 or arcbr0) |
| if [ "$(cat "${device_path}/addr_assign_type" 2>&1)" != "0" ] ; then |
| continue |
| fi |
| |
| # ignore "wlan" devices. |
| if iw "${device}" info > /dev/null 2>&1 ; then |
| continue |
| fi |
| |
| echo "${device}" |
| done |
| } |
| |
| # Pings the given ipaddress through all operational ethernet devices |
| # $1 - IP address to ping. |
| do_ping() { |
| local ip_addr=$1 |
| local eth |
| |
| for eth in $(search_devices 1); do |
| ping -q -I "${eth}" -c 9 "${ip_addr}" && return 0 |
| done |
| |
| return 1 |
| } |
| |
| # Return the remote IP address of the first established SSH connection |
| find_ssh_client() { |
| netstat -lanp | awk '/tcp.*:22 .*ESTABLISHED.*/ {split($5,a,":"); \ |
| if (a[1] != "127.0.0.1") print a[1]}' |
| } |
| |
| # Return the IP address of a neighbor reachable via ethernet |
| find_ethernet_neighbor() { |
| local eth |
| local neighbor_ip |
| |
| for eth in $(search_devices 1); do |
| neighbor_ip=$(ip -4 neigh show dev "${eth}" | |
| awk '/REACHABLE|DELAY|STALE/ {print $1; exit}') |
| [ -n "${neighbor_ip}" ] && echo "${neighbor_ip}" && return 0 |
| done |
| return 1 |
| } |
| |
| # Try to find a connected SSH client (our autotest server) and ping it |
| ping_controlling_server() { |
| local default_gateway |
| default_gateway="$(get_default_gateway)" || default_gateway= |
| if [ -n "${default_gateway}" ]; then |
| do_ping ${default_gateway} && return 0 |
| fi |
| |
| local ssh_client |
| ssh_client="$(find_ssh_client)" || ssh_client= |
| if [ -n "${ssh_client}" ]; then |
| do_ping ${ssh_client} && return 0 |
| fi |
| |
| # Last attempt: any recently seen neighbor |
| local neighbor |
| neighbor="$(find_ethernet_neighbor)" || neighbor= |
| if [ -n "${neighbor}" ]; then |
| do_ping ${neighbor} && return 0 |
| fi |
| return 1 |
| } |
| |
| reload_ethernet_drivers() { |
| local eth |
| local ret=1 |
| for eth in $(search_devices); do |
| info_msg "Reload driver for interface ${eth}" |
| reload_network_device "${eth}" |
| ret=0 |
| done |
| return ${ret} |
| } |
| |
| toggle_usb_ports() { |
| local device_path |
| local ret=1 |
| for device_path in /sys/class/net/*; do |
| local usbpath |
| usbpath=$(readlink -f "${device_path}") |
| # Example of what usbpath is expect to look like: |
| # usbpath=/sys/devices/pci0000:00/0000:00:14.0/usb2/2-2/2-2:1.0/net/eth1 |
| # |
| # But we want the port for that device: |
| # echo ${usbpath%/*/net/*} |
| # /sys/devices/pci0000:00/0000:00:14.0/usb2/2-2 |
| usbpath=${usbpath%/*/net/*} |
| |
| # Only USB devices have "authorized" field in /sys |
| if [ -w "${usbpath}/authorized" ]; then |
| ret=0 |
| # disable port: sort of like unplugging/plugging the dongle |
| echo 0 > "${usbpath}/authorized" |
| sleep 2 |
| echo 1 > "${usbpath}/authorized" |
| sleep 1 |
| fi |
| done |
| return ${ret} |
| } |
| |
| # If there are no devices available, rescan all hubs. |
| rescan_usb_hubs() { |
| local port |
| local portnum |
| local usbhub=/sys/bus/usb/drivers/hub |
| |
| # If there's an operational Ethernet interface, no need to reset any hubs. |
| if [ -n "$(search_devices 1)" ]; then |
| return 1 |
| fi |
| |
| # Didn't find any eth devices. |
| # Some possible causes are: |
| # crbug.com/452686 RT8153 (USB3-GigE) dongle coming up as USB-storage |
| # (Fixed: "mist" will reset back to ethernet device) |
| # crbug.com/733425 USB enumeration fails with |
| # "device not accepting address 2, error -71" |
| info_msg "No ethernet found: Rescanning USB hubs" |
| |
| for port in "${usbhub}"/[0-9]*-0:1.0; do |
| # Doesn't exist. Glob didn't match anything? |
| [ -e "${port}" ] || continue |
| |
| critical_msg "Rescanning ${port}" |
| portnum="$(basename "${port}")" |
| echo "${portnum}" > "${usbhub}"/unbind |
| sleep 1 |
| echo "${portnum}" > "${usbhub}"/bind |
| done |
| |
| # Return status: *now* do we have any devices? |
| for _ in $(seq 1 5); do |
| # USB needs a bit of time to scan the "hub". |
| sleep 3 |
| [ -n "$(search_devices 1)" ] && return 0 |
| done |
| return 1 |
| } |
| |
| restart_connection_manager() { |
| # NB: -e will fail on a dangling symlink. That's deliberate. The |
| # symlink should point at /proc/<locker's PID>. And if that path is |
| # gone, the locker has exited, and the lock is stale. |
| if [ -e "${SHILL_START_LOCK_PATH}" ]; then |
| lock_holder=$(readlink "${SHILL_START_LOCK_PATH}") |
| info_msg "Ignoring restart request; lock held by ${lock_holder}" |
| return |
| fi |
| initctl stop shill || info_msg "Shill was not running." |
| initctl start shill |
| } |
| |
| try_pause_lock() { |
| # Append, to avoid changing the mtime of an existing lock if we don't acquire |
| # it. |
| if ! exec 9>>"${PAUSE_FILE}"; then |
| critical_msg "Failed to open ${PAUSE_FILE}" |
| return 1 |
| fi |
| if ! flock -xn 9; then |
| critical_msg "Failed to acquire ${PAUSE_FILE}" |
| return 1 |
| fi |
| } |
| |
| # Clear old locks and try to grab the lock. Does not block, and aborts if we |
| # fail. |
| force_pause_lock() { |
| critical_msg "Clobbering lock (${PAUSE_FILE})" |
| rm -f "${PAUSE_FILE}" |
| try_pause_lock || exit 1 |
| } |
| |
| # Return 0 if we need to pause (abort). Return non-zero and grab the "pause" |
| # lock if we can continue. |
| pause_check_ethernet() { |
| # power_SuspendStress tests requires many minutes of network timeout |
| # tolerance since the SSH connection to the autotest server will be |
| # disrupted. *** See http://crbug.com/334951 *** |
| # |
| # "Pause" the ethernet check for up to 30 minutes at the |
| # request of any test that creates and flocks PAUSE_FILE. |
| |
| # Acquire the lock and hold it until exit, if possible. |
| if try_pause_lock; then |
| # File wasn't locked - no need to pause. |
| return 1 |
| fi |
| |
| local now |
| local start_time |
| |
| now="$(date +%s)" |
| start_time=$(stat -c%Z "${PAUSE_FILE}") || true |
| |
| if [ -z "${start_time}" ]; then |
| # Couldn't figure out lock time - just clobber it. |
| force_pause_lock |
| return 1 |
| fi |
| |
| local paused_time |
| paused_time=$((now - start_time)) |
| if [ ${paused_time} -gt $((30*60)) ] ; then |
| critical_msg "Pause request exceeded 30 minutes. Checking lab network link." |
| force_pause_lock |
| return 1 |
| fi |
| |
| info_msg "Ethernet Check Pause started ${paused_time} seconds ago." |
| return 0 |
| } |
| |
| main() { |
| # Special check for devices running power autotests that connect to moblab via |
| # WiFi. This check saves devices from rebooting even though they don't have |
| # wired ethernet connection. |
| if [ -f "${WIFI_CRED}" ]; then |
| info_msg "${WIFI_CRED} found. No need to check ethernet." |
| return 0 |
| fi |
| |
| if pause_check_ethernet; then |
| return 0 |
| fi |
| |
| # Attempt to ping our controlling autotest server over ethernet. |
| if ping_controlling_server; then |
| return 0 |
| fi |
| |
| local recovery_method |
| for recovery_method in rescan_usb_hubs \ |
| toggle_usb_ports \ |
| reload_ethernet_drivers; do |
| critical_msg "Attempting recovery method \"${recovery_method}\"" |
| |
| # A success return from the recovery method implies that it successfully |
| # performed some action that makes it worth re-checking to see whether |
| # our connectivity was remediated. Otherwise, we move on to the next |
| # recovery method without delay. |
| "${recovery_method}" || continue |
| |
| local now |
| local start_time |
| local method_timeout |
| |
| now="$(date +%s)" |
| start_time="${now}" |
| method_timeout=$((now+30)) |
| |
| if ! initctl status shill | grep -q running ; then |
| restart_connection_manager |
| fi |
| |
| # poll "controlling_server" until timeout |
| # NB: Our Lab DHCP servers must respond in < 30 seconds. |
| while [ "${now}" -lt "${method_timeout}" ]; do |
| if ping_controlling_server; then |
| critical_msg "${recovery_method} successful after $((now-start_time)) seconds." |
| return 0 |
| fi |
| sleep 1 |
| now="$(date +%s)" |
| done |
| done |
| |
| critical_msg "All ethernet recovery methods have failed. Rebooting." |
| sync |
| |
| # Give powerd a chance to reboot via the standard path (and log messages that |
| # are helpful for debugging) before calling 'reboot' directly. |
| dbus-send --system --type=method_call --dest=org.chromium.PowerManager \ |
| /org/chromium/PowerManager org.chromium.PowerManager.RequestRestart \ |
| int32:2 string:'recover_duts check_ethernet hook failed' & |
| sleep 30 |
| reboot |
| |
| return 1 |
| } |
| |
| main |