| #!/bin/bash |
| # |
| # Copyright (c) 2012 The Chromium OS Authors. All rights reserved. |
| # Use of this source code is governed by a BSD-style license that can be |
| # found in the LICENSE file. |
| |
| set -e |
| |
| # Only run this script on test machines that run in the lab. |
| # See autotest/server/hosts/site_host.py for more information. |
| if [ ! -f /mnt/stateful_partition/.labmachine ]; then |
| exit 0 |
| fi |
| |
| NON_ETHERNET_DRIVERS="cdc_ether" |
| |
| # Critical messages should be sent to /var/log/messages. Other messages should |
| # be sent via echo to be harvested by recover_duts.py. |
| # |
| # TODO(tbroch) Relocate this to common hook library if/when there's more than |
| # one hook. |
| critical_msg() { |
| logger -t "$(basename $0)" -- "$@" |
| } |
| |
| # Returns the default gateway. |
| get_default_gateway() { |
| local ip_route="$(ip route get 1.0.0.0)" |
| echo ${ip_route} | head -n 1 | cut -f 3 -d ' ' |
| } |
| |
| # Returns 0 if $1 is a non-Ethernet driver, or 1 otherwise. |
| is_non_ethernet_driver() { |
| local driver="$1" |
| local non_ethernet_driver |
| |
| for non_ethernet_driver in ${NON_ETHERNET_DRIVERS}; do |
| if [ "${driver}" = "${non_ethernet_driver}" ]; then |
| return 0 |
| fi |
| done |
| return 1 |
| } |
| |
| # Shows the list of Ethernet interfaces found on the system. |
| find_ethernet_interfaces() { |
| local device_path |
| local driver_path |
| local driver |
| |
| for device_path in /sys/class/net/eth*; do |
| driver_path="${device_path}/device/driver" |
| if [ -e "${driver_path}" ]; then |
| driver=$(basename $(readlink -f "${driver_path}")) |
| if ! is_non_ethernet_driver "${driver}"; then |
| basename "${device_path}" |
| fi |
| fi |
| done |
| } |
| |
| # Pings the given ipaddress through all wired ethernet devices |
| # $1 - IP address to ping. |
| do_ping() { |
| local ip_addr=$1 |
| for eth in $(find_ethernet_interfaces); do |
| ping -q -I ${eth} -c 9 ${ip_addr} && return 0 |
| done |
| return 1 |
| } |
| |
| # Restart all our ethernet devices and restart shill. |
| recover_ethernet_devices() { |
| local eth |
| critical_msg "Gateway still unreachable; restarting ethernet interfaces" |
| for eth in $(find_ethernet_interfaces); do |
| echo "Bounce interface ${eth}" |
| ifconfig ${eth} down |
| ifconfig ${eth} up |
| done |
| initctl stop shill || echo "Shill was not running." |
| initctl start shill |
| sleep 30 |
| ifconfig -a |
| } |
| |
| # Return the remote IP address of the first established SSH connection |
| find_ssh_client() { |
| netstat -lanp | awk '/tcp.*:22.*ESTABLISHED.*/ {split($5,a,":"); print a[1]}' |
| } |
| |
| # Try to find a connected SSH client (our autotest server) and ping it |
| ping_controlling_server() { |
| local default_gateway="$(get_default_gateway)" || default_gateway= |
| if [ -n "${default_gateway}" ]; then |
| do_ping ${default_gateway} && return 0 |
| fi |
| |
| local ssh_client="$(find_ssh_client)" || ssh_client= |
| if [ -n "${ssh_client}" ]; then |
| do_ping ${ssh_client} && return 0 |
| fi |
| return 1 |
| } |
| |
| main() { |
| # Attempt to ping our controlling autotest server over ethernet. We guarantee |
| # a minimum of 12 minutes network timeout tolerance for tests that disrupt |
| # connectivity with the SSH connection from the autotest server. This timeout |
| # is 15 minutes to make sure it can never fail before that SSH session does. |
| if ping_controlling_server; then |
| return 0 |
| fi |
| critical_msg "Gateway unreachable; will retry for 15 minutes" |
| for i in {1..30}; do |
| sleep 30 |
| if ping_controlling_server; then |
| critical_msg "Gateway now answering; returning success" |
| return 0 |
| fi |
| done |
| |
| # We can't reach our controlling server through any ethernet devices. |
| recover_ethernet_devices |
| |
| # Attempt to ping again. If successful, return 1 so that way log the fact |
| # that we need to take action to recover the dut. |
| if ping_controlling_server; then |
| return 1 |
| fi |
| |
| # Last chance - reboot if we can't get any connectivity. |
| critical_msg "All efforts to recover ethernet have been exhausted. Rebooting." |
| sync |
| (sleep 5 && reboot) & |
| return 1 |
| } |
| |
| main |