#!/bin/bash
#
# Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.

set -e

# Only run this script on test machines that run in the lab.
# See autotest/server/hosts/site_host.py for more information.
if [ ! -f /mnt/stateful_partition/.labmachine ]; then
  exit 0
fi

SHILL_START_LOCK_PATH="/var/lock/shill-start.lock"

# See code in src/thirdparty/autotest/files/client/cros/power_suspend.py
PAUSE_FILE="/run/autotest_pause_ethernet_hook"
paused_time=0


# Critical messages should be sent to /var/log/messages.  Other messages should
# be sent via echo to be harvested by recover_duts.py.
#
# TODO(tbroch) Relocate this to common hook library if/when there's more than
# one hook.
critical_msg() {
  echo $(date --rfc-3339=seconds) " $@" >&2
  logger -t "$(basename $0)" -- "$@"
}

info_msg() {
  echo $(date --rfc-3339=seconds) " $@" >&2
}

# Returns the default gateway.
get_default_gateway() {
  local ip_route="$(ip route get 1.0.0.0)"
  echo ${ip_route} | head -n 1 | cut -f 3 -d ' '
}

rescan_usb_hubs() {
  local port
  local portnum
  local usbhub=/sys/bus/usb/drivers/hub

  for port in ${usbhub}/[0-9]*-0:1.0; do
    critical_msg "Rescanning  $port"
    portnum=$(basename "${port}")
    echo ${portnum} > ${usbhub}/unbind
    sleep 1
    echo ${portnum} > ${usbhub}/bind
    # USB needs a bit of time to scan the "hub"
    sleep 3
  done
}

search_devices() {
  local device_path
  local device

  for device_path in /sys/class/net/*; do
    device=$(basename "${device_path}")

    # lab interconnect is full-duplex and tells us so.
    # devices w/o link won't get listed here
    if cat "${device_path}/duplex" > /dev/null 2>&1 ; then
       echo "${device}"
       continue
    fi

    # ignore "wwan*" devices: they are 3G/4G/LTE modems.
    case "${device}" in
      wwan*|lo) continue;;
    esac

    # ignore virtual devices (e.g. lxcbr0 or arcbr0)
    if [ "$(cat ${device_path}/addr_assign_type 2>&1)" != "0" ] ; then
      continue
    fi

    # ignore "wlan" devices.
    if iw "${device}" info > /dev/null 2>&1  ; then
      continue
    fi

    echo "${device}"
  done
}

# Shows the list of Ethernet interfaces found on the system.
find_ethernet_interfaces() {
  local interfaces=$(search_devices)

  if [ -z "${interfaces}" ] ; then
    # didn't find any eth devices.
    # Some possible causes are:
    #   crbug.com/452686 RT8153 (USB3-GigE) dongle coming up as USB-storage
    #                    (Fixed: "mist" will reset back to ethernet device)
    #   crbug.com/733425 USB enumeration fails with
    #                    "device not accepting address 2, error -71"
    info_msg "No ethernet found: Rescanning USB hubs"
    rescan_usb_hubs

    # check again
    interfaces=$(search_devices)
  fi
  echo $interfaces
}


# Pings the given ipaddress through all ethernet devices
# $1 - IP address to ping.
do_ping() {
  local ip_addr=$1
  local eth

  for eth in $(find_ethernet_interfaces); do
    ping -q -I ${eth} -c 9 ${ip_addr} && return 0
  done

  return 1
}

# Restart all our ethernet devices and restart shill.
# Return the remote IP address of the first established SSH connection
find_ssh_client() {
  netstat -lanp | awk '/tcp.*:22 .*ESTABLISHED.*/ {split($5,a,":"); \
                       if (a[1] != "127.0.0.1") print a[1]}'
}

# Return the IP address of a neighbor reachable via ethernet
find_ethernet_neighbor() {
  local eth
  local neighbor_ip

  for eth in $(find_ethernet_interfaces); do
    neighbor_ip=$(ip -4 neigh show dev ${eth} |
                  awk '/REACHABLE|DELAY|STALE/ {print $1; exit}')
    [ -n "${neighbor_ip}" ] && echo ${neighbor_ip} && return 0
  done
  return 1
}

# Try to find a connected SSH client (our autotest server) and ping it
ping_controlling_server() {
  local default_gateway="$(get_default_gateway)" || default_gateway=
  if [ -n "${default_gateway}" ]; then
    do_ping ${default_gateway} && return 0
  fi

  local ssh_client="$(find_ssh_client)" || ssh_client=
  if [ -n "${ssh_client}" ]; then
    do_ping ${ssh_client} && return 0
  fi

  # Last attempt: any recently seen neighbor
  local neighbor="$(find_ethernet_neighbor)" || neighbor=
  if [ -n "${neighbor}" ]; then
    do_ping ${neighbor} && return 0
  fi
  return 1
}

reload_ethernet_drivers() {
  local eth
  local ret=1
  for eth in $(find_ethernet_interfaces); do
    info_msg "Reload driver for interface ${eth}"
    reload_network_device "${eth}"
    ret=0
  done
  return ${ret}
}

toggle_usb_ports() {
  local device_path
  local ret=1
  for device_path in /sys/class/net/*; do
    local usbpath=$(readlink -f "${device_path}")
    # Example of what usbpath is expect to look like:
    #    usbpath=/sys/devices/pci0000:00/0000:00:14.0/usb2/2-2/2-2:1.0/net/eth1
    #
    # But we want the port for that device:
    #    echo ${usbpath%/*/net/*}
    #    /sys/devices/pci0000:00/0000:00:14.0/usb2/2-2
    usbpath=${usbpath%/*/net/*}

    # Only USB devices have "authorized" field in /sys
    if [ -w "${usbpath}/authorized" ]; then
      ret=0
      # disable port: sort of like unplugging/plugging the dongle
      echo 0 > "${usbpath}/authorized"
      sleep 2
      echo 1 > "${usbpath}/authorized"
      sleep 1
    fi
  done
  return ${ret}
}

restart_connection_manager() {
  # NB: -e will fail on a dangling symlink. That's deliberate. The
  # symlink should point at /proc/<locker's PID>. And if that path is
  # gone, the locker has exited, and the lock is stale.
  if [ -e "${SHILL_START_LOCK_PATH}" ]; then
    lock_holder=$(readlink "${SHILL_START_LOCK_PATH}")
    info_msg "Ignoring restart request; lock held by ${lock_holder}"
    return
  fi
  initctl stop shill || info_msg "Shill was not running."
  initctl start shill
}

pause_check_ethernet() {
  paused_time=0
  # power_SuspendStress tests requires many minutes of network timeout
  # tolerance since the SSH connection to the autotest server will be
  # disrupted.   *** See http://crbug.com/334951 ***
  #
  # "Pause" the ethernet check for up to 30 minutes at the
  # request of any test that creates and flocks PAUSE_FILE.
  if  flock -xn "${PAUSE_FILE}" -c : ; then
    # file wasn't locked - can try to recover ethernet link.
    rm -f "${PAUSE_FILE}"
    return 1
  fi

  local now="$(date +%s)"
  local start_time=$(stat -c%Z "${PAUSE_FILE}") || true

  if [ -z "${start_time}" ]; then
      return 1
  fi

  paused_time=$((now - start_time))
  return 0
}

main() {
  local recovery_method

  for recovery_method in toggle_usb_ports reload_ethernet_drivers; do
    if pause_check_ethernet ; then
      # pause_check_ethernet() sets up $paused_time.
      if [ ${paused_time} -gt $((30*60)) ] ; then
        critical_msg "Pause request exceeded 30 minutes. Checking lab network link."
        rm -f "${PAUSE_FILE}"
      else
        info_msg "Ethernet Check Pause started ${paused_time} seconds ago."
        return 0
      fi
    fi

    # Attempt to ping our controlling autotest server over ethernet.
    if ping_controlling_server; then
      return 0
    fi

    critical_msg "Attempting recovery method \"${recovery_method}\""

    # A success return from the recovery method implies that it successfully
    # performed some action that makes it worth re-checking to see whether
    # our connectivity was remediated.  Otherwise, we move on to the next
    # recovery method without delay.
    "${recovery_method}" || continue

    local now="$(date +%s)"
    local start_time="${now}"
    local method_timeout=$(( now + 30 ))

    if ! initctl status shill | grep -q running ; then
      restart_connection_manager
    fi

    # poll "controlling_server" until timeout
    # NB: Our Lab DHCP servers must respond in < 30 seconds.
    while [ $now -lt $method_timeout ]; do
      if ping_controlling_server; then
        critical_msg "${recovery_method} successful after $(( now - start_time )) seconds."
        return 0
      fi
      sleep 1
      now=$(date +%s)
    done
  done

  critical_msg "All ethernet recovery methods have failed. Rebooting."
  sync

  # Give powerd a chance to reboot via the standard path (and log messages that
  # are helpful for debugging) before calling 'reboot' directly.
  dbus-send --system --type=method_call --dest=org.chromium.PowerManager \
      /org/chromium/PowerManager org.chromium.PowerManager.RequestRestart \
      int32:2 string:'recover_duts check_ethernet hook failed' &
  sleep 30
  reboot

  return 1
}

main
