#!/bin/sh
# shellcheck disable=SC2039
#
# Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.

set -e

SHILL_START_LOCK_PATH="/run/lock/shill-start.lock"
WIFI_CRED=/usr/local/etc/wifi_cred

# See do_suspend() in src/third_party/autotest/files/client/cros/power/sys_power.py
PAUSE_FILE=/run/autotest_pause_ethernet_hook
paused_time=0


# Critical messages should be sent to /var/log/messages.  Other messages should
# be sent via echo to be harvested by recover_duts.py.
#
# TODO(tbroch) Relocate this to common hook library if/when there's more than
# one hook.
critical_msg() {
  echo "$(date --rfc-3339=seconds)  $*" >&2
  logger -t "$(basename "$0")" -- "$*"
}

info_msg() {
  echo "$(date --rfc-3339=seconds)  $*" >&2
}

# Returns the default gateway.
get_default_gateway() {
  local ip_route
  ip_route="$(ip route get 1.0.0.0)"
  echo "${ip_route}" | head -n 1 | cut -f 3 -d ' '
}

rescan_usb_hubs() {
  local port
  local portnum
  local usbhub=/sys/bus/usb/drivers/hub

  for port in "${usbhub}"/[0-9]*-0:1.0; do
    critical_msg "Rescanning  ${port}"
    portnum="$(basename "${port}")"
    echo "${portnum}" > "${usbhub}"/unbind
    sleep 1
    echo "${portnum}" > "${usbhub}"/bind
    # USB needs a bit of time to scan the "hub"
    sleep 3
  done
}

search_devices() {
  local device_path
  local device

  for device_path in /sys/class/net/*; do
    device="$(basename "${device_path}")"

    # lab interconnect is full-duplex and tells us so.
    # devices w/o link won't get listed here
    if cat "${device_path}/duplex" > /dev/null 2>&1 ; then
       echo "${device}"
       continue
    fi

    # ignore "wwan*" devices: they are 3G/4G/LTE modems.
    case "${device}" in
      wwan*|lo) continue;;
    esac

    # ignore virtual devices (e.g. lxcbr0 or arcbr0)
    if [ "$(cat "${device_path}/addr_assign_type" 2>&1)" != "0" ] ; then
      continue
    fi

    # ignore "wlan" devices.
    if iw "${device}" info > /dev/null 2>&1  ; then
      continue
    fi

    echo "${device}"
  done
}

# Shows the list of Ethernet interfaces found on the system.
find_ethernet_interfaces() {
  local interfaces

  interfaces=$(search_devices)
  if [ -z "${interfaces}" ] ; then
    # didn't find any eth devices.
    # Some possible causes are:
    #   crbug.com/452686 RT8153 (USB3-GigE) dongle coming up as USB-storage
    #                    (Fixed: "mist" will reset back to ethernet device)
    #   crbug.com/733425 USB enumeration fails with
    #                    "device not accepting address 2, error -71"
    info_msg "No ethernet found: Rescanning USB hubs"
    rescan_usb_hubs

    # check again
    interfaces=$(search_devices)
  fi
  echo "$interfaces"
}


# Pings the given ipaddress through all ethernet devices
# $1 - IP address to ping.
do_ping() {
  local ip_addr=$1
  local eth

  for eth in $(find_ethernet_interfaces); do
    ping -q -I "${eth}" -c 9 "${ip_addr}" && return 0
  done

  return 1
}

# Restart all our ethernet devices and restart shill.
# Return the remote IP address of the first established SSH connection
find_ssh_client() {
  netstat -lanp | awk '/tcp.*:22 .*ESTABLISHED.*/ {split($5,a,":");  \
                         if (a[1] != "127.0.0.1") print a[1]}'
}

# Return the IP address of a neighbor reachable via ethernet
find_ethernet_neighbor() {
  local eth
  local neighbor_ip

  for eth in $(find_ethernet_interfaces); do
    neighbor_ip=$(ip -4 neigh show dev "${eth}" |
                  awk '/REACHABLE|DELAY|STALE/ {print $1; exit}')
    [ -n "${neighbor_ip}" ] && echo "${neighbor_ip}" && return 0
  done
  return 1
}

# Try to find a connected SSH client (our autotest server) and ping it
ping_controlling_server() {
  local default_gateway
  default_gateway="$(get_default_gateway)" || default_gateway=
  if [ -n "${default_gateway}" ]; then
    do_ping ${default_gateway} && return 0
  fi

  local ssh_client
  ssh_client="$(find_ssh_client)" || ssh_client=
  if [ -n "${ssh_client}" ]; then
    do_ping ${ssh_client} && return 0
  fi

  # Last attempt: any recently seen neighbor
  local neighbor
  neighbor="$(find_ethernet_neighbor)" || neighbor=
  if [ -n "${neighbor}" ]; then
    do_ping ${neighbor} && return 0
  fi
  return 1
}

reload_ethernet_drivers() {
  local eth
  local ret=1
  for eth in $(find_ethernet_interfaces); do
    info_msg "Reload driver for interface ${eth}"
    reload_network_device "${eth}"
    ret=0
  done
  return ${ret}
}

toggle_usb_ports() {
  local device_path
  local ret=1
  for device_path in /sys/class/net/*; do
    local usbpath
    usbpath=$(readlink -f "${device_path}")
    # Example of what usbpath is expect to look like:
    #    usbpath=/sys/devices/pci0000:00/0000:00:14.0/usb2/2-2/2-2:1.0/net/eth1
    #
    # But we want the port for that device:
    #    echo ${usbpath%/*/net/*}
    #    /sys/devices/pci0000:00/0000:00:14.0/usb2/2-2
    usbpath=${usbpath%/*/net/*}

    # Only USB devices have "authorized" field in /sys
    if [ -w "${usbpath}/authorized" ]; then
      ret=0
      # disable port: sort of like unplugging/plugging the dongle
      echo 0 > "${usbpath}/authorized"
      sleep 2
      echo 1 > "${usbpath}/authorized"
      sleep 1
    fi
  done
  return ${ret}
}

restart_connection_manager() {
  # NB: -e will fail on a dangling symlink. That's deliberate. The
  # symlink should point at /proc/<locker's PID>. And if that path is
  # gone, the locker has exited, and the lock is stale.
  if [ -e "${SHILL_START_LOCK_PATH}" ]; then
    lock_holder=$(readlink "${SHILL_START_LOCK_PATH}")
    info_msg "Ignoring restart request; lock held by ${lock_holder}"
    return
  fi
  initctl stop shill || info_msg "Shill was not running."
  initctl start shill
}

try_pause_lock() {
  # Append, to avoid changing the mtime of an existing lock if we don't acquire
  # it.
  if ! exec 9>>"${PAUSE_FILE}"; then
    critical_msg "Failed to open ${PAUSE_FILE}"
    return 1
  fi
  if ! flock -xn 9; then
    critical_msg "Failed to acquire ${PAUSE_FILE}"
    return 1
  fi
}

# Clear old locks and try to grab the lock. Does not block, and aborts if we
# fail.
force_pause_lock() {
  critical_msg "Clobbering lock (${PAUSE_FILE})"
  rm -f "${PAUSE_FILE}"
  try_pause_lock || exit 1
}

# Return 0 if we need to pause (abort). Return non-zero and grab the "pause"
# lock if we can continue.
pause_check_ethernet() {
  # power_SuspendStress tests requires many minutes of network timeout
  # tolerance since the SSH connection to the autotest server will be
  # disrupted.   *** See http://crbug.com/334951 ***
  #
  # "Pause" the ethernet check for up to 30 minutes at the
  # request of any test that creates and flocks PAUSE_FILE.

  # Acquire the lock and hold it until exit, if possible.
  if try_pause_lock; then
    # File wasn't locked - no need to pause.
    return 1
  fi

  local now
  local start_time

  now="$(date +%s)"
  start_time=$(stat -c%Z "${PAUSE_FILE}") || true

  if [ -z "${start_time}" ]; then
    # Couldn't figure out lock time - just clobber it.
    force_pause_lock
    return 1
  fi

  local paused_time
  paused_time=$((now - start_time))
  if [ ${paused_time} -gt $((30*60)) ] ; then
    critical_msg "Pause request exceeded 30 minutes. Checking lab network link."
    force_pause_lock
    return 1
  fi

  info_msg "Ethernet Check Pause started ${paused_time} seconds ago."
  return 0
}

main() {
  # Special check for devices running power autotests that connect to moblab via
  # WiFi. This check saves devices from rebooting even though they don't have
  # wired ethernet connection.
  if [ -f "${WIFI_CRED}" ]; then
    info_msg "${WIFI_CRED} found. No need to check ethernet."
    return 0
  fi

  if pause_check_ethernet; then
    return 0
  fi

  local recovery_method

  for recovery_method in toggle_usb_ports reload_ethernet_drivers; do
    # Attempt to ping our controlling autotest server over ethernet.
    if ping_controlling_server; then
      return 0
    fi

    critical_msg "Attempting recovery method \"${recovery_method}\""

    # A success return from the recovery method implies that it successfully
    # performed some action that makes it worth re-checking to see whether
    # our connectivity was remediated.  Otherwise, we move on to the next
    # recovery method without delay.
    "${recovery_method}" || continue

    local now
    local start_time
    local method_timeout

    now="$(date +%s)"
    start_time="${now}"
    method_timeout=$((now+30))

    if ! initctl status shill | grep -q running ; then
      restart_connection_manager
    fi

    # poll "controlling_server" until timeout
    # NB: Our Lab DHCP servers must respond in < 30 seconds.
    while [ "${now}" -lt "${method_timeout}" ]; do
      if ping_controlling_server; then
        critical_msg "${recovery_method} successful after $((now-start_time)) seconds."
        return 0
      fi
      sleep 1
      now="$(date +%s)"
    done
  done

  critical_msg "All ethernet recovery methods have failed. Rebooting."
  sync

  # Give powerd a chance to reboot via the standard path (and log messages that
  # are helpful for debugging) before calling 'reboot' directly.
  dbus-send --system --type=method_call --dest=org.chromium.PowerManager \
      /org/chromium/PowerManager org.chromium.PowerManager.RequestRestart \
      int32:2 string:'recover_duts check_ethernet hook failed' &
  sleep 30
  reboot

  return 1
}

main
