Change check_ethernet.hook to try repairing at every failure. This changes check_ethernet.hook to try restarting the network everytime the network is found to be down, rather than only as a last resort before trying to reboot. BUG=chromium:354944 TEST=run script manually with network disconnected Change-Id: I1c9f08f8e4495a6f196cc8e8f31e68b778ac695e Reviewed-on: https://chromium-review.googlesource.com/199678 Tested-by: Richard Barnette <jrbarnette@chromium.org> Reviewed-by: Chris Sosa <sosa@chromium.org> Commit-Queue: Richard Barnette <jrbarnette@chromium.org>

commit: bb080491bfecbd3a212f832e92af7d41f14e6abd [log] [tgz]
author: J. Richard Barnette <jrbarnette@chromium.org> Tue May 13 18:11:09 2014 -0700
committer: chrome-internal-fetch <chrome-internal-fetch@google.com> Wed May 14 22:53:28 2014 +0000
tree: c3278d0484f35ab2ed964f9d18ceae0a2897a98b
parent: ffc8ee1bed51ff11370ff74efc85d352f595ac4e [diff]
diff --git a/recover_duts/hooks/check_ethernet.hook b/recover_duts/hooks/check_ethernet.hook
index 117bd9e..9a6b3e0 100755
--- a/recover_duts/hooks/check_ethernet.hook
+++ b/recover_duts/hooks/check_ethernet.hook

@@ -20,6 +20,7 @@
 # TODO(tbroch) Relocate this to common hook library if/when there's more than
 # one hook.
 critical_msg() {
+  echo "$@"
   logger -t "$(basename $0)" -- "$@"
 }
 
@@ -70,20 +71,6 @@
 }
 
 # Restart all our ethernet devices and restart shill.
-recover_ethernet_devices() {
-  local eth
-  critical_msg "Gateway still unreachable; restarting ethernet interfaces"
-  for eth in $(find_ethernet_interfaces); do
-    echo "Bounce interface ${eth}"
-    ifconfig ${eth} down
-    ifconfig ${eth} up
-  done
-  initctl stop shill || echo "Shill was not running."
-  initctl start shill
-  sleep 30
-  ifconfig -a
-}
-
 # Return the remote IP address of the first established SSH connection
 find_ssh_client() {
   netstat -lanp | awk '/tcp.*:22.*ESTABLISHED.*/ {split($5,a,":"); print a[1]}'
@@ -103,33 +90,55 @@
   return 1
 }
 
+recover_ethernet_devices() {
+  local eth
+  critical_msg "Gateway unreachable; restarting ethernet interfaces"
+  for eth in $(find_ethernet_interfaces); do
+    echo "Bounce interface ${eth}"
+    ifconfig ${eth} down
+    ifconfig ${eth} up
+  done
+  initctl stop shill || echo "Shill was not running."
+  initctl start shill
+  sleep 30
+  ifconfig -a
+
+  if ping_controlling_server; then
+      critical_msg "Restart successful"
+      return 0
+  fi
+  return 1
+}
+
+TIMEOUT_MINUTES=15
+TIMEOUT=$(( TIMEOUT_MINUTES * 60 ))
+
 main() {
-  # Attempt to ping our controlling autotest server over ethernet. We guarantee
-  # a minimum of 12 minutes network timeout tolerance for tests that disrupt
-  # connectivity with the SSH connection from the autotest server. This timeout
-  # is 15 minutes to make sure it can never fail before that SSH session does.
+  # Attempt to ping our controlling autotest server over ethernet.
+  # We guarantee a minimum of 12 minutes network timeout tolerance
+  # for tests that disrupt connectivity with the SSH connection from
+  # the autotest server.  This timeout is 15 minutes to make sure it
+  # can never fail before that SSH session does.
+
+  local endtime=$(( $(date +%s) + TIMEOUT ))
   if ping_controlling_server; then
     return 0
   fi
-  critical_msg "Gateway unreachable; will retry for 15 minutes"
-  for i in {1..30}; do
+  if recover_ethernet_devices; then
+    return 0
+  fi
+  critical_msg "Restart failed; will retry recovery for ~$TIMEOUT_MINUTES minutes"
+  while [ $(date +%s) -lt $endtime ]; do
     sleep 30
     if ping_controlling_server; then
-      critical_msg "Gateway now answering; returning success"
+      critical_msg "Gateway now reachable; ending recovery loop"
+      return 0
+    fi
+    if recover_ethernet_devices; then
       return 0
     fi
   done
 
-  # We can't reach our controlling server through any ethernet devices.
-  recover_ethernet_devices
-
-  # Attempt to ping again. If successful, return 1 so that way log the fact
-  # that we need to take action to recover the dut.
-  if ping_controlling_server; then
-    return 1
-  fi
-
-  # Last chance - reboot if we can't get any connectivity.
   critical_msg "All efforts to recover ethernet have been exhausted. Rebooting."
   sync
   (sleep 5 && reboot) &
commit	bb080491bfecbd3a212f832e92af7d41f14e6abd	[log] [tgz]
author	J. Richard Barnette <jrbarnette@chromium.org>	Tue May 13 18:11:09 2014 -0700
committer	chrome-internal-fetch <chrome-internal-fetch@google.com>	Wed May 14 22:53:28 2014 +0000
tree	c3278d0484f35ab2ed964f9d18ceae0a2897a98b
parent	ffc8ee1bed51ff11370ff74efc85d352f595ac4e [diff]