lab helper script: adds triage helper Add a script to accelerate triage of possible issues with lab devices. BUG=none TEST=confirmed I could extract the generated tarball Change-Id: Ic16bcd5338f378fef04203e05086ad2f5e7e2c38 Reviewed-on: https://chromium-review.googlesource.com/1140791 Commit-Ready: Kevin Shelton <kmshelton@chromium.org> Tested-by: Kevin Shelton <kmshelton@chromium.org> Reviewed-by: Mary Ruthven <mruthven@chromium.org>

commit: 4397d97f956a83da75c0259d4c2c4ad444eaa2c7 [log] [tgz]
author: Kevin Shelton <kmshelton@chromium.org> Tue Jul 17 11:45:30 2018 -0700
committer: chrome-bot <chrome-bot@chromium.org> Wed Jul 18 01:40:02 2018 -0700
tree: e9b4a19f4c922a37ce609c5a76fb116a44a7d9e7
parent: 0e9ed4f26f5c035f2d0116460ad3131601a45c17 [diff]
diff --git a/provingground/firmware/fw_lab_triage_helper.sh b/provingground/firmware/fw_lab_triage_helper.sh
new file mode 100755
index 0000000..a41fbcc
--- /dev/null
+++ b/provingground/firmware/fw_lab_triage_helper.sh

@@ -0,0 +1,97 @@
+#!/bin/bash
+#
+# Copyright 2018 The Chromium OS Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+# TODO(someone): Obviate in favor of monitoring and alerting.
+
+GSC_LABSTATIONS="chromeos1-row2-rack1-labstation \
+  chromeos1-row2-rack10-labstation chromeos1-row1-rack5-labstation"
+SERVO_PORTS="9901 9902 9903 9904"
+
+echo "This script asssumes many things, like that you have atest in the \
+  environment that you run it, that you have cros in your DNS search path, and \
+  that you have added the testing_rsa key to your ssh agent.  This is just a \
+  conveience script to get lab daily analysis off the ground, feel free and \
+  encouraged to extend and enhance it, but longer term it should be mostly \
+  obviated by monitoring and alerting."
+
+# TODO(kmshelton): Add an option to gather data on faft-test pool devices or
+# faft-cr50 devices instead of hardcoding for faft-cr50 pool devices.
+
+echo -e "\nChecking that the labstations respond to ping first, because if a \
+  labstation is down: you are hosed."
+fping $GSC_LABSTATIONS
+
+echo -e "\n\nLogging labstation's version, uptime, the last 10 eventlog lines, \
+  and servod process state"
+for STATION in $GSC_LABSTATIONS;
+    do echo -e "\n\nsshing to $STATION\n"
+    ssh -o StrictHostKeyChecking=no root@$STATION "grep \
+      guado_labstation-release /etc/lsb-release; echo -e \"\"; uptime; \
+      echo -e \"\"; mosys eventlog list | tail -n 10; echo -e \"\";"
+done
+
+echo -e "\n\nChecking for lines with ERROR in the most recent servod log (the \
+  last 4 will be on the labstation, only the most recent one is checked here). \
+  Errors that appear to match the signature of the KI from b/110796670 are \
+  ignored (it is safe to ignore)."
+for STATION in $GSC_LABSTATIONS;
+    do echo -e "\nchecking $STATION"
+    ssh -o StrictHostKeyChecking=no root@$STATION "grep ERROR \
+      /var/log/servod_990*.log | grep -v 8001";
+done
+
+
+echo -e "\n\nLogging servo versions (there's 4 per labsation even if there \
+  aren't 4 DUTs). This should excercise the servo consoles.  If you get \
+  connection refused, servod has likely stopped.  If you get a timeout waiting \
+  for response, the servo has likely crashed (no console response)."
+for STATION in $GSC_LABSTATIONS;
+    do echo -e "\nchecking $STATION"
+    for PORT in $SERVO_PORTS;
+        do ssh -o StrictHostKeyChecking=no root@$STATION "dut-control -p $PORT \
+          servo_micro_version; dut-control -p $PORT servo_v4_version;"
+    done;
+done
+
+# TODO(kmshelton): Figure out how to check if the latest build was successful
+# for all models in the environment, since this is the cause of missing
+# nightlies sometimes.
+
+echo -e "\nGathering kernel and servod logs."
+LOGS_DIR="$(mktemp -d)"
+cd $LOGS_DIR
+echo -e "\nLogs will be stored in $LOGS_DIR"
+for STATION in $GSC_LABSTATIONS;
+    do mkdir $LOGS_DIR/$STATION
+    echo "checking $STATION"
+    scp -o StrictHostKeyChecking=no root@$STATION:/var/log/messages \
+      $LOGS_DIR/$STATION;
+    scp -o StrictHostKeyChecking=no root@$STATION:/var/log/servo* \
+      $LOGS_DIR/$STATION;
+done
+
+echo -e "\n\nChecking for the failure signature ("did not claim interface") of \
+  b/110045723:"
+for STATION in $GSC_LABSTATIONS;
+    do grep -r "did not claim interface" $LOGS_DIR/$STATION;
+done
+
+
+echo -e "\n\nTarring kernel and servo logs."
+TARBALL="$(mktemp)"
+tar -cvf $TARBALL .
+echo -e "\n\n There will be a tarball at $TARBALL.  Attach the tarball to any \
+  bugs filed."
+
+echo -e "\n\nReturning to original directory"
+cd -
+
+echo -e "\n\nLogging high level host overview. DUTs should have a status of \
+  "Ready", unless there is a KI.  If a DUT is stuck in a repair state, try to \
+  figure out why.  If you find yourself repeatedly checking something, extend \
+  this script with what you are checking.\n"
+
+atest host list --label=pool:faft-cr50 --parse | cut -f 1,2,4,5,6 -d \|
commit	4397d97f956a83da75c0259d4c2c4ad444eaa2c7	[log] [tgz]
author	Kevin Shelton <kmshelton@chromium.org>	Tue Jul 17 11:45:30 2018 -0700
committer	chrome-bot <chrome-bot@chromium.org>	Wed Jul 18 01:40:02 2018 -0700
tree	e9b4a19f4c922a37ce609c5a76fb116a44a7d9e7
parent	0e9ed4f26f5c035f2d0116460ad3131601a45c17 [diff]