fw_lab_triage_helper: apply shell style guide
Applied the following aspects of the style guide:
Don't use a file extension (not mandatory, but strongly preferred).
Start the file with a description of contents.
Put dos on the same line as the if/for/while.
Use the parameter expansion assignment syntax for setting a default value.
Have separate declaration and assigment statements when the assignment
value is provided by a command substitution.
Use embedded newlines in long strings.
Only use caps for constants variable names.
Use a main function (since there is at least one other function).
Also misc cleanups.
BUG=None
TEST=ran script against the faft-cr50 pool and faft-test pool
Change-Id: I62a78a473d9d4d9c0dc5fde7463e0e3ef8f81045
Reviewed-on: https://chromium-review.googlesource.com/1570306
Commit-Ready: ChromeOS CL Exonerator Bot <chromiumos-cl-exonerator@appspot.gserviceaccount.com>
Tested-by: Kevin Shelton <kmshelton@chromium.org>
Reviewed-by: Mary Ruthven <mruthven@chromium.org>
diff --git a/provingground/firmware/fw_lab_triage_helper b/provingground/firmware/fw_lab_triage_helper
new file mode 100755
index 0000000..323f2dd
--- /dev/null
+++ b/provingground/firmware/fw_lab_triage_helper
@@ -0,0 +1,151 @@
+#!/bin/bash
+
+# Copyright 2019 The Chromium OS Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+# Script to aid in maintaining AP/EC/GSC Firmware test enivornment health.
+
+# TODO(kmshelton): Obviate in favor of monitoring and alerting.
+
+# TODO(kmshelton): Support hosts that are in the fw lab, but not in a qual
+# enivornment (i.e. hosts for which this static list of servo ports is not a
+# safe assumption).
+SERVO_PORTS="9901 9902 9903 9904"
+
+find_fw_qual_labstations() {
+ # Labstations outside of the high touch lab (chromeos1) are not in the qual
+ # environment.
+ atest host list --label=pool:$1 --parse \
+ | grep chromeos1- \
+ | cut -f 1 -d \| \
+ | sed 's/host.*/labstation/g' \
+ | sed 's/Host=//g' \
+ | sort \
+ | uniq \
+ | xargs
+}
+
+print_long_message() {
+ message="$(sed 's/[[:space:]]\{2,\}/ /' <<< $1)"
+ message="$(tr -d '\n' <<< $message)"
+ terminal_columns=$(tput cols)
+ message="$(fold -w ${terminal_columns} -s <<< $message)"
+ printf '%s\n\n' "$message"
+}
+
+main() {
+ local pool
+ while getopts 'p:' OPTION; do
+ case $OPTION in
+ p) pool="$OPTARG"
+ ;;
+ ?) printf "Usage: %s [-p value]\n" ${0##*/} >&2
+ exit 2
+ ;;
+ esac
+ done
+ : ${pool:="faft-cr50"}
+
+ local labstations
+ labstations="$(find_fw_qual_labstations ${pool})"
+
+ WARNING_MESSAGE="This script asssumes many things, like that you have atest in
+ the environment in which it is run, that you have cros in your DNS search
+ path, and that you have configured you environment to use the testing_rsa
+ key on lab DUTs. This is just a convenience script for firmware qual test
+ environment health triage. Feel free and encouraged to extend and enhance
+ it, but in the long term, it should be obviated by monitoring and alerting."
+ print_long_message "${WARNING_MESSAGE}"
+
+ LABSTATION_SANITY_CHECK_MESSAGE="Checking that the labstations respond to ping
+ first, because if a labstation is down: you are hosed:"
+ print_long_message "${LABSTATION_SANITY_CHECK_MESSAGE}"
+ fping ${labstations}
+ printf "\n\n"
+
+ LABSTATION_CHECK_MESSAGE="Logging labstation's version, uptime, the last 10
+ eventlog lines, and the update_engine's PID:"
+ print_long_message "${LABSTATION_CHECK_MESSAGE}"
+ for station in ${labstations}; do
+ printf "\nChecking $station\n"
+ ssh -o StrictHostKeyChecking=no root@$station "
+ grep guado_labstation-release /etc/lsb-release;
+ printf \"\n\";
+ uptime;
+ printf \"\n\";
+ mosys eventlog list | tail -n 10;
+ printf \"\n\";
+ pgrep --list-full update_engine;
+ printf \"\n\";"
+ done
+
+ SERVOD_CHECK_MESSAGE="Checking for lines with ERROR in the most recent servod
+ log (the last 4 will be on the labstation, only the most recent one is
+ checked here)."
+ print_long_message "${SERVOD_CHECK_MESSAGE}"
+ for station in ${labstations}; do
+ printf "\nChecking $station\n"
+ ssh -o StrictHostKeyChecking=no root@$station \
+ "grep ERROR /var/log/servod_990*.log";
+ done
+
+ SERVO_CHECK_MESSAGE="Logging servo versions (there's 4 per labsation even if
+ there aren't 4 DUTs). This should excercise the servo consoles. If you get
+ connection refused, servod has likely stopped. If you get a timeout waiting
+ for response, the servo has likely wedged (no console response)."
+ printf "\n"
+ print_long_message "${SERVO_CHECK_MESSAGE}"
+
+ for station in ${labstations}; do
+ printf "\nChecking $station\n"
+ for port in ${SERVO_PORTS}; do
+ ssh -o StrictHostKeyChecking=no root@$station "dut-control -p $port \
+ servo_micro_version; dut-control -p $port servo_v4_version;"
+ done;
+ done
+
+ # TODO(kmshelton): Figure out how to check if the latest build was successful
+ # for all models in the environment, since this is the cause of missing
+ # nightlies sometimes.
+
+ printf "\nGathering kernel and servod logs."
+ logs_dir="$(mktemp -d)"
+ cd $logs_dir
+ printf "\nLogs will be stored in $logs_dir\n"
+ for station in ${labstations}; do
+ mkdir $logs_dir/$station
+ printf "Gathering from ${station}.\n"
+ scp -o StrictHostKeyChecking=no root@$station:/var/log/messages \
+ $logs_dir/$station > /dev/null;
+ scp -o StrictHostKeyChecking=no root@$station:/var/log/servo* \
+ $logs_dir/$station > /dev/null;
+ done
+
+ FAILURE_SIG_MESSAGE="Checking for the failure signature (\"did not claim
+ interface\") of b/110045723:"
+ print_long_message "${FAILURE_SIG_MESSAGE}"
+ for station in ${labstations}; do
+ grep -r "did not claim interface" $logs_dir/$station;
+ done
+
+ printf "Tarring kernel and servo logs.\n\n"
+ tarball="$(mktemp)"
+ tar -cvf $tarball . > /dev/null
+ TARBALL_MESSAGE="There will be a tarball at $tarball. Attach the tarball to
+ any bugs filed."
+ print_long_message "${TARBALL_MESSAGE}"
+ printf "Returning to original directory.\n\n"
+ cd - > /dev/null
+
+ OVERVIEW_MESSAGE="Logging a high level host overview. DUTs should have a
+ status of \"Ready\", unless there is a KI. If a DUT is stuck in a repair
+ state, try to figure out why. If you find yourself repeatedly checking
+ something, extend this script with what you are checking."
+ print_long_message "${OVERVIEW_MESSAGE}"
+ atest host list --label=pool:${pool} --parse \
+ | grep chromeos1- \
+ | cut -f 1,2,4,5,6 -d \|
+}
+
+main "$@"
diff --git a/provingground/firmware/fw_lab_triage_helper.sh b/provingground/firmware/fw_lab_triage_helper.sh
deleted file mode 100755
index a137cb0..0000000
--- a/provingground/firmware/fw_lab_triage_helper.sh
+++ /dev/null
@@ -1,122 +0,0 @@
-#!/bin/bash
-#
-# Copyright 2019 The Chromium OS Authors. All rights reserved.
-# Use of this source code is governed by a BSD-style license that can be
-# found in the LICENSE file.
-
-# TODO(someone): Obviate in favor of monitoring and alerting.
-
-SERVO_PORTS="9901 9902 9903 9904"
-
-# Labstations outside of the high touch lab (chromeos1) are excluded.
-find_labstations() {
- atest host list --label=pool:${POOL} --parse \
- | grep chromeos1- \
- | cut -f 1 -d \| \
- | sed 's/host.*/labstation/g' \
- | sed 's/Host=//g' \
- | sort \
- | uniq \
- | xargs
-}
-
-POOL=faft-cr50
-while getopts 'p:' OPTION
-do
- case $OPTION in
- p) POOL="$OPTARG"
- ;;
- ?) printf "Usage: %s [-p value]\n" ${0##*/} >&2
- exit 2
- ;;
- esac
-done
-
-echo "This script asssumes many things, like that you have atest in the \
- environment that you run it, that you have cros in your DNS search path, and \
- that you have added the testing_rsa key to your ssh agent. This is just a \
- conveience script to get lab daily analysis off the ground, feel free and \
- encouraged to extend and enhance it, but longer term it should be mostly \
- obviated by monitoring and alerting."
-
-echo -e "\nChecking that the labstations respond to ping first, because if a \
- labstation is down: you are hosed."
-LABSTATIONS="$(find_labstations)"
-echo $LABSTATIONS
-fping $LABSTATIONS
-
-echo -e "\n\nLogging labstation's version, uptime, the last 10 eventlog lines, \
- and the update_engine's PID:"
-for STATION in $LABSTATIONS;
- do echo -e "\n\nsshing to $STATION\n"
- ssh -o StrictHostKeyChecking=no root@$STATION "
- grep guado_labstation-release /etc/lsb-release;
- echo;
- uptime;
- echo;
- mosys eventlog list | tail -n 10;
- echo;
- pgrep --list-full update_engine;"
-done
-
-echo -e "\n\nChecking for lines with ERROR in the most recent servod log (the \
- last 4 will be on the labstation, only the most recent one is checked here)."
-for STATION in $GSC_LABSTATIONS;
- do echo -e "\nchecking $STATION"
- ssh -o StrictHostKeyChecking=no root@$STATION \
- "grep ERROR /var/log/servod_990*.log";
-done
-
-
-echo -e "\n\nLogging servo versions (there's 4 per labsation even if there \
- aren't 4 DUTs). This should excercise the servo consoles. If you get \
- connection refused, servod has likely stopped. If you get a timeout waiting \
- for response, the servo has likely crashed (no console response)."
-for STATION in $LABSTATIONS;
- do echo -e "\nchecking $STATION"
- for PORT in $SERVO_PORTS;
- do ssh -o StrictHostKeyChecking=no root@$STATION "dut-control -p $PORT \
- servo_micro_version; dut-control -p $PORT servo_v4_version;"
- done;
-done
-
-# TODO(kmshelton): Figure out how to check if the latest build was successful
-# for all models in the environment, since this is the cause of missing
-# nightlies sometimes.
-
-echo -e "\nGathering kernel and servod logs."
-LOGS_DIR="$(mktemp -d)"
-cd $LOGS_DIR
-echo -e "\nLogs will be stored in $LOGS_DIR"
-for STATION in $LABSTATIONS;
- do mkdir $LOGS_DIR/$STATION
- echo "checking $STATION"
- scp -o StrictHostKeyChecking=no root@$STATION:/var/log/messages \
- $LOGS_DIR/$STATION;
- scp -o StrictHostKeyChecking=no root@$STATION:/var/log/servo* \
- $LOGS_DIR/$STATION;
-done
-
-echo -e "\n\nChecking for the failure signature ("did not claim interface") of \
- b/110045723:"
-for STATION in $LABSTATIONS;
- do grep -r "did not claim interface" $LOGS_DIR/$STATION;
-done
-
-
-echo -e "\n\nTarring kernel and servo logs."
-TARBALL="$(mktemp)"
-tar -cvf $TARBALL .
-echo -e "\n\n There will be a tarball at $TARBALL. Attach the tarball to any \
- bugs filed."
-
-echo -e "\n\nReturning to original directory"
-cd -
-
-echo -e "\n\nLogging high level host overview. DUTs should have a status of \
- "Ready", unless there is a KI. If a DUT is stuck in a repair state, try to \
- figure out why. If you find yourself repeatedly checking something, extend \
- this script with what you are checking.\n"
-
-atest host list --label=pool:${POOL} --parse \
-| grep chromeos1- | cut -f 1,2,4,5,6 -d \|