fw_lab_triage_helper: add arbitrary port support

This change adds support for DUTs that don't follow an old servod port
convention.  Supporting fetching the DUT's port would have been too
cumbersome in bash, so this transitions the utility to golang.  Some
functionality that is no longer important was dropped (grabbing kernel
and labstation logs, checking them for lines with ERROR, and putting
them into a tarball).

BUG=none
TEST=ran the utility and inspected the output

Change-Id: I4be39cef0bad8e9afc12c72a0024f52bc9d1cf61
Reviewed-on: https://chromium-review.googlesource.com/1706393
Tested-by: Kevin Shelton <kmshelton@chromium.org>
Commit-Ready: Kevin Shelton <kmshelton@chromium.org>
Legacy-Commit-Queue: Commit Bot <commit-bot@chromium.org>
Reviewed-by: Greg Edelston <gredelston@google.com>
diff --git a/provingground/firmware/fw_lab_triage_helper b/provingground/firmware/fw_lab_triage_helper
deleted file mode 100755
index 323f2dd..0000000
--- a/provingground/firmware/fw_lab_triage_helper
+++ /dev/null
@@ -1,151 +0,0 @@
-#!/bin/bash
-
-# Copyright 2019 The Chromium OS Authors. All rights reserved.
-# Use of this source code is governed by a BSD-style license that can be
-# found in the LICENSE file.
-
-# Script to aid in maintaining AP/EC/GSC Firmware test enivornment health.
-
-# TODO(kmshelton): Obviate in favor of monitoring and alerting.
-
-# TODO(kmshelton): Support hosts that are in the fw lab, but not in a qual
-# enivornment (i.e. hosts for which this static list of servo ports is not a
-# safe assumption).
-SERVO_PORTS="9901 9902 9903 9904"
-
-find_fw_qual_labstations() {
-  # Labstations outside of the high touch lab (chromeos1) are not in the qual
-  # environment.
-  atest host list --label=pool:$1 --parse \
-    | grep chromeos1- \
-    | cut -f 1 -d \| \
-    | sed 's/host.*/labstation/g' \
-    | sed 's/Host=//g' \
-    | sort \
-    | uniq \
-    | xargs
-}
-
-print_long_message() {
-  message="$(sed 's/[[:space:]]\{2,\}/ /' <<< $1)"
-  message="$(tr -d  '\n' <<< $message)"
-  terminal_columns=$(tput cols)
-  message="$(fold -w ${terminal_columns} -s <<< $message)"
-  printf '%s\n\n' "$message"
-}
-
-main() {
-  local pool
-  while getopts 'p:' OPTION; do
-    case $OPTION in
-      p) pool="$OPTARG"
-        ;;
-      ?) printf "Usage: %s [-p value]\n" ${0##*/} >&2
-        exit 2
-        ;;
-      esac
-  done
-  : ${pool:="faft-cr50"}
-
-  local labstations
-  labstations="$(find_fw_qual_labstations ${pool})"
-
-  WARNING_MESSAGE="This script asssumes many things, like that you have atest in
-    the environment in which it is run, that you have cros in your DNS search
-    path, and that you have configured you environment to use the testing_rsa
-    key on lab DUTs.  This is just a convenience script for firmware qual test
-    environment health triage.  Feel free and encouraged to extend and enhance
-    it, but in the long term, it should be obviated by monitoring and alerting."
-  print_long_message "${WARNING_MESSAGE}"
-
-  LABSTATION_SANITY_CHECK_MESSAGE="Checking that the labstations respond to ping
-    first, because if a labstation is down: you are hosed:"
-  print_long_message "${LABSTATION_SANITY_CHECK_MESSAGE}"
-  fping ${labstations}
-  printf "\n\n"
-
-  LABSTATION_CHECK_MESSAGE="Logging labstation's version, uptime, the last 10
-    eventlog lines, and the update_engine's PID:"
-  print_long_message "${LABSTATION_CHECK_MESSAGE}"
-  for station in ${labstations}; do
-    printf "\nChecking $station\n"
-    ssh -o StrictHostKeyChecking=no root@$station "
-      grep guado_labstation-release /etc/lsb-release;
-      printf \"\n\";
-      uptime;
-      printf \"\n\";
-      mosys eventlog list | tail -n 10;
-      printf \"\n\";
-      pgrep --list-full update_engine;
-      printf \"\n\";"
-  done
-
-  SERVOD_CHECK_MESSAGE="Checking for lines with ERROR in the most recent servod
-    log (the last 4 will be on the labstation, only the most recent one is
-    checked here)."
-  print_long_message "${SERVOD_CHECK_MESSAGE}"
-  for station in ${labstations}; do
-    printf "\nChecking $station\n"
-    ssh -o StrictHostKeyChecking=no root@$station \
-      "grep ERROR /var/log/servod_990*.log";
-  done
-
-  SERVO_CHECK_MESSAGE="Logging servo versions (there's 4 per labsation even if
-    there aren't 4 DUTs).  This should excercise the servo consoles.  If you get
-    connection refused, servod has likely stopped.  If you get a timeout waiting
-    for response, the servo has likely wedged (no console response)."
-  printf "\n"
-  print_long_message "${SERVO_CHECK_MESSAGE}"
-
-  for station in ${labstations}; do
-    printf "\nChecking $station\n"
-    for port in ${SERVO_PORTS}; do
-      ssh -o StrictHostKeyChecking=no root@$station "dut-control -p $port \
-        servo_micro_version; dut-control -p $port servo_v4_version;"
-      done;
-  done
-
-  # TODO(kmshelton): Figure out how to check if the latest build was successful
-  # for all models in the environment, since this is the cause of missing
-  # nightlies sometimes.
-
-  printf "\nGathering kernel and servod logs."
-  logs_dir="$(mktemp -d)"
-  cd $logs_dir
-  printf "\nLogs will be stored in $logs_dir\n"
-  for station in ${labstations}; do
-    mkdir $logs_dir/$station
-    printf "Gathering from ${station}.\n"
-    scp -o StrictHostKeyChecking=no root@$station:/var/log/messages \
-      $logs_dir/$station > /dev/null;
-    scp -o StrictHostKeyChecking=no root@$station:/var/log/servo* \
-      $logs_dir/$station > /dev/null;
-  done
-
-  FAILURE_SIG_MESSAGE="Checking for the failure signature (\"did not claim
-    interface\") of b/110045723:"
-  print_long_message "${FAILURE_SIG_MESSAGE}"
-  for station in ${labstations}; do
-    grep -r "did not claim interface" $logs_dir/$station;
-  done
-
-  printf "Tarring kernel and servo logs.\n\n"
-  tarball="$(mktemp)"
-  tar -cvf $tarball . > /dev/null
-  TARBALL_MESSAGE="There will be a tarball at $tarball.  Attach the tarball to
-    any bugs filed."
-  print_long_message "${TARBALL_MESSAGE}"
-  printf "Returning to original directory.\n\n"
-  cd - > /dev/null
-
-  OVERVIEW_MESSAGE="Logging a high level host overview. DUTs should have a
-    status of \"Ready\", unless there is a KI.  If a DUT is stuck in a repair
-    state, try to figure out why.  If you find yourself repeatedly checking
-    something, extend this script with what you are checking."
-  print_long_message "${OVERVIEW_MESSAGE}"
-  atest host list --label=pool:${pool} --parse \
-    | grep chromeos1- \
-    | cut -f 1,2,4,5,6 -d \|
-}
-
-main "$@"
diff --git a/provingground/firmware/fw_lab_triage_helper.go b/provingground/firmware/fw_lab_triage_helper.go
new file mode 100644
index 0000000..677faec
--- /dev/null
+++ b/provingground/firmware/fw_lab_triage_helper.go
@@ -0,0 +1,133 @@
+// Copyright 2019 The Chromium OS Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+package main
+
+import (
+	"fmt"
+	"log"
+	"os/exec"
+	"reflect"
+	"regexp"
+	"strings"
+)
+
+const labstationTelemetryCmds = "grep guado_labstation-release /etc/lsb-release; " +
+	"printf \"\n\"; " +
+	"uptime; " +
+	"printf \"\n\"; " +
+	"mosys eventlog list | tail -n 10; " +
+	"printf \"\n\"; " +
+	"pgrep --list-full update_engine; " +
+	"printf \"\n\";"
+
+const warningMessage = "This utility asssumes many things, like that you have " +
+	"atest in the environment in which it is run, that you have cros in " +
+	"your DNS search path, and that you have configured your environment to " +
+	"use the testing_rsa key on lab DUTs.  This is just a convenience " +
+	"utility for firmware qual test environment health triage.  Feel free " +
+	"and encouraged to extend and enhance it, but in the long term, it " +
+	"should be mostly obviated by monitoring and alerting."
+
+type dut struct {
+	Hostname, Port, Labstation, Board, Model, Status, LockStatus, LockReason string
+}
+
+func newDut(hostname string) dut {
+	d := dut{Hostname: hostname}
+
+	regexMap := map[string]string{
+		"Port":       `servo_port : (?P<Port>.*)`,
+		"Labstation": `servo_host : (?P<Labstation>.*)`,
+		"Board":      `board:(?P<Board>.*)`,
+		"Model":      `model:(?P<Model>.*)`,
+		"Status":     `Status: (?P<Status>.*)`,
+		"LockStatus": `Locked: (?P<LockStatus>.*)`,
+		"LockReason": `Lock Reason: (?P<LockReason>.*)`,
+	}
+
+	cmd := exec.Command("atest", "host", "stat", hostname)
+	out, _ := cmd.Output()
+
+	for field, re := range regexMap {
+		match := regexp.MustCompile(re).FindStringSubmatch(string(out))
+		// The LockReason field can be empty if the DUT is not locked.
+		if len(match) != 2 && field != "LockReason" {
+			log.Printf("Skipping %s on %s.  This could be ok if a DUT is only partially through the deployment checklist.", field, hostname)
+			continue
+		} else {
+			reflect.ValueOf(&d).Elem().FieldByName(field).SetString(match[1])
+		}
+	}
+
+	return d
+}
+
+func sendSSHCommand(host, remoteCmd string) (outs string, err error) {
+	cmd := exec.Command("ssh", "-o", "StrictHostKeyChecking=no", "root@"+host, remoteCmd)
+	out, err := cmd.Output()
+	outs = string(out)
+	return
+}
+
+func main() {
+	duts := []dut{}
+
+	fmt.Println(warningMessage)
+
+	log.Print("Gathering DUT info via atest...")
+	// TODO(kmshelton): Support arbitrary pools.  Remember to sanitize for chromeos1 (high
+	// touch lab) when adding arbitrary pool support, as we would not want to do operations
+	// like ssh'ing to labstations in the low touch lab.  The current hardcoded pool, it is
+	// safe to assume no low touch lab devices are operated upon.
+	cmd := exec.Command("atest", "host", "list", "--hostnames-only", "--label=pool:faft-cr50")
+	out, err := cmd.Output()
+	if err != nil {
+		log.Fatalf("<atest host list> encountered: %s", err)
+	}
+	hostnames := strings.Fields(string(out))
+
+	for _, hostname := range hostnames {
+		duts = append(duts, newDut(hostname))
+	}
+
+	log.Print("Summarizing DUT info...")
+	// TODO(kmshelton): Use text/tabwriter to make this digestable.
+	for _, dut := range duts {
+		fmt.Printf("%+v\n", dut)
+	}
+
+	log.Print("Gathering and displaying key telemetry for labstations.")
+	// TODO(kmshelton): Do this without keeping two approximately-identical memos.
+	labstations := []string{}
+	labstationsSeen := make(map[string]bool)
+	for _, dut := range duts {
+		if _, ok := labstationsSeen[dut.Labstation]; !ok && dut.Labstation != "" {
+			labstations = append(labstations, dut.Labstation)
+			labstationsSeen[dut.Labstation] = true
+		}
+	}
+
+	// TODO(kmshelton): Migrate to using x/crypto/ssh (here and eleswehere) and handle network errors.
+	for _, labstation := range labstations {
+		fmt.Println("Operating on ", labstation)
+		out, err := sendSSHCommand(labstation, labstationTelemetryCmds)
+		if err != nil {
+			log.Fatalf("Gathering labstation telemetry encountered: %s when interfacing with %s.  Is the labstation pingable?  Do you have lab ssh credentials setup?", err, labstation)
+		}
+		fmt.Println("\n", out)
+	}
+
+	log.Print("Querying servos for their versions (note this depends on the servo consoles being in a functional state): ")
+	for _, dut := range duts {
+		if dut.Labstation == "" || dut.Port == "" {
+			continue
+		}
+		out, err := sendSSHCommand(dut.Labstation, "dut-control -p "+dut.Port+" servo_micro_version; dut-control -p "+dut.Port+" servo_v4_version;")
+		if err != nil {
+			log.Fatalf("Querying servos encountered: %s when interfacing with %s.  Is the labstation pingable?  Do you have lab ssh credentials setup?", err, dut.Labstation)
+		}
+		fmt.Println(dut.Hostname, ": ", out)
+	}
+}