provingground/firmware/fw_lab_triage_helper.sh - mirrors/cros/chromiumos/platform/crostestutils - Git at Google

 #!/bin/bash
 #
 # Copyright 2019 The Chromium OS Authors. All rights reserved.
 # Use of this source code is governed by a BSD-style license that can be
 # found in the LICENSE file.

 # TODO(someone): Obviate in favor of monitoring and alerting.

 SERVO_PORTS="9901 9902 9903 9904"

 # Labstations outside of the high touch lab (chromeos1) are excluded.
 find_labstations() {
   atest host list --label=pool:${POOL} --parse \
   | grep chromeos1- \
   | cut -f 1 -d \| \
   | sed 's/host.*/labstation/g' \
   | sed 's/Host=//g' \
   | sort \
   | uniq \
   | xargs
 }

 POOL=faft-cr50
 while getopts 'p:' OPTION
 do
   case $OPTION in
     p) POOL="$OPTARG"
        ;;
     ?) printf "Usage: %s [-p value]\n" ${0##*/} >&2
        exit 2
        ;;
     esac
 done

 echo "This script asssumes many things, like that you have atest in the \
   environment that you run it, that you have cros in your DNS search path, and \
   that you have added the testing_rsa key to your ssh agent.  This is just a \
   conveience script to get lab daily analysis off the ground, feel free and \
   encouraged to extend and enhance it, but longer term it should be mostly \
   obviated by monitoring and alerting."

 echo -e "\nChecking that the labstations respond to ping first, because if a \
   labstation is down: you are hosed."
 LABSTATIONS="$(find_labstations)"
 echo $LABSTATIONS
 fping $LABSTATIONS

 echo -e "\n\nLogging labstation's version, uptime, the last 10 eventlog lines, \
   and the update_engine's PID:"
 for STATION in $LABSTATIONS;
     do echo -e "\n\nsshing to $STATION\n"
     ssh -o StrictHostKeyChecking=no root@$STATION "
       grep guado_labstation-release /etc/lsb-release;
       echo;
       uptime;
       echo;
       mosys eventlog list | tail -n 10;
       echo;
       pgrep --list-full update_engine;"
 done

 echo -e "\n\nChecking for lines with ERROR in the most recent servod log (the \
   last 4 will be on the labstation, only the most recent one is checked here)."
 for STATION in $GSC_LABSTATIONS;
     do echo -e "\nchecking $STATION"
     ssh -o StrictHostKeyChecking=no root@$STATION \
       "grep ERROR /var/log/servod_990*.log";
 done


 echo -e "\n\nLogging servo versions (there's 4 per labsation even if there \
   aren't 4 DUTs). This should excercise the servo consoles.  If you get \
   connection refused, servod has likely stopped.  If you get a timeout waiting \
   for response, the servo has likely crashed (no console response)."
 for STATION in $LABSTATIONS;
     do echo -e "\nchecking $STATION"
     for PORT in $SERVO_PORTS;
         do ssh -o StrictHostKeyChecking=no root@$STATION "dut-control -p $PORT \
           servo_micro_version; dut-control -p $PORT servo_v4_version;"
     done;
 done

 # TODO(kmshelton): Figure out how to check if the latest build was successful
 # for all models in the environment, since this is the cause of missing
 # nightlies sometimes.

 echo -e "\nGathering kernel and servod logs."
 LOGS_DIR="$(mktemp -d)"
 cd $LOGS_DIR
 echo -e "\nLogs will be stored in $LOGS_DIR"
 for STATION in $LABSTATIONS;
     do mkdir $LOGS_DIR/$STATION
     echo "checking $STATION"
     scp -o StrictHostKeyChecking=no root@$STATION:/var/log/messages \
       $LOGS_DIR/$STATION;
     scp -o StrictHostKeyChecking=no root@$STATION:/var/log/servo* \
       $LOGS_DIR/$STATION;
 done

 echo -e "\n\nChecking for the failure signature ("did not claim interface") of \
   b/110045723:"
 for STATION in $LABSTATIONS;
     do grep -r "did not claim interface" $LOGS_DIR/$STATION;
 done


 echo -e "\n\nTarring kernel and servo logs."
 TARBALL="$(mktemp)"
 tar -cvf $TARBALL .
 echo -e "\n\n There will be a tarball at $TARBALL.  Attach the tarball to any \
   bugs filed."

 echo -e "\n\nReturning to original directory"
 cd -

 echo -e "\n\nLogging high level host overview. DUTs should have a status of \
   "Ready", unless there is a KI.  If a DUT is stuck in a repair state, try to \
   figure out why.  If you find yourself repeatedly checking something, extend \
   this script with what you are checking.\n"

 atest host list --label=pool:${POOL} --parse \
 | grep chromeos1- | cut -f 1,2,4,5,6 -d \|
	#!/bin/bash
	#
	# Copyright 2019 The Chromium OS Authors. All rights reserved.
	# Use of this source code is governed by a BSD-style license that can be
	# found in the LICENSE file.

	# TODO(someone): Obviate in favor of monitoring and alerting.

	SERVO_PORTS="9901 9902 9903 9904"

	# Labstations outside of the high touch lab (chromeos1) are excluded.
	find_labstations() {
	atest host list --label=pool:${POOL} --parse \
	\| grep chromeos1- \
	\| cut -f 1 -d \\| \
	\| sed 's/host.*/labstation/g' \
	\| sed 's/Host=//g' \
	\| sort \
	\| uniq \
	\| xargs
	}

	POOL=faft-cr50
	while getopts 'p:' OPTION
	do
	case $OPTION in
	p) POOL="$OPTARG"
	;;
	?) printf "Usage: %s [-p value]\n" ${0##*/} >&2
	exit 2
	;;
	esac
	done

	echo "This script asssumes many things, like that you have atest in the \
	environment that you run it, that you have cros in your DNS search path, and \
	that you have added the testing_rsa key to your ssh agent. This is just a \
	conveience script to get lab daily analysis off the ground, feel free and \
	encouraged to extend and enhance it, but longer term it should be mostly \
	obviated by monitoring and alerting."

	echo -e "\nChecking that the labstations respond to ping first, because if a \
	labstation is down: you are hosed."
	LABSTATIONS="$(find_labstations)"
	echo $LABSTATIONS
	fping $LABSTATIONS

	echo -e "\n\nLogging labstation's version, uptime, the last 10 eventlog lines, \
	and the update_engine's PID:"
	for STATION in $LABSTATIONS;
	do echo -e "\n\nsshing to $STATION\n"
	ssh -o StrictHostKeyChecking=no root@$STATION "
	grep guado_labstation-release /etc/lsb-release;
	echo;
	uptime;
	echo;
	mosys eventlog list \| tail -n 10;
	echo;
	pgrep --list-full update_engine;"
	done

	echo -e "\n\nChecking for lines with ERROR in the most recent servod log (the \
	last 4 will be on the labstation, only the most recent one is checked here)."
	for STATION in $GSC_LABSTATIONS;
	do echo -e "\nchecking $STATION"
	ssh -o StrictHostKeyChecking=no root@$STATION \
	"grep ERROR /var/log/servod_990*.log";
	done


	echo -e "\n\nLogging servo versions (there's 4 per labsation even if there \
	aren't 4 DUTs). This should excercise the servo consoles. If you get \
	connection refused, servod has likely stopped. If you get a timeout waiting \
	for response, the servo has likely crashed (no console response)."
	for STATION in $LABSTATIONS;
	do echo -e "\nchecking $STATION"
	for PORT in $SERVO_PORTS;
	do ssh -o StrictHostKeyChecking=no root@$STATION "dut-control -p $PORT \
	servo_micro_version; dut-control -p $PORT servo_v4_version;"
	done;
	done

	# TODO(kmshelton): Figure out how to check if the latest build was successful
	# for all models in the environment, since this is the cause of missing
	# nightlies sometimes.

	echo -e "\nGathering kernel and servod logs."
	LOGS_DIR="$(mktemp -d)"
	cd $LOGS_DIR
	echo -e "\nLogs will be stored in $LOGS_DIR"
	for STATION in $LABSTATIONS;
	do mkdir $LOGS_DIR/$STATION
	echo "checking $STATION"
	scp -o StrictHostKeyChecking=no root@$STATION:/var/log/messages \
	$LOGS_DIR/$STATION;
	scp -o StrictHostKeyChecking=no root@$STATION:/var/log/servo* \
	$LOGS_DIR/$STATION;
	done

	echo -e "\n\nChecking for the failure signature ("did not claim interface") of \
	b/110045723:"
	for STATION in $LABSTATIONS;
	do grep -r "did not claim interface" $LOGS_DIR/$STATION;
	done


	echo -e "\n\nTarring kernel and servo logs."
	TARBALL="$(mktemp)"
	tar -cvf $TARBALL .
	echo -e "\n\n There will be a tarball at $TARBALL. Attach the tarball to any \
	bugs filed."

	echo -e "\n\nReturning to original directory"
	cd -

	echo -e "\n\nLogging high level host overview. DUTs should have a status of \
	"Ready", unless there is a KI. If a DUT is stuck in a repair state, try to \
	figure out why. If you find yourself repeatedly checking something, extend \
	this script with what you are checking.\n"

	atest host list --label=pool:${POOL} --parse \
	\| grep chromeos1- \| cut -f 1,2,4,5,6 -d \\|