blob: ae80a6b704fe1075535ed7b66de9b9a90878b031 [file] [log] [blame] [edit]
#!/bin/bash
# Force a repair special task for any host that hasn't seen activity in
# the past day.
#
# Various scripts/cron jobs look for DUTs that aren't working. To be
# conservative, those scripts assume that a DUT that hasn't run any jobs
# within a reasonable time interval isn't working, since some of the
# ways a DUT may be unavailable manifest as inactivity.
#
# In some cases, we'd like to be more certain as to a DUT's status.
# This script goes through the entire AFE hosts table, and identifies
# unlocked hosts that would otherwise be flagged as "not working due to
# lack of activity", and forces a repair task.
#
# We use a repair task (as opposed to verify) for various reasons:
# + If a DUT is working, repair and verify perform the same checks,
# and generally run in the same time.
# + If a DUT is broken, a verify task will fail and invoke repair,
# which will take longer than just repair alone.
# + Repair tasks that pass update labels; without this, labels could
# become out-of-date simply because a DUT is idle.
#
# Locked hosts are skipped because they can't run jobs and because we
# want them to show up as suspicious anyway.
cd $(dirname $0)/..
# Gather all the hosts under supervision of the lab techs.
# Basically, that's any host in any managed pool.
GET_HOSTS='
/pool:(suites|bvt|cq|continuous|cts|arc-presubmit|crosperf|performance)/ {
print $1
}
'
HOSTS=( $(cli/atest host list --unlocked | awk "$GET_HOSTS") )
# Go through the gathered hosts, and use dut_status to find the
# ones with unknown state (anything without a positive "OK" or
# "NO" diagnosis).
NEED_CHECK='
/OK/ || /NO/ { next }
/^chromeos/ { print $1 }
'
CHECK=( $(site_utils/dut_status.py -d 19 "${HOSTS[@]}" | awk "$NEED_CHECK") )
contrib/repair_hosts "${CHECK[@]}"