blob: e8ba34cfd141c3fad62540dbe59f8f6a2ad6a36d [file] [log] [blame]
#!/usr/bin/env python
import collections
import multiprocessing.pool
import os
import re
import subprocess
import sys
import tempfile
import urllib2
import common
from autotest_lib.server import frontend
from autotest_lib.server import utils
SERVERS = ['cautotest', 'cautotest-cq']
RESULT_URL_FMT = 'http://%s/tko/retrieve_logs.cgi?job=/results'
LOG_PATH_FMT = 'hosts/%s/%d-repair'
AUTOSERV_DEBUG_LOG = 'debug/autoserv.DEBUG'
GS_URI = 'gs://chromeos-autotest-results'
REPAIR_FAILURE_PATTERN = re.compile('.*Failed to repair .*: (.*)')
REPAIR_ACTION_PATTERN = re.compile('.*Attempting (.*)')
NEW_LINE_PATTERN = re.compile('^\d\d/\d\d\s\d\d:\d\d:\d\d')
COMMAND_PATTERN = re.compile('[^"]*("[^"]*").*')
def ssh_command(host, cmd):
return ['ssh', '-o PasswordAuthentication=no', '-o ConnectTimeout=1',
'-o ConnectionAttempts=1', '-o StrictHostKeyChecking=no',
'-q', 'root@'+host, '--', cmd]
class Result(object):
def __init__(self, host, ping, ssh, servod, logs):
self.host = host
self.ping = ping
self.ssh = ssh
self.servod = servod
self.logs = logs
class FailureReason(object):
"""Object used to store the detail of an attempt to repair.
"""
def __init__(self, action, error, hostname):
"""Initialize the detail of an attempt to repair.
@param action: Action taken to repair the DUT.
@param error: Error message.
@param hostname: Hostname of the repair failed DUT.
"""
self.action = action
self.error = error
self.hostname = hostname
self.key = self.get_key()
def get_key(self):
"""Get a reason string that does not contain unnecessary information.
@param reason: A string that explains repair failure.
@param host: An attribute dictionary of host that failed to be repaired.
@return: A simplified string that does not contain unnecessary
information, e.g., hostname etc.
"""
# Merge error into a single line.
error = self.error.replace('\n', ' ')
# Remove repeated empty spaces.
error = re.sub('\s\s+' , ' ', error)
if '/usr/bin/ssh -a -x -o ControlPath=' in error:
# Extract the command, and rebuild reason string.
error = ('Failed to run command: %s' %
extract_match_text(error, COMMAND_PATTERN))
reason = 'Action: %s. Error: %s' % (self.action, error)
# Remove hostname from reason string.
reason = reason.replace(self.hostname, '')
# Trim reason string as some might be really long.
reason = reason[:136]
return reason
class Failure(object):
"""Object used to store the details of why a host is failed to be repaired.
"""
def __init__(self, server, host, repair_job_id):
"""Initialize details of failure
@param server: Name of Autotest instance.
@param host: CrosHost object of the DUT.
@param repair_job_id is the job ID of last repair job.
"""
self.host = host
self.repair_job_id = repair_job_id
log_path = LOG_PATH_FMT % (host['hostname'], repair_job_id)
self.log_url = '%s/%s' % (RESULT_URL_FMT % server, log_path)
self.reasons = self.get_repair_fail_reasons()
@classmethod
def get_debug_log(self, autoserv_log_url, autoserv_log_path):
"""Get a list of strings or a stream for autoserv.DEBUG log file.
@param autoserv_log_url: Url to the repair job's autoserv.DEBUG log.
@param autoserv_log_path: Path to autoserv.DEBUG log, e.g.,
hosts/hostname/job_id-repair/debug/autoserv.DEBUG.
@return: A list of string if debug log was moved to GS already, or a
stream of the autoserv.DEBUG file.
"""
url = urllib2.urlopen(autoserv_log_url).geturl()
if not 'accounts.google.com' in url:
return urllib2.urlopen(url)
else:
# The file was moved to Google storage already, download it to /tmp
debug_log_link = '%s/%s' % (GS_URI, autoserv_log_path)
cmd = 'gsutil cat %s' % debug_log_link
proc = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
stdout, stderr = proc.communicate()
if proc.returncode == 0:
return stdout.split(os.linesep)
else:
print 'Failed to read %s: %s' % (debug_log_link, stderr)
@classmethod
def is_log_entry(self, line):
"""Check if the given line is a log entry started with a timestamp.
@param line: A single line in the log file.
@return: True if the line is a new log entry that starts with a
timestamp.
"""
return NEW_LINE_PATTERN.match(line) != None
@classmethod
def get_action(self, line):
"""Parse the action used to repair DUT.
@param line: A single line in the log file.
@return: Action used to repair DUT, None if the line does not have such
information.
"""
action = extract_match_text(line, REPAIR_ACTION_PATTERN)
# Trim `to `, some repair attempt (RPM) does not use that word.
if action and action.startswith('to '):
action = action[3:]
# Ignore the line `Attempting full repair`
if action == 'full repair':
action = None
return action
def get_repair_fail_reasons(self):
"""Get a list of reasons for repair failure.
Read through autoserv.DEBUG log to find each attempt to repair the DUT
and the reason it failed. The sequence in the log looks like:
Attempting full repair
Attempting to repair servo host chromeos2-row5-rack1-host4-servo.
Failed to repair servo: Host did not return from reboot
Attempting recover servo enabled device by powering it off and on
Failed to repair device: DUT did not boot after long_press.
Attempting recovery servo enabled device with servo_repair_reinstall
Failed to repair device: Download image to usb failed.
Attempting repair via RPM powercycle.
Failed to repair device: Powercycled host 6 times; device did not...
@param autoserv_log_url: Url to the repair job's autoserv.DEBUG log.
@param autoserv_log_path: Path to autoserv.DEBUG log, e.g.,
hosts/hostname/job_id-repair/debug/autoserv.DEBUG.
@param hostname: Hostname of the repair failed DUT.
@return: A list of reason strings.
"""
hostname = self.host['hostname']
log_path = LOG_PATH_FMT % (hostname, self.repair_job_id)
autoserv_log_path = '%s/%s' % (log_path, AUTOSERV_DEBUG_LOG)
autoserv_log_url = '%s/%s' % (self.log_url, AUTOSERV_DEBUG_LOG)
# Parse log debug/autoserv.DEBUG to find all reasons that repair failed.
reasons = []
debug_log = self.get_debug_log(autoserv_log_url, autoserv_log_path)
# Some message might take multiple lines.
error = None
action = None
for line in debug_log:
if self.is_log_entry(line):
if error:
reasons.append(FailureReason(action, error, hostname))
action = None
if not action:
action = self.get_action(line)
error = extract_match_text(line, REPAIR_FAILURE_PATTERN)
elif error:
error += line
if error:
reasons.append(FailureReason(action, error, hostname))
# Close the stream if debug_log is not a list of string read from
# command 'gsutil cat'.
if not isinstance(debug_log, list):
debug_log.close()
return reasons
def log(self):
"""Print the details about why DUT was failed to be repaired.
"""
print '\nExamining failure reason for %s ...' % self.host['hostname']
print ' Log url: %s' % self.log_url
print ' Repair failed reasons:'
print '\n'.join([' ' + reason.key for reason in self.reasons])
def extract_match_text(line, pattern):
"""Extract the text information that matches the pattern.
@param line: A single line in the log file.
@return: Text that matches the given pattern, None if the line does not
match the pattern.
"""
m = pattern.match(line)
if m:
return m.group(1)
else:
return None
def log_result(result):
print "Examining %s ..." % result.host
if result.ping:
print " PING = UP"
else:
print " PING = DOWN\n"
return
if result.ssh:
print " SSH = UP"
else:
print " SSH = DOWN\n"
return
print " SERVOD = %s" % ('UP' if result.servod else 'DOWN',)
print " LOGS = \n%s" % (result.logs,)
def check_servo(servo):
r = Result(servo, None, None, None, None)
r.ping = (utils.ping(servo, tries=5, deadline=5) == 0)
if not r.ping:
return r
try:
subprocess.check_output(ssh_command(servo, "true"))
except subprocess.CalledProcessError:
r.ssh = False
return r
else:
r.ssh = True
try:
output = subprocess.check_output(ssh_command(servo, "pgrep servod"))
except subprocess.CalledProcessError:
r.servod = False
else:
r.servod = (output != "")
try:
output = subprocess.check_output(
ssh_command(servo, "tail /var/log/servod.log"))
except subprocess.CalledProcessError:
r.logs = ""
else:
r.logs = output
return r
def get_last_repair_job(afe, host):
"""Get the job ID of last repair job.
@param afe: rpc interface.
@param host: Attribute dictionary of host that failed to be repaired.
@return: Job ID of last repair job.
"""
try:
tasks = afe.run('get_special_tasks',
host__hostname=host['hostname'],
task='Repair',
time_started__isnull=False,
is_complete=True,
is_aborted=False,
sort_by=['-time_started'],
query_limit=1)
return tasks[0]['id']
except Exception as e:
print ('Failed to get repair job id for host %s: %s' %
(host['hostname'], e))
return None
def check_host_failure(input):
"""Check why host failed to be repaired.
@param input: A dictionary of input arguments, i.e.,
{server, host, repair_job_id}, where:
server is the name of autotest instance
host is an attribute dictionary of host that failed to be
repaired.
repair_job_id is the job ID of last repair job.
"""
return Failure(server=input['server'], host=input['host'],
repair_job_id=input['repair_job_id'])
def redeploy_hdctools(host):
try:
subprocess.check_output(
ssh_command(host, "/home/chromeos-test/hdctools/beaglebone/deploy"),
stderr=subprocess.STDOUT)
except subprocess.CalledProcessError:
return False
else:
return True
def install_package(package):
def curry(host):
try:
subprocess.check_output(
ssh_command(host, "apt-get install %s" % package),
stderr=subprocess.STDOUT)
subprocess.check_output(
ssh_command(host, "start servod"),
stderr=subprocess.STDOUT)
except subprocess.CalledProcessError:
return False
else:
return True
curry.__name__ = "install_package(%s)" % package
return curry
def manual_intervention(reason):
def curry(_):
return False
curry.__name__ = 'MANUAL(%s)' % reason
return curry
Fix = collections.namedtuple('Fix', ['host', 'method', 'success'])
# I don't know if these failures are one-time or repeating, so I'm adding code
# here for now. If these are seen and fixed by this frequently, then this code
# should be moved into servo_host's repair()
def diagnose_failure(r):
method = None
if r.logs and 'ImportError: Entry point' in r.logs:
method = redeploy_hdctools
if r.logs and 'ImportError: No module named serial' in r.logs:
method = install_package('python-serial')
if not r.ping or not r.ssh:
method = manual_intervention('servo is unresponsive on network')
if r.logs and 'No usb device connected to servo' in r.logs:
method = manual_intervention("servo doesn't see USB drive")
if r.logs and 'discover_servo - No servos found' in r.logs:
method = manual_intervention("beaglebone doesn't see servo")
if method:
return Fix(r.host, method.__name__, method(r.host))
else:
return None
def get_board(host):
"""Get name of the board from a host object.
@param host: An attribute dictionary of host.
@return: Name of the board, return None if board label is not found.
"""
for label in host['labels']:
if label.startswith('board:'):
return label[6:]
return None
def main():
pool = multiprocessing.pool.ThreadPool()
all_results = []
all_repair_failures = []
for server in SERVERS:
afe = frontend.AFE(server=server)
hosts = afe.run('get_hosts', multiple_labels=['servo'],
status='Repair Failed')
print ('%s: %s hosts with servo label are in Repair Failed.' %
(server, len(hosts)))
servos = [h['hostname']+'-servo.cros' for h in hosts]
results = pool.imap_unordered(check_servo, servos)
for result in results:
log_result(result)
all_results.append(result)
inputs = [{'server': server, 'host': host,
'repair_job_id': get_last_repair_job(afe, host)}
for host in hosts]
repair_failures = pool.imap_unordered(check_host_failure, inputs)
for failure in repair_failures:
failure.log()
all_repair_failures.append(failure)
failure_collection = []
for failure in all_repair_failures:
failure_collection.extend([(reason, get_board(failure.host))
for reason in failure.reasons
if 'servo' in reason.action])
def compare_failure(f1, f2):
return (cmp(f1[0].key, f2[0].key) or cmp(f1[1], f2[1]) or
cmp(f1[0].hostname, f2[0].hostname))
failure_collection.sort(cmp=compare_failure)
print '====================================='
print 'DUTs failed to be repaired by servo:'
print '====================================='
current_key = None
for (reason, board) in failure_collection:
if reason.key != current_key:
print reason.key
current_key = reason.key
current_board = None
if board != current_board:
print ' %s' % board
current_board = board
print ' %s' % reason.hostname
# fix 'em if you can?
fixes = filter(None, pool.imap_unordered(diagnose_failure, all_results))
for fix in fixes:
print ("Fixing %(host)s via %(method)s resulted in %(success)s" %
dict(host=fix.host, method=fix.method,
success='SUCCESS' if fix.success else 'FAILURE'))
return 0
if __name__ == '__main__':
sys.exit(main())