blob: bf8fff3056fbc3a3af3bb5ed086ab8b3c44c6ab3 [file] [log] [blame]
# -*- coding: utf-8 -*-
# Copyright 2017 The Chromium OS Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
"""Simple log-based classification of build failures."""
from __future__ import print_function
import datetime
import os
import re
from chromite.lib import cros_logging as logging
from chromite.lib import gs
def LogTimestampToDatetime(timestamp):
"""Convert a log timestamp to a datetime.
Date is assumed to be default of 1900/1/1.
timestamp: string of the form 'HH:MM:SS'
datetime object.
return datetime.datetime.strptime(timestamp, '%H:%M:%S')
def WhiteoutImageVersion(test_name):
"""Whiteout the version embedded in a test name
test_name: string including an embedded version number.
string with version component replaces with '_#.#.#'
return re.sub(r'_\d+\.\d+\.\d+', '_#.#.#', test_name)
# Patterns used for test root causing.
DEVSERVER_IP_RE = re.compile(r'Staging autotest artifacts for .* on devserver '
NETWORK_RE = re.compile('Returned update_engine error code: ERROR_CODE=37')
SERVICE_FILE_RE = re.compile('The name org.chromium.UpdateEngine '
'was not provided by any .service files')
MYSQL_IP_RE = re.compile(r'Can\'t connect to MySQL server on '
def ClassifyDUTAutoservLog(log_content):
"""Attempts to classify any failures in the DUT's autoserv.DEBUG logs.
log_content: string iterator.
string of root cause, None otherwise.
devserver = 'unknown'
for line in log_content:
m =
if m:
# saving devserver value for other errors below.
devserver =
m =
if m:
return 'kOmahaErrorInHTTPResponse: %s' % (devserver)
m =
if m:
return 'dot-service'
return None
def ClassifyDUTTKOParseLog(log_content):
"""Attempts to classify any failures in the TKO parse .parse.log file.
log_content: string iterator.
string of root cause, None otherwise.
server = 'unknown'
for line in log_content:
m =
if m:
# saving devserver value for other errors below.
server =
m =
if m:
return 'TKOParseTimeout: %s' % (server)
return None
def ClassifyGSFile(gs_path, handler):
"""Attempts to classify a log file from GS storage.
gs_path: GS path.
handler: Handler which takes log content and returns a classification.
result from handler or None if the file cannot be retrieved.
ctx = gs.GSContext()
log_content = ctx.Cat(gs_path)
except gs.GSNoSuchKey:
return None
return handler(log_content.splitlines())
def GetTestRootCause(job_id, dut):
"""Attempts to get the root cause of a test failure.
job_id: Job that failed.
dut: Device that was running the test.
string with root cause, or None if the root cause is unknown.
test_path = ('gs://chromeos-autotest-results/%(job)s-chromeos-test' %
{'job': job_id})
# DUT is required to access logs. If not specified, try and
# determine it based on contents of test directory.
ctx = gs.GSContext()
if dut is None:
files = ctx.LS(test_path)
except gs.GSNoSuchKey:
return None
if len(files) == 0:
return None
base_path = files[0]
base_path = os.path.join(test_path, dut)
# Grab autoserv.DEBUG log.
classification = ClassifyGSFile(os.path.join(base_path,
if classification:
return classification
# Grab .parse.log log.
classification = ClassifyGSFile(os.path.join(base_path, '.parse.log'),
if classification:
return classification
return None
# Patterns used for classifying test failures.
# 05:28:51: INFO: RunCommand: /b/cbuild/internal_master/chromite/third_party/
# swarming.client/ run --swarming
# 07:52:52: WARNING: Killing tasks: [<_BackgroundTask(_BackgroundTask-7:7:4,
# started)>]
TIMESTAMP_PATTERN = r'(\d\d:\d\d:\d\d)'
SWARMING_START_RE = re.compile(TIMESTAMP_PATTERN + ': INFO: RunCommand: ' +
'.*/swarming.client/ run --swarming')
': WARNING: Killing tasks')
FAILED_GS_REMOVAL_1_RE = re.compile(r'^Removing gs:/.*\.\.\.')
FAILED_GS_REMOVAL_2_RE = re.compile(r'^NotFoundException: .* does not exist\.')
# Simple-minded labeling of failures.
SIMPLE_FAILURE_PATTERNS = [(re.compile(pattern), label) for pattern, label in (
('MySQL server has gone away', 'MySQL server'),
('PayloadTestError: cannot find source payload for testing',
'missing payload'),
('DebugSymbolsUploadException: Failed to upload all symbols',
'DebugSymbols Upload'),
('Failed to install device image', 'image install'),
('UnexpectedSignerResultsError', 'signer'),
('reached AU retry limit', 'AU limit'),
('ERROR:.*writer.Write.*failed', 'delta diff write'),
('Timeout reason: This build has reached the timeout '
'deadline set by the master', 'master timeout'),
('FAIL: No JSON object could be decoded', 'JSON decode'),
('ERROR: Failed to parse JSON', 'JSON parse'),
('FAIL: stage_artifacts timed out', 'artifact staging'),
('Command [^a-z]*tar.* returned non-zero exit status 2', 'tar error 2'),
('Unhandled run_suite exception: Call is timed out',
'run_suite call timeout'),
('Not enough DUTs', 'DUTs'),
('IOError: .Errno 28. No space left on device', 'no space'),
('Unhandled DevServerException: All devservers are currently down',
('Devserver portfile does not exist', 'No Devserver portfile'),
('Your .*Oauth 2.0 User Account.* credentials are invalid', 'Bad GS Oauth'),
('CommandException: No URLs matched', 'Bad URL'),
('Reached GCETestStage test run timeout', 'GCETestStage Timeout'),
(r'No output from .* for \d+ seconds', 'Timeout (No Output Returned)'),
# Test name + outcome pattern.
# For instance:
# login_MultiUserPolicy [ PASSED ]
# Spaces in between words are allowed (for instance: Suite job)
TEST_OUTCOME_RE = re.compile(r'(\w[\w\. ]+\w) +\[ ([A-Z]+) \]')
# TEST_FAILURE_PATTERNS is matched against the entire line which includes
# the test name and the reason. In cases where additional debugging or
# information is available, a link is provided with details.
TEST_FAILURE_PATTERNS = [(re.compile(pattern), label) for pattern, label in (
# Tests.
('provision', ''),
# Reasons.
('ABORT: Failed to install device image.*CODE=37', 'AU37'),
('ABORT: Failed to install device image.*CODE=0', 'AU0'),
('FAIL: Update server .* not available', 'NoServer'),
('ABORT: Timed out, did not run', 'TimeOut'),
('ABORT: Host did not return from reboot', 'NoReboot'),
('FAIL: Unhandled DevServerException: All devservers.*down', 'NoDevServ'),
('ABORT: Failed to trigger an update.*AutoserRunError', 'BlankAUError'),
('FAIL: Failed to receive a download started notification', 'AUNoStart'),
('FAIL: Saw file system error', 'FSError'),
('ABORT: Autotest client terminated unexpectedly: DUT is pingable',
ON_DUT_RE = re.compile(r'on (chromeos\d+-row\d+-rack\d+-host\d+).*')
LOG_LINK_RE = re.compile(
r'(\w[\w\. ]+\w) +'
AUTOTEST_LOG_LINK_RE = re.compile(
r'@@@STEP_LINK@\[Test-Logs\]: ([^:]+):.*(FAIL|ABORT).*'
r'on (chromeos\d+-row\d+-rack\d+-host\d+).*'
def ClassifyTestFailure(log_content):
"""Classifies the logs of a test failure.
log_content: string iterator.
set of classifications, or None if no classification.
# Add failures to this set as they are found.
failure_set = set()
swarming_start_time = None
test_runs = {}
test_jobs = {}
for line in log_content:
m =
if m:
swarming_start_time = LogTimestampToDatetime(
m =
if m and swarming_start_time:
swarming_end_time = LogTimestampToDatetime(
elapsed_time = swarming_end_time - swarming_start_time
# Since these are just times without dates, if the time appears to go
# backwards, assume that it's rolled over to the next day.
if elapsed_time.total_seconds() < 0:
elapsed_time += datetime.timedelta(days=1)
if elapsed_time > datetime.timedelta(hours=1):
failure_set.add('swarming timeout')
# Another multi-line check.
m =
if m:
line =
m =
if m:
failure_set.add('GS removal')
for failure_pattern, failure_label in SIMPLE_FAILURE_PATTERNS:
m =
if m:
# Two-line check looking for tests that failed.
m =
if m:
# For each invocation of the test, record its status. Each test
# might be invoked multiple times, so need to record the ordering
# of each run to match it with the logs.
# TODO: This entire parsing of failed jobs is very weak and fragile.
# Get the failures from a structured data source or start logging
# a single line sufficient to trigger the root cause logic.
autotest_name =
if autotest_name not in test_runs:
test_runs[autotest_name] = []
test_entry = {'status':}
if in ('FAILED', 'ABORT'):
failure =
# some test names contain an image version, for instance:
# autoupdate_EndToEndTest_npo_delta_8803.0.0
# The version number prevents matching failures and is not
# useful at this stage so we hide it.
failure = WhiteoutImageVersion(failure)
if failure not in ['Suite job']:
line =
# Extract and store DUT name if it exists.
m =
if m:
test_entry['dut'] =
m =
# Append a label to the failure name
if m:
for error_re, error_label in TEST_FAILURE_PATTERNS:
m =
if m:
failure += '{%s}' % error_label
# Look for links to test logs.
m =
if m:
# Record the ordering of the test logs so they can be reconciled.
autotest_name =
autotest_job_id =
test_jobs.setdefault(autotest_name, []).append(autotest_job_id)
# Find the root cause for failed test runs.
root_caused_jobs = set()
for autotest_name in test_runs:
for run in xrange(len(test_runs[autotest_name])):
test_entry = test_runs[autotest_name][run]
status = test_entry.get('status', '')
autotest_dut = test_entry.get('dut', None)
if (status in ('FAILED', 'ABORT') and
autotest_name in test_jobs and run < len(test_jobs[autotest_name])):
autotest_job_id = test_jobs[autotest_name][run]
if autotest_job_id in root_caused_jobs:
# Avoid processing the same job twice, necessary since the logs
# are duplicated.
root_cause = GetTestRootCause(autotest_job_id, autotest_dut)'root caused job %s dut %s: %s', autotest_job_id,
autotest_dut, root_cause)
if root_cause:
# Replace existing failure entry with a more specific one
if autotest_name in failure_set:
failure_set.add('%s==%s' % (autotest_name, root_cause))
if failure_set:
return failure_set
return None
# Patterns for classifying timeouts.
TIMEOUT_RE = re.compile('Timeout occurred-')
def IdentifyTimeout(log_content):
"""Classifies the logs of a stage that can contain a timetout.
log_content: string iterator of the stage's logs.
set of classifications, or None if no classification.
for line in log_content:
return set(['timeout'])
# Patterns for classifying archive failures.
BUILD_IMAGE_RE = re.compile('BuildScriptFailure: ./build_image failed .code=1.')
def ClassifyArchiveFailure(log_content):
"""Classifies the logs of an archive failure.
log_content: string iterator of the stage's logs.
set of classifications, or None if no classification.
for line in log_content:
return set(['build_image'])
def GetHandler(stage):
"""Determines a handler to classify a particular type of stage.
stage: name of the stage that failed.
function to classify logs.
return {
'HWTest [sanity]': ClassifyTestFailure,
'HWTest [bvt-inline]': ClassifyTestFailure,
'HWTest [bvt-arc]': ClassifyTestFailure,
'AUTest [au]': ClassifyTestFailure,
'HWTest [jetstream_cq]': ClassifyTestFailure,
'Paygen': ClassifyTestFailure,
'PaygenBuildCanary': ClassifyTestFailure,
'PaygenTestCanary': ClassifyTestFailure,
'PaygenBuildDev': ClassifyTestFailure,
'PaygenTestDev': ClassifyTestFailure,
'SetupBoard': IdentifyTimeout,
'BuildPackages': IdentifyTimeout,
'Archive': ClassifyArchiveFailure,
'VMTest': ClassifyTestFailure,
'GCETest': ClassifyTestFailure,
}.get(stage, None)
def ClassifyFailure(stage, log_content):
"""Classifies the logs of a failed stage.
stage: name of the stage that failed.
log_content: string iterator of the stage's logs.
set of classifications, or None if no classification.
# Ignore steps that failed because substeps failed.
if stage.startswith('cbuildbot ') or stage in ['steps', 'Failure reason']:
return None
# Getting rid of (attempt ...) appended onto labels.
if stage.startswith('VMTest') or stage.startswith('GCETest'):
stage = stage.split(' ', 1)[0]
handler = GetHandler(stage)
if handler:
return handler(log_content)
logging.debug('Missing handler for stage %s', stage)
return None
# Extracts block, success, task, notes.
STATUS_LOG_RE = re.compile(
r'(?P<task>\S+)\s+timestamp=\d+\s+localtime=\w+ \d+ \d\d:\d\d:\d\d\s*'
def ExtractLabStatusParts(line):
"""Given a status.log lines, extract the parts we care about.
This means the start/end of a block, success/failure, and what was being done.
line: One line of a status.log log file from the lab.
(block, result, task, notes), or None
result =
if not result:
return None
# Any of these values are allowed to be None.
return ('block'),'success'),'task'),'notes'))
def HandleRepairStatus(line_parts):
"""Given turn line_parts into a set of notes about the repair job.
line_parts: Iterable of ExtractLabStatusParts() results.
Set of single word strings which describe a given repair job.
repair_notes = set()
noop = True # Did this repair job perform any actual repair?
for _, result, task, notes in line_parts:
# If we get no failures, other than the servo connection, we did no repair
# work.
if result not in (None, 'GOOD') and task != 'verify.servod':
noop = False
# If we ran the repair.usb task, we tried to do a USB based repair.
if task == 'repair.usb':
if (result == 'FAIL' and task == 'verify.ssh' and
notes.startswith('No answer to ping')):
if noop:
return repair_notes
def ClassifyLabJobStatusResult(log_content):
"""Given a repair log, classify the type of repair.
log_content: Iterator yielding the status.log lines.
String summary of data extracted from log.
# Parse into parts.
# Throw away the ones we couldn't parse, since they don't matter.
line_parts = [ExtractLabStatusParts(l) for l in log_content]
line_parts = [l for l in line_parts if l]
# If we have no lines, or the last one isn't the Job END.
if not line_parts or line_parts[-1][0] != 'END':
return 'JOB: Unable to parse job status.log.'
# Extract the final result for the job.
_block, result, task, _notes = line_parts[-1]
repair_notes = set()
if task == 'repair':
repair_notes |= HandleRepairStatus(line_parts)
if not repair_notes:
repair_notes.add('No details extracted.')
# Example output: repair: SUCCESS: USB
return '%s: %s: %s' % (task, result, ', '.join(sorted(repair_notes)))