| # Copyright 2017 The Chromium OS Authors. All rights reserved. |
| # Use of this source code is governed by a BSD-style license that can be |
| # found in the LICENSE file. |
| |
| """Simple log-based classification of build failures.""" |
| |
| from __future__ import print_function |
| |
| import datetime |
| import re |
| |
| from chromite.lib import cros_logging as logging |
| from chromite.lib import gs |
| |
| |
| def LogTimestampToDatetime(timestamp): |
| """Convert a log timestamp to a datetime. |
| |
| Date is assumed to be default of 1900/1/1. |
| |
| Args: |
| timestamp: string of the form 'HH:MM:SS' |
| |
| Returns: |
| datetime object. |
| """ |
| return datetime.datetime.strptime(timestamp, '%H:%M:%S') |
| |
| |
| def WhiteoutImageVersion(test_name): |
| """Whiteout the version embedded in a test name |
| |
| Args: |
| test_name: string including an embedded version number. |
| |
| Returns: |
| string with version component replaces with '_#.#.#' |
| """ |
| return re.sub(r'_\d+\.\d+\.\d+', '_#.#.#', test_name) |
| |
| |
| # Patterns used for test root causing. |
| |
| DEVSERVER_IP_RE = re.compile(r'Staging autotest artifacts for .* on devserver ' |
| r'http://(\d+\.\d+\.\d+\.\d+):8082') |
| NETWORK_RE = re.compile('Returned update_engine error code: ERROR_CODE=37') |
| SERVICE_FILE_RE = re.compile('The name org.chromium.UpdateEngine ' |
| 'was not provided by any .service files') |
| |
| |
| def GetTestRootCause(job_id, dut): |
| """Attempts to get the root cause of a test failure. |
| |
| Args: |
| job_id: Job that failed. |
| dut: DUT that the test was being performed by. |
| |
| Returns: |
| string of root cause, None otherwise. |
| """ |
| # Currently rely on DUT to root cause any test. |
| if dut is None: |
| return None |
| |
| # Grab autoserv.DEBUG log. |
| gs_path = ('gs://chromeos-autotest-results' |
| '/%(job)s-chromeos-test/%(dut)s/debug/autoserv.DEBUG' % { |
| 'job': job_id, |
| 'dut': dut, |
| }) |
| ctx = gs.GSContext() |
| debug_log_content = ctx.Cat(gs_path) |
| |
| for line in debug_log_content.splitlines(): |
| m = DEVSERVER_IP_RE.search(line) |
| if m: |
| # saving devserver value for other errors below. |
| devserver = m.group(1) |
| |
| m = NETWORK_RE.search(line) |
| if m: |
| return 'kOmahaErrorInHTTPResponse: %s' % (devserver) |
| |
| m = SERVICE_FILE_RE.search(line) |
| if m: |
| return 'dot-service' |
| |
| return None |
| |
| |
| # Patterns used for classifying test failures. |
| |
| # 05:28:51: INFO: RunCommand: /b/cbuild/internal_master/chromite/third_party/ |
| # swarming.client/swarming.py run --swarming |
| # 07:52:52: WARNING: Killing tasks: [<_BackgroundTask(_BackgroundTask-7:7:4, |
| # started)>] |
| TIMESTAMP_PATTERN = r'(\d\d:\d\d:\d\d)' |
| SWARMING_START_RE = re.compile(TIMESTAMP_PATTERN + ': INFO: RunCommand: ' + |
| '.*/swarming.client/swarming.py run --swarming') |
| SWARMING_TIMEOUT_RE = re.compile(TIMESTAMP_PATTERN + |
| ': WARNING: Killing tasks') |
| |
| FAILED_GS_REMOVAL_1_RE = re.compile(r'^Removing gs:/.*\.\.\.') |
| FAILED_GS_REMOVAL_2_RE = re.compile(r'^NotFoundException: .* does not exist\.') |
| |
| # Simple-minded labeling of failures. |
| SIMPLE_FAILURE_PATTERNS = [(re.compile(pattern), label) for pattern, label in ( |
| ('MySQL server has gone away', 'MySQL server'), |
| ('PayloadTestError: cannot find source payload for testing', |
| 'missing payload'), |
| ('DebugSymbolsUploadException: Failed to upload all symbols', |
| 'DebugSymbols Upload'), |
| ('Failed to install device image', 'image install'), |
| ('UnexpectedSignerResultsError', 'signer'), |
| ('reached AU retry limit', 'AU limit'), |
| ('ERROR:.*writer.Write.*failed', 'delta diff write'), |
| ('Timeout reason: This build has reached the timeout ' |
| 'deadline set by the master', 'master timeout'), |
| ('FAIL: No JSON object could be decoded', 'JSON decode'), |
| ('ERROR: Failed to parse JSON', 'JSON parse'), |
| ('FAIL: stage_artifacts timed out', 'artifact staging'), |
| ('Command [^a-z]*tar.* returned non-zero exit status 2', 'tar error 2'), |
| ('Unhandled run_suite exception: Call is timed out', |
| 'run_suite call timeout'), |
| ('Not enough DUTs', 'DUTs'), |
| ('IOError: .Errno 28. No space left on device', 'no space'), |
| ('Unhandled DevServerException: All devservers are currently down', |
| 'DEVSERVERS'), |
| ('Devserver portfile does not exist', 'No Devserver portfile'), |
| ('Your .*Oauth 2.0 User Account.* credentials are invalid', 'Bad GS Oauth'), |
| ('CommandException: No URLs matched', 'Bad URL'), |
| ('Reached GCETestStage test run timeout', 'GCETestStage Timeout'), |
| (r'No output from .* for \d+ seconds', 'Timeout (No Output Returned)'), |
| )] |
| |
| # Test name + outcome pattern. |
| # For instance: |
| # login_MultiUserPolicy [ PASSED ] |
| # Spaces in between words are allowed (for instance: Suite job) |
| TEST_OUTCOME_RE = re.compile(r'(\w[\w\. ]+\w) +\[ ([A-Z]+) \]') |
| |
| FAILED_TEST_REASON_RE = re.compile('FAILED|ERROR|ABORT') |
| TEST_FAILURE_PATTERNS = [(re.compile(pattern), label) for pattern, label in ( |
| ('ABORT: Failed to install device image.*CODE=37', 'AU37'), |
| ('ABORT: Failed to install device image.*CODE=0', 'AU0'), |
| ('FAIL: Update server .* not available', 'NoServer'), |
| ('ABORT: Timed out, did not run', 'TimeOut'), |
| ('ABORT: Host did not return from reboot', 'NoReboot'), |
| ('FAIL: Unhandled DevServerException: All devservers.*down', 'NoDevServ'), |
| ('ABORT: Failed to trigger an update.*AutoserRunError', 'BlankAUError'), |
| ('FAIL: Failed to receive a download started notification', 'AUNoStart'), |
| ('FAIL: Saw file system error', 'FSError'), |
| ('ABORT: Autotest client terminated unexpectedly: DUT is pingable', |
| 'AUUnexpect'), |
| )] |
| ON_DUT_RE = re.compile(r'on (chromeos\d+-row\d+-rack\d+-host\d+).*') |
| |
| LOG_LINK_RE = re.compile( |
| r'(\w[\w\. ]+\w) +' |
| r'http://cautotest/tko/retrieve_logs.cgi\?job=/results/(\d+)-chromeos-test') |
| |
| AUTOTEST_LOG_LINK_RE = re.compile( |
| r'@@@STEP_LINK@\[Test-Logs\]: ([^:]+):.*(FAIL|ABORT).*' |
| r'on (chromeos\d+-row\d+-rack\d+-host\d+).*' |
| r'http://cautotest/tko/retrieve_logs.cgi\?job=/results/(\d+)-chromeos-test') |
| |
| |
| def ClassifyTestFailure(log_content): |
| """Classifies the logs of a test failure. |
| |
| Args: |
| log_content: string iterator. |
| |
| Returns: |
| set of classifications, or None if no classification. |
| """ |
| # Add failures to this set as they are found. |
| failure_set = set() |
| swarming_start_time = None |
| test_runs = {} |
| test_jobs = {} |
| |
| for line in log_content: |
| m = SWARMING_START_RE.search(line) |
| if m: |
| swarming_start_time = LogTimestampToDatetime(m.group(1)) |
| |
| m = SWARMING_TIMEOUT_RE.search(line) |
| if m and swarming_start_time: |
| swarming_end_time = LogTimestampToDatetime(m.group(1)) |
| elapsed_time = swarming_end_time - swarming_start_time |
| # Since these are just times without dates, if the time appears to go |
| # backwards, assume that it's rolled over to the next day. |
| if elapsed_time.total_seconds() < 0: |
| elapsed_time += datetime.timedelta(days=1) |
| if elapsed_time > datetime.timedelta(hours=1): |
| failure_set.add('swarming timeout') |
| |
| # Another multi-line check. |
| m = FAILED_GS_REMOVAL_1_RE.search(line) |
| if m: |
| line = log_content.next() |
| m = FAILED_GS_REMOVAL_2_RE.search(line) |
| if m: |
| failure_set.add('GS removal') |
| |
| for failure_pattern, failure_label in SIMPLE_FAILURE_PATTERNS: |
| m = failure_pattern.search(line) |
| if m: |
| failure_set.add(failure_label) |
| |
| # Two-line check looking for tests that failed. |
| m = TEST_OUTCOME_RE.search(line) |
| if m: |
| # For each invocation of the test, record its status. Each test |
| # might be invoked multiple times, so need to record the ordering |
| # of each run to match it with the logs. |
| # TODO: This entire parsing of failed jobs is very weak and fragile. |
| # Get the failures from a structured data source or start logging |
| # a single line sufficient to trigger the root cause logic. |
| autotest_name = m.group(1) |
| if autotest_name not in test_runs: |
| test_runs[autotest_name] = [] |
| test_entry = {'status': m.group(2)} |
| test_runs[autotest_name].append(test_entry) |
| |
| if m.group(2) in ('FAILED', 'ABORT'): |
| failure = m.group(1) |
| # some test names contain an image version, for instance: |
| # autoupdate_EndToEndTest_npo_delta_8803.0.0 |
| # The version number prevents matching failures and is not |
| # useful at this stage so we hide it. |
| failure = WhiteoutImageVersion(failure) |
| if failure not in ['Suite job']: |
| line = log_content.next() |
| |
| # Extract and store DUT name if it exists. |
| m = ON_DUT_RE.search(line) |
| if m: |
| test_entry['dut'] = m.group(1) |
| |
| m = FAILED_TEST_REASON_RE.search(line) |
| # Append a label to the failure name |
| if m: |
| for error_re, error_label in TEST_FAILURE_PATTERNS: |
| m = error_re.search(line) |
| if m: |
| failure += '{%s}' % error_label |
| break |
| failure_set.add(failure) |
| |
| # Look for links to test logs. |
| m = LOG_LINK_RE.search(line) |
| if m: |
| # Record the ordering of the test logs so they can be reconciled. |
| autotest_name = m.group(1) |
| autotest_job_id = m.group(2) |
| if autotest_name not in test_jobs: |
| test_jobs[autotest_name] = [] |
| test_jobs[autotest_name].append(autotest_job_id) |
| |
| # Find the root cause for failed test runs. |
| root_caused_jobs = set() |
| for autotest_name in test_runs: |
| for run in xrange(len(test_runs[autotest_name])): |
| test_entry = test_runs[autotest_name][run] |
| status = test_entry.get('status', '') |
| autotest_dut = test_entry.get('dut', None) |
| if (status in ('FAILED', 'ABORT') and |
| autotest_name in test_jobs and run < len(test_jobs[autotest_name])): |
| autotest_job_id = test_jobs[autotest_name][run] |
| if autotest_job_id in root_caused_jobs: |
| # Avoid processing the same job twice, necessary since the logs |
| # are duplicated. |
| continue |
| root_caused_jobs.add(autotest_job_id) |
| root_cause = GetTestRootCause(autotest_job_id, autotest_dut) |
| logging.info('root caused job %s dut %s: %s', autotest_job_id, |
| autotest_dut, root_cause) |
| if root_cause: |
| # Replace existing failure entry with a more specific one |
| if autotest_name in failure_set: |
| failure_set.discard(autotest_name) |
| failure_set.add('%s==%s' % (autotest_name, root_cause)) |
| |
| if failure_set: |
| return failure_set |
| return None |
| |
| |
| # Patterns for classifying timeouts. |
| |
| TIMEOUT_RE = re.compile('Timeout occurred-') |
| |
| |
| def IdentifyTimeout(log_content): |
| """Classifies the logs of a stage that can contain a timetout. |
| |
| Args: |
| log_content: string iterator of the stage's logs. |
| |
| Returns: |
| set of classifications, or None if no classification. |
| """ |
| for line in log_content: |
| if TIMEOUT_RE.search(line): |
| return set(['timeout']) |
| |
| |
| # Patterns for classifying archive failures. |
| |
| BUILD_IMAGE_RE = re.compile('BuildScriptFailure: ./build_image failed .code=1.') |
| |
| def ClassifyArchiveFailure(log_content): |
| """Classifies the logs of an archive failure. |
| |
| Args: |
| log_content: string iterator of the stage's logs. |
| |
| Returns: |
| set of classifications, or None if no classification. |
| """ |
| for line in log_content: |
| if BUILD_IMAGE_RE.search(line): |
| return set(['build_image']) |
| |
| |
| def GetHandler(stage): |
| """Determines a handler to classify a particular type of stage. |
| |
| Args: |
| stage: name of the stage that failed. |
| |
| Returns: |
| function to classify logs. |
| """ |
| return { |
| 'HWTest [sanity]': ClassifyTestFailure, |
| 'HWTest [bvt-inline]': ClassifyTestFailure, |
| 'HWTest [arc-bvt-cq]': ClassifyTestFailure, |
| 'AUTest [au]': ClassifyTestFailure, |
| 'HWTest [jetstream_cq]': ClassifyTestFailure, |
| 'Paygen': ClassifyTestFailure, |
| 'PaygenBuildCanary': ClassifyTestFailure, |
| 'PaygenTestCanary': ClassifyTestFailure, |
| 'PaygenBuildDev': ClassifyTestFailure, |
| 'PaygenTestDev': ClassifyTestFailure, |
| 'SetupBoard': IdentifyTimeout, |
| 'BuildPackages': IdentifyTimeout, |
| 'Archive': ClassifyArchiveFailure, |
| 'VMTest': ClassifyTestFailure, |
| 'GCETest': ClassifyTestFailure, |
| }.get(stage, None) |
| |
| |
| def ClassifyFailure(stage, log_content): |
| """Classifies the logs of a failed stage. |
| |
| Args: |
| stage: name of the stage that failed. |
| log_content: string iterator of the stage's logs. |
| |
| Returns: |
| set of classifications, or None if no classification. |
| """ |
| # Ignore steps that failed because substeps failed. |
| if stage.startswith('cbuildbot ') or stage in ['steps', 'Failure reason']: |
| return None |
| |
| # Getting rid of (attempt ...) appended onto labels. |
| if stage.startswith('VMTest') or stage.startswith('GCETest'): |
| stage = stage.split(' ', 1)[0] |
| |
| handler = GetHandler(stage) |
| if handler: |
| return handler(log_content) |
| else: |
| logging.debug('Missing handler for stage %s', stage) |
| return None |