| #!/usr/bin/env python3 |
| # -*- coding: utf-8 -*- |
| # Copyright 2020 The ChromiumOS Authors |
| # Use of this source code is governed by a BSD-style license that can be |
| # found in the LICENSE file. |
| """Fetches and submits the latest test-cases from Lexan's crash bucket.""" |
| |
| import argparse |
| import contextlib |
| import datetime |
| import json |
| import logging |
| import os |
| import shutil |
| import subprocess |
| import sys |
| import tempfile |
| from typing import Generator, Iterable, List |
| |
| |
| gsurl_base = "gs://chrome-clang-crash-reports/v1" |
| |
| |
| def gsutil_ls(loc: str) -> List[str]: |
| results = subprocess.run( |
| ["gsutil.py", "ls", loc], |
| stdout=subprocess.PIPE, |
| check=True, |
| encoding="utf-8", |
| ) |
| return [l.strip() for l in results.stdout.splitlines()] |
| |
| |
| def gsurl_ls_last_numbers(url: str) -> List[int]: |
| return sorted(int(x.rstrip("/").split("/")[-1]) for x in gsutil_ls(url)) |
| |
| |
| def get_available_year_numbers() -> List[int]: |
| return gsurl_ls_last_numbers(gsurl_base) |
| |
| |
| def get_available_month_numbers(year: int) -> List[int]: |
| return gsurl_ls_last_numbers(f"{gsurl_base}/{year}") |
| |
| |
| def get_available_day_numbers(year: int, month: int) -> List[int]: |
| return gsurl_ls_last_numbers(f"{gsurl_base}/{year}/{month:02d}") |
| |
| |
| def get_available_test_case_urls(year: int, month: int, day: int) -> List[str]: |
| return gsutil_ls(f"{gsurl_base}/{year}/{month:02d}/{day:02d}") |
| |
| |
| def test_cases_on_or_after( |
| date: datetime.datetime, |
| ) -> Generator[str, None, None]: |
| """Yields all test-cases submitted on or after the given date.""" |
| for year in get_available_year_numbers(): |
| if year < date.year: |
| continue |
| |
| for month in get_available_month_numbers(year): |
| if year == date.year and month < date.month: |
| continue |
| |
| for day in get_available_day_numbers(year, month): |
| when = datetime.date(year, month, day) |
| if when < date: |
| continue |
| |
| yield when, get_available_test_case_urls(year, month, day) |
| |
| |
| def to_ymd(date: datetime.date) -> str: |
| return date.strftime("%Y-%m-%d") |
| |
| |
| def from_ymd(date_str: str) -> datetime.date: |
| return datetime.datetime.strptime(date_str, "%Y-%m-%d").date() |
| |
| |
| def persist_state( |
| seen_urls: Iterable[str], state_file: str, current_date: datetime.date |
| ): |
| tmp_state_file = state_file + ".tmp" |
| with open(tmp_state_file, "w", encoding="utf-8") as f: |
| json.dump( |
| { |
| "already_seen": sorted(seen_urls), |
| "most_recent_date": to_ymd(current_date), |
| }, |
| f, |
| ) |
| os.rename(tmp_state_file, state_file) |
| |
| |
| @contextlib.contextmanager |
| def temp_dir() -> Generator[str, None, None]: |
| loc = tempfile.mkdtemp("lexan-autosubmit") |
| try: |
| yield loc |
| finally: |
| shutil.rmtree(loc) |
| |
| |
| def fetch_gs_file_size(gs_url: str) -> int: |
| """Returns the size of the file at gs_url, in bytes.""" |
| du = subprocess.run( |
| ["gsutil.py", "du", gs_url], |
| check=True, |
| stdout=subprocess.PIPE, |
| encoding="utf-8", |
| ).stdout |
| |
| lines = du.splitlines() |
| assert len(lines) == 1, f"{lines}" |
| # Format is `size file_name`. |
| num_bytes = lines[0].lstrip().split(None, 1)[0] |
| return int(num_bytes) |
| |
| |
| def download_and_unpack_test_case(gs_url: str, tempdir: str) -> None: |
| suffix = os.path.splitext(gs_url)[1] |
| target_name = "test_case" + suffix |
| target = os.path.join(tempdir, target_name) |
| subprocess.run(["gsutil.py", "cp", gs_url, target], check=True) |
| subprocess.run(["tar", "xaf", target_name], check=True, cwd=tempdir) |
| os.unlink(target) |
| |
| |
| def submit_test_case(gs_url: str, cr_tool: str) -> None: |
| size_limit = 100 * 1024 |
| size_kb = fetch_gs_file_size(gs_url) // 1024 |
| if size_kb > size_limit: |
| logging.warning( |
| "Ignoring %s; it's %dKB, and the limit is %dKB", |
| gs_url, |
| size_kb, |
| size_limit, |
| ) |
| return |
| |
| logging.info("Downloading %s (%dKB)", gs_url, size_kb) |
| with temp_dir() as tempdir: |
| download_and_unpack_test_case(gs_url, tempdir) |
| |
| # Sometimes (e.g., in |
| # gs://chrome-clang-crash-reports/v1/2020/03/27/ |
| # chromium.clang-ToTiOS-12754-GTXToolKit-2bfcde.tgz) |
| # we'll get `.crash` files. Unclear why, but let's filter them out anyway. |
| repro_files = [ |
| os.path.join(tempdir, x) |
| for x in os.listdir(tempdir) |
| if not x.endswith(".crash") |
| ] |
| if len(repro_files) == 1 and repro_files[0].endswith(".tar"): |
| logging.info( |
| "Skipping submission of %s; it's a linker crash", gs_url |
| ) |
| return |
| |
| assert len(repro_files) == 2, repro_files |
| if repro_files[0].endswith(".sh"): |
| sh_file, src_file = repro_files |
| assert not src_file.endswith(".sh"), repro_files |
| else: |
| src_file, sh_file = repro_files |
| assert sh_file.endswith(".sh"), repro_files |
| |
| # Peephole: lexan got a crash upload with a way old clang. Ignore it. |
| with open(sh_file, encoding="utf-8") as f: |
| if "Crash reproducer for clang version 9.0.0" in f.read(): |
| logging.warning( |
| "Skipping upload for %s; seems to be with an old clang", |
| gs_url, |
| ) |
| return |
| |
| logging.info("Submitting %s", gs_url) |
| subprocess.run( |
| [ |
| cr_tool, |
| "reduce", |
| "-stream=false", |
| "-wait=false", |
| "-note", |
| gs_url, |
| "-sh_file", |
| os.path.join(tempdir, sh_file), |
| "-src_file", |
| os.path.join(tempdir, src_file), |
| ], |
| check=True, |
| ) |
| |
| |
| def submit_new_test_cases( |
| last_seen_test_cases: Iterable[str], |
| earliest_date_to_check: datetime.date, |
| forcey: str, |
| state_file_path: str, |
| ) -> None: |
| """Submits new test-cases to forcey. |
| |
| This will persist state after each test-case is submitted. |
| |
| Args: |
| last_seen_test_cases: test-cases which have been submitted already, and |
| should be skipped if seen again. |
| earliest_date_to_check: the earliest date we should consider test-cases |
| from. |
| forcey: path to the forcey binary. |
| state_file_path: path to our state file. |
| """ |
| # `all_test_cases_seen` is the union of all test-cases seen on this and prior |
| # invocations. It guarantees, in all cases we care about, that we won't |
| # submit the same test-case twice. `test_cases_seen_this_invocation` is |
| # persisted as "all of the test-cases we've seen on this and prior |
| # invocations" if we successfully submit _all_ test-cases. |
| # |
| # Since you can visualize the test-cases this script considers as a sliding |
| # window that only moves forward, if we saw a test-case on a prior iteration |
| # but no longer see it, we'll never see it again (since it fell out of our |
| # sliding window by being too old). Hence, keeping it around is |
| # pointless. |
| # |
| # We only persist this minimized set of test-cases if _everything_ succeeds, |
| # since if something fails below, there's a chance that we haven't revisited |
| # test-cases that we've already seen. |
| all_test_cases_seen = set(last_seen_test_cases) |
| test_cases_seen_this_invocation = [] |
| most_recent_date = earliest_date_to_check |
| for date, candidates in test_cases_on_or_after(earliest_date_to_check): |
| most_recent_date = max(most_recent_date, date) |
| |
| for url in candidates: |
| test_cases_seen_this_invocation.append(url) |
| if url in all_test_cases_seen: |
| continue |
| |
| all_test_cases_seen.add(url) |
| submit_test_case(url, forcey) |
| |
| # Persisting on each iteration of this loop isn't free, but it's the |
| # easiest way to not resubmit test-cases, and it's good to keep in mind |
| # that: |
| # - the state file will be small (<12KB, since it only keeps a few days |
| # worth of test-cases after the first run) |
| # - in addition to this, we're downloading+unzipping+reuploading multiple |
| # MB of test-case bytes. |
| # |
| # So comparatively, the overhead here probably isn't an issue. |
| persist_state( |
| all_test_cases_seen, state_file_path, most_recent_date |
| ) |
| |
| persist_state( |
| test_cases_seen_this_invocation, state_file_path, most_recent_date |
| ) |
| |
| |
| def main(argv: List[str]): |
| logging.basicConfig( |
| format=">> %(asctime)s: %(levelname)s: %(filename)s:%(lineno)d: " |
| "%(message)s", |
| level=logging.INFO, |
| ) |
| |
| my_dir = os.path.dirname(os.path.abspath(__file__)) |
| |
| parser = argparse.ArgumentParser(description=__doc__) |
| parser.add_argument( |
| "--state_file", default=os.path.join(my_dir, "lexan-state.json") |
| ) |
| parser.add_argument( |
| "--last_date", |
| help="The earliest date that we care about. All test cases from here " |
| "on will be picked up. Format is YYYY-MM-DD.", |
| ) |
| parser.add_argument( |
| "--4c", dest="forcey", required=True, help="Path to a 4c client binary" |
| ) |
| opts = parser.parse_args(argv) |
| |
| forcey = opts.forcey |
| state_file = opts.state_file |
| last_date_str = opts.last_date |
| |
| os.makedirs(os.path.dirname(state_file), 0o755, exist_ok=True) |
| |
| if last_date_str is None: |
| with open(state_file, encoding="utf-8") as f: |
| data = json.load(f) |
| most_recent_date = from_ymd(data["most_recent_date"]) |
| submit_new_test_cases( |
| last_seen_test_cases=data["already_seen"], |
| # Note that we always subtract one day from this to avoid a race: |
| # uploads may appear slightly out-of-order (or builders may lag, or |
| # ...), so the last test-case uploaded for 2020/01/01 might appear |
| # _after_ the first test-case for 2020/01/02. Assuming that builders |
| # won't lag behind for over a day, the easiest way to handle this is to |
| # always check the previous and current days. |
| earliest_date_to_check=most_recent_date |
| - datetime.timedelta(days=1), |
| forcey=forcey, |
| state_file_path=state_file, |
| ) |
| else: |
| submit_new_test_cases( |
| last_seen_test_cases=(), |
| earliest_date_to_check=from_ymd(last_date_str), |
| forcey=forcey, |
| state_file_path=state_file, |
| ) |
| |
| |
| if __name__ == "__main__": |
| sys.exit(main(sys.argv[1:])) |