llvm_tools: add a lexan crash autouploader

This CL adds an autouploader for crashes that land in Lexan's bucket.
This autouploader uploads them to 4c, much like how
`bisect_clang_crashes.py` does. The intent is to run this regularly on
chrotomation.

BUG=None
TEST=Ran it

Change-Id: I7cfbe463d89994f6ed3f750c9e8277e1fad0738e
Reviewed-on: https://chromium-review.googlesource.com/c/chromiumos/third_party/toolchain-utils/+/2451306
Reviewed-by: Jian Cai <jiancai@google.com>
Tested-by: George Burgess <gbiv@chromium.org>
diff --git a/llvm_tools/README.md b/llvm_tools/README.md
index d7c2016..783ec22 100644
--- a/llvm_tools/README.md
+++ b/llvm_tools/README.md
@@ -555,7 +555,7 @@
 ### `bisect_clang_crashes.py`
 
 This script downloads clang crash diagnoses from
-gs://chromeos-toolchain-artifacts/clang-crash-diagnoses and send them to 4c for
+gs://chromeos-toolchain-artifacts/clang-crash-diagnoses and sends them to 4c for
 bisection.
 
 Usage example:
@@ -569,3 +569,26 @@
 output/state.json under the current path. The output directory will be created
 automatically if it does not exist yet. To get more information of the submitted
 jobs, please refer to go/4c-cli.
+
+### `upload_lexan_crashes_to_forcey.py`
+
+This script downloads clang crash diagnoses from Lexan's bucket and sends them
+to 4c for bisection.
+
+Usage example:
+
+```
+$ ./upload_lexan_crashes_to_forcey.py --4c 4c-cli \
+    --state_file ./output/state.json
+```
+
+The above command downloads the artifacts of clang crash diagnoses and send them
+to 4c server for bisection. The summary of submitted jobs will be saved in
+output/state.json under the current path. The output directory will be created
+automatically if it does not exist yet. To get more information of the submitted
+jobs, please refer to go/4c-cli.
+
+Note that it's recommended to 'seed' the state file with a most recent upload
+date. This can be done by running this tool *once* with a `--last_date` flag.
+This flag has the script override whatever's in the state file (if anything) and
+start submitting all crashes uploaded starting at the given day.
diff --git a/llvm_tools/upload_lexan_crashes_to_forcey.py b/llvm_tools/upload_lexan_crashes_to_forcey.py
new file mode 100755
index 0000000..9cf0c08
--- /dev/null
+++ b/llvm_tools/upload_lexan_crashes_to_forcey.py
@@ -0,0 +1,258 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+# Copyright 2020 The Chromium OS Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+"""Fetches and submits the latest test-cases from Lexan's crash bucket."""
+
+# pylint: disable=cros-logging-import
+
+import argparse
+import contextlib
+import datetime
+import json
+import logging
+import os
+import shutil
+import subprocess
+import sys
+import tempfile
+from typing import Generator, List, Iterable
+
+gsurl_base = 'gs://chrome-clang-crash-reports/v1'
+
+
+def gsutil_ls(loc: str) -> List[str]:
+  results = subprocess.run(['gsutil', 'ls', loc],
+                           stdout=subprocess.PIPE,
+                           check=True,
+                           encoding='utf-8')
+  return [l.strip() for l in results.stdout.splitlines()]
+
+
+def gsurl_ls_last_numbers(url: str) -> List[int]:
+  return sorted(int(x.rstrip('/').split('/')[-1]) for x in gsutil_ls(url))
+
+
+def get_available_year_numbers() -> List[int]:
+  return gsurl_ls_last_numbers(gsurl_base)
+
+
+def get_available_month_numbers(year: int) -> List[int]:
+  return gsurl_ls_last_numbers(f'{gsurl_base}/{year}')
+
+
+def get_available_day_numbers(year: int, month: int) -> List[int]:
+  return gsurl_ls_last_numbers(f'{gsurl_base}/{year}/{month:02d}')
+
+
+def get_available_test_case_urls(year: int, month: int, day: int) -> List[str]:
+  return gsutil_ls(f'{gsurl_base}/{year}/{month:02d}/{day:02d}')
+
+
+def test_cases_on_or_after(date: datetime.datetime
+                          ) -> Generator[str, None, None]:
+  """Yields all test-cases submitted on or after the given date."""
+  for year in get_available_year_numbers():
+    if year < date.year:
+      continue
+
+    for month in get_available_month_numbers(year):
+      if year == date.year and month < date.month:
+        continue
+
+      for day in get_available_day_numbers(year, month):
+        when = datetime.date(year, month, day)
+        if when < date:
+          continue
+
+        yield when, get_available_test_case_urls(year, month, day)
+
+
+def to_ymd(date: datetime.date) -> str:
+  return date.strftime('%Y-%m-%d')
+
+
+def from_ymd(date_str: str) -> datetime.date:
+  return datetime.datetime.strptime(date_str, '%Y-%m-%d').date()
+
+
+def persist_state(seen_urls: Iterable[str], state_file: str,
+                  current_date: datetime.date):
+  tmp_state_file = state_file + '.tmp'
+  with open(tmp_state_file, 'w', encoding='utf-8') as f:
+    json.dump(
+        {
+            'already_seen': sorted(seen_urls),
+            'most_recent_date': to_ymd(current_date),
+        },
+        f,
+    )
+  os.rename(tmp_state_file, state_file)
+
+
+@contextlib.contextmanager
+def temp_dir() -> Generator[str, None, None]:
+  loc = tempfile.mkdtemp('lexan-autosubmit')
+  try:
+    yield loc
+  finally:
+    shutil.rmtree(loc)
+
+
+def submit_test_case(gs_url: str, cr_tool: str) -> None:
+  logging.info('Submitting %s', gs_url)
+  suffix = os.path.splitext(gs_url)[1]
+  with temp_dir() as tempdir:
+    target_name = 'test_case' + suffix
+    target = os.path.join(tempdir, target_name)
+    subprocess.run(['gsutil', 'cp', gs_url, target], check=True)
+    subprocess.run(['tar', 'xaf', target_name], check=True, cwd=tempdir)
+    os.unlink(target)
+
+    # Sometimes (e.g., in
+    # gs://chrome-clang-crash-reports/v1/2020/03/27/
+    # chromium.clang-ToTiOS-12754-GTXToolKit-2bfcde.tgz)
+    # we'll get `.crash` files. Unclear why, but let's filter them out anyway.
+    repro_files = [x for x in os.listdir(tempdir) if not x.endswith('.crash')]
+    assert len(repro_files) == 2, repro_files
+    if repro_files[0].endswith('.sh'):
+      sh_file, src_file = repro_files
+      assert not src_file.endswith('.sh'), repro_files
+    else:
+      src_file, sh_file = repro_files
+      assert sh_file.endswith('.sh'), repro_files
+
+    subprocess.run(
+        [
+            cr_tool,
+            'reduce',
+            '-stream=false',
+            '-wait=false',
+            '-note',
+            gs_url,
+            '-sh_file',
+            os.path.join(tempdir, sh_file),
+            '-src_file',
+            os.path.join(tempdir, src_file),
+        ],
+        check=True,
+    )
+
+
+def submit_new_test_cases(
+    last_seen_test_cases: Iterable[str],
+    earliest_date_to_check: datetime.date,
+    forcey: str,
+    state_file_path: str,
+) -> None:
+  """Submits new test-cases to forcey.
+
+  This will persist state after each test-case is submitted.
+
+  Args:
+    last_seen_test_cases: test-cases which have been submitted already, and
+      should be skipped if seen again.
+    earliest_date_to_check: the earliest date we should consider test-cases
+      from.
+    forcey: path to the forcey binary.
+    state_file_path: path to our state file.
+  """
+  # `all_test_cases_seen` is the union of all test-cases seen on this and prior
+  # invocations. It guarantees, in all cases we care about, that we won't
+  # submit the same test-case twice. `test_cases_seen_this_invocation` is
+  # persisted as "all of the test-cases we've seen on this and prior
+  # invocations" if we successfully submit _all_ test-cases.
+  #
+  # Since you can visualize the test-cases this script considers as a sliding
+  # window that only moves forward, if we saw a test-case on a prior iteration
+  # but no longer see it, we'll never see it again (since it fell out of our
+  # sliding window by being too old). Hence, keeping it around is
+  # pointless.
+  #
+  # We only persist this minimized set of test-cases if _everything_ succeeds,
+  # since if something fails below, there's a chance that we haven't revisited
+  # test-cases that we've already seen.
+  all_test_cases_seen = set(last_seen_test_cases)
+  test_cases_seen_this_invocation = []
+  most_recent_date = earliest_date_to_check
+  for date, candidates in test_cases_on_or_after(earliest_date_to_check):
+    most_recent_date = max(most_recent_date, date)
+
+    for url in candidates:
+      test_cases_seen_this_invocation.append(url)
+      if url in all_test_cases_seen:
+        continue
+
+      all_test_cases_seen.add(url)
+      submit_test_case(url, forcey)
+
+      # Persisting on each iteration of this loop isn't free, but it's the
+      # easiest way to not resubmit test-cases, and it's good to keep in mind
+      # that:
+      # - the state file will be small (<12KB, since it only keeps a few days
+      #   worth of test-cases after the first run)
+      # - in addition to this, we're downloading+unzipping+reuploading multiple
+      #   MB of test-case bytes.
+      #
+      # So comparatively, the overhead here probably isn't an issue.
+      persist_state(all_test_cases_seen, state_file_path, most_recent_date)
+
+  persist_state(test_cases_seen_this_invocation, state_file_path,
+                most_recent_date)
+
+
+def main(argv: List[str]):
+  logging.basicConfig(
+      format='>> %(asctime)s: %(levelname)s: %(filename)s:%(lineno)d: '
+      '%(message)s',
+      level=logging.INFO,
+  )
+
+  my_dir = os.path.dirname(os.path.abspath(__file__))
+
+  parser = argparse.ArgumentParser(description=__doc__)
+  parser.add_argument(
+      '--state_file', default=os.path.join(my_dir, 'lexan-state.json'))
+  parser.add_argument(
+      '--last_date',
+      help='The earliest date that we care about. All test cases from here '
+      'on will be picked up. Format is YYYY-MM-DD.')
+  parser.add_argument(
+      '--4c', dest='forcey', required=True, help='Path to a 4c client binary')
+  opts = parser.parse_args(argv)
+
+  forcey = opts.forcey
+  state_file = opts.state_file
+  last_date_str = opts.last_date
+
+  os.makedirs(os.path.dirname(state_file), 0o755)
+
+  if last_date_str is None:
+    with open(state_file, encoding='utf-8') as f:
+      data = json.load(f)
+    most_recent_date = from_ymd(data['most_recent_date'])
+    submit_new_test_cases(
+        last_seen_test_cases=data['already_seen'],
+        # Note that we always subtract one day from this to avoid a race:
+        # uploads may appear slightly out-of-order (or builders may lag, or
+        # ...), so the last test-case uploaded for 2020/01/01 might appear
+        # _after_ the first test-case for 2020/01/02. Assuming that builders
+        # won't lag behind for over a day, the easiest way to handle this is to
+        # always check the previous and current days.
+        earliest_date_to_check=most_recent_date - datetime.timedelta(days=1),
+        forcey=forcey,
+        state_file_path=state_file,
+    )
+  else:
+    submit_new_test_cases(
+        last_seen_test_cases=(),
+        earliest_date_to_check=from_ymd(last_date_str),
+        forcey=forcey,
+        state_file_path=state_file,
+    )
+
+
+if __name__ == '__main__':
+  sys.exit(main(sys.argv[1:]))
diff --git a/llvm_tools/upload_lexan_crashes_to_forcey_test.py b/llvm_tools/upload_lexan_crashes_to_forcey_test.py
new file mode 100755
index 0000000..3c9c0d4
--- /dev/null
+++ b/llvm_tools/upload_lexan_crashes_to_forcey_test.py
@@ -0,0 +1,122 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+# Copyright 2020 The Chromium OS Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+"""Tests for upload_lexan_crashes_to_forcey."""
+
+import datetime
+import unittest
+import unittest.mock
+
+import upload_lexan_crashes_to_forcey
+
+
+class Test(unittest.TestCase):
+  """Tests for upload_lexan_crashes_to_forcey."""
+
+  def test_date_parsing_functions(self):
+    self.assertEqual(
+        datetime.date(2020, 2, 1),
+        upload_lexan_crashes_to_forcey.from_ymd('2020-02-01'))
+
+  @unittest.mock.patch(
+      'upload_lexan_crashes_to_forcey.test_cases_on_or_after',
+      return_value=(
+          (
+              datetime.date(2020, 1, 1),
+              ('gs://test-case-1', 'gs://test-case-1.1'),
+          ),
+          (datetime.date(2020, 1, 2), ('gs://test-case-2',)),
+          (datetime.date(2020, 1, 1), ('gs://test-case-3',)),
+          (datetime.date(2020, 1, 4), ('gs://test-case-4',)),
+      ))
+  @unittest.mock.patch('upload_lexan_crashes_to_forcey.submit_test_case')
+  @unittest.mock.patch('upload_lexan_crashes_to_forcey.persist_state')
+  def test_new_test_case_submission_functions(self, persist_state_mock,
+                                              submit_test_case_mock,
+                                              test_cases_on_or_after_mock):
+    forcey_path = '/path/to/4c'
+    real_state_file_path = '/path/to/state/file'
+    earliest_date = datetime.date(2020, 1, 1)
+
+    persist_state_calls = []
+
+    # Since the set this gets is mutated, we need to copy it somehow.
+    def persist_state_side_effect(test_cases_to_persist, state_file_path,
+                                  most_recent_date):
+      self.assertEqual(state_file_path, real_state_file_path)
+      persist_state_calls.append(
+          (sorted(test_cases_to_persist), most_recent_date))
+
+    persist_state_mock.side_effect = persist_state_side_effect
+
+    upload_lexan_crashes_to_forcey.submit_new_test_cases(
+        last_seen_test_cases=(
+            'gs://test-case-0',
+            'gs://test-case-1',
+        ),
+        earliest_date_to_check=earliest_date,
+        forcey=forcey_path,
+        state_file_path=real_state_file_path,
+    )
+
+    test_cases_on_or_after_mock.assert_called_once_with(earliest_date)
+    self.assertEqual(submit_test_case_mock.call_args_list, [
+        unittest.mock.call('gs://test-case-1.1', forcey_path),
+        unittest.mock.call('gs://test-case-2', forcey_path),
+        unittest.mock.call('gs://test-case-3', forcey_path),
+        unittest.mock.call('gs://test-case-4', forcey_path),
+    ])
+
+    self.assertEqual(persist_state_calls, [
+        (
+            ['gs://test-case-0', 'gs://test-case-1', 'gs://test-case-1.1'],
+            datetime.date(2020, 1, 1),
+        ),
+        (
+            [
+                'gs://test-case-0',
+                'gs://test-case-1',
+                'gs://test-case-1.1',
+                'gs://test-case-2',
+            ],
+            datetime.date(2020, 1, 2),
+        ),
+        (
+            [
+                'gs://test-case-0',
+                'gs://test-case-1',
+                'gs://test-case-1.1',
+                'gs://test-case-2',
+                'gs://test-case-3',
+            ],
+            datetime.date(2020, 1, 2),
+        ),
+        (
+            [
+                'gs://test-case-0',
+                'gs://test-case-1',
+                'gs://test-case-1.1',
+                'gs://test-case-2',
+                'gs://test-case-3',
+                'gs://test-case-4',
+            ],
+            datetime.date(2020, 1, 4),
+        ),
+        (
+            [
+                'gs://test-case-1',
+                'gs://test-case-1.1',
+                'gs://test-case-2',
+                'gs://test-case-3',
+                'gs://test-case-4',
+            ],
+            datetime.date(2020, 1, 4),
+        ),
+    ])
+
+
+if __name__ == '__main__':
+  unittest.main()