scripts/tricium_clang_tidy.py - mirrors/cros/chromiumos/chromite - Git at Google

 # Copyright 2020 The Chromium OS Authors. All rights reserved.
 # Use of this source code is governed by a BSD-style license that can be
 # found in the LICENSE file.

 """Runs clang-tidy across the given files, dumping diagnostics to a JSON file.

 This script is intended specifically for use with Tricium (go/tricium).
 """

 # From an implementation perspective, it's good to note that this script
 # cooperates with the toolchain's compiler wrapper. In particular,
 # ${cros}/src/third_party/toolchain-utils/compiler_wrapper/clang_tidy_flag.go.
 #
 # When |WITH_TIDY=tricium| is set and the wrapper (which is already $CC/$CXX)
 # is invoked, $CC will invoke clang-tidy _as well_ as the regular compiler.
 # This clang-tidy invocation will result in a few files being dumped to
 # |LINT_BASE| (below):
 #   - "${LINT_BASE}/some-prefix.yaml" -- a YAML file that represents
 #     clang-tidy's diagnostics for the file the compiler was asked to build
 #   - "${LINT_BASE}/some-prefix.json" -- metadata about how the above YAML file
 #     was generated, including clang-tidy's exit code, stdout, etc. See
 #     |InvocationMetadata| below.
 #
 # As one might expect, the compiler wrapper writes the JSON file only after
 # clang-tidy is done executing.
 #
 # This directory might contain other files, as well; these are ignored by this
 # script.

 import bisect
 import json
 import multiprocessing
 import os
 from pathlib import Path
 import re
 import subprocess
 import sys
 import tempfile
 import traceback
 from typing import (Any, Dict, Iterable, List, NamedTuple, Optional, Set, Tuple,
                     Union)

 import yaml  # pylint: disable=import-error
 from chromite.lib import commandline
 from chromite.lib import cros_build_lib
 from chromite.lib import cros_logging as logging
 from chromite.lib import osutils
 from chromite.lib import portage_util
 from chromite.lib import workon_helper

 assert sys.version_info >= (3, 6), 'This module requires Python 3.6+'

 # The directory under which the compiler wrapper stores clang-tidy reports.
 LINT_BASE = Path('/tmp/linting_output/clang-tidy')


 class TidyReplacement(NamedTuple):
   """Represents a replacement emitted by clang-tidy.

   File path is omitted, since these are intended to be associated with
   TidyDiagnostics with identical paths.
   """
   new_text: str
   start_line: int
   end_line: int
   start_char: int
   end_char: int


 class TidyExpandedFrom(NamedTuple):
   """Represents a macro expansion.

   When a diagnostic is inside of a macro expansion, clang-tidy emits
   information about where said macro was expanded from. |TidyDiagnostic|s will
   have one |TidyExpandedFrom| for each level of this expansion.
   """
   file_path: Path
   line_number: int

   def to_dict(self) -> Dict[str, Any]:
     """Converts this |TidyExpandedFrom| to a dict serializeable as JSON."""
     return {
         'file_path': self.file_path.as_posix(),
         'line_number': self.line_number,
     }


 class Error(Exception):
   """Base error class for tricium-clang-tidy."""


 class ClangTidyParseError(Error):
   """Raised when clang-tidy parsing jobs fail."""

   def __init__(self, failed_jobs: int, total_jobs: int):
     super().__init__(f'{failed_jobs}/{total_jobs} parse jobs failed')
     self.failed_jobs = failed_jobs
     self.total_jobs = total_jobs


 class TidyDiagnostic(NamedTuple):
   """A diagnostic emitted by clang-tidy.

   Note that we shove these in a set for cheap deduplication, and we sort based
   on the natural element order here. Sorting is mostly just for
   deterministic/pretty output.
   """
   file_path: Path
   line_number: int
   diag_name: str
   message: str
   replacements: Tuple[TidyReplacement]
   expansion_locs: Tuple[TidyExpandedFrom]

   def normalize_paths_to(self, where: str) -> 'TidyDiagnostic':
     """Creates a new TidyDiagnostic with all paths relative to |where|."""
     return self._replace(
         # Use relpath because Path.relative_to requires that `self` is rooted
         # at `where`.
         file_path=Path(os.path.relpath(self.file_path, where)),
         expansion_locs=tuple(
             x._replace(file_path=Path(os.path.relpath(x.file_path, where)))
             for x in self.expansion_locs))

   def to_dict(self) -> Dict[str, Any]:
     """Converts this |TidyDiagnostic| to a dict serializeable as JSON."""
     return {
         'file_path': self.file_path.as_posix(),
         'line_number': self.line_number,
         'diag_name': self.diag_name,
         'message': self.message,
         'replacements': [x._asdict() for x in self.replacements],
         'expansion_locs': [x.to_dict() for x in self.expansion_locs],
     }


 class ClangTidySchemaError(Error):
   """Raised when we encounter malformed YAML."""

   def __init__(self, err_msg: str):
     super().__init__(err_msg)
     self.err_msg = err_msg


 class LineOffsetMap:
   """Convenient API to turn offsets in a file into line numbers."""

   def __init__(self, newline_locations: Iterable[int]):
     line_starts = [x + 1 for x in newline_locations]
     # The |bisect| logic in |get_line_number|/|get_line_offset| gets a bit
     # complicated around the first and last lines of a file. Adding boundaries
     # here removes some complexity from those implementations.
     line_starts.append(0)
     line_starts.append(sys.maxsize)
     line_starts.sort()

     assert line_starts[0] == 0, line_starts[0]
     assert line_starts[1] != 0, line_starts[1]
     assert line_starts[-2] < sys.maxsize, line_starts[-2]
     assert line_starts[-1] == sys.maxsize, line_starts[-1]

     self._line_starts = line_starts

   def get_line_number(self, char_number: int) -> int:
     """Given a char offset into a file, returns its line number."""
     assert 0 <= char_number < sys.maxsize, char_number
     return bisect.bisect_right(self._line_starts, char_number)

   def get_line_offset(self, char_number: int) -> int:
     """Given a char offset into a file, returns its column number."""
     assert 0 <= char_number < sys.maxsize, char_number
     line_start_index = bisect.bisect_right(self._line_starts, char_number) - 1
     return char_number - self._line_starts[line_start_index]

   @staticmethod
   def for_text(data: str) -> 'LineOffsetMap':
     """Creates a LineOffsetMap for the given string."""
     return LineOffsetMap(m.start() for m in re.finditer(r'\n', data))


 def parse_tidy_fixes_file(tidy_invocation_dir: Path,
                           yaml_data: Any) -> Iterable[TidyDiagnostic]:
   """Parses a clang-tidy YAML file.

   Args:
     yaml_data: The parsed YAML data from clang-tidy's fixits file.
     tidy_invocation_dir: The directory clang-tidy was run in.

   Returns:
     A generator of |TidyDiagnostic|s.
   """
   assert tidy_invocation_dir.is_absolute(), tidy_invocation_dir

   if yaml_data is None:
     return

   # A cache of file_path => LineOffsetMap so we only need to load offsets once
   # per file per |parse_tidy_fixes_file| invocation.
   cached_line_offsets = {}

   def get_line_offsets(file_path: Optional[Path]) -> LineOffsetMap:
     """Gets a LineOffsetMap for the given |file_path|."""
     assert not file_path or file_path.is_absolute(), file_path

     if file_path in cached_line_offsets:
       return cached_line_offsets[file_path]

     # Sometimes tidy will give us empty file names; they don't map to any file,
     # and are generally issues it has with CFLAGS, etc. File offsets don't
     # matter in those, so use an empty map.
     if file_path:
       offsets = LineOffsetMap.for_text(file_path.read_text(encoding='utf-8'))
     else:
       offsets = LineOffsetMap(())
     cached_line_offsets[file_path] = offsets
     return offsets

   # Rarely (e.g., in the case of missing |#include|s, clang will emit relative
   # file paths for diagnostics. This fixes those.
   def makeabs(file_path: str) -> Path:
     """Resolves a |file_path| emitted by clang-tidy to an absolute path."""
     if not file_path:
       return None
     path = Path(file_path)
     if not path.is_absolute():
       path = tidy_invocation_dir / path
     return path.resolve()

   try:
     for diag in yaml_data['Diagnostics']:
       message = diag['DiagnosticMessage']
       file_path = message['FilePath']

       absolute_file_path = makeabs(file_path)
       line_offsets = get_line_offsets(absolute_file_path)

       replacements = []
       for replacement in message.get('Replacements', ()):
         replacement_file_path = makeabs(replacement['FilePath'])

         # FIXME(gbiv): This happens in practice with things like
         # hicpp-member-init. Supporting it should be simple, but I'd like to
         # get the basics running first.
         if replacement_file_path != absolute_file_path:
           logging.warning(
               "Replacement %r wasn't in original file %r (diag: %r)",
               replacement_file_path, file_path, diag)
           continue

         start_offset = replacement['Offset']
         end_offset = start_offset + replacement['Length']
         replacements.append(
             TidyReplacement(
                 new_text=replacement['ReplacementText'],
                 start_line=line_offsets.get_line_number(start_offset),
                 end_line=line_offsets.get_line_number(end_offset),
                 start_char=line_offsets.get_line_offset(start_offset),
                 end_char=line_offsets.get_line_offset(end_offset),
             ))

       expansion_locs = []
       for note in diag.get('Notes', ()):
         if not note['Message'].startswith('expanded from macro '):
           continue

         absolute_note_path = makeabs(note['FilePath'])
         note_offsets = get_line_offsets(absolute_note_path)
         expansion_locs.append(
             TidyExpandedFrom(
                 file_path=absolute_note_path,
                 line_number=note_offsets.get_line_number(note['FileOffset']),
             ))

       yield TidyDiagnostic(
           diag_name=diag['DiagnosticName'],
           message=message['Message'],
           file_path=absolute_file_path,
           line_number=line_offsets.get_line_number(message['FileOffset']),
           replacements=tuple(replacements),
           expansion_locs=tuple(expansion_locs),
       )
   except KeyError as k:
     key_name = k.args[0]
     raise ClangTidySchemaError(f'Broken yaml: missing key {key_name!r}')


 # Represents metadata about a clang-tidy invocation.
 class InvocationMetadata(NamedTuple):
   """Metadata describing a singular invocation of clang-tidy."""
   exit_code: int
   invocation: List[str]
   lint_target: str
   stdstreams: str
   wd: str


 class ExceptionData:
   """Info about an exception that can be sent across processes."""

   def __init__(self):
     """Builds an instance; only intended to be called from `except` blocks."""
     self._str = traceback.format_exc()

   def __str__(self):
     return self._str


 def parse_tidy_invocation(
     json_file: Path,
 ) -> Union[ExceptionData, Tuple[InvocationMetadata, List[TidyDiagnostic]]]:
   """Parses a clang-tidy invocation result based on a JSON file.

   This is intended to be run in a separate process, which Exceptions and
   locking and such work notoriously poorly over, so it's never intended to
   |raise| (except under a KeyboardInterrupt or similar).

   Args:
     json_file: The JSON invocation metadata file to parse.

   Returns:
     An |ExceptionData| instance on failure. On success, it returns a
     (InvocationMetadata, [TidyLint]).
   """
   try:
     assert json_file.suffix == '.json', json_file

     with json_file.open(encoding='utf-8') as f:
       raw_meta = json.load(f)

     meta = InvocationMetadata(
         exit_code=raw_meta['exit_code'],
         invocation=[raw_meta['executable']] + raw_meta['args'],
         lint_target=raw_meta['lint_target'],
         stdstreams=raw_meta['stdstreams'],
         wd=raw_meta['wd'],
     )

     raw_crash_output = raw_meta.get('crash_output')
     if raw_crash_output:
       crash_reproducer_path = raw_crash_output['crash_reproducer_path']
       output = raw_crash_output['stdstreams']
       raise RuntimeError(f"""\
 Clang-tidy apparently crashed; dumping lots of invocation info:
 ## Tidy JSON file target: {json_file}
 ## Invocation: {meta.invocation}
 ## Target: {meta.lint_target}
 ## Crash reproducer is at: {crash_reproducer_path}
 ## Output producing reproducer:
 {output}
 ## Output from the crashing invocation:
 {meta.stdstreams}
 """)

     yaml_file = json_file.with_suffix('.yaml')
     # If this happened, clang-tidy was probably killed. Dump output as part of
     # the exception so it's easier to reason about what happened.
     if not yaml_file.exists():
       raise RuntimeError("clang-tidy didn't produce an output file for "
                          f'{json_file}. Output:\n{meta.stdstreams}')

     with yaml_file.open('rb') as f:
       yaml_data = yaml.load(f)
     return meta, list(parse_tidy_fixes_file(Path(meta.wd), yaml_data))
   except Exception:
     return ExceptionData()


 def generate_lints(board: str, ebuild_path: str) -> Path:
   """Collects the lints for a given package on a given board.

   Args:
     board: the board to collect lints for.
     ebuild_path: the path to the ebuild to collect lints for.

   Returns:
     The path to a tmpdir that all of the lint YAML files (if any) will be in.
     This will also be populated by JSON files containing InvocationMetadata.
     The generation of this is handled by our compiler wrapper.
   """
   logging.info('Running lints for %r on board %r', ebuild_path, board)

   osutils.RmDir(LINT_BASE, ignore_missing=True, sudo=True)
   osutils.SafeMakedirs(LINT_BASE, 0o777, sudo=True)

   # FIXME(gbiv): |test| might be better here?
   result = cros_build_lib.run(
       [f'ebuild-{board}', ebuild_path, 'clean', 'compile'],
       check=False,
       print_cmd=True,
       extra_env={'WITH_TIDY': 'tricium'},
       capture_output=True,
       encoding='utf-8',
       errors='replace',
   )

   if result.returncode:
     status = f'failed with code {result.returncode}; output:\n{result.stdout}'
     log_fn = logging.warning
   else:
     status = 'succeeded'
     log_fn = logging.info

   log_fn('Running |ebuild| on %s %s', ebuild_path, status)
   lint_tmpdir = tempfile.mkdtemp(prefix='tricium_tidy')
   osutils.CopyDirContents(LINT_BASE, lint_tmpdir)
   return Path(lint_tmpdir)


 def collect_lints(lint_tmpdir: Path,
                   yaml_pool: multiprocessing.Pool) -> Set[TidyDiagnostic]:
   """Collects the lints for a given directory filled with linting artifacts."""
   json_files = list(lint_tmpdir.glob('*.json'))
   pending_parses = yaml_pool.imap(parse_tidy_invocation, json_files)

   parses_failed = 0
   all_complaints = set()
   for path, parse in zip(json_files, pending_parses):
     if isinstance(parse, ExceptionData):
       parses_failed += 1
       logging.error('Parsing %r failed with an exception\n%s', path, parse)
       continue

     meta, complaints = parse
     if meta.exit_code:
       logging.warning(
           'Invoking clang-tidy on %r with flags %r exited with code %d; '
           'output:\n%s',
           meta.lint_target,
           meta.invocation,
           meta.exit_code,
           meta.stdstreams,
       )

     all_complaints.update(complaints)

   if parses_failed:
     raise ClangTidyParseError(parses_failed, len(json_files))

   return all_complaints


 def setup_tidy(board: str, ebuild_list: List[portage_util.EBuild]):
   """Sets up to run clang-tidy on the given ebuilds for the given board."""
   packages = [x.package for x in ebuild_list]
   logging.info('Setting up to lint %r', packages)

   workon = workon_helper.WorkonHelper(cros_build_lib.GetSysroot(board))
   workon.StopWorkingOnPackages(packages=[], use_all=True)
   workon.StartWorkingOnPackages(packages)

   # We're going to be hacking with |ebuild| later on, so having all
   # dependencies in place is necessary so one |ebuild| won't stomp on another.
   cmd = [
       f'emerge-{board}',
       '--onlydeps',
       # Since each `emerge` may eat up to `ncpu` cores, limit the maximum
       # concurrency we can get here to (arbitrarily) 8 jobs. Having
       # `configure`s and such run in parallel is nice.
       f'-j{min(8, multiprocessing.cpu_count())}',
   ]
   cmd += packages
   result = cros_build_lib.run(cmd, print_cmd=True, check=False)
   if result.returncode:
     logging.error('Setup failed with exit code %d; some lints may fail.',
                   result.returncode)


 def run_tidy(board: str, ebuild_list: List[portage_util.EBuild],
              keep_dirs: bool,
              parse_errors_are_nonfatal: bool) -> Set[TidyDiagnostic]:
   """Runs clang-tidy on the given ebuilds for the given board.

   Returns the set of |TidyDiagnostic|s produced by doing so.
   """
   # Since we rely on build actions _actually_ running, we can't live with a
   # cache.
   osutils.RmDir(
       Path(cros_build_lib.GetSysroot(board)) / 'var' / 'cache' / 'portage',
       ignore_missing=True,
       sudo=True,
   )

   results = set()
   # If clang-tidy dumps a lot of diags, it can take 1-10secs of CPU while
   # holding the GIL to |yaml.load| on my otherwise-idle dev box. |yaml_pool|
   # lets us do this in parallel.
   with multiprocessing.pool.Pool() as yaml_pool:
     for ebuild in ebuild_list:
       lint_tmpdir = generate_lints(board, ebuild.ebuild_path)
       try:
         results |= collect_lints(lint_tmpdir, yaml_pool)
       except ClangTidyParseError:
         if not parse_errors_are_nonfatal:
           raise
         logging.exception('Working on %r', ebuild)
       finally:
         if keep_dirs:
           logging.info('Lints for %r are in %r', ebuild.ebuild_path,
                        lint_tmpdir)
         else:
           osutils.RmDir(lint_tmpdir, ignore_missing=True, sudo=True)
   return results


 def resolve_package_ebuilds(board: str,
                             package_names: Iterable[str]) -> List[str]:
   """Figures out ebuild paths for the given package names."""

   def resolve_package(package_name_or_ebuild):
     """Resolves a single package name an ebuild path."""
     if package_name_or_ebuild.endswith('.ebuild'):
       return package_name_or_ebuild
     return cros_build_lib.run([f'equery-{board}', 'w', package_name_or_ebuild],
                               check=True,
                               stdout=subprocess.PIPE,
                               encoding='utf-8').stdout.strip()

   # Resolving ebuilds takes time. If we get more than one (like when I'm tesing
   # on 50 of them), parallelism speeds things up quite a bit.
   with multiprocessing.pool.ThreadPool() as pool:
     return pool.map(resolve_package, package_names)


 def filter_tidy_lints(only_files: Optional[Set[Path]],
                       git_repo_base: Optional[Path],
                       diags: Iterable[TidyDiagnostic]) -> List[TidyDiagnostic]:
   """Transforms and filters the given TidyDiagnostics.

   Args:
     only_files: a set of file paths, or None; if this is not None, only
       |TidyDiagnostic|s in these files will be kept.
     git_repo_base: if not None, only files in the given directory will be kept.
       All paths of the returned diagnostics will be made relative to
       |git_repo_base|.
     diags: diagnostics to transform/filter.

   Returns:
     A sorted list of |TidyDiagnostic|s.
   """
   result_diags = []
   total_diags = 0

   for diag in diags:
     total_diags += 1

     if not diag.file_path:
       # Things like |-DFOO=1 -DFOO=2| can trigger diagnostics ("oh no you're
       # redefining |FOO| with a different value") in 'virtual' files; these
       # receive no name in clang.
       logging.info('Dropping diagnostic %r, since it has no associated file',
                    diag)
       continue

     file_path = Path(diag.file_path)
     if only_files and file_path not in only_files:
       continue

     if git_repo_base:
       if git_repo_base not in file_path.parents:
         continue
       diag = diag.normalize_paths_to(git_repo_base)

     result_diags.append(diag)

   logging.info('Dropped %d/%d diags', total_diags - len(result_diags),
                total_diags)

   result_diags.sort()
   return result_diags


 def get_parser() -> commandline.ArgumentParser:
   """Creates an argument parser for this script."""
   parser = commandline.ArgumentParser(description=__doc__)
   parser.add_argument(
       '--output', required=True, type='path', help='File to write results to.')
   parser.add_argument(
       '--git-repo-base',
       type='path',
       help="Base directory of the git repo we're looking at. If specified, "
       'only diagnostics in files in this directory will be emitted. All '
       'diagnostic file paths will be made relative to this directory.')
   parser.add_argument('--board', required=True, help='Board to run under.')
   parser.add_argument(
       '--package',
       action='append',
       required=True,
       help='Package(s) to build and lint. Required.')
   parser.add_argument(
       '--keep-lint-dirs',
       action='store_true',
       help='Keep directories with tidy lints around; meant primarily for '
       'debugging.')
   parser.add_argument(
       '--nonfatal-parse-errors',
       action='store_true',
       help="Keep going even if clang-tidy's output is impossible to parse.")
   parser.add_argument(
       'file',
       nargs='*',
       type='path',
       help='File(s) to output lints for. If none are specified, this tool '
       'outputs all lints that clang-tidy emits after applying filtering '
       'from |--git-repo-base|, if applicable.')
   return parser


 def main(argv: List[str]) -> None:
   cros_build_lib.AssertInsideChroot()
   parser = get_parser()
   opts = parser.parse_args(argv)
   opts.Freeze()

   only_files = {Path(f).resolve() for f in opts.file}

   git_repo_base = opts.git_repo_base
   if git_repo_base:
     git_repo_base = Path(opts.git_repo_base)
     if not (git_repo_base / '.git').exists():
       # This script doesn't strictly care if there's a .git dir there; more of
       # a smoke check.
       parser.error(f'Given git repo base ({git_repo_base}) has no .git dir')

   package_ebuilds = [
       portage_util.EBuild(x)
       for x in resolve_package_ebuilds(opts.board, opts.package)
   ]

   setup_tidy(opts.board, package_ebuilds)
   lints = filter_tidy_lints(
       only_files,
       git_repo_base,
       diags=run_tidy(opts.board, package_ebuilds, opts.keep_lint_dirs,
                      opts.nonfatal_parse_errors))

   osutils.WriteFile(
       opts.output,
       json.dumps({'tidy_diagnostics': [x.to_dict() for x in lints]}),
       atomic=True)
	# Copyright 2020 The Chromium OS Authors. All rights reserved.
	# Use of this source code is governed by a BSD-style license that can be
	# found in the LICENSE file.

	"""Runs clang-tidy across the given files, dumping diagnostics to a JSON file.

	This script is intended specifically for use with Tricium (go/tricium).
	"""

	# From an implementation perspective, it's good to note that this script
	# cooperates with the toolchain's compiler wrapper. In particular,
	# ${cros}/src/third_party/toolchain-utils/compiler_wrapper/clang_tidy_flag.go.
	#
	# When \|WITH_TIDY=tricium\| is set and the wrapper (which is already $CC/$CXX)
	# is invoked, $CC will invoke clang-tidy _as well_ as the regular compiler.
	# This clang-tidy invocation will result in a few files being dumped to
	# \|LINT_BASE\| (below):
	# - "${LINT_BASE}/some-prefix.yaml" -- a YAML file that represents
	# clang-tidy's diagnostics for the file the compiler was asked to build
	# - "${LINT_BASE}/some-prefix.json" -- metadata about how the above YAML file
	# was generated, including clang-tidy's exit code, stdout, etc. See
	# \|InvocationMetadata\| below.
	#
	# As one might expect, the compiler wrapper writes the JSON file only after
	# clang-tidy is done executing.
	#
	# This directory might contain other files, as well; these are ignored by this
	# script.

	import bisect
	import json
	import multiprocessing
	import os
	from pathlib import Path
	import re
	import subprocess
	import sys
	import tempfile
	import traceback
	from typing import (Any, Dict, Iterable, List, NamedTuple, Optional, Set, Tuple,
	Union)

	import yaml # pylint: disable=import-error
	from chromite.lib import commandline
	from chromite.lib import cros_build_lib
	from chromite.lib import cros_logging as logging
	from chromite.lib import osutils
	from chromite.lib import portage_util
	from chromite.lib import workon_helper

	assert sys.version_info >= (3, 6), 'This module requires Python 3.6+'

	# The directory under which the compiler wrapper stores clang-tidy reports.
	LINT_BASE = Path('/tmp/linting_output/clang-tidy')


	class TidyReplacement(NamedTuple):
	"""Represents a replacement emitted by clang-tidy.

	File path is omitted, since these are intended to be associated with
	TidyDiagnostics with identical paths.
	"""
	new_text: str
	start_line: int
	end_line: int
	start_char: int
	end_char: int


	class TidyExpandedFrom(NamedTuple):
	"""Represents a macro expansion.

	When a diagnostic is inside of a macro expansion, clang-tidy emits
	information about where said macro was expanded from. \|TidyDiagnostic\|s will
	have one \|TidyExpandedFrom\| for each level of this expansion.
	"""
	file_path: Path
	line_number: int

	def to_dict(self) -> Dict[str, Any]:
	"""Converts this \|TidyExpandedFrom\| to a dict serializeable as JSON."""
	return {
	'file_path': self.file_path.as_posix(),
	'line_number': self.line_number,
	}


	class Error(Exception):
	"""Base error class for tricium-clang-tidy."""


	class ClangTidyParseError(Error):
	"""Raised when clang-tidy parsing jobs fail."""

	def __init__(self, failed_jobs: int, total_jobs: int):
	super().__init__(f'{failed_jobs}/{total_jobs} parse jobs failed')
	self.failed_jobs = failed_jobs
	self.total_jobs = total_jobs


	class TidyDiagnostic(NamedTuple):
	"""A diagnostic emitted by clang-tidy.

	Note that we shove these in a set for cheap deduplication, and we sort based
	on the natural element order here. Sorting is mostly just for
	deterministic/pretty output.
	"""
	file_path: Path
	line_number: int
	diag_name: str
	message: str
	replacements: Tuple[TidyReplacement]
	expansion_locs: Tuple[TidyExpandedFrom]

	def normalize_paths_to(self, where: str) -> 'TidyDiagnostic':
	"""Creates a new TidyDiagnostic with all paths relative to \|where\|."""
	return self._replace(
	# Use relpath because Path.relative_to requires that `self` is rooted
	# at `where`.
	file_path=Path(os.path.relpath(self.file_path, where)),
	expansion_locs=tuple(
	x._replace(file_path=Path(os.path.relpath(x.file_path, where)))
	for x in self.expansion_locs))

	def to_dict(self) -> Dict[str, Any]:
	"""Converts this \|TidyDiagnostic\| to a dict serializeable as JSON."""
	return {
	'file_path': self.file_path.as_posix(),
	'line_number': self.line_number,
	'diag_name': self.diag_name,
	'message': self.message,
	'replacements': [x._asdict() for x in self.replacements],
	'expansion_locs': [x.to_dict() for x in self.expansion_locs],
	}


	class ClangTidySchemaError(Error):
	"""Raised when we encounter malformed YAML."""

	def __init__(self, err_msg: str):
	super().__init__(err_msg)
	self.err_msg = err_msg


	class LineOffsetMap:
	"""Convenient API to turn offsets in a file into line numbers."""

	def __init__(self, newline_locations: Iterable[int]):
	line_starts = [x + 1 for x in newline_locations]
	# The \|bisect\| logic in \|get_line_number\|/\|get_line_offset\| gets a bit
	# complicated around the first and last lines of a file. Adding boundaries
	# here removes some complexity from those implementations.
	line_starts.append(0)
	line_starts.append(sys.maxsize)
	line_starts.sort()

	assert line_starts[0] == 0, line_starts[0]
	assert line_starts[1] != 0, line_starts[1]
	assert line_starts[-2] < sys.maxsize, line_starts[-2]
	assert line_starts[-1] == sys.maxsize, line_starts[-1]

	self._line_starts = line_starts

	def get_line_number(self, char_number: int) -> int:
	"""Given a char offset into a file, returns its line number."""
	assert 0 <= char_number < sys.maxsize, char_number
	return bisect.bisect_right(self._line_starts, char_number)

	def get_line_offset(self, char_number: int) -> int:
	"""Given a char offset into a file, returns its column number."""
	assert 0 <= char_number < sys.maxsize, char_number
	line_start_index = bisect.bisect_right(self._line_starts, char_number) - 1
	return char_number - self._line_starts[line_start_index]

	@staticmethod
	def for_text(data: str) -> 'LineOffsetMap':
	"""Creates a LineOffsetMap for the given string."""
	return LineOffsetMap(m.start() for m in re.finditer(r'\n', data))


	def parse_tidy_fixes_file(tidy_invocation_dir: Path,
	yaml_data: Any) -> Iterable[TidyDiagnostic]:
	"""Parses a clang-tidy YAML file.

	Args:
	yaml_data: The parsed YAML data from clang-tidy's fixits file.
	tidy_invocation_dir: The directory clang-tidy was run in.

	Returns:
	A generator of \|TidyDiagnostic\|s.
	"""
	assert tidy_invocation_dir.is_absolute(), tidy_invocation_dir

	if yaml_data is None:
	return

	# A cache of file_path => LineOffsetMap so we only need to load offsets once
	# per file per \|parse_tidy_fixes_file\| invocation.
	cached_line_offsets = {}

	def get_line_offsets(file_path: Optional[Path]) -> LineOffsetMap:
	"""Gets a LineOffsetMap for the given \|file_path\|."""
	assert not file_path or file_path.is_absolute(), file_path

	if file_path in cached_line_offsets:
	return cached_line_offsets[file_path]

	# Sometimes tidy will give us empty file names; they don't map to any file,
	# and are generally issues it has with CFLAGS, etc. File offsets don't
	# matter in those, so use an empty map.
	if file_path:
	offsets = LineOffsetMap.for_text(file_path.read_text(encoding='utf-8'))
	else:
	offsets = LineOffsetMap(())
	cached_line_offsets[file_path] = offsets
	return offsets

	# Rarely (e.g., in the case of missing \|#include\|s, clang will emit relative
	# file paths for diagnostics. This fixes those.
	def makeabs(file_path: str) -> Path:
	"""Resolves a \|file_path\| emitted by clang-tidy to an absolute path."""
	if not file_path:
	return None
	path = Path(file_path)
	if not path.is_absolute():
	path = tidy_invocation_dir / path
	return path.resolve()

	try:
	for diag in yaml_data['Diagnostics']:
	message = diag['DiagnosticMessage']
	file_path = message['FilePath']

	absolute_file_path = makeabs(file_path)
	line_offsets = get_line_offsets(absolute_file_path)

	replacements = []
	for replacement in message.get('Replacements', ()):
	replacement_file_path = makeabs(replacement['FilePath'])

	# FIXME(gbiv): This happens in practice with things like
	# hicpp-member-init. Supporting it should be simple, but I'd like to
	# get the basics running first.
	if replacement_file_path != absolute_file_path:
	logging.warning(
	"Replacement %r wasn't in original file %r (diag: %r)",
	replacement_file_path, file_path, diag)
	continue

	start_offset = replacement['Offset']
	end_offset = start_offset + replacement['Length']
	replacements.append(
	TidyReplacement(
	new_text=replacement['ReplacementText'],
	start_line=line_offsets.get_line_number(start_offset),
	end_line=line_offsets.get_line_number(end_offset),
	start_char=line_offsets.get_line_offset(start_offset),
	end_char=line_offsets.get_line_offset(end_offset),
	))

	expansion_locs = []
	for note in diag.get('Notes', ()):
	if not note['Message'].startswith('expanded from macro '):
	continue

	absolute_note_path = makeabs(note['FilePath'])
	note_offsets = get_line_offsets(absolute_note_path)
	expansion_locs.append(
	TidyExpandedFrom(
	file_path=absolute_note_path,
	line_number=note_offsets.get_line_number(note['FileOffset']),
	))

	yield TidyDiagnostic(
	diag_name=diag['DiagnosticName'],
	message=message['Message'],
	file_path=absolute_file_path,
	line_number=line_offsets.get_line_number(message['FileOffset']),
	replacements=tuple(replacements),
	expansion_locs=tuple(expansion_locs),
	)
	except KeyError as k:
	key_name = k.args[0]
	raise ClangTidySchemaError(f'Broken yaml: missing key {key_name!r}')


	# Represents metadata about a clang-tidy invocation.
	class InvocationMetadata(NamedTuple):
	"""Metadata describing a singular invocation of clang-tidy."""
	exit_code: int
	invocation: List[str]
	lint_target: str
	stdstreams: str
	wd: str


	class ExceptionData:
	"""Info about an exception that can be sent across processes."""

	def __init__(self):
	"""Builds an instance; only intended to be called from `except` blocks."""
	self._str = traceback.format_exc()

	def __str__(self):
	return self._str


	def parse_tidy_invocation(
	json_file: Path,
	) -> Union[ExceptionData, Tuple[InvocationMetadata, List[TidyDiagnostic]]]:
	"""Parses a clang-tidy invocation result based on a JSON file.

	This is intended to be run in a separate process, which Exceptions and
	locking and such work notoriously poorly over, so it's never intended to
	\|raise\| (except under a KeyboardInterrupt or similar).

	Args:
	json_file: The JSON invocation metadata file to parse.

	Returns:
	An \|ExceptionData\| instance on failure. On success, it returns a
	(InvocationMetadata, [TidyLint]).
	"""
	try:
	assert json_file.suffix == '.json', json_file

	with json_file.open(encoding='utf-8') as f:
	raw_meta = json.load(f)

	meta = InvocationMetadata(
	exit_code=raw_meta['exit_code'],
	invocation=[raw_meta['executable']] + raw_meta['args'],
	lint_target=raw_meta['lint_target'],
	stdstreams=raw_meta['stdstreams'],
	wd=raw_meta['wd'],
	)

	raw_crash_output = raw_meta.get('crash_output')
	if raw_crash_output:
	crash_reproducer_path = raw_crash_output['crash_reproducer_path']
	output = raw_crash_output['stdstreams']
	raise RuntimeError(f"""\
	Clang-tidy apparently crashed; dumping lots of invocation info:
	## Tidy JSON file target: {json_file}
	## Invocation: {meta.invocation}
	## Target: {meta.lint_target}
	## Crash reproducer is at: {crash_reproducer_path}
	## Output producing reproducer:
	{output}
	## Output from the crashing invocation:
	{meta.stdstreams}
	""")

	yaml_file = json_file.with_suffix('.yaml')
	# If this happened, clang-tidy was probably killed. Dump output as part of
	# the exception so it's easier to reason about what happened.
	if not yaml_file.exists():
	raise RuntimeError("clang-tidy didn't produce an output file for "
	f'{json_file}. Output:\n{meta.stdstreams}')

	with yaml_file.open('rb') as f:
	yaml_data = yaml.load(f)
	return meta, list(parse_tidy_fixes_file(Path(meta.wd), yaml_data))
	except Exception:
	return ExceptionData()


	def generate_lints(board: str, ebuild_path: str) -> Path:
	"""Collects the lints for a given package on a given board.

	Args:
	board: the board to collect lints for.
	ebuild_path: the path to the ebuild to collect lints for.

	Returns:
	The path to a tmpdir that all of the lint YAML files (if any) will be in.
	This will also be populated by JSON files containing InvocationMetadata.
	The generation of this is handled by our compiler wrapper.
	"""
	logging.info('Running lints for %r on board %r', ebuild_path, board)

	osutils.RmDir(LINT_BASE, ignore_missing=True, sudo=True)
	osutils.SafeMakedirs(LINT_BASE, 0o777, sudo=True)

	# FIXME(gbiv): \|test\| might be better here?
	result = cros_build_lib.run(
	[f'ebuild-{board}', ebuild_path, 'clean', 'compile'],
	check=False,
	print_cmd=True,
	extra_env={'WITH_TIDY': 'tricium'},
	capture_output=True,
	encoding='utf-8',
	errors='replace',
	)

	if result.returncode:
	status = f'failed with code {result.returncode}; output:\n{result.stdout}'
	log_fn = logging.warning
	else:
	status = 'succeeded'
	log_fn = logging.info

	log_fn('Running \|ebuild\| on %s %s', ebuild_path, status)
	lint_tmpdir = tempfile.mkdtemp(prefix='tricium_tidy')
	osutils.CopyDirContents(LINT_BASE, lint_tmpdir)
	return Path(lint_tmpdir)


	def collect_lints(lint_tmpdir: Path,
	yaml_pool: multiprocessing.Pool) -> Set[TidyDiagnostic]:
	"""Collects the lints for a given directory filled with linting artifacts."""
	json_files = list(lint_tmpdir.glob('*.json'))
	pending_parses = yaml_pool.imap(parse_tidy_invocation, json_files)

	parses_failed = 0
	all_complaints = set()
	for path, parse in zip(json_files, pending_parses):
	if isinstance(parse, ExceptionData):
	parses_failed += 1
	logging.error('Parsing %r failed with an exception\n%s', path, parse)
	continue

	meta, complaints = parse
	if meta.exit_code:
	logging.warning(
	'Invoking clang-tidy on %r with flags %r exited with code %d; '
	'output:\n%s',
	meta.lint_target,
	meta.invocation,
	meta.exit_code,
	meta.stdstreams,
	)

	all_complaints.update(complaints)

	if parses_failed:
	raise ClangTidyParseError(parses_failed, len(json_files))

	return all_complaints


	def setup_tidy(board: str, ebuild_list: List[portage_util.EBuild]):
	"""Sets up to run clang-tidy on the given ebuilds for the given board."""
	packages = [x.package for x in ebuild_list]
	logging.info('Setting up to lint %r', packages)

	workon = workon_helper.WorkonHelper(cros_build_lib.GetSysroot(board))
	workon.StopWorkingOnPackages(packages=[], use_all=True)
	workon.StartWorkingOnPackages(packages)

	# We're going to be hacking with \|ebuild\| later on, so having all
	# dependencies in place is necessary so one \|ebuild\| won't stomp on another.
	cmd = [
	f'emerge-{board}',
	'--onlydeps',
	# Since each `emerge` may eat up to `ncpu` cores, limit the maximum
	# concurrency we can get here to (arbitrarily) 8 jobs. Having
	# `configure`s and such run in parallel is nice.
	f'-j{min(8, multiprocessing.cpu_count())}',
	]
	cmd += packages
	result = cros_build_lib.run(cmd, print_cmd=True, check=False)
	if result.returncode:
	logging.error('Setup failed with exit code %d; some lints may fail.',
	result.returncode)


	def run_tidy(board: str, ebuild_list: List[portage_util.EBuild],
	keep_dirs: bool,
	parse_errors_are_nonfatal: bool) -> Set[TidyDiagnostic]:
	"""Runs clang-tidy on the given ebuilds for the given board.

	Returns the set of \|TidyDiagnostic\|s produced by doing so.
	"""
	# Since we rely on build actions _actually_ running, we can't live with a
	# cache.
	osutils.RmDir(
	Path(cros_build_lib.GetSysroot(board)) / 'var' / 'cache' / 'portage',
	ignore_missing=True,
	sudo=True,
	)

	results = set()
	# If clang-tidy dumps a lot of diags, it can take 1-10secs of CPU while
	# holding the GIL to \|yaml.load\| on my otherwise-idle dev box. \|yaml_pool\|
	# lets us do this in parallel.
	with multiprocessing.pool.Pool() as yaml_pool:
	for ebuild in ebuild_list:
	lint_tmpdir = generate_lints(board, ebuild.ebuild_path)
	try:
	results \|= collect_lints(lint_tmpdir, yaml_pool)
	except ClangTidyParseError:
	if not parse_errors_are_nonfatal:
	raise
	logging.exception('Working on %r', ebuild)
	finally:
	if keep_dirs:
	logging.info('Lints for %r are in %r', ebuild.ebuild_path,
	lint_tmpdir)
	else:
	osutils.RmDir(lint_tmpdir, ignore_missing=True, sudo=True)
	return results


	def resolve_package_ebuilds(board: str,
	package_names: Iterable[str]) -> List[str]:
	"""Figures out ebuild paths for the given package names."""

	def resolve_package(package_name_or_ebuild):
	"""Resolves a single package name an ebuild path."""
	if package_name_or_ebuild.endswith('.ebuild'):
	return package_name_or_ebuild
	return cros_build_lib.run([f'equery-{board}', 'w', package_name_or_ebuild],
	check=True,
	stdout=subprocess.PIPE,
	encoding='utf-8').stdout.strip()

	# Resolving ebuilds takes time. If we get more than one (like when I'm tesing
	# on 50 of them), parallelism speeds things up quite a bit.
	with multiprocessing.pool.ThreadPool() as pool:
	return pool.map(resolve_package, package_names)


	def filter_tidy_lints(only_files: Optional[Set[Path]],
	git_repo_base: Optional[Path],
	diags: Iterable[TidyDiagnostic]) -> List[TidyDiagnostic]:
	"""Transforms and filters the given TidyDiagnostics.

	Args:
	only_files: a set of file paths, or None; if this is not None, only
	\|TidyDiagnostic\|s in these files will be kept.
	git_repo_base: if not None, only files in the given directory will be kept.
	All paths of the returned diagnostics will be made relative to
	\|git_repo_base\|.
	diags: diagnostics to transform/filter.

	Returns:
	A sorted list of \|TidyDiagnostic\|s.
	"""
	result_diags = []
	total_diags = 0

	for diag in diags:
	total_diags += 1

	if not diag.file_path:
	# Things like \|-DFOO=1 -DFOO=2\| can trigger diagnostics ("oh no you're
	# redefining \|FOO\| with a different value") in 'virtual' files; these
	# receive no name in clang.
	logging.info('Dropping diagnostic %r, since it has no associated file',
	diag)
	continue

	file_path = Path(diag.file_path)
	if only_files and file_path not in only_files:
	continue

	if git_repo_base:
	if git_repo_base not in file_path.parents:
	continue
	diag = diag.normalize_paths_to(git_repo_base)

	result_diags.append(diag)

	logging.info('Dropped %d/%d diags', total_diags - len(result_diags),
	total_diags)

	result_diags.sort()
	return result_diags


	def get_parser() -> commandline.ArgumentParser:
	"""Creates an argument parser for this script."""
	parser = commandline.ArgumentParser(description=__doc__)
	parser.add_argument(
	'--output', required=True, type='path', help='File to write results to.')
	parser.add_argument(
	'--git-repo-base',
	type='path',
	help="Base directory of the git repo we're looking at. If specified, "
	'only diagnostics in files in this directory will be emitted. All '
	'diagnostic file paths will be made relative to this directory.')
	parser.add_argument('--board', required=True, help='Board to run under.')
	parser.add_argument(
	'--package',
	action='append',
	required=True,
	help='Package(s) to build and lint. Required.')
	parser.add_argument(
	'--keep-lint-dirs',
	action='store_true',
	help='Keep directories with tidy lints around; meant primarily for '
	'debugging.')
	parser.add_argument(
	'--nonfatal-parse-errors',
	action='store_true',
	help="Keep going even if clang-tidy's output is impossible to parse.")
	parser.add_argument(
	'file',
	nargs='*',
	type='path',
	help='File(s) to output lints for. If none are specified, this tool '
	'outputs all lints that clang-tidy emits after applying filtering '
	'from \|--git-repo-base\|, if applicable.')
	return parser


	def main(argv: List[str]) -> None:
	cros_build_lib.AssertInsideChroot()
	parser = get_parser()
	opts = parser.parse_args(argv)
	opts.Freeze()

	only_files = {Path(f).resolve() for f in opts.file}

	git_repo_base = opts.git_repo_base
	if git_repo_base:
	git_repo_base = Path(opts.git_repo_base)
	if not (git_repo_base / '.git').exists():
	# This script doesn't strictly care if there's a .git dir there; more of
	# a smoke check.
	parser.error(f'Given git repo base ({git_repo_base}) has no .git dir')

	package_ebuilds = [
	portage_util.EBuild(x)
	for x in resolve_package_ebuilds(opts.board, opts.package)
	]

	setup_tidy(opts.board, package_ebuilds)
	lints = filter_tidy_lints(
	only_files,
	git_repo_base,
	diags=run_tidy(opts.board, package_ebuilds, opts.keep_lint_dirs,
	opts.nonfatal_parse_errors))

	osutils.WriteFile(
	opts.output,
	json.dumps({'tidy_diagnostics': [x.to_dict() for x in lints]}),
	atomic=True)