contrib/copybot/copybot.py - mirrors/cros/chromiumos/platform/dev-util - Git at Google

 #!/usr/bin/env vpython3
 # Copyright 2022 The ChromiumOS Authors
 # Use of this source code is governed by a BSD-style license that can be
 # found in the LICENSE file.

 """CopyBot script.

 This script copies commits from one repo (the "upstream") to another
 (the "downstream").

 Usage: copybot.py [options...] upstream_repo:branch downstream_repo:branch
 """

 # [VPYTHON:BEGIN]
 # python_version: "3.8"
 # wheel: <
 #   name: "infra/python/wheels/requests-py3"
 #   version: "version:2.31.0"
 # >
 # wheel: <
 #   name: "infra/python/wheels/certifi-py2_py3"
 #   version: "version:2020.11.8"
 # >
 # wheel: <
 #   name: "infra/python/wheels/idna-py2_py3"
 #   version: "version:2.8"
 # >
 # wheel: <
 #   name: "infra/python/wheels/charset_normalizer-py3"
 #   version: "version:2.0.4"
 # >
 # wheel: <
 #   name: "infra/python/wheels/urllib3-py2_py3"
 #   version: "version:1.26.6"
 # >
 # [VPYTHON:END]

 from __future__ import annotations

 import argparse
 import dataclasses
 import enum
 import json
 import logging
 import os
 import pathlib
 import re
 import shlex
 import subprocess
 import tempfile
 import time
 from typing import Dict, Optional, Union
 import urllib

 import requests  # pylint: disable=import-error


 logger = logging.getLogger(__name__)

 # Matches a full 40-character commit hash.
 _COMMIT_HASH_PATTERN = re.compile(r"\b[0-9a-f]{40}\b")


 class MergeConflictBehavior(enum.Enum):
     """How to behave on merge conflicts.

     FAIL: Stop immediately. Don't upload anything.
     SKIP: Skip the commit that failed to merge. Summarize the failed
         commits at the end of the execution, and exit failure status.
     STOP: Stop immediately. Upload staged changes prior to conflict.
     ALLOW_CONFLICT: Commit the conflicted CL with conflicts.  Summarize
         the conflicted CLs at the end of execution, and exit failure
         status.  Conflicted CLs WILL be uploaded to the downstream.
         "Commit: false" will be added to the commit message to prevent
         GoB from committing conflicted changes before they are edited.
     """

     FAIL = enum.auto()
     SKIP = enum.auto()
     STOP = enum.auto()
     ALLOW_CONFLICT = enum.auto()


 class MergeConflictError(Exception):
     """A commit cannot be cherry-picked due to a conflict."""


 class EmptyCommitError(Exception):
     """A commit cannot be cherry-picked as it results in an empty commit."""


 class CopybotFatalError(Exception):
     """Copybot fatal error."""

     enum_name = "FAILURE_UNKNOWN"

     def __init__(self, *args, commits=(), **kwargs):
         self.commits = commits
         super().__init__(*args, **kwargs)


 class UpstreamFetchError(CopybotFatalError):
     """Copybot died as the upstream failed to fetch."""

     enum_name = "FAILURE_UPSTREAM_FETCH_ERROR"


 class DownstreamFetchError(CopybotFatalError):
     """Copybot died as the downstream failed to fetch."""

     enum_name = "FAILURE_DOWNSTREAM_FETCH_ERROR"


 class PushError(CopybotFatalError):
     """Copybot died as it failed to push to the downstream GoB host."""

     enum_name = "FAILURE_DOWNSTREAM_PUSH_ERROR"


 class MergeConflictsError(CopybotFatalError):
     """Copybot ran, but encountered merge conflicts."""

     enum_name = "FAILURE_MERGE_CONFLICTS"


 @dataclasses.dataclass
 class GerritClInfo:
     """Stores information for a Gerrit CL."""

     def __init__(self, change_id: str, hashtags: str, ref: str) -> None:
         """Initialize the Gerrit CL Info.

         Args:
             change_id: Change ID on Gerrit for this change.
             hashtags: Hashtags associated with this change
             ref: Current ref for this changes to be able to form a refspec
         """
         self.change_id = change_id
         self.hashtags = hashtags
         self.current_ref = ref


 class GitRepo:
     """Class wrapping common Git repository actions."""

     def __init__(self, git_dir):
         self.git_dir = git_dir

     def _run_git(self, *args, **kwargs):
         """Wrapper to run git with the provided arguments."""
         argv = ["git", "-C", self.git_dir, "--no-pager", *args]
         logger.info("Run `%s`", " ".join(shlex.quote(str(arg)) for arg in argv))
         kwargs.setdefault("encoding", "utf-8")
         kwargs.setdefault("errors", "replace")
         try:
             return subprocess.run(
                 argv,
                 check=True,
                 stdout=subprocess.PIPE,
                 stderr=subprocess.PIPE,
                 **kwargs,
             )
         except subprocess.CalledProcessError as e:
             logger.error("Git command failed!")
             logger.error("  STDOUT:")
             for line in e.stdout.splitlines():
                 logger.error("    %s", line)
             logger.error("  STDERR:")
             for line in e.stderr.splitlines():
                 logger.error("    %s", line)
             raise

     @classmethod
     def init(cls, git_dir: Union[str, "os.PathLike[str]"]):
         """Do a `git init` to create a new repository."""
         git_dir = pathlib.Path(git_dir)
         repo = cls(git_dir)
         if not (git_dir / ".git").exists():
             repo._run_git("init")  # pylint: disable=protected-access
         return repo

     def rev_parse(self, rev="HEAD"):
         """Do a `git rev-parse`."""
         result = self._run_git("rev-parse", rev)
         return result.stdout.rstrip()

     def fetch(self, remote, ref=None):
         """Do a `git fetch`.

         Returns:
             The full commit hash corresponding to FETCH_HEAD.
         """
         extra_args = []
         if ref:
             extra_args.append(ref)
         self._run_git("fetch", remote, *extra_args)
         return self.rev_parse("FETCH_HEAD")

     def checkout(self, ref):
         """Do a `git checkout`."""
         return self._run_git("checkout", ref)

     def log(
         self,
         revision_range="HEAD",
         fmt=None,
         num=0,
         subtree=None,
         exclude_file_patterns=(),
     ):
         """Do a `git log`."""
         extra_args = []
         if fmt:
             extra_args.append(f"--format={fmt}")
         if num:
             extra_args.append(f"-n{num}")
         extra_args.append("--")
         extra_args.extend(
             [f":!{path}" for path in (exclude_file_patterns or [])]
         )
         if subtree:
             extra_args.append(subtree)
         return self._run_git("log", revision_range, *extra_args)

     def log_hashes(
         self,
         revision_range="HEAD",
         num=0,
         subtree=None,
         exclude_file_patterns=(),
     ):
         """Get the commit log as a list of commit hashes."""
         result = self.log(
             revision_range=revision_range,
             fmt="%H",
             num=num,
             subtree=subtree,
             exclude_file_patterns=exclude_file_patterns,
         )
         return result.stdout.splitlines()

     def get_commit_message(self, rev="HEAD"):
         """Get a commit message of a commit."""
         result = self.log(revision_range=rev, num=1, fmt="%B")
         return result.stdout

     def commit_file_list(self, rev="HEAD"):
         """Get the files modified by a commit."""
         result = self._run_git("show", "--pretty=", "--name-only", rev)
         return result.stdout.splitlines()

     def show(self, rev="HEAD", files=()):
         """Do a `git show`."""
         result = self._run_git("show", rev, "--", *files)
         return result.stdout

     def apply(self, patch, path=None, include_paths=None, exclude_paths=None):
         """Apply a patch to the staging area."""
         extra_args = []
         if path:
             extra_args.append(f"--directory={path}")
         if include_paths:
             extra_args.extend(f"--include={path}" for path in include_paths)
         if exclude_paths:
             extra_args.extend(f"--exclude={path}" for path in exclude_paths)

         extra_args.append(patch)
         return self._run_git("apply", *extra_args)

     def commit(self, message, amend=False, sign_off=False, stage=False):
         """Create a commit.

         Returns:
             The commit hash.
         """
         extra_args = []
         if stage:
             extra_args.append("--all")
         if amend:
             extra_args.append("--amend")
         if sign_off:
             extra_args.append("--signoff")
         self._run_git("commit", *extra_args, "-m", message)
         return self.rev_parse()

     def reword(self, new_message, sign_off=False):
         """Reword the commit at HEAD.

         Returns:
             The new commit hash.
         """
         return self.commit(new_message, amend=True, sign_off=sign_off)

     def format_patch(self, rev, output_path, num=1, relative_path=None):
         """Generate patch from revision with an optional relative path.

         Returns:
             Path of the output patch.
         """
         extra_args = []
         if relative_path:
             extra_args.append(f"--relative={relative_path}")
         extra_args.extend([f"-{num}", rev, f"--output-directory={output_path}"])

         result = self._run_git("format-patch", *extra_args)
         return result.stdout.rstrip()

     def add(self, path, stage=False, force=False):
         """Add unstaged files."""
         extra_args = []
         if stage:
             extra_args.append("--all")
         if force:
             extra_args.append("--force")
         if path:
             extra_args.append(path)
         return self._run_git("add", *extra_args)

     def get_subtree_lowest_working_dir(
         self, path: Union[pathlib.Path, "os.PathLike[str]"]
     ) -> pathlib.Path:
         """Get the lowest working directory path for subtree."""
         patch_dir = pathlib.Path(self.git_dir)
         if path:
             patch_dir = patch_dir / path
         subtree = path
         if not patch_dir.is_dir():
             subtree = pathlib.Path(path).parents[0]
         return subtree

     def cherry_pick(
         self,
         rev,
         patch_dir=None,
         upstream_subtree=None,
         downstream_subtree=None,
         include_paths=None,
         exclude_paths=None,
         allow_conflict=False,
     ):
         """Do a `git cherry-pick`.

         This will first try without any merge options, and if that fails,
         try again with -Xpatience, which is slower, but may be more likely
         to resolve a merge conflict.

         Raises:
             EmptyCommitError: The resultant commit was empty and should be
                 skipped.
             MergeConflictError: There was a merge conflict that could not
                 be resolved automatically with -Xpatience.
         """

         def _try_cherry_pick(extra_flags):
             try:
                 self._run_git("cherry-pick", "-x", rev, *extra_flags)
             except subprocess.CalledProcessError as e:
                 try:
                     self._run_git("rev-parse", "--verify", "CHERRY_PICK_HEAD")
                     logger.warning("Could not cherry-pick")
                 except subprocess.CalledProcessError as err:
                     if "is a merge but no -m option was given" in e.stderr:
                         logger.warning("Merge commit detected")
                     raise MergeConflictError() from err
                 if allow_conflict:
                     self.add(downstream_subtree, stage=True, force=True)
                     self.commit(
                         self.get_commit_message(rev),
                         amend=False,
                         sign_off=False,
                         stage=True,
                     )
                 else:
                     self._run_git("cherry-pick", "--abort")
                     if "The previous cherry-pick is now empty" in e.stderr:
                         raise EmptyCommitError() from e
                     raise MergeConflictError() from e

         cherry_pick_flag_list = ([], ["-Xpatience"], ["-m", "2"])
         if downstream_subtree or upstream_subtree or include_paths:
             cherry_pick_flag_list = ()
         for flags in cherry_pick_flag_list:
             try:
                 _try_cherry_pick(flags)
             except MergeConflictError:
                 continue
             else:
                 return
         patch = self.format_patch(
             rev,
             patch_dir,
             1,
             self.get_subtree_lowest_working_dir(upstream_subtree),
         )
         try:
             self.apply(
                 patch=patch,
                 path=self.get_subtree_lowest_working_dir(downstream_subtree),
                 include_paths=include_paths,
                 exclude_paths=exclude_paths,
             )
         except subprocess.CalledProcessError as e:
             if not allow_conflict:
                 raise MergeConflictError() from e
         self.add(downstream_subtree, stage=True, force=True)
         self.commit(
             self.get_commit_message(rev),
             amend=False,
             sign_off=False,
             stage=True,
         )
         return

     def push(self, url, refspec, options=()):
         """Do a `git push`."""
         args = []

         for option in options:
             args.extend(["-o", option])

         args.append(url)
         args.append(refspec)
         self._run_git("push", *args)

     def get_cl_count(self, original_rev: str, current_rev: str) -> int:
         """Get the number of CLs between the specified revisions."""
         if not original_rev or not current_rev:
             return 0
         args = ["--count"]
         args.append(f"{original_rev}..{current_rev}")
         result = self._run_git("rev-list", *args)
         return int(result.stdout.rstrip())


 class Pseudoheaders:
     """Dictionary-like object for the pseudoheaders from a commit message.

     The pseudoheaders are the header-like lines often found in the
     bottom of a commit message.  Header names are case-insensitive.

     Pseudoheaders are parsed the same way that the "git footers"
     command parses them.
     """

     # Matches lines that look like a "header" (the conventional footer
     # lines in a commit message).
     _PSEUDOHEADER_PATTERN = re.compile(r"^(?:[A-Za-z0-9]+-)*[A-Za-z0-9]+:\s+")

     def __init__(self, header_list=()):
         self._header_list = list(header_list)

     @classmethod
     def from_commit_message(cls, commit_message, offset=1):
         """Parse pseudoheaders from a commit message.

         Args:
             commit_message: commit message from git log.
             offset: which line to start processing the message from.
                 Lines less than the offset will not be altered.

         Returns:
             Two values, a Pseudoheaders dictionary, and the commit
             message without any pseudoheaders.
         """
         message_lines = commit_message.splitlines()
         rewritten_message = []

         header_list = []
         for i, line in enumerate(message_lines):
             if i < offset or not cls._PSEUDOHEADER_PATTERN.match(line):
                 rewritten_message.append(line)
             else:
                 name, _, value = line.partition(":")
                 header_list.append((name, value.strip()))

         return cls(header_list), "".join(
             f"{line}\n" for line in rewritten_message
         )

     def prefix(self, prefix="Original-", keep=()):
         """Prefix all header keys with a string.

         Args:
             prefix: The prefix to use.
             keep: Headers which should not be modified.

         Returns:
             A new Pseudoheaders dictionary.
         """
         new_header_list = []

         # Constructing a new pseudoheaders dictionary ensures we
         # consider the keep list to be case insensitive.
         keep_dict = self.__class__([(key, True) for key in keep])

         for key, value in self._header_list:
             if keep_dict.get(key):
                 new_header_list.append((key, value))
             else:
                 new_header_list.append((f"{prefix}{key}", value))
         return self.__class__(new_header_list)

     def __getitem__(self, item):
         """Get a header value by name."""
         for key, value in self._header_list:
             if key.lower() == item.lower():
                 return value
         raise KeyError(item)

     def get(self, item, default=None):
         """Get a header value by name, or return a default value."""
         try:
             return self[item]
         except KeyError:
             return default

     def as_dict(self) -> Dict[str, str]:
         """Get the dict of stored values."""
         return dict(self._header_list)

     def __setitem__(self, key, value):
         """Add a header."""
         self._header_list.append((key, value))

     def add_to_commit_message(self, commit_message):
         """Add our pseudoheaders to a commit message.

         Returns:
             The new commit message.
         """
         message_lines = commit_message.splitlines()

         if not message_lines:
             message_lines = ["NO COMMIT MESSAGE"]

         # Ensure exactly one blank line separating body and pseudoheaders.
         while not message_lines[-1].strip():
             message_lines.pop()

         message_lines.append("")

         for key, value in self._header_list:
             message_lines.append(f"{key}: {value}")
         return "".join(f"{line}\n" for line in message_lines)

     def __str__(self):
         return "\n".join(f"{key}:{value}" for key, value in self._header_list)

     def update(self, other: Union[dict, Pseudoheaders]) -> None:
         if isinstance(other, type(self)):
             for key, value in other.as_dict().items():
                 self[key] = value
         elif isinstance(other, dict):
             for key, value in other:
                 self[key] = value
         else:
             raise TypeError(f"Other class has conflicting type({type(other)})")


 class Gerrit:
     """Wrapper for actions on a Gerrit host."""

     def __init__(self, hostname):
         self.hostname = hostname

     def search(self, query):
         """Do a query on Gerrit."""
         url = f"https://{self.hostname}/changes/"
         params = [
             ("q", query),
             ("o", "CURRENT_REVISION"),
             ("o", "CURRENT_COMMIT"),
             ("o", "COMMIT_FOOTERS"),
         ]
         while True:
             r = requests.get(url, params=params)
             if r.ok:
                 break
             if r.status_code == requests.codes.too_many:
                 time.sleep(1)
                 continue
             r.raise_for_status()
             assert False

         if r.text[:5] != ")]}'\n":
             logger.error("Bad response from Gerrit: %r", r.text)
             raise ValueError("Unexpected JSON payload from gerrit")

         result = json.loads(r.text[5:])
         return result

     def find_pending_changes(
         self, project, branch, hashtags=(), subtree=None, exclude_paths=()
     ):
         """Find pending changes previously opened by CopyBot on Gerrit.

         Returns:
             A dictionary mapping upstream commit hashes to their
             current Change-Id on Gerrit and their associated hashtags.
         """
         query = [
             "status:open",
             f"project:{project}",
             f"branch:{branch}",
         ]
         query.extend(f"hashtag:{hashtag}" for hashtag in hashtags)
         if subtree:
             query.append(f"directory:{subtree}")
         for path in exclude_paths:
             query.append(f"-directory:{path}")
         query_result = self.search(" ".join(query))
         change_ids = {}
         for cl in query_result:
             change_id = cl["change_id"]
             current_revision_hash = cl["current_revision"]
             current_revision_data = cl["revisions"][current_revision_hash]
             commit_message = current_revision_data["commit"]["message"]
             origin_rev_id = get_origin_rev_id(commit_message)
             rev_id = []
             if origin_rev_id:
                 rev_id.append(origin_rev_id)
             else:
                 for commit_hash in _COMMIT_HASH_PATTERN.finditer(
                     commit_message
                 ):
                     rev_id.append(commit_hash.group(0))
             for rev in rev_id:
                 change_ids[rev] = GerritClInfo(
                     change_id,
                     cl["hashtags"].copy(),
                     current_revision_data["ref"],
                 )
         return change_ids


 def generate_change_id():
     """Generate a Unique Change-Id."""
     return f"I{os.urandom(20).hex()}"


 def get_origin_rev_id(commit_message):
     """Get the origin revision hash from a commit message.

     Returns:
         The revision hash if one was found, or None otherwise.
     """
     pseudoheaders, _ = Pseudoheaders.from_commit_message(commit_message)
     origin_revid = pseudoheaders.get("GitOrigin-RevId")
     if not origin_revid:
         origin_revid = pseudoheaders.get("Original-Commit-Id")
     return origin_revid


 def get_change_id(commit_message):
     """Get the Change-Id from a commit message.

     Returns:
         The Change-Id if one was found, or None otherwise.
     """
     pseudoheaders, _ = Pseudoheaders.from_commit_message(commit_message)
     return pseudoheaders.get("Change-Id")


 def find_last_merged_rev(
     repo,
     upstream_rev,
     downstream_rev,
     upstream_subtree=None,
     downstream_subtree=None,
     exclude_file_patterns=(),
     include_change_id=False,
     upstream_history_length: int = 0,
     downstream_history_length: int = 0,
 ):
     """Find the last merged revision in a Git repo.

     Args:
         repo: The GitRepo.
         upstream_rev: The commit hash of the upstream HEAD.
         downstream_rev: The commit hash of the downstream HEAD.
         upstream_subtree: The subtree of interest of the upstream repo.
         downstream_subtree: The subtree of interest of the downstream repo.
         exclude_file_patterns: List of paths to be excluded.
         include_change_id: Bool specifying whether or not to
             consider Change-Ids
         upstream_history_length: Number of CLs to consider as a part of the
             upstream history.
         downstream_history_length: Number of CLs to consider as a part of the
             downstream history.

     Returns:
         Two values,
             1. A commit hash of the last merged revision by CopyBot, or the
                 first common commit hash in both logs.
             2. The number of CLs which are eligible to be downstreamed.

     Raises:
         ValueError: No common history could be found.
     """
     upstream_hashes = repo.log_hashes(
         revision_range=upstream_rev,
         subtree=upstream_subtree,
         exclude_file_patterns=exclude_file_patterns,
         num=upstream_history_length,
     )
     downstream_hashes = repo.log_hashes(
         revision_range=downstream_rev,
         subtree=downstream_subtree,
         exclude_file_patterns=exclude_file_patterns,
         num=downstream_history_length,
     )

     upstream_change_ids = {}
     if include_change_id:
         for rev in upstream_hashes:
             change_id = get_change_id(repo.get_commit_message(rev))
             if change_id:
                 upstream_change_ids[change_id] = rev

     for rev in downstream_hashes:
         commit_message = repo.get_commit_message(rev)
         origin_revid = get_origin_rev_id(commit_message)
         change_id = get_change_id(commit_message)
         if (
             rev in upstream_hashes
             or origin_revid
             or (change_id and include_change_id)
         ):
             if origin_revid in upstream_hashes:
                 counter = upstream_hashes.index(origin_revid or rev)
             elif include_change_id and change_id in upstream_change_ids:
                 origin_revid = upstream_change_ids[change_id]
                 counter = upstream_hashes.index(origin_revid or rev)
             else:
                 continue
             return origin_revid or rev, counter

     raise ValueError(
         "Downstream has no GitOrigin-RevId commits, and upstream and "
         "downstream share no common history."
     )


 def get_downstreamed_list(
     repo,
     downstream_rev,
     downstream_subtree=None,
     exclude_file_patterns=(),
     limit=0,
     upstream_change_ids=(),
     downstream_history_length=0,
 ):
     """Find the last merged revision in a Git repo.

     Args:
         repo: The GitRepo.
         downstream_rev: The commit hash of the downstream HEAD.
         downstream_subtree: The subtree of interest of the downstream repo.
         exclude_file_patterns: List of paths to be excluded.
         limit: The maxinum number of CLs in the history to check.
         upstream_change_ids: Dictionary of upstream Change-Id's and their
             associated upstream commit hash.
         downstream_history_length: Number of CLs to consider as a part of the
             downstream history.

     Returns:
         The set of upstream commit hashes that have already been downstreamed.
     """
     downstream_hashes = repo.log_hashes(
         revision_range=downstream_rev,
         subtree=downstream_subtree,
         exclude_file_patterns=exclude_file_patterns,
         num=downstream_history_length,
     )
     downstreamed_revs = downstream_hashes[:]

     for counter, rev in enumerate(downstream_hashes):
         if counter > limit and limit != 0:
             break

         commit_message = repo.get_commit_message(rev)
         origin_revid = get_origin_rev_id(commit_message)
         change_id = get_change_id(commit_message)

         if origin_revid:
             downstreamed_revs.append(origin_revid)
         if change_id and change_id in upstream_change_ids:
             downstreamed_revs.append(upstream_change_ids[change_id])
     return downstreamed_revs


 def find_commits_to_copy(
     repo,
     upstream_rev,
     upstream_subtree,
     downstream_rev,
     downstream_subtree,
     include_paths,
     upstream_limit=0,
     downstream_limit=0,
     exclude_file_patterns=(),
     pending_changes=(),
     skip_copybot_job_names=(),
     include_change_id=False,
     upstream_history_length: int = 0,
     downstream_history_length: int = 0,
 ):
     """Find the commits to copy to downstream.

     Args:
         repo: The GitRepo.
         upstream_rev: The commit hash of the upstream HEAD.
         upstream_subtree: The subtree of interest of the upstream repo.
         downstream_rev: The commit hash of the downstream HEAD.
         downstream_subtree: The subtree of interest of the downstream repo.
         include_paths: The paths to include from the upstream relative to
             the downstream subtree(Only valid with downstream subtree)
         upstream_limit: The maxinum number of CLs in the upstream history to
             check.
         downstream_limit: The maxinum number of CLs in the downstream history
             to check.
         exclude_file_patterns: File paths that should not be copied.
         pending_changes: Changes pending in downstream repo.
         skip_copybot_job_names: Names of copybot jobs to not copy CLs from
         include_change_id: Bool specifying whether or not to
             consider Change-Ids
         upstream_history_length: Number of CLs to consider as a part of the
             upstream history.
         downstream_history_length: Number of CLs to consider as a part of the
             downstream history.

     Returns:
         A list of the commit hashes to copy.

     Raises:
         ValueError: If the provided last merged commit hash does not
            exist in upstream commit history.
     """
     commits_to_copy = []
     upstream_change_ids = {}
     upstream_hashes = repo.log_hashes(
         revision_range=upstream_rev,
         subtree=upstream_subtree,
         exclude_file_patterns=exclude_file_patterns,
         num=upstream_history_length,
     )
     if include_change_id:
         for counter, rev in enumerate(upstream_hashes):
             if counter > upstream_limit and upstream_limit != 0:
                 break
             change_id = get_change_id(repo.get_commit_message(rev))
             if change_id:
                 upstream_change_ids[change_id] = rev
     downstreamed_revs = get_downstreamed_list(
         repo=repo,
         downstream_rev=downstream_rev,
         downstream_subtree=downstream_subtree,
         exclude_file_patterns=exclude_file_patterns,
         limit=downstream_limit,
         upstream_change_ids=upstream_change_ids,
         downstream_history_length=downstream_history_length,
     )

     counter = 0
     for rev in upstream_hashes:
         # Early exit if limit reached to avoid inadvertent continuation
         if counter > upstream_limit and upstream_limit != 0:
             break

         # Check if this is a filtered commit.
         commit_message = repo.get_commit_message(rev)
         pseudoheaders, commit_message = Pseudoheaders.from_commit_message(
             commit_message
         )
         job_name = pseudoheaders.get("Copybot-Job-Name")
         if job_name in skip_copybot_job_names:
             logger.info(
                 "Skip %s due to Copybot-Job-Name: %s",
                 rev,
                 job_name,
             )
             continue

         # Increment counter after filtering Copybot-Job-Name CLs to treat
         # them as if they don't belong to the target repo.
         counter += 1

         if rev in pending_changes:
             if "copybot-skip" in pending_changes[rev].hashtags:
                 logger.info("Skip %s due to copybot-skip hashtag", rev)
                 continue

         # If change is in pending list, allow relands
         if rev in downstreamed_revs:
             if rev in pending_changes:
                 logger.info(
                     "Found pending change that has already merged: %s", rev
                 )
             else:
                 continue

         if downstream_subtree and include_paths:
             commit_files = repo.commit_file_list(rev)
             filtered_commit_files = []
             for path in commit_files:
                 filtered_path = path.relative_to(upstream_subtree)
                 if filtered_path in include_paths:
                     filtered_commit_files.append(path)
                     break

             if not filtered_commit_files:
                 logger.info(
                     "Skip commit %s due to empty file list after filtering "
                     "(before filtering was %r)",
                     rev,
                     commit_files,
                 )
                 continue
         commits_to_copy.append(rev)

     return commits_to_copy


 def rewrite_commit_message(
     repo,
     upstream_rev,
     change_id,
     prepend_subject="",
     sign_off=False,
     keep_pseudoheaders=(),
     additional_pseudoheaders=(),
 ):
     """Reword the commit at HEAD with appropriate metadata.

     Args:
         repo: The GitRepo to operate on.
         upstream_rev: The upstream commit hash corresponding to this commit.
         change_id: The Change-Id to add to the commit.
         prepend_subject: A string to prepend the subject line with.
         sign_off: True if Signed-off-by should be added to the commit message.
         keep_pseudoheaders: Pseudoheaders which should not be prefixed.
         additional_pseudoheaders: Psuedoheaders to be added to the commit
             message.
     """
     commit_message = repo.get_commit_message()
     if prepend_subject:
         commit_message = prepend_subject + commit_message
     pseudoheaders, commit_message = Pseudoheaders.from_commit_message(
         commit_message
     )
     pseudoheaders = pseudoheaders.prefix(keep=keep_pseudoheaders)

     pseudoheaders["GitOrigin-RevId"] = upstream_rev
     for additional_header in additional_pseudoheaders:
         parsed, _ = Pseudoheaders.from_commit_message(
             additional_header, offset=0
         )
         pseudoheaders.update(parsed)
     if "Change-Id" not in keep_pseudoheaders:
         pseudoheaders["Change-Id"] = change_id

     commit_message = pseudoheaders.add_to_commit_message(commit_message)
     repo.reword(commit_message, sign_off=sign_off)


 def get_push_refspec(args, downstream_branch):
     """Generate a push refspec for Gerrit.

     Args:
         args: The parsed command line arguments.
         downstream_branch: The branch to push to.

     Returns:
         A push refspec as a string.
     """
     push_options = []

     def _add_push_option(key, value):
         for option in value.split(","):
             push_options.append(f"{key}={option}")

     for label in args.labels:
         _add_push_option("l", label)

     for cc in args.ccs:
         _add_push_option("cc", cc)

     for reviewer in args.reviewers:
         _add_push_option("r", reviewer)

     for hashtag in [args.topic, *args.hashtags]:
         _add_push_option("t", hashtag)

     return f"HEAD:refs/for/{downstream_branch}%{','.join(push_options)}"


 def is_server_gob(url):
     return re.fullmatch(
         r"https://(chromium|chrome-internal)"
         r"(?:-review)?\.googlesource\.com/(.*)",
         url,
     )


 def parse_repo_info(repo_string):
     is_local = True
     max_fields = 3
     repo_info = []

     base_string = repo_string
     url_result = urllib.parse.urlparse(repo_string)
     if url_result.netloc and url_result.scheme:
         is_local = False
     for _ in range(max_fields + 1):
         base_string, sep, current_field = base_string.rpartition(":")
         if not sep:
             if is_local:
                 repo_info.insert(0, pathlib.Path(current_field))
             else:
                 repo_info[0] = url_result.scheme + ":" + repo_info[0]
             break
         else:
             repo_info.insert(0, current_field)

     for _ in range(len(repo_info), max_fields):
         repo_info.append(None)

     repo_url = repo_info[0]
     repo_branch = repo_info[1]
     repo_subtree = repo_info[2]
     if not repo_branch:
         repo_branch = "main"

     return is_local, repo_url, repo_branch, repo_subtree


 def run_copybot(args, git_dir, patch_dir):
     """Run copybot.

     Args:
         args: The parsed command line arguments.
         git_dir: A temporary or local directory to use for Git operations.
         patch_dir: A temporary directory to use for storing patch files.
     """
     (
         _,
         upstream_url,
         upstream_branch,
         upstream_subtree,
     ) = parse_repo_info(args.upstream)
     (
         local_downstream,
         downstream_url,
         downstream_branch,
         downstream_subtree,
     ) = parse_repo_info(args.downstream)

     if local_downstream:
         git_dir = downstream_url

     keep_pseudoheaders = list(args.keep_pseudoheaders)
     related_repo = False
     if upstream_url == downstream_url or (
         is_server_gob(str(downstream_url)) and is_server_gob(str(upstream_url))
     ):
         related_repo = True
         if "Change-Id" not in keep_pseudoheaders:
             keep_pseudoheaders.append("Change-Id")
     merge_conflict_behavior = MergeConflictBehavior[
         args.merge_conflict_behavior
     ]
     pending_changes = {}
     gerrit: Optional[Gerrit] = None
     if (m := is_server_gob(str(downstream_url))) is not None:
         downstream_gob_host = m.group(1)
         downstream_project = m.group(2)

         gerrit = Gerrit(f"{downstream_gob_host}-review.googlesource.com")
         pending_changes = gerrit.find_pending_changes(
             project=downstream_project,
             branch=downstream_branch,
             hashtags=[args.topic],
             subtree=downstream_subtree,
             exclude_paths=args.exclude_file_patterns,
         )
         logger.info(
             "Found %s pending changes already on Gerrit", len(pending_changes)
         )
     repo = GitRepo.init(git_dir)
     try:
         upstream_rev = repo.fetch(upstream_url, upstream_branch)
     except subprocess.CalledProcessError as e:
         raise UpstreamFetchError(
             f"Failed to fetch branch {upstream_branch} from {upstream_url}"
         ) from e

     try:
         downstream_rev = repo.fetch(downstream_url, downstream_branch)
     except subprocess.CalledProcessError as e:
         raise DownstreamFetchError(
             f"Failed to fetch branch {downstream_branch} from {downstream_url}"
         ) from e

     upstream_history_length = repo.get_cl_count(
         args.upstream_history_starts_with, upstream_rev
     )
     downstream_history_length = repo.get_cl_count(
         args.downstream_history_starts_with, downstream_rev
     )

     # Verify that the two repositories share a history
     num_cls_to_downstream = 0
     last_related_rev = ""
     try:
         last_related_rev, num_cls_to_downstream = find_last_merged_rev(
             repo,
             upstream_rev,
             downstream_rev,
             upstream_subtree,
             downstream_subtree,
             args.exclude_file_patterns,
             related_repo,
             upstream_history_length=upstream_history_length,
             downstream_history_length=downstream_history_length,
         )
     except ValueError as e:
         if not pending_changes:
             raise e
         else:
             logger.info("Last related revision from pending changes!")
     else:
         logger.info("Last related revision: %s", last_related_rev)

     logger.info("Found: %s new changes to downstream", num_cls_to_downstream)

     pending_modifications = False
     for _, pending_cl in pending_changes.items():
         if (
             "copybot-reword" in pending_cl.hashtags
             or "copybot-preserve" not in pending_cl.hashtags
         ):
             pending_modifications = True
             break

     if not num_cls_to_downstream and not pending_modifications:
         # No CLs to downstream, and no modifications to pending CLs
         logger.info("Nothing to do!")
         return

     num_cls_to_downstream += len(pending_changes)

     if (
         num_cls_to_downstream > args.upstream_history_limit
         and args.upstream_history_limit != 0
     ):
         logger.warning(
             "There are %s CLs between HEAD and %s but the history limit is"
             " set to %s. Raising the history limit to accommodate this.",
             num_cls_to_downstream,
             last_related_rev,
             args.upstream_history_limit,
         )
         args.upstream_history_limit = num_cls_to_downstream

     commits_to_copy = find_commits_to_copy(
         repo,
         upstream_rev,
         upstream_subtree,
         downstream_rev,
         downstream_subtree,
         include_paths=args.include_downstream,
         upstream_limit=args.upstream_history_limit,
         downstream_limit=args.downstream_history_limit,
         exclude_file_patterns=args.exclude_file_patterns,
         pending_changes=pending_changes,
         skip_copybot_job_names=args.skip_job_name,
         upstream_history_length=upstream_history_length,
         downstream_history_length=downstream_history_length,
     )

     if not commits_to_copy:
         logger.info("Nothing to do!")
         return

     conflicted_revs = []
     empty_revs = []
     skipped_revs = []

     if args.limit > 0 and len(commits_to_copy) > args.limit:
         logger.warning(
             "Limiting commits to copy from %s to %s",
             len(commits_to_copy),
             args.limit,
         )
         commits_to_copy = commits_to_copy[-args.limit :]

     repo.checkout(downstream_rev)
     for i, rev in enumerate(reversed(commits_to_copy)):
         logger.info("(%s/%s) Cherry-pick %s", i + 1, len(commits_to_copy), rev)
         pending_change = False
         reword_pending_change = False
         if rev in pending_changes:
             if "copybot-preserve" in pending_changes[rev].hashtags:
                 logger.info(
                     "Preserving pending change due to copybot-preserve hashtag."
                 )
                 pending_change = True
             if "copybot-reword" in pending_changes[rev].hashtags:
                 reword_pending_change = True
                 logger.info(
                     "Rewording commit message due to copybot-reword hashtag."
                 )
         try:
             if pending_change:
                 repo.fetch(downstream_url, pending_changes[rev].current_ref)
                 repo.cherry_pick("FETCH_HEAD")
             else:
                 repo.cherry_pick(
                     rev,
                     patch_dir=patch_dir,
                     upstream_subtree=upstream_subtree,
                     downstream_subtree=downstream_subtree,
                     include_paths=args.include_downstream,
                     exclude_paths=args.exclude_file_patterns,
                 )
         except EmptyCommitError:
             logger.warning("Skip cherry-pick due to empty commit")
             empty_revs.append(rev)
             continue
         except MergeConflictError as e:
             logger.error("Merge conflict cherry-picking %s!", rev)
             if merge_conflict_behavior is MergeConflictBehavior.SKIP:
                 logger.warning("Skipping %s", rev)
                 skipped_revs.append(rev)
                 continue
             elif merge_conflict_behavior is MergeConflictBehavior.STOP:
                 logger.warning("Stopping at revision %s", rev)
                 skipped_revs.extend(list(reversed(commits_to_copy))[i:])
                 break
             elif (
                 merge_conflict_behavior is MergeConflictBehavior.ALLOW_CONFLICT
             ):
                 logger.warning("Committing %s with conflicts", rev)
                 if pending_change:
                     repo.fetch(downstream_url, pending_changes[rev].current_ref)
                     repo.cherry_pick(rev="FETCH_HEAD", allow_conflict=True)
                 else:
                     repo.cherry_pick(
                         rev,
                         patch_dir=patch_dir,
                         upstream_subtree=upstream_subtree,
                         downstream_subtree=downstream_subtree,
                         include_paths=args.include_downstream,
                         exclude_paths=args.exclude_file_patterns,
                         allow_conflict=True,
                     )
                 if not pending_change or reword_pending_change:
                     change_id = None
                     pending_overwrite = pending_changes.get(rev)
                     if pending_overwrite is not None:
                         change_id = pending_changes.get(rev).change_id
                     rewrite_commit_message(
                         repo,
                         upstream_rev=rev,
                         change_id=change_id or generate_change_id(),
                         prepend_subject=args.prepend_subject,
                         sign_off=args.add_signed_off_by,
                         keep_pseudoheaders=keep_pseudoheaders,
                         additional_pseudoheaders=[
                             *args.add_pseudoheaders,
                             "Commit: false",
                         ],
                     )
                 conflicted_revs.append(rev)
                 continue
             raise MergeConflictsError(commits=[rev]) from e
         if not pending_change or reword_pending_change:
             change_id = None
             pending_overwrite = pending_changes.get(rev)
             if pending_overwrite is not None:
                 change_id = pending_changes.get(rev).change_id
             rewrite_commit_message(
                 repo,
                 upstream_rev=rev,
                 change_id=change_id or generate_change_id(),
                 prepend_subject=args.prepend_subject,
                 sign_off=args.add_signed_off_by,
                 keep_pseudoheaders=keep_pseudoheaders,
                 additional_pseudoheaders=args.add_pseudoheaders,
             )

     if repo.rev_parse() == downstream_rev:
         logger.info("Nothing to push!")
     else:
         push_refspec = get_push_refspec(args, downstream_branch)
         if not args.dry_run and not local_downstream:
             try:
                 repo.push(
                     downstream_url, push_refspec, options=args.push_options
                 )
             except subprocess.CalledProcessError as e:
                 raise PushError(f"Failed to push to {downstream_url}") from e
         else:
             logger.info("Skip push due to dry/local run")

     emptylist = [
         repo.log(rev, fmt="%H %s", num=1).stdout.strip() for rev in empty_revs
     ]
     if emptylist:
         logger.warning(
             "The following commits were not applied because they were empty:"
         )
         for rev in emptylist:
             logger.warning("- %s", rev)

     revlist = [
         repo.log(rev, fmt="%H %s", num=1).stdout.strip() for rev in skipped_revs
     ]
     if revlist:
         logger.error(
             "The following commits were not applied due to merge conflict:"
         )
         for rev in revlist:
             logger.error("- %s", rev)
         raise MergeConflictsError(commits=skipped_revs)

     conflictedlist = [
         repo.log(rev, fmt="%H %s", num=1).stdout.strip()
         for rev in conflicted_revs
     ]
     if conflictedlist:
         logger.error("The following commits were uploaded with conflicts:")
         for rev in conflictedlist:
             logger.error("- %s", rev)
         raise MergeConflictsError(commits=conflicted_revs)


 def write_json_error(path: pathlib.Path, err: Exception):
     """Write out the JSON-serialized protobuf from an exception.

     Args:
         path: The Path to write to.
         err: The exception to serialize.
     """
     err_json = {}
     if err:
         if isinstance(err, CopybotFatalError):
             err_json["failure_reason"] = err.enum_name
             if err.commits:
                 err_json["merge_conflicts"] = [{"hash": x} for x in err.commits]
         else:
             err_json["failure_reason"] = CopybotFatalError.enum_name
     logger.debug("JSON response: %s", err_json)
     path.write_text(json.dumps(err_json))


 def main(argv=None):
     """The entry point to the program."""
     parser = argparse.ArgumentParser(description="CopyBot")
     parser.add_argument(
         "--topic",
         help="Topic to set and search in Gerrit",
         default="copybot",
     )
     parser.add_argument(
         "--label",
         help="Label to set in Gerrit (can be passed multiple times)",
         action="append",
         dest="labels",
         default=[],
     )
     parser.add_argument(
         "--re",
         help="Reviewer to set in Gerrit (can be passed multiple times)",
         action="append",
         dest="reviewers",
         default=[],
     )
     parser.add_argument(
         "--cc",
         help="CC to set in Gerrit (can be passed multiple times)",
         action="append",
         dest="ccs",
         default=[],
     )
     parser.add_argument(
         "--push-option",
         help="Add downstream push option (can be passed multiple times)",
         action="append",
         dest="push_options",
         default=[],
     )
     parser.add_argument(
         "--ht",
         help="Hashtag to set in Gerrit (can be passed multiple times)",
         action="append",
         dest="hashtags",
         default=[],
     )
     parser.add_argument(
         "--json-out",
         type=pathlib.Path,
         help="Write JSON result to this file.",
     )
     parser.add_argument(
         "--dry-run",
         help="Don't push",
         action="store_true",
     )
     parser.add_argument(
         "--prepend-subject",
         help="Prepend the subject of commits made with this string",
         default="",
     )
     parser.add_argument(
         "--exclude-file-pattern",
         help="Exclude changes to files matched by these path regexes",
         action="append",
         dest="exclude_file_patterns",
         default=[],
     )
     parser.add_argument(
         "--merge-conflict-behavior",
         help="How to handle merge conflicts",
         default="SKIP",
         choices=[behavior.name for behavior in MergeConflictBehavior],
     )
     parser.add_argument(
         "--add-signed-off-by",
         help="Add Signed-off-by pseudoheader to commit messages",
         action="store_true",
     )
     parser.add_argument(
         "--keep-pseudoheader",
         help="Keep a pseudoheader from being prefixed",
         action="append",
         dest="keep_pseudoheaders",
         default=[],
     )
     parser.add_argument(
         "--limit",
         type=int,
         default=500,
         help="Maximum number of CLs to downstream at once.  0 for infinite.",
     )
     parser.add_argument(
         "--upstream-history-limit",
         type=int,
         default=250,
         help="Maximum number of CLs in upstream history to check.  0 for"
         " infinite.",
     )
     parser.add_argument(
         "--downstream-history-limit",
         type=int,
         default=0,
         help="Maximum number of CLs in upstream history to check.  0 for"
         " infinite.",
     )
     parser.add_argument(
         "--include-downstream",
         action="append",
         default=[],
         help="Downstream include paths (relative to the subtree) separated by"
         " colons.  Note: Only supported with subtrees",
     )
     parser.add_argument(
         "--add-pseudoheader",
         action="append",
         default=[],
         dest="add_pseudoheaders",
         help="Pseudoheaders to be added to the commit message",
     )
     parser.add_argument(
         "--skip-job-name",
         action="append",
         default=[],
         help="Skip CLs in upstream copied from the specified job name",
     )
     parser.add_argument(
         "--upstream-history-starts-with",
         help="Commit hash to start comparing history from",
         default="",
     )
     parser.add_argument(
         "--downstream-history-starts-with",
         help="Commit hash to start comparing history from",
         default="",
     )
     parser.add_argument(
         "upstream",
         help="Upstream Git URL, optionally with a branch and subtree separated"
         " by colons",
     )
     parser.add_argument(
         "downstream",
         help="Downstream Git URL, optionally with a branch and subtree"
         "separated by colons",
     )
     args = parser.parse_args(argv)

     logging.basicConfig(
         format="%(asctime)s %(levelname)s: %(message)s",
         level=logging.INFO,
     )

     if 0 < args.downstream_history_limit < args.upstream_history_limit:
         logger.warning(
             "Using a lower downstream limit than upstream limit may cause"
             " previously downstreamed changes to be chosen again."
         )

     err = None
     try:
         with tempfile.TemporaryDirectory(
             ".copybot"
         ) as git_dir, tempfile.TemporaryDirectory("_patches") as patch_dir:
             run_copybot(args, git_dir, patch_dir)
     except Exception as e:
         err = e
         raise
     finally:
         if args.json_out:
             write_json_error(args.json_out, err)


 if __name__ == "__main__":
     main()