| #!/usr/bin/env python3 |
| # |
| # Copyright 2024 The ChromiumOS Authors |
| # Use of this source code is governed by a BSD-style license that can be |
| # found in the LICENSE file. |
| |
| # This script is a slightly fancier version of `git cherry`, which can be used |
| # to find commits which have been applied locally but haven't been applied |
| # upstream. |
| # |
| # Like `git cherry`, this script takes arguments for an upstream, a head, and |
| # a limit. If the head argument is omitted, HEAD is used. |
| # |
| # The script will do the following: |
| # 1. Get all of the commits which are present between limit and head. |
| # 2. For each commit: |
| # 1. Check if the commit is present upstream by: |
| # 1. Checking if a commit with an identical hash is present upstream. |
| # 2. Checking if a commit with the exact same contents is present |
| # upstream. |
| # 3. Checking if the there are upstream commit hashes in the commit |
| # message. |
| # 4. Checking for an upstream commit with the exact same subject line. |
| # 2. Output the commit hash, the upstream commit hash (if present), |
| # whether an exact content match was found, and the commit subject. |
| # |
| # The output will be a tab-separated table in commit order. |
| # |
| # In order for the upstream subject lookups to be fast, the commit history |
| # between limit and upstream is read and the subjects are put into a hashmap. |
| # This default <limit>..<upstream> commit range should almost always be fine, |
| # but a different revision range can be passed via the |
| # subject-map-revision-range argument if it's expected that there are relevant |
| # commits elsewhere. |
| # |
| # Usage: |
| # If you were upgrading from kernel version 6.1 to 6.6, you could run this |
| # script as follows: |
| # |
| # python3 get_non_upstreamed_commits.py \ |
| # --upstream cos/upstream-6.6 \ |
| # --head cos-6.6 \ |
| # --limit cos/upstream-6.6 > commits.tsv |
| # |
| # Using the abbreviated arguments: |
| # |
| # python3 get_non_upstreamed_commits.py \ |
| # -u cos/upstream-6.6 -h cos-6.6 -l cos/upstream-6.6 > commits.tsv |
| |
| import argparse |
| import dataclasses |
| import logging |
| import re |
| from subprocess import PIPE, Popen, run |
| import sys |
| from typing import Mapping, Optional |
| |
| COMMIT_SUBJECT_RE = re.compile(r'(.*?) (.*)') |
| UPSTREAM_COMMIT_PATTERN_1 = re.compile('Upstream commit.*?([a-f0-9]+)') |
| UPSTREAM_COMMIT_PATTERN_2 = re.compile(r'UPSTREAM\(([a-f0-9]+)\)') |
| |
| |
| @dataclasses.dataclass |
| class Commit: |
| """Class representing information about a git commit.""" |
| |
| _hash: str |
| subject: str |
| message: str |
| has_exact_content_match: bool |
| |
| def get_upstream_commit_hashes( |
| self, |
| subject_to_commit_map: Mapping[str, list[str]], |
| ) -> list[str]: |
| """Gets the hashes of any upstream commits which "match" this one. |
| |
| A commit is determined to "match" this one if it is referred to as an |
| upstream commit in the body of this commit, or if it has the same subject |
| as this commit. |
| |
| Args: |
| subject_to_commit_map: A mapping from commit subjects to a list of hashes |
| for commits with that subject. |
| |
| Returns: |
| A list of all commit hashes which match this one. |
| """ |
| logging.info( |
| f'Getting upstream hashes for commit {self._hash} ({self.subject})' |
| ) |
| if (m := UPSTREAM_COMMIT_PATTERN_1.search(self.message)) is not None: |
| logging.info('Matched upstream pattern 1') |
| return [m.group(1)] |
| elif (m := UPSTREAM_COMMIT_PATTERN_2.search(self.message)) is not None: |
| logging.info('Matched upstream pattern 2') |
| return [m.group(1)] |
| elif subject_to_commit_map: |
| logging.info(f'Checking subject: {self.subject}') |
| return subject_to_commit_map.get(self.subject, []) |
| else: |
| return [] |
| |
| |
| def git_cherry(upstream: str, head: str, limit: str) -> list[Commit]: |
| """Gets commits corresponding to the output of `git cherry -v upstream head limit` |
| |
| See https://git-scm.com/docs/git-cherry for more information. |
| """ |
| commits = [] |
| |
| with Popen( |
| ['git', 'cherry', '-v', upstream, head, limit], stdout=PIPE |
| ) as commit_lines: |
| for line in commit_lines.stdout: |
| try: |
| line = line.decode('utf-8').strip() |
| except Exception as e: |
| logging.warning(e) |
| continue |
| |
| has_exact_content_match_str, _hash, subject = line.split(maxsplit=2) |
| has_exact_content_match = has_exact_content_match_str == '-' |
| message = _get_commit_message(_hash) |
| |
| commit = Commit( |
| _hash=_hash, |
| subject=subject, |
| message=message, |
| has_exact_content_match=has_exact_content_match, |
| ) |
| |
| commits.append(commit) |
| |
| return commits |
| |
| |
| def _get_commit_message(commit_hash: str) -> str: |
| return run( |
| ['git', 'log', '--format=%B', '-n', '1', commit_hash], stdout=PIPE |
| ).stdout.decode('utf-8') |
| |
| |
| def get_subject_to_commit_map(revision_range: str) -> dict[str, list[str]]: |
| """Gets the map from commit subjects to their hashes for a revision range. |
| |
| Args: |
| revision_range: The range of commits whose subjects will be loaded. The |
| interpretation is the same as `git log`, i.e., <commit1>..<commit2> |
| will load the commits between commit1 and commit2. |
| |
| Returns: |
| A dict from commit subjects to lists of hashes of commits with that |
| subject. Most subjects will only have one corresponding hash, but |
| sometimes two unrelated commits may have the same subject. |
| """ |
| mapping = {} |
| with Popen( |
| ['git', 'log', '--format=%H %s', revision_range], stdout=PIPE |
| ) as commit_lines: |
| for line in commit_lines.stdout: |
| try: |
| line = line.decode('utf-8').strip() |
| except Exception as e: |
| logging.warning(e) |
| continue |
| |
| if (m := COMMIT_SUBJECT_RE.match(line)) is not None: |
| commit, subject = m.groups() |
| # A subject can have multiple commits, so we return them all. |
| commits = mapping.get(subject, []) |
| commits.append(commit) |
| mapping[subject] = commits |
| logging.info(f'mapped {len(mapping)} commit subjects') |
| return mapping |
| |
| |
| if __name__ == '__main__': |
| logging.basicConfig(level=logging.INFO) |
| |
| parser = argparse.ArgumentParser( |
| prog='get-non-upstreamed-commits', |
| description=( |
| "Finds commits which have been applied locally but haven't been" |
| " applied upstream" |
| ), |
| ) |
| parser.add_argument('-u', '--upstream', type=str, required=True) |
| parser.add_argument('--head', type=str, default='HEAD', required=False) |
| parser.add_argument('-l', '--limit', type=str, required=True) |
| parser.add_argument( |
| '-s', '--subject-map-revision-range', type=str, required=False |
| ) |
| args = parser.parse_args() |
| |
| logging.info('Finding COS-specific commits...') |
| commits = git_cherry(args.upstream, args.head, args.limit) |
| logging.info(f'Done. Found {len(commits)} commits.') |
| |
| logging.info('Getting map from commit subjects to commit hashes...') |
| subject_map_revision_range = ( |
| args.subject_map_revision_range or f'{args.limit}..{args.upstream}' |
| ) |
| subject_to_commit_map = get_subject_to_commit_map(subject_map_revision_range) |
| logging.info('Done.') |
| |
| # Find the upstream commits for each commit, if present, and output the |
| # results as a tab-separated table. |
| sys.stdout.write(f'Hash\tUpstream Hash\tHas Exact Content Match\tSubject\n') |
| for commit in commits: |
| upstream_hashes = commit.get_upstream_commit_hashes(subject_to_commit_map) |
| if len(upstream_hashes) > 1: |
| logging.warning( |
| f'Found multiple upstream commits for commit {commit._hash}:' |
| f' {commit.subject}' |
| ) |
| if not upstream_hashes: |
| # We ouptut the string 'None' if we didn't find any matching commits. |
| upstream_hashes = ['None'] |
| for upstream_hash in upstream_hashes: |
| sys.stdout.write( |
| f'{commit._hash}\t{upstream_hash}\t{commit.has_exact_content_match}\t{commit.subject}\n' |
| ) |