blob: d26327c02f6c4be6f44afc8febafd3aff753cd7c [file] [log] [blame] [edit]
#!/usr/bin/env python3
#
# Copyright 2024 The ChromiumOS Authors
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
# This script is a slightly fancier version of `git cherry`, which can be used
# to find commits which have been applied locally but haven't been applied
# upstream.
#
# Like `git cherry`, this script takes arguments for an upstream, a head, and
# a limit. If the head argument is omitted, HEAD is used.
#
# The script will do the following:
# 1. Get all of the commits which are present between limit and head.
# 2. For each commit:
# 1. Check if the commit is present upstream by:
# 1. Checking if a commit with an identical hash is present upstream.
# 2. Checking if a commit with the exact same contents is present
# upstream.
# 3. Checking if the there are upstream commit hashes in the commit
# message.
# 4. Checking for an upstream commit with the exact same subject line.
# 2. Output the commit hash, the upstream commit hash (if present),
# whether an exact content match was found, and the commit subject.
#
# The output will be a tab-separated table in commit order.
#
# In order for the upstream subject lookups to be fast, the commit history
# between limit and upstream is read and the subjects are put into a hashmap.
# This default <limit>..<upstream> commit range should almost always be fine,
# but a different revision range can be passed via the
# subject-map-revision-range argument if it's expected that there are relevant
# commits elsewhere.
#
# Usage:
# If you were upgrading from kernel version 6.1 to 6.6, you could run this
# script as follows:
#
# python3 get_non_upstreamed_commits.py \
# --upstream cos/upstream-6.6 \
# --head cos-6.6 \
# --limit cos/upstream-6.6 > commits.tsv
#
# Using the abbreviated arguments:
#
# python3 get_non_upstreamed_commits.py \
# -u cos/upstream-6.6 -h cos-6.6 -l cos/upstream-6.6 > commits.tsv
import argparse
import dataclasses
import logging
import re
from subprocess import PIPE, Popen, run
import sys
from typing import Mapping, Optional
COMMIT_SUBJECT_RE = re.compile(r'(.*?) (.*)')
UPSTREAM_COMMIT_PATTERN_1 = re.compile('Upstream commit.*?([a-f0-9]+)')
UPSTREAM_COMMIT_PATTERN_2 = re.compile(r'UPSTREAM\(([a-f0-9]+)\)')
@dataclasses.dataclass
class Commit:
"""Class representing information about a git commit."""
_hash: str
subject: str
message: str
has_exact_content_match: bool
def get_upstream_commit_hashes(
self,
subject_to_commit_map: Mapping[str, list[str]],
) -> list[str]:
"""Gets the hashes of any upstream commits which "match" this one.
A commit is determined to "match" this one if it is referred to as an
upstream commit in the body of this commit, or if it has the same subject
as this commit.
Args:
subject_to_commit_map: A mapping from commit subjects to a list of hashes
for commits with that subject.
Returns:
A list of all commit hashes which match this one.
"""
logging.info(
f'Getting upstream hashes for commit {self._hash} ({self.subject})'
)
if (m := UPSTREAM_COMMIT_PATTERN_1.search(self.message)) is not None:
logging.info('Matched upstream pattern 1')
return [m.group(1)]
elif (m := UPSTREAM_COMMIT_PATTERN_2.search(self.message)) is not None:
logging.info('Matched upstream pattern 2')
return [m.group(1)]
elif subject_to_commit_map:
logging.info(f'Checking subject: {self.subject}')
return subject_to_commit_map.get(self.subject, [])
else:
return []
def git_cherry(upstream: str, head: str, limit: str) -> list[Commit]:
"""Gets commits corresponding to the output of `git cherry -v upstream head limit`
See https://git-scm.com/docs/git-cherry for more information.
"""
commits = []
with Popen(
['git', 'cherry', '-v', upstream, head, limit], stdout=PIPE
) as commit_lines:
for line in commit_lines.stdout:
try:
line = line.decode('utf-8').strip()
except Exception as e:
logging.warning(e)
continue
has_exact_content_match_str, _hash, subject = line.split(maxsplit=2)
has_exact_content_match = has_exact_content_match_str == '-'
message = _get_commit_message(_hash)
commit = Commit(
_hash=_hash,
subject=subject,
message=message,
has_exact_content_match=has_exact_content_match,
)
commits.append(commit)
return commits
def _get_commit_message(commit_hash: str) -> str:
return run(
['git', 'log', '--format=%B', '-n', '1', commit_hash], stdout=PIPE
).stdout.decode('utf-8')
def get_subject_to_commit_map(revision_range: str) -> dict[str, list[str]]:
"""Gets the map from commit subjects to their hashes for a revision range.
Args:
revision_range: The range of commits whose subjects will be loaded. The
interpretation is the same as `git log`, i.e., <commit1>..<commit2>
will load the commits between commit1 and commit2.
Returns:
A dict from commit subjects to lists of hashes of commits with that
subject. Most subjects will only have one corresponding hash, but
sometimes two unrelated commits may have the same subject.
"""
mapping = {}
with Popen(
['git', 'log', '--format=%H %s', revision_range], stdout=PIPE
) as commit_lines:
for line in commit_lines.stdout:
try:
line = line.decode('utf-8').strip()
except Exception as e:
logging.warning(e)
continue
if (m := COMMIT_SUBJECT_RE.match(line)) is not None:
commit, subject = m.groups()
# A subject can have multiple commits, so we return them all.
commits = mapping.get(subject, [])
commits.append(commit)
mapping[subject] = commits
logging.info(f'mapped {len(mapping)} commit subjects')
return mapping
if __name__ == '__main__':
logging.basicConfig(level=logging.INFO)
parser = argparse.ArgumentParser(
prog='get-non-upstreamed-commits',
description=(
"Finds commits which have been applied locally but haven't been"
" applied upstream"
),
)
parser.add_argument('-u', '--upstream', type=str, required=True)
parser.add_argument('--head', type=str, default='HEAD', required=False)
parser.add_argument('-l', '--limit', type=str, required=True)
parser.add_argument(
'-s', '--subject-map-revision-range', type=str, required=False
)
args = parser.parse_args()
logging.info('Finding COS-specific commits...')
commits = git_cherry(args.upstream, args.head, args.limit)
logging.info(f'Done. Found {len(commits)} commits.')
logging.info('Getting map from commit subjects to commit hashes...')
subject_map_revision_range = (
args.subject_map_revision_range or f'{args.limit}..{args.upstream}'
)
subject_to_commit_map = get_subject_to_commit_map(subject_map_revision_range)
logging.info('Done.')
# Find the upstream commits for each commit, if present, and output the
# results as a tab-separated table.
sys.stdout.write(f'Hash\tUpstream Hash\tHas Exact Content Match\tSubject\n')
for commit in commits:
upstream_hashes = commit.get_upstream_commit_hashes(subject_to_commit_map)
if len(upstream_hashes) > 1:
logging.warning(
f'Found multiple upstream commits for commit {commit._hash}:'
f' {commit.subject}'
)
if not upstream_hashes:
# We ouptut the string 'None' if we didn't find any matching commits.
upstream_hashes = ['None']
for upstream_hash in upstream_hashes:
sys.stdout.write(
f'{commit._hash}\t{upstream_hash}\t{commit.has_exact_content_match}\t{commit.subject}\n'
)