blob: cd4536284c2427d3b8e3e35500b78154eb017a01 [file] [log] [blame] [edit]
#!/usr/bin/env python3
#
# Copyright 2024 The ChromiumOS Authors
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
# This script can be used to find commits which are present in the COS kernel
# branch, but not the upstream kernel branch.
# The script takes the new kernel version and previous kernel version as
# arguments and does the following:
#
# 1. Gets all of the commits which are specific to COS by listing the commits
# which are in the `cos-$PREVIOUS_KERNEL_VERSION` branch but not in the
# `upstream-$KERNEL_VERSION` branch.
# 2. For each commit:
# 1. Checks if the commit is present upstream by:
# 1. Checking if a commit with the exact same contents is present
# upstream.
# 2. Checking if the there are upstream commit hashes in the commit
# message.
# 3. Checking for an upstream commit with the exact same subject line.
# 2. Outputs the commit hash, the upstream commit hash (if present),
# whether an exact content match was found, and the commit subject.
#
# The output will be a tab-separated table in commit order.
#
# Usage:
# If you were upgrading from kernel version 6.1 to 6.6, you could run this
# script as follows:
#
# python3 get_cos_specific_kernel_commits.py -p 6.1 -n 6.6 > commits.tsv
import argparse
import dataclasses
import logging
import re
from subprocess import PIPE, Popen, run
import sys
from typing import Mapping, Optional
COMMIT_SUBJECT_RE = re.compile(r'(.*?) (.*)')
UPSTREAM_COMMIT_PATTERN_1 = re.compile('Upstream commit.*?([a-f0-9]+)')
UPSTREAM_COMMIT_PATTERN_2 = re.compile('UPSTREAM\(([a-f0-9]+)\)')
@dataclasses.dataclass
class Commit:
"""Class representing information about a git commit."""
_hash: str
subject: str
message: str
has_exact_content_match: bool
def get_upstream_commit_hashes(
self,
subject_to_commit_map: Mapping[str, list[str]],
) -> list[str]:
"""Gets the hashes of any upstream commits which "match" this one.
A commit is determined to "match" this one if it is referred to as an
upstream commit in the body of this commit, or if it has the same subject
as this commit.
Args:
subject_to_commit_map: A mapping from commit subjects to a list of hashes
for commits with that subject.
Returns:
A list of all commit hashes which match this one.
"""
logging.info(
f'Getting upstream hashes for commit {self._hash} ({self.subject})'
)
if (m := UPSTREAM_COMMIT_PATTERN_1.search(self.message)) is not None:
logging.info('Matched upstream pattern 1')
return [m.group(1)]
elif (m := UPSTREAM_COMMIT_PATTERN_2.search(self.message)) is not None:
logging.info('Matched upstream pattern 2')
return [m.group(1)]
elif subject_to_commit_map:
logging.info(f'Checking subject: {self.subject}')
return subject_to_commit_map.get(self.subject, [])
else:
return []
def get_cos_specific_commits(
previous_kernel_version: str, new_kernel_version: str, origin: str
) -> list[Commit]:
"""Gets the commits which are in the previous COS kernel but not the new kernel.
For example, this would return all of the commits which were added to
cos/cos-6.1 starting from cos/upstream-6.1, but which have not been added to
cos/upstream-6.6:
get_cos_specific_commits('6.1', '6.6', 'cos')
"""
old_branch = f'{origin}/cos-{previous_kernel_version}'
old_upstream = f'{origin}/upstream-{previous_kernel_version}'
new_upstream = f'{origin}/upstream-{new_kernel_version}'
return _git_cherry(new_upstream, old_branch, old_upstream)
def _git_cherry(upstream: str, head: str, limit: str) -> list[Commit]:
"""Gets commits corresponding to the output of `git cherry -v upstream head limit`
See https://git-scm.com/docs/git-cherry for more information.
"""
commits = []
with Popen(
['git', 'cherry', '-v', upstream, head, limit], stdout=PIPE
) as commit_lines:
for line in commit_lines.stdout:
try:
line = line.decode('utf-8').strip()
except Exception as e:
logging.warning(e)
continue
has_exact_content_match_str, _hash, subject = line.split(maxsplit=2)
has_exact_content_match = has_exact_content_match_str == '-'
message = _get_commit_message(_hash)
commit = Commit(
_hash=_hash,
subject=subject,
message=message,
has_exact_content_match=has_exact_content_match,
)
commits.append(commit)
return commits
def _get_commit_message(commit_hash: str) -> str:
return run(
['git', 'log', '--format=%B', '-n', '1', commit_hash], stdout=PIPE
).stdout.decode('utf-8')
def get_subject_to_commit_map(
branch: str, max_commits: Optional[int] = None
) -> dict[str, list[str]]:
"""Gets the map from commit subjects to their hashes on a specific branch.
Args:
branch: The branch for which the mapping will be collected.
max_commits: The maximum number of commits to iterate over when collecting
the subject to commit map. If None, will iterate over all commits. Setting
this to a smaller value saves a significant amount of time for large
repositories.
Returns:
A dict from commit subjects to lists of hashes of commits with that
subject. Most subjects will only have one corresponding hash, but
sometimes two unrelated commits may have the same subject.
"""
mapping = {}
with Popen(
['git', 'log', '--format=%H %s', branch], stdout=PIPE
) as commit_lines:
for i, line in enumerate(commit_lines.stdout):
if max_commits is not None and i >= max_commits:
break
try:
line = line.decode('utf-8').strip()
except Exception as e:
logging.warning(e)
continue
if (m := COMMIT_SUBJECT_RE.match(line)) is not None:
commit, subject = m.groups()
# A subject can have multiple commits, so we return them all.
commits = mapping.get(subject, [])
commits.append(commit)
mapping[subject] = commits
logging.info(f'mapped {len(mapping)} commit subjects')
return mapping
if __name__ == '__main__':
logging.basicConfig(level=logging.INFO)
parser = argparse.ArgumentParser(
prog='get-cos-specific-kernel-commits',
description=(
'Finds commits which are in the COS kernel branch, but not the'
' upstream kernel branch'
),
)
parser.add_argument('-n', '--new-kernel-version', type=str, required=True)
parser.add_argument(
'-p', '--previous-kernel-version', type=str, required=True
)
parser.add_argument('-o', '--origin', default='cos', type=str, nargs='?')
args = parser.parse_args()
logging.info('Finding COS-specific commits...')
commits = get_cos_specific_commits(
args.previous_kernel_version, args.new_kernel_version, args.origin
)
logging.info(f'Done. Found {len(commits)} commits.')
logging.info('Getting map from commit subjects to commit hashes...')
# Get the subject mapping for the 100,000 most recent commits. Hopefully
# we're not more behind than that at any point.
new_upstream = f'{args.origin}/upstream-{args.new_kernel_version}'
subject_to_commit_map = get_subject_to_commit_map(new_upstream, 100000)
logging.info('Done.')
# Find the upstream commits for each commit, if present, and output the
# results as a tab-separated table.
sys.stdout.write(f'Hash\tUpstream Hash\tHas Exact Content Match\tSubject\n')
for commit in commits:
upstream_hashes = commit.get_upstream_commit_hashes(subject_to_commit_map)
if len(upstream_hashes) > 1:
logging.warning(
f'Found multiple upstream commits for commit {commit._hash}:'
f' {commit.subject}'
)
if not upstream_hashes:
# We ouptut the string 'None' if we didn't find any matching commits.
upstream_hashes = ['None']
for upstream_hash in upstream_hashes:
sys.stdout.write(
f'{commit._hash}\t{upstream_hash}\t{commit.has_exact_content_match}\t{commit.subject}\n'
)