scripts: Add get_cos_specific_kernel_commits.py
Adds a script to get commits which are specific to the cos kernel
sources and which do not exist in the upstream kernel branch. This is
intended to be used during the annual kernel upgrade.
BUG=b/339879422
TEST=Ran `python3 get_cos_specific_kernel_commits.py -p 6.1 -n 6.6 >
commits.tsv` and manually verified output
RELEASE_NOTE=None
Change-Id: I1deee4f955d096afefa5d480d5848573009281eb
Reviewed-on: https://cos-review.googlesource.com/c/cos/scripts/+/72873
Tested-by: Cusky Presubmit Bot <presubmit@cos-infra-prod.iam.gserviceaccount.com>
Reviewed-by: Oleksandr Tymoshenko <ovt@google.com>
diff --git a/get_cos_specific_kernel_commits.py b/get_cos_specific_kernel_commits.py
new file mode 100755
index 0000000..cd45362
--- /dev/null
+++ b/get_cos_specific_kernel_commits.py
@@ -0,0 +1,231 @@
+#!/usr/bin/env python3
+#
+# Copyright 2024 The ChromiumOS Authors
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+# This script can be used to find commits which are present in the COS kernel
+# branch, but not the upstream kernel branch.
+# The script takes the new kernel version and previous kernel version as
+# arguments and does the following:
+#
+# 1. Gets all of the commits which are specific to COS by listing the commits
+# which are in the `cos-$PREVIOUS_KERNEL_VERSION` branch but not in the
+# `upstream-$KERNEL_VERSION` branch.
+# 2. For each commit:
+# 1. Checks if the commit is present upstream by:
+# 1. Checking if a commit with the exact same contents is present
+# upstream.
+# 2. Checking if the there are upstream commit hashes in the commit
+# message.
+# 3. Checking for an upstream commit with the exact same subject line.
+# 2. Outputs the commit hash, the upstream commit hash (if present),
+# whether an exact content match was found, and the commit subject.
+#
+# The output will be a tab-separated table in commit order.
+#
+# Usage:
+# If you were upgrading from kernel version 6.1 to 6.6, you could run this
+# script as follows:
+#
+# python3 get_cos_specific_kernel_commits.py -p 6.1 -n 6.6 > commits.tsv
+
+import argparse
+import dataclasses
+import logging
+import re
+from subprocess import PIPE, Popen, run
+import sys
+from typing import Mapping, Optional
+
+COMMIT_SUBJECT_RE = re.compile(r'(.*?) (.*)')
+UPSTREAM_COMMIT_PATTERN_1 = re.compile('Upstream commit.*?([a-f0-9]+)')
+UPSTREAM_COMMIT_PATTERN_2 = re.compile('UPSTREAM\(([a-f0-9]+)\)')
+
+
+@dataclasses.dataclass
+class Commit:
+ """Class representing information about a git commit."""
+
+ _hash: str
+ subject: str
+ message: str
+ has_exact_content_match: bool
+
+ def get_upstream_commit_hashes(
+ self,
+ subject_to_commit_map: Mapping[str, list[str]],
+ ) -> list[str]:
+ """Gets the hashes of any upstream commits which "match" this one.
+
+ A commit is determined to "match" this one if it is referred to as an
+ upstream commit in the body of this commit, or if it has the same subject
+ as this commit.
+
+ Args:
+ subject_to_commit_map: A mapping from commit subjects to a list of hashes
+ for commits with that subject.
+
+ Returns:
+ A list of all commit hashes which match this one.
+ """
+ logging.info(
+ f'Getting upstream hashes for commit {self._hash} ({self.subject})'
+ )
+ if (m := UPSTREAM_COMMIT_PATTERN_1.search(self.message)) is not None:
+ logging.info('Matched upstream pattern 1')
+ return [m.group(1)]
+ elif (m := UPSTREAM_COMMIT_PATTERN_2.search(self.message)) is not None:
+ logging.info('Matched upstream pattern 2')
+ return [m.group(1)]
+ elif subject_to_commit_map:
+ logging.info(f'Checking subject: {self.subject}')
+ return subject_to_commit_map.get(self.subject, [])
+ else:
+ return []
+
+
+def get_cos_specific_commits(
+ previous_kernel_version: str, new_kernel_version: str, origin: str
+) -> list[Commit]:
+ """Gets the commits which are in the previous COS kernel but not the new kernel.
+
+ For example, this would return all of the commits which were added to
+ cos/cos-6.1 starting from cos/upstream-6.1, but which have not been added to
+ cos/upstream-6.6:
+ get_cos_specific_commits('6.1', '6.6', 'cos')
+ """
+
+ old_branch = f'{origin}/cos-{previous_kernel_version}'
+ old_upstream = f'{origin}/upstream-{previous_kernel_version}'
+ new_upstream = f'{origin}/upstream-{new_kernel_version}'
+
+ return _git_cherry(new_upstream, old_branch, old_upstream)
+
+
+def _git_cherry(upstream: str, head: str, limit: str) -> list[Commit]:
+ """Gets commits corresponding to the output of `git cherry -v upstream head limit`
+
+ See https://git-scm.com/docs/git-cherry for more information.
+ """
+ commits = []
+
+ with Popen(
+ ['git', 'cherry', '-v', upstream, head, limit], stdout=PIPE
+ ) as commit_lines:
+ for line in commit_lines.stdout:
+ try:
+ line = line.decode('utf-8').strip()
+ except Exception as e:
+ logging.warning(e)
+ continue
+
+ has_exact_content_match_str, _hash, subject = line.split(maxsplit=2)
+ has_exact_content_match = has_exact_content_match_str == '-'
+ message = _get_commit_message(_hash)
+
+ commit = Commit(
+ _hash=_hash,
+ subject=subject,
+ message=message,
+ has_exact_content_match=has_exact_content_match,
+ )
+
+ commits.append(commit)
+
+ return commits
+
+
+def _get_commit_message(commit_hash: str) -> str:
+ return run(
+ ['git', 'log', '--format=%B', '-n', '1', commit_hash], stdout=PIPE
+ ).stdout.decode('utf-8')
+
+
+def get_subject_to_commit_map(
+ branch: str, max_commits: Optional[int] = None
+) -> dict[str, list[str]]:
+ """Gets the map from commit subjects to their hashes on a specific branch.
+
+ Args:
+ branch: The branch for which the mapping will be collected.
+ max_commits: The maximum number of commits to iterate over when collecting
+ the subject to commit map. If None, will iterate over all commits. Setting
+ this to a smaller value saves a significant amount of time for large
+ repositories.
+
+ Returns:
+ A dict from commit subjects to lists of hashes of commits with that
+ subject. Most subjects will only have one corresponding hash, but
+ sometimes two unrelated commits may have the same subject.
+ """
+ mapping = {}
+ with Popen(
+ ['git', 'log', '--format=%H %s', branch], stdout=PIPE
+ ) as commit_lines:
+ for i, line in enumerate(commit_lines.stdout):
+ if max_commits is not None and i >= max_commits:
+ break
+ try:
+ line = line.decode('utf-8').strip()
+ except Exception as e:
+ logging.warning(e)
+ continue
+
+ if (m := COMMIT_SUBJECT_RE.match(line)) is not None:
+ commit, subject = m.groups()
+ # A subject can have multiple commits, so we return them all.
+ commits = mapping.get(subject, [])
+ commits.append(commit)
+ mapping[subject] = commits
+ logging.info(f'mapped {len(mapping)} commit subjects')
+ return mapping
+
+
+if __name__ == '__main__':
+ logging.basicConfig(level=logging.INFO)
+
+ parser = argparse.ArgumentParser(
+ prog='get-cos-specific-kernel-commits',
+ description=(
+ 'Finds commits which are in the COS kernel branch, but not the'
+ ' upstream kernel branch'
+ ),
+ )
+ parser.add_argument('-n', '--new-kernel-version', type=str, required=True)
+ parser.add_argument(
+ '-p', '--previous-kernel-version', type=str, required=True
+ )
+ parser.add_argument('-o', '--origin', default='cos', type=str, nargs='?')
+ args = parser.parse_args()
+
+ logging.info('Finding COS-specific commits...')
+ commits = get_cos_specific_commits(
+ args.previous_kernel_version, args.new_kernel_version, args.origin
+ )
+ logging.info(f'Done. Found {len(commits)} commits.')
+
+ logging.info('Getting map from commit subjects to commit hashes...')
+ # Get the subject mapping for the 100,000 most recent commits. Hopefully
+ # we're not more behind than that at any point.
+ new_upstream = f'{args.origin}/upstream-{args.new_kernel_version}'
+ subject_to_commit_map = get_subject_to_commit_map(new_upstream, 100000)
+ logging.info('Done.')
+
+ # Find the upstream commits for each commit, if present, and output the
+ # results as a tab-separated table.
+ sys.stdout.write(f'Hash\tUpstream Hash\tHas Exact Content Match\tSubject\n')
+ for commit in commits:
+ upstream_hashes = commit.get_upstream_commit_hashes(subject_to_commit_map)
+ if len(upstream_hashes) > 1:
+ logging.warning(
+ f'Found multiple upstream commits for commit {commit._hash}:'
+ f' {commit.subject}'
+ )
+ if not upstream_hashes:
+ # We ouptut the string 'None' if we didn't find any matching commits.
+ upstream_hashes = ['None']
+ for upstream_hash in upstream_hashes:
+ sys.stdout.write(
+ f'{commit._hash}\t{upstream_hash}\t{commit.has_exact_content_match}\t{commit.subject}\n'
+ )