#!/usr/bin/env python3
#
# Copyright 2024 The ChromiumOS Authors
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.

# This script can be used to find commits which are present in the COS kernel
# branch, but not the upstream kernel branch.
# The script takes the new kernel version and previous kernel version as
# arguments and does the following:
#
# 1.  Gets all of the commits which are specific to COS by listing the commits
#     which are in the `cos-$PREVIOUS_KERNEL_VERSION` branch but not in the
#     `upstream-$KERNEL_VERSION` branch.
# 2.  For each commit:
#     1.  Checks if the commit is present upstream by:
#         1.  Checking if a commit with the exact same contents is present
#             upstream.
#         2.  Checking if the there are upstream commit hashes in the commit
#             message.
#         3.  Checking for an upstream commit with the exact same subject line.
#     2.  Outputs the commit hash, the upstream commit hash (if present),
#         whether an exact content match was found, and the commit subject.
#
# The output will be a tab-separated table in commit order.
#
# Usage:
#   If you were upgrading from kernel version 6.1 to 6.6, you could run this
#   script as follows:
#
#   python3 get_cos_specific_kernel_commits.py -p 6.1 -n 6.6 > commits.tsv

import argparse
import dataclasses
import logging
import re
from subprocess import PIPE, Popen, run
import sys
from typing import Mapping, Optional

COMMIT_SUBJECT_RE = re.compile(r'(.*?) (.*)')
UPSTREAM_COMMIT_PATTERN_1 = re.compile('Upstream commit.*?([a-f0-9]+)')
UPSTREAM_COMMIT_PATTERN_2 = re.compile('UPSTREAM\(([a-f0-9]+)\)')


@dataclasses.dataclass
class Commit:
  """Class representing information about a git commit."""

  _hash: str
  subject: str
  message: str
  has_exact_content_match: bool

  def get_upstream_commit_hashes(
      self,
      subject_to_commit_map: Mapping[str, list[str]],
  ) -> list[str]:
    """Gets the hashes of any upstream commits which "match" this one.

    A commit is determined to "match" this one if it is referred to as an
    upstream commit in the body of this commit, or if it has the same subject
    as this commit.

    Args:
      subject_to_commit_map: A mapping from commit subjects to a list of hashes
        for commits with that subject.

    Returns:
      A list of all commit hashes which match this one.
    """
    logging.info(
        f'Getting upstream hashes for commit {self._hash} ({self.subject})'
    )
    if (m := UPSTREAM_COMMIT_PATTERN_1.search(self.message)) is not None:
      logging.info('Matched upstream pattern 1')
      return [m.group(1)]
    elif (m := UPSTREAM_COMMIT_PATTERN_2.search(self.message)) is not None:
      logging.info('Matched upstream pattern 2')
      return [m.group(1)]
    elif subject_to_commit_map:
      logging.info(f'Checking subject: {self.subject}')
      return subject_to_commit_map.get(self.subject, [])
    else:
      return []


def get_cos_specific_commits(
    previous_kernel_version: str, new_kernel_version: str, origin: str
) -> list[Commit]:
  """Gets the commits which are in the previous COS kernel but not the new kernel.

  For example, this would return all of the commits which were added to
  cos/cos-6.1 starting from cos/upstream-6.1, but which have not been added to
  cos/upstream-6.6:
    get_cos_specific_commits('6.1', '6.6', 'cos')
  """

  old_branch = f'{origin}/cos-{previous_kernel_version}'
  old_upstream = f'{origin}/upstream-{previous_kernel_version}'
  new_upstream = f'{origin}/upstream-{new_kernel_version}'

  return _git_cherry(new_upstream, old_branch, old_upstream)


def _git_cherry(upstream: str, head: str, limit: str) -> list[Commit]:
  """Gets commits corresponding to the output of `git cherry -v upstream head limit`

  See https://git-scm.com/docs/git-cherry for more information.
  """
  commits = []

  with Popen(
      ['git', 'cherry', '-v', upstream, head, limit], stdout=PIPE
  ) as commit_lines:
    for line in commit_lines.stdout:
      try:
        line = line.decode('utf-8').strip()
      except Exception as e:
        logging.warning(e)
        continue

      has_exact_content_match_str, _hash, subject = line.split(maxsplit=2)
      has_exact_content_match = has_exact_content_match_str == '-'
      message = _get_commit_message(_hash)

      commit = Commit(
          _hash=_hash,
          subject=subject,
          message=message,
          has_exact_content_match=has_exact_content_match,
      )

      commits.append(commit)

  return commits


def _get_commit_message(commit_hash: str) -> str:
  return run(
      ['git', 'log', '--format=%B', '-n', '1', commit_hash], stdout=PIPE
  ).stdout.decode('utf-8')


def get_subject_to_commit_map(
    branch: str, max_commits: Optional[int] = None
) -> dict[str, list[str]]:
  """Gets the map from commit subjects to their hashes on a specific branch.

  Args:
    branch: The branch for which the mapping will be collected.
    max_commits: The maximum number of commits to iterate over when collecting
      the subject to commit map. If None, will iterate over all commits. Setting
      this to a smaller value saves a significant amount of time for large
      repositories.

  Returns:
    A dict from commit subjects to lists of hashes of commits with that
    subject. Most subjects will only have one corresponding hash, but
    sometimes two unrelated commits may have the same subject.
  """
  mapping = {}
  with Popen(
      ['git', 'log', '--format=%H %s', branch], stdout=PIPE
  ) as commit_lines:
    for i, line in enumerate(commit_lines.stdout):
      if max_commits is not None and i >= max_commits:
        break
      try:
        line = line.decode('utf-8').strip()
      except Exception as e:
        logging.warning(e)
        continue

      if (m := COMMIT_SUBJECT_RE.match(line)) is not None:
        commit, subject = m.groups()
        # A subject can have multiple commits, so we return them all.
        commits = mapping.get(subject, [])
        commits.append(commit)
        mapping[subject] = commits
  logging.info(f'mapped {len(mapping)} commit subjects')
  return mapping


if __name__ == '__main__':
  logging.basicConfig(level=logging.INFO)

  parser = argparse.ArgumentParser(
      prog='get-cos-specific-kernel-commits',
      description=(
          'Finds commits which are in the COS kernel branch, but not the'
          ' upstream kernel branch'
      ),
  )
  parser.add_argument('-n', '--new-kernel-version', type=str, required=True)
  parser.add_argument(
      '-p', '--previous-kernel-version', type=str, required=True
  )
  parser.add_argument('-o', '--origin', default='cos', type=str, nargs='?')
  args = parser.parse_args()

  logging.info('Finding COS-specific commits...')
  commits = get_cos_specific_commits(
      args.previous_kernel_version, args.new_kernel_version, args.origin
  )
  logging.info(f'Done. Found {len(commits)} commits.')

  logging.info('Getting map from commit subjects to commit hashes...')
  # Get the subject mapping for the 100,000 most recent commits. Hopefully
  # we're not more behind than that at any point.
  new_upstream = f'{args.origin}/upstream-{args.new_kernel_version}'
  subject_to_commit_map = get_subject_to_commit_map(new_upstream, 100000)
  logging.info('Done.')

  # Find the upstream commits for each commit, if present, and output the
  # results as a tab-separated table.
  sys.stdout.write(f'Hash\tUpstream Hash\tHas Exact Content Match\tSubject\n')
  for commit in commits:
    upstream_hashes = commit.get_upstream_commit_hashes(subject_to_commit_map)
    if len(upstream_hashes) > 1:
      logging.warning(
          f'Found multiple upstream commits for commit {commit._hash}:'
          f' {commit.subject}'
      )
    if not upstream_hashes:
      # We ouptut the string 'None' if we didn't find any matching commits.
      upstream_hashes = ['None']
    for upstream_hash in upstream_hashes:
      sys.stdout.write(
          f'{commit._hash}\t{upstream_hash}\t{commit.has_exact_content_match}\t{commit.subject}\n'
      )
