blob: eb7a69d15ed493995939a2a5feafc73193e1a4aa [file] [log] [blame] [edit]
#!/usr/bin/env python3
# Copyright 2020 The ChromiumOS Authors
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
"""Extract all the licenses from the man page comments.
Each man page may be licensed under a unique license, so we have to walk each
one and extract the details.
"""
import argparse
import os
from pathlib import Path
import re
import sys
from typing import List, Optional, Tuple, Union
# Extract the license name & text from a section like:
# .\" %%%LICENSE_START(BSD_4_CLAUSE_UCB)
# .\" ...text...
# .\" %%%LICENSE_END
EXTRACT_LICENSE = re.compile(
r'^[^\n]*%%%LICENSE_START\(([^)]+)\)\n(.*)%%%LICENSE_END$',
flags=re.MULTILINE | re.DOTALL)
# Licenses that require copyright attribution.
ATTRIBUTION_LICENSES = {
'BSD_3_CLAUSE_UCB',
'BSD_4_CLAUSE_UCB',
'BSD_ONELINE_CDROM',
'MISC',
'MIT',
'PERMISSIVE_MISC',
'VERBATIM',
'VERBATIM_ONE_PARA',
'VERBATIM_PROF',
'VERBATIM_TWO_PARA',
}
# All licenses we know about. If a new one shows up, we'll throw an error so
# we're forced to evaluate it.
KNOWN_LICENSES = ATTRIBUTION_LICENSES | {
'FREELY_REDISTRIBUTABLE',
'GPL_NOVERSION_ONELINE',
'GPLv2+',
'GPLv2+_DOC_FULL',
'GPLv2+_DOC_MISC',
'GPLv2+_DOC_ONEPARA',
'GPLv2_MISC',
'GPLv2_ONELINE',
'GPLv2+_SW_3_PARA',
'GPLv2+_SW_ONEPARA',
'LDPv1',
'PUBLIC_DOMAIN',
}
def line_iscomment(line: str) -> bool:
"""Whether |line| is a roff comment."""
# A variety of possible formats here. This should get cleaned up in newer
# versions, but we have to deal with it in current releases.
# \" ...
# .\" ...
# '\" ...
# .
# We can't use triple double quotes here because it'll be invalid syntax,
# so we're forced to use triple single quotes instead.
# pylint: disable=invalid-triple-quote
return re.match(r'''^(\.$|[.']?\\")''', line)
def extract_license(page: Path) -> Union[None, Tuple[str, str]]:
"""Extract the license from |page|."""
with open(page, encoding='utf-8') as fp:
data = fp.read()
# Ignore stub pointer files.
if data.startswith('.so '):
return None
# Find the name of the license to do high level checks.
matches = list(EXTRACT_LICENSE.finditer(data))
assert matches, f'{page}: unable to find licenses'
for match in matches:
name = match.group(1)
assert name in KNOWN_LICENSES, (
f'{page}: {name}: unknown license; please update script')
# We'll yield the entire preceding header to the license.
if name in ATTRIBUTION_LICENSES:
# Walk backwards to collect copyrights until we:
# (1) Hit the start of the file.
# (2) Hit a non-comment line (as all copyrights are comments).
# (3) Hit the previous license.
header = data[0:match.start(0)]
lines = []
for line in reversed(header.splitlines()):
if not line_iscomment(line) or '%%%LICENSE_' in line:
break
lines.insert(0, line[3:].strip())
assert lines, f'{page}: invalid header:\n{header}'
# Trim a weird leading line pending upstream cleanup.
if lines[0] == 't':
lines.pop(0)
copyright_text = '\n'.join(lines).strip()
# Format the license text.
license_text = ''
# First remove the different comment styles.
lines = [x[3:].strip()
for x in match.group(2).strip().splitlines()]
# Then merge sentences.
for i, line in enumerate(lines):
line = line.strip()
if i == 0:
license_text = line + ' '
else:
if not line:
license_text = license_text.strip() + '\n\n'
else:
license_text += line + ' '
license_text = license_text.strip()
return (license_text, copyright_text)
# No licenses were found that required attribution.
return None
def get_parser() -> argparse.ArgumentParser:
"""Get CLI parser."""
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument('-o', '--output', type=Path,
help='File to write combined license to.')
parser.add_argument('-d', '--srcdir', type=Path,
default=os.environ.get('S'),
help='Source dir to walk (e.g. $S)')
parser.add_argument('files', nargs='*', default=[],
help='Source files (overrides --srcdir)')
return parser
def main(argv: Optional[List[str]] = None) -> Optional[int]:
"""The main entry point for scripts."""
parser = get_parser()
opts = parser.parse_args(argv)
if not opts.srcdir and not opts.files:
parser.error('--srcdir is required')
elif opts.srcdir and opts.files:
parser.error('--srcdir and files are mutually exclusive')
elif opts.srcdir:
if not opts.srcdir.is_dir():
parser.error(f'{opts.srcdir}: --srcdir is missing or is not a dir')
files = opts.srcdir.glob('man[0-9]/*.[0-9]')
else:
files = [Path(x) for x in opts.files]
# Merge pages with same licenses into one to avoid duplication.
licenses = {}
for file in files:
result = extract_license(file)
if result:
license_text, copyright_text = result
licenses.setdefault(license_text, []).append(
(file.name, copyright_text))
# Then produce the final notice lines.
lines = []
for license_text, pages in sorted(licenses.items()):
for page, copyright_text in pages:
lines += [page, copyright_text]
lines += ['', license_text]
lines += ['-' * 80]
# Remove the last ~ banner since we don't need it at the end of the file.
lines.pop()
# Then write it all out.
data = '\n'.join(lines).strip() + '\n'
if opts.output:
with open(opts.output, 'w', encoding='utf-8') as fp:
fp.write(data)
else:
print(data, end='')
if __name__ == '__main__':
sys.exit(main(sys.argv[1:]))