sys-apps/man-pages/files/extract-licenses.py - third_party/overlays/chromiumos-overlay - Git at Google

 #!/usr/bin/env python3
 # Copyright 2020 The ChromiumOS Authors
 # Use of this source code is governed by a BSD-style license that can be
 # found in the LICENSE file.

 """Extract all the licenses from the man page comments.

 Each man page may be licensed under a unique license, so we have to walk each
 one and extract the details.
 """

 import argparse
 import os
 from pathlib import Path
 import re
 import sys
 from typing import List, Optional, Tuple, Union


 # Extract the license name & text from a section like:
 # .\" %%%LICENSE_START(BSD_4_CLAUSE_UCB)
 # .\" ...text...
 # .\" %%%LICENSE_END
 EXTRACT_LICENSE = re.compile(
     r'^[^\n]*%%%LICENSE_START\(([^)]+)\)\n(.*)%%%LICENSE_END$',
     flags=re.MULTILINE | re.DOTALL)


 # Licenses that require copyright attribution.
 ATTRIBUTION_LICENSES = {
     'BSD_3_CLAUSE_UCB',
     'BSD_4_CLAUSE_UCB',
     'BSD_ONELINE_CDROM',
     'MISC',
     'MIT',
     'PERMISSIVE_MISC',
     'VERBATIM',
     'VERBATIM_ONE_PARA',
     'VERBATIM_PROF',
     'VERBATIM_TWO_PARA',
 }

 # All licenses we know about.  If a new one shows up, we'll throw an error so
 # we're forced to evaluate it.
 KNOWN_LICENSES = ATTRIBUTION_LICENSES | {
     'FREELY_REDISTRIBUTABLE',
     'GPL_NOVERSION_ONELINE',
     'GPLv2+',
     'GPLv2+_DOC_FULL',
     'GPLv2+_DOC_MISC',
     'GPLv2+_DOC_ONEPARA',
     'GPLv2_MISC',
     'GPLv2_ONELINE',
     'GPLv2+_SW_3_PARA',
     'GPLv2+_SW_ONEPARA',
     'LDPv1',
     'PUBLIC_DOMAIN',
 }


 def line_iscomment(line: str) -> bool:
     """Whether |line| is a roff comment."""
     # A variety of possible formats here.  This should get cleaned up in newer
     # versions, but we have to deal with it in current releases.
     # \" ...
     # .\" ...
     # '\" ...
     # .
     # We can't use triple double quotes here because it'll be invalid syntax,
     # so we're forced to use triple single quotes instead.
     # pylint: disable=invalid-triple-quote
     return re.match(r'''^(\.$|[.']?\\")''', line)


 def extract_license(page: Path) -> Union[None, Tuple[str, str]]:
     """Extract the license from |page|."""
     with open(page, encoding='utf-8') as fp:
         data = fp.read()

         # Ignore stub pointer files.
         if data.startswith('.so '):
             return None

         # Find the name of the license to do high level checks.
         matches = list(EXTRACT_LICENSE.finditer(data))
         assert matches, f'{page}: unable to find licenses'
         for match in matches:
             name = match.group(1)
             assert name in KNOWN_LICENSES, (
                 f'{page}: {name}: unknown license; please update script')

             # We'll yield the entire preceding header to the license.
             if name in ATTRIBUTION_LICENSES:
                 # Walk backwards to collect copyrights until we:
                 # (1) Hit the start of the file.
                 # (2) Hit a non-comment line (as all copyrights are comments).
                 # (3) Hit the previous license.
                 header = data[0:match.start(0)]
                 lines = []
                 for line in reversed(header.splitlines()):
                     if not line_iscomment(line) or '%%%LICENSE_' in line:
                         break
                     lines.insert(0, line[3:].strip())
                 assert lines, f'{page}: invalid header:\n{header}'
                 # Trim a weird leading line pending upstream cleanup.
                 if lines[0] == 't':
                     lines.pop(0)
                 copyright_text = '\n'.join(lines).strip()

                 # Format the license text.
                 license_text = ''
                 # First remove the different comment styles.
                 lines = [x[3:].strip()
                          for x in match.group(2).strip().splitlines()]
                 # Then merge sentences.
                 for i, line in enumerate(lines):
                     line = line.strip()
                     if i == 0:
                         license_text = line + ' '
                     else:
                         if not line:
                             license_text = license_text.strip() + '\n\n'
                         else:
                             license_text += line + ' '
                 license_text = license_text.strip()

                 return (license_text, copyright_text)

     # No licenses were found that required attribution.
     return None


 def get_parser() -> argparse.ArgumentParser:
     """Get CLI parser."""
     parser = argparse.ArgumentParser(description=__doc__)
     parser.add_argument('-o', '--output', type=Path,
                         help='File to write combined license to.')
     parser.add_argument('-d', '--srcdir', type=Path,
                         default=os.environ.get('S'),
                         help='Source dir to walk (e.g. $S)')
     parser.add_argument('files', nargs='*', default=[],
                         help='Source files (overrides --srcdir)')
     return parser


 def main(argv: Optional[List[str]] = None) -> Optional[int]:
     """The main entry point for scripts."""
     parser = get_parser()
     opts = parser.parse_args(argv)

     if not opts.srcdir and not opts.files:
         parser.error('--srcdir is required')
     elif opts.srcdir and opts.files:
         parser.error('--srcdir and files are mutually exclusive')
     elif opts.srcdir:
         if not opts.srcdir.is_dir():
             parser.error(f'{opts.srcdir}: --srcdir is missing or is not a dir')

         files = opts.srcdir.glob('man[0-9]/*.[0-9]')
     else:
         files = [Path(x) for x in opts.files]

     # Merge pages with same licenses into one to avoid duplication.
     licenses = {}
     for file in files:
         result = extract_license(file)
         if result:
             license_text, copyright_text = result
             licenses.setdefault(license_text, []).append(
                 (file.name, copyright_text))

     # Then produce the final notice lines.
     lines = []
     for license_text, pages in sorted(licenses.items()):
         for page, copyright_text in pages:
             lines += [page, copyright_text]
         lines += ['', license_text]
         lines += ['-' * 80]
     # Remove the last ~ banner since we don't need it at the end of the file.
     lines.pop()

     # Then write it all out.
     data = '\n'.join(lines).strip() + '\n'
     if opts.output:
         with open(opts.output, 'w', encoding='utf-8') as fp:
             fp.write(data)
     else:
         print(data, end='')


 if __name__ == '__main__':
     sys.exit(main(sys.argv[1:]))
	#!/usr/bin/env python3
	# Copyright 2020 The ChromiumOS Authors
	# Use of this source code is governed by a BSD-style license that can be
	# found in the LICENSE file.

	"""Extract all the licenses from the man page comments.

	Each man page may be licensed under a unique license, so we have to walk each
	one and extract the details.
	"""

	import argparse
	import os
	from pathlib import Path
	import re
	import sys
	from typing import List, Optional, Tuple, Union


	# Extract the license name & text from a section like:
	# .\" %%%LICENSE_START(BSD_4_CLAUSE_UCB)
	# .\" ...text...
	# .\" %%%LICENSE_END
	EXTRACT_LICENSE = re.compile(
	r'^[^\n]%%%LICENSE_START\(([^)]+)\)\n(.)%%%LICENSE_END$',
	flags=re.MULTILINE \| re.DOTALL)


	# Licenses that require copyright attribution.
	ATTRIBUTION_LICENSES = {
	'BSD_3_CLAUSE_UCB',
	'BSD_4_CLAUSE_UCB',
	'BSD_ONELINE_CDROM',
	'MISC',
	'MIT',
	'PERMISSIVE_MISC',
	'VERBATIM',
	'VERBATIM_ONE_PARA',
	'VERBATIM_PROF',
	'VERBATIM_TWO_PARA',
	}

	# All licenses we know about. If a new one shows up, we'll throw an error so
	# we're forced to evaluate it.
	KNOWN_LICENSES = ATTRIBUTION_LICENSES \| {
	'FREELY_REDISTRIBUTABLE',
	'GPL_NOVERSION_ONELINE',
	'GPLv2+',
	'GPLv2+_DOC_FULL',
	'GPLv2+_DOC_MISC',
	'GPLv2+_DOC_ONEPARA',
	'GPLv2_MISC',
	'GPLv2_ONELINE',
	'GPLv2+_SW_3_PARA',
	'GPLv2+_SW_ONEPARA',
	'LDPv1',
	'PUBLIC_DOMAIN',
	}


	def line_iscomment(line: str) -> bool:
	"""Whether \|line\| is a roff comment."""
	# A variety of possible formats here. This should get cleaned up in newer
	# versions, but we have to deal with it in current releases.
	# \" ...
	# .\" ...
	# '\" ...
	# .
	# We can't use triple double quotes here because it'll be invalid syntax,
	# so we're forced to use triple single quotes instead.
	# pylint: disable=invalid-triple-quote
	return re.match(r'''^(\.$\|[.']?\\")''', line)


	def extract_license(page: Path) -> Union[None, Tuple[str, str]]:
	"""Extract the license from \|page\|."""
	with open(page, encoding='utf-8') as fp:
	data = fp.read()

	# Ignore stub pointer files.
	if data.startswith('.so '):
	return None

	# Find the name of the license to do high level checks.
	matches = list(EXTRACT_LICENSE.finditer(data))
	assert matches, f'{page}: unable to find licenses'
	for match in matches:
	name = match.group(1)
	assert name in KNOWN_LICENSES, (
	f'{page}: {name}: unknown license; please update script')

	# We'll yield the entire preceding header to the license.
	if name in ATTRIBUTION_LICENSES:
	# Walk backwards to collect copyrights until we:
	# (1) Hit the start of the file.
	# (2) Hit a non-comment line (as all copyrights are comments).
	# (3) Hit the previous license.
	header = data[0:match.start(0)]
	lines = []
	for line in reversed(header.splitlines()):
	if not line_iscomment(line) or '%%%LICENSE_' in line:
	break
	lines.insert(0, line[3:].strip())
	assert lines, f'{page}: invalid header:\n{header}'
	# Trim a weird leading line pending upstream cleanup.
	if lines[0] == 't':
	lines.pop(0)
	copyright_text = '\n'.join(lines).strip()

	# Format the license text.
	license_text = ''
	# First remove the different comment styles.
	lines = [x[3:].strip()
	for x in match.group(2).strip().splitlines()]
	# Then merge sentences.
	for i, line in enumerate(lines):
	line = line.strip()
	if i == 0:
	license_text = line + ' '
	else:
	if not line:
	license_text = license_text.strip() + '\n\n'
	else:
	license_text += line + ' '
	license_text = license_text.strip()

	return (license_text, copyright_text)

	# No licenses were found that required attribution.
	return None


	def get_parser() -> argparse.ArgumentParser:
	"""Get CLI parser."""
	parser = argparse.ArgumentParser(description=__doc__)
	parser.add_argument('-o', '--output', type=Path,
	help='File to write combined license to.')
	parser.add_argument('-d', '--srcdir', type=Path,
	default=os.environ.get('S'),
	help='Source dir to walk (e.g. $S)')
	parser.add_argument('files', nargs='*', default=[],
	help='Source files (overrides --srcdir)')
	return parser


	def main(argv: Optional[List[str]] = None) -> Optional[int]:
	"""The main entry point for scripts."""
	parser = get_parser()
	opts = parser.parse_args(argv)

	if not opts.srcdir and not opts.files:
	parser.error('--srcdir is required')
	elif opts.srcdir and opts.files:
	parser.error('--srcdir and files are mutually exclusive')
	elif opts.srcdir:
	if not opts.srcdir.is_dir():
	parser.error(f'{opts.srcdir}: --srcdir is missing or is not a dir')

	files = opts.srcdir.glob('man[0-9]/*.[0-9]')
	else:
	files = [Path(x) for x in opts.files]

	# Merge pages with same licenses into one to avoid duplication.
	licenses = {}
	for file in files:
	result = extract_license(file)
	if result:
	license_text, copyright_text = result
	licenses.setdefault(license_text, []).append(
	(file.name, copyright_text))

	# Then produce the final notice lines.
	lines = []
	for license_text, pages in sorted(licenses.items()):
	for page, copyright_text in pages:
	lines += [page, copyright_text]
	lines += ['', license_text]
	lines += ['-' * 80]
	# Remove the last ~ banner since we don't need it at the end of the file.
	lines.pop()

	# Then write it all out.
	data = '\n'.join(lines).strip() + '\n'
	if opts.output:
	with open(opts.output, 'w', encoding='utf-8') as fp:
	fp.write(data)
	else:
	print(data, end='')


	if __name__ == '__main__':
	sys.exit(main(sys.argv[1:]))