Add license text scanning to install hook for SBOM

gen-package-licenses.sh pre-processes package
licenses so we can depend on license.yaml generated
by the script for license names.

Any license not listed by SPDX spec needs to have
license text scanned. Though license.yaml contains
some scanned license text, it doesn't always have
what we need so we have to scan the source code
ourselves.

This CL adds functions to SBOM generation
for correcting license names for SPDX spec,
and scanning license text. It also updates the code
for fetching go dependencies and make use of package
source code fetched by gen-package-license.sh.

The license file of package lsof was added to the source
code on Jan 13 2023, so we need to treat it as a special
case until we update it to a newer version.

netcat is very special. See its license text in
license_data.py.

BUG=b/254334533
TEST=presubmit
RELEASE_NOTE=None

Change-Id: I6f860766b43f2738bc933b82416086bc3ab9277f
Reviewed-on: https://cos-review.googlesource.com/c/third_party/platform/crosutils/+/41682
Reviewed-by: Robert Kolchmeyer <rkolchmeyer@google.com>
Tested-by: Cusky Presubmit Bot <presubmit@cos-infra-prod.iam.gserviceaccount.com>
Reviewed-by: Vaibhav Rustagi <vaibhavrustagi@google.com>
diff --git a/hooks/install/gen-sbom-package-info.py b/hooks/install/gen-sbom-package-info.py
index 29bdecd..4b4c6a3 100755
--- a/hooks/install/gen-sbom-package-info.py
+++ b/hooks/install/gen-sbom-package-info.py
@@ -15,32 +15,43 @@
 # This script is used to automatically generate package
 # information for SBOM of COS image bundled dependencies.
 
+import json
 import os
 import sys
 from sbom_info_lib import download_url
 from sbom_info_lib import go_dep
 from sbom_info_lib import licenses
 from chromite.lib import osutils
+from chromite.lib import portage_util
 
-SBOM_INFO_FILE_NAME = "sbom-pkg-info"
+
+SBOM_INFO_FILE_NAME = "sbom-pkg-info.json"
 
 
 class SbomPackageInfo:
     def __init__(self):
         self.download_url = ""
         self.licenses = ""
-        self.go_dep = ""
+        self.go_dep = []
+        # Format: [{
+        #     "license_name": <license_name>,
+        #     "file_names": [file_name],
+        #     "license_txt": <license_txt>
+        # },{......}]
+        self.other_license_list = []
         self.err = ""
 
     def write_to_build_info(self, build_info_dir):
-        content = (
-            f"download-url:{self.download_url}\n"
-            + f"licenses:{self.licenses}\n"
-            + f"go-dep:{self.go_dep}\n"
-            + f"err:{self.err}\n"
-        )
+        content = {
+            "download-url": self.download_url,
+            "licenses": self.licenses,
+            "go-dep": self.go_dep,
+            "other_licenses": self.other_license_list,
+            "err": self.err,
+        }
+        json_content = json.dumps(content, indent=4)
         osutils.WriteFile(
-            f"{build_info_dir}/{SBOM_INFO_FILE_NAME}", content, makedirs=True
+            f"{build_info_dir}/{SBOM_INFO_FILE_NAME}", json_content, makedirs=True
         )
 
 
@@ -64,9 +75,22 @@
     return repository, category, pf, license
 
 
+def get_src_path(sysroot, fullname):
+    # Package source code has been fetched by gen-package-licenses.sh.
+    tmpdir = portage_util.PortageqEnvvar("PORTAGE_TMPDIR", sysroot=sysroot)
+    src_dir = os.path.join(tmpdir, "portage", fullname, "work")
+
+    if not os.path.exists(src_dir):
+        raise AssertionError(
+            "Unpack of %s didn't create %s. Version mismatch" % (fullname, src_dir)
+        )
+    return src_dir
+
+
 def main():
     sbom_pkg_info = SbomPackageInfo()
     package_dir = os.getenv("PORTAGE_BUILDDIR")
+    sysroot = "/".join(package_dir.split("/")[:3])
     build_info_dir = os.path.join(package_dir, "build-info")
     private = False
     try:
@@ -80,10 +104,12 @@
         sbom_pkg_info.download_url = download_url.get_download_url(
             ebuild, repository, category, pf, license
         )
-        sbom_pkg_info.licenses = licenses.get_licenses(build_info_dir)
-        sbom_pkg_info.go_dep = go_dep.get_go_dep(
-            sbom_pkg_info.download_url, build_info_dir
-        )
+        src_path = get_src_path(sysroot, os.path.join(category, pf))
+        (
+            sbom_pkg_info.licenses,
+            sbom_pkg_info.other_license_list,
+        ) = licenses.get_licenses(build_info_dir, src_path, package_name)
+        sbom_pkg_info.go_dep = go_dep.get_go_dep(src_path)
         if not sbom_pkg_info.download_url:
             raise SBOMPkgInfoError(f"download url not found")
         if not sbom_pkg_info.licenses:
diff --git a/hooks/install/sbom_info_lib/download_url.py b/hooks/install/sbom_info_lib/download_url.py
index e29c04d..9b42dfb 100644
--- a/hooks/install/sbom_info_lib/download_url.py
+++ b/hooks/install/sbom_info_lib/download_url.py
@@ -332,7 +332,7 @@
     return ""
 
 
-def search_go_source(category):
+def search_go_source():
     res = []
     go_src = parse_var_from_env(GO_SOURCE)
     for src in go_src:
@@ -362,7 +362,7 @@
     return ""
 
 
-def search_download_location(gcs_names, category, cros_uri):
+def search_download_location(gcs_names, cros_uri):
     res = search_mirror_gcs(gcs_names)
     if res:
         return res
@@ -371,7 +371,7 @@
         return res
     if cros_uri:
         return cros_uri
-    res = search_go_source(category)
+    res = search_go_source()
     if res:
         return res
     res = search_homepage()
@@ -397,6 +397,6 @@
     env_set, cros_uri, gcs_names_ebuild = parse_ebuild(ebuild)
     gcs_names.update(gcs_names_ebuild)
     gcs_names.discard("")
-    res = search_download_location(gcs_names, category, cros_uri)
+    res = search_download_location(gcs_names, cros_uri)
     unset_env(env_set)
     return res
diff --git a/hooks/install/sbom_info_lib/go_dep.py b/hooks/install/sbom_info_lib/go_dep.py
index 79429a1..805eb3f 100644
--- a/hooks/install/sbom_info_lib/go_dep.py
+++ b/hooks/install/sbom_info_lib/go_dep.py
@@ -16,9 +16,7 @@
 
 import os
 import re
-import subprocess
-import tarfile
-import requests
+from chromite.lib import cros_build_lib
 
 # REGEX_GO_MOD_DEP finds
 # `require (
@@ -29,48 +27,23 @@
 GO_MOD_DEP_FILE = ["go.mod", "vendor.mod", "vendor.conf"]
 
 
-def download_src_code(url, build_info_dir):
-    filepath = os.path.join(build_info_dir, os.path.basename(url))
-    if url.startswith("gs://"):
-        subprocess.run(["gsutil", "cp", url, filepath])
-    else:
-        if not url.startswith("http"):
-            return ""
-        if url.startswith("https://github.com"):
-            url = f'{url.replace("@","/archive/")}.tar.gz'
-        else:
-            url = f'{url.replace("@","/+archive/").replace("#","/")}.tar.gz'
-        response = requests.get(url)
-        open(filepath, "wb").write(response.content)
-    return filepath
-
-
-def get_go_dep(download_url, build_info_dir):
+def get_go_dep(src_path):
     res = set()
-    if not download_url:
-        return res
-    for url in download_url.split(","):
-        if url.endswith(".gn"):
+    args = ["find", src_path, "-type", "f"]
+    result = cros_build_lib.run(args, stdout=True, encoding="utf-8")
+    files = result.stdout.splitlines()
+    for name in files:
+        basename = os.path.basename(name)
+        if basename not in GO_MOD_DEP_FILE:
             continue
-        filepath = download_src_code(url, build_info_dir)
-        if not filepath:
-            return res
-        try:
-            t = tarfile.open(filepath, "r:gz")
-            for filename in t.getnames():
-                if os.path.basename(filename) not in GO_MOD_DEP_FILE:
-                    continue
-                f = t.extractfile(filename)
-                content = f.read()
-                match = re.findall(REGEX_GO_MOD_DEP, content.decode("utf-8"), re.DOTALL)
-                for req in match:
-                    deps = req.strip().split("\n")
-                    for dep in deps:
-                        # remove comments.
-                        dep = dep.split("//")[0].strip()
-                        if dep:
-                            res.add(dep)
-        except:
-            pass
-        os.remove(filepath)
-    return ",".join(res)
+        f = open(name, "r")
+        content = f.read()
+        match = re.findall(REGEX_GO_MOD_DEP, content, re.DOTALL)
+        for req in match:
+            deps = req.strip().split("\n")
+            for dep in deps:
+                # remove comments.
+                dep = dep.split("//")[0].strip()
+                if dep:
+                    res.add(dep)
+    return list(res)
diff --git a/hooks/install/sbom_info_lib/license_data.py b/hooks/install/sbom_info_lib/license_data.py
new file mode 100644
index 0000000..d4a4a86
--- /dev/null
+++ b/hooks/install/sbom_info_lib/license_data.py
@@ -0,0 +1,543 @@
+# Copyright 2023 Google LLC
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# version 2 as published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+
+SPDX_LICENSES = {
+    "0BSD",
+    "AAL",
+    "Abstyles",
+    "Adobe-2006",
+    "Adobe-Glyph",
+    "ADSL",
+    "AFL-1.1",
+    "AFL-1.2",
+    "AFL-2.0",
+    "AFL-2.1",
+    "AFL-3.0",
+    "Afmparse",
+    "AGPL-1.0-only",
+    "AGPL-1.0-or-later",
+    "AGPL-3.0-only",
+    "AGPL-3.0-or-later",
+    "Aladdin",
+    "AMDPLPA",
+    "AML",
+    "AMPAS",
+    "ANTLR-PD",
+    "ANTLR-PD-fallback",
+    "Apache-1.0",
+    "Apache-1.1",
+    "Apache-2.0",
+    "APAFML",
+    "APL-1.0",
+    "App-s2p",
+    "APSL-1.0",
+    "APSL-1.1",
+    "APSL-1.2",
+    "APSL-2.0",
+    "Arphic-1999",
+    "Artistic-1.0",
+    "Artistic-1.0-cl8",
+    "Artistic-1.0-Perl",
+    "Artistic-2.0",
+    "Baekmuk",
+    "Bahyph",
+    "Barr",
+    "Beerware",
+    "Bitstream-Vera",
+    "BitTorrent-1.0",
+    "BitTorrent-1.1",
+    "blessing",
+    "BlueOak-1.0.0",
+    "Borceux",
+    "BSD-1-Clause",
+    "BSD-2-Clause",
+    "BSD-2-Clause-Patent",
+    "BSD-2-Clause-Views",
+    "BSD-3-Clause",
+    "BSD-3-Clause-Attribution",
+    "BSD-3-Clause-Clear",
+    "BSD-3-Clause-LBNL",
+    "BSD-3-Clause-Modification",
+    "BSD-3-Clause-No-Military-License",
+    "BSD-3-Clause-No-Nuclear-License",
+    "BSD-3-Clause-No-Nuclear-License-2014",
+    "BSD-3-Clause-No-Nuclear-Warranty",
+    "BSD-3-Clause-Open-MPI",
+    "BSD-4-Clause",
+    "BSD-4-Clause-Shortened",
+    "BSD-4-Clause-UC",
+    "BSD-Protection",
+    "BSD-Source-Code",
+    "BSL-1.0",
+    "BUSL-1.1",
+    "bzip2-1.0.6",
+    "C-UDA-1.0",
+    "CAL-1.0",
+    "CAL-1.0-Combined-Work-Exception",
+    "Caldera",
+    "CATOSL-1.1",
+    "CC-BY-1.0",
+    "CC-BY-2.0",
+    "CC-BY-2.5",
+    "CC-BY-2.5-AU",
+    "CC-BY-3.0",
+    "CC-BY-3.0-AT",
+    "CC-BY-3.0-DE",
+    "CC-BY-3.0-IGO",
+    "CC-BY-3.0-NL",
+    "CC-BY-3.0-US",
+    "CC-BY-4.0",
+    "CC-BY-NC-1.0",
+    "CC-BY-NC-2.0",
+    "CC-BY-NC-2.5",
+    "CC-BY-NC-3.0",
+    "CC-BY-NC-3.0-DE",
+    "CC-BY-NC-4.0",
+    "CC-BY-NC-ND-1.0",
+    "CC-BY-NC-ND-2.0",
+    "CC-BY-NC-ND-2.5",
+    "CC-BY-NC-ND-3.0",
+    "CC-BY-NC-ND-3.0-DE",
+    "CC-BY-NC-ND-3.0-IGO",
+    "CC-BY-NC-ND-4.0",
+    "CC-BY-NC-SA-1.0",
+    "CC-BY-NC-SA-2.0",
+    "CC-BY-NC-SA-2.0-FR",
+    "CC-BY-NC-SA-2.0-UK",
+    "CC-BY-NC-SA-2.5",
+    "CC-BY-NC-SA-3.0",
+    "CC-BY-NC-SA-3.0-DE",
+    "CC-BY-NC-SA-3.0-IGO",
+    "CC-BY-NC-SA-4.0",
+    "CC-BY-ND-1.0",
+    "CC-BY-ND-2.0",
+    "CC-BY-ND-2.5",
+    "CC-BY-ND-3.0",
+    "CC-BY-ND-3.0-DE",
+    "CC-BY-ND-4.0",
+    "CC-BY-SA-1.0",
+    "CC-BY-SA-2.0",
+    "CC-BY-SA-2.0-UK",
+    "CC-BY-SA-2.1-JP",
+    "CC-BY-SA-2.5",
+    "CC-BY-SA-3.0",
+    "CC-BY-SA-3.0-AT",
+    "CC-BY-SA-3.0-DE",
+    "CC-BY-SA-4.0",
+    "CC-PDDC",
+    "CC0-1.0",
+    "CDDL-1.0",
+    "CDDL-1.1",
+    "CDL-1.0",
+    "CDLA-Permissive-1.0",
+    "CDLA-Permissive-2.0",
+    "CDLA-Sharing-1.0",
+    "CECILL-1.0",
+    "CECILL-1.1",
+    "CECILL-2.0",
+    "CECILL-2.1",
+    "CECILL-B",
+    "CECILL-C",
+    "CERN-OHL-1.1",
+    "CERN-OHL-1.2",
+    "CERN-OHL-P-2.0",
+    "CERN-OHL-S-2.0",
+    "CERN-OHL-W-2.0",
+    "checkmk",
+    "ClArtistic",
+    "CNRI-Jython",
+    "CNRI-Python",
+    "CNRI-Python-GPL-Compatible",
+    "COIL-1.0",
+    "Community-Spec-1.0",
+    "Condor-1.1",
+    "copyleft-next-0.3.0",
+    "copyleft-next-0.3.1",
+    "CPAL-1.0",
+    "CPL-1.0",
+    "CPOL-1.02",
+    "Crossword",
+    "CrystalStacker",
+    "CUA-OPL-1.0",
+    "Cube",
+    "curl",
+    "D-FSL-1.0",
+    "diffmark",
+    "DL-DE-BY-2.0",
+    "DOC",
+    "Dotseqn",
+    "DRL-1.0",
+    "DSDP",
+    "dvipdfm",
+    "ECL-1.0",
+    "ECL-2.0",
+    "EFL-1.0",
+    "EFL-2.0",
+    "eGenix",
+    "Elastic-2.0",
+    "Entessa",
+    "EPICS",
+    "EPL-1.0",
+    "EPL-2.0",
+    "ErlPL-1.1",
+    "etalab-2.0",
+    "EUDatagrid",
+    "EUPL-1.0",
+    "EUPL-1.1",
+    "EUPL-1.2",
+    "Eurosym",
+    "Fair",
+    "FDK-AAC",
+    "Frameworx-1.0",
+    "FreeBSD-DOC",
+    "FreeImage",
+    "FSFAP",
+    "FSFUL",
+    "FSFULLR",
+    "FSFULLRWD",
+    "FTL",
+    "GD",
+    "GFDL-1.1-invariants-only",
+    "GFDL-1.1-invariants-or-later",
+    "GFDL-1.1-no-invariants-only",
+    "GFDL-1.1-no-invariants-or-later",
+    "GFDL-1.1-only",
+    "GFDL-1.1-or-later",
+    "GFDL-1.2-invariants-only",
+    "GFDL-1.2-invariants-or-later",
+    "GFDL-1.2-no-invariants-only",
+    "GFDL-1.2-no-invariants-or-later",
+    "GFDL-1.2-only",
+    "GFDL-1.2-or-later",
+    "GFDL-1.3-invariants-only",
+    "GFDL-1.3-invariants-or-later",
+    "GFDL-1.3-no-invariants-only",
+    "GFDL-1.3-no-invariants-or-later",
+    "GFDL-1.3-only",
+    "GFDL-1.3-or-later",
+    "Giftware",
+    "GL2PS",
+    "Glide",
+    "Glulxe",
+    "GLWTPL",
+    "gnuplot",
+    "GPL-1.0-only",
+    "GPL-1.0-or-later",
+    "GPL-2.0-only",
+    "GPL-2.0-or-later",
+    "GPL-3.0-only",
+    "GPL-3.0-or-later",
+    "gSOAP-1.3b",
+    "HaskellReport",
+    "Hippocratic-2.1",
+    "HPND",
+    "HPND-sell-variant",
+    "HTMLTIDY",
+    "IBM-pibs",
+    "ICU",
+    "IJG",
+    "ImageMagick",
+    "iMatix",
+    "Imlib2",
+    "Info-ZIP",
+    "Intel",
+    "Intel-ACPI",
+    "Interbase-1.0",
+    "IPA",
+    "IPL-1.0",
+    "ISC",
+    "Jam",
+    "JasPer-2.0",
+    "JPNIC",
+    "JSON",
+    "Knuth-CTAN",
+    "LAL-1.2",
+    "LAL-1.3",
+    "Latex2e",
+    "Leptonica",
+    "LGPL-2.0-only",
+    "LGPL-2.0-or-later",
+    "LGPL-2.1-only",
+    "LGPL-2.1-or-later",
+    "LGPL-3.0-only",
+    "LGPL-3.0-or-later",
+    "LGPLLR",
+    "Libpng",
+    "libpng-2.0",
+    "libselinux-1.0",
+    "libtiff",
+    "libutil-David-Nugent",
+    "LiLiQ-P-1.1",
+    "LiLiQ-R-1.1",
+    "LiLiQ-Rplus-1.1",
+    "Linux-man-pages-copyleft",
+    "Linux-OpenIB",
+    "LPL-1.0",
+    "LPL-1.02",
+    "LPPL-1.0",
+    "LPPL-1.1",
+    "LPPL-1.2",
+    "LPPL-1.3a",
+    "LPPL-1.3c",
+    "LZMA-SDK-9.11-to-9.20",
+    "LZMA-SDK-9.22",
+    "MakeIndex",
+    "Minpack",
+    "MirOS",
+    "MIT",
+    "MIT-0",
+    "MIT-advertising",
+    "MIT-CMU",
+    "MIT-enna",
+    "MIT-feh",
+    "MIT-Modern-Variant",
+    "MIT-open-group",
+    "MITNFA",
+    "Motosoto",
+    "mpi-permissive",
+    "mpich2",
+    "MPL-1.0",
+    "MPL-1.1",
+    "MPL-2.0",
+    "MPL-2.0-no-copyleft-exception",
+    "mplus",
+    "MS-LPL",
+    "MS-PL",
+    "MS-RL",
+    "MTLL",
+    "MulanPSL-1.0",
+    "MulanPSL-2.0",
+    "Multics",
+    "Mup",
+    "NAIST-2003",
+    "NASA-1.3",
+    "Naumen",
+    "NBPL-1.0",
+    "NCGL-UK-2.0",
+    "NCSA",
+    "Net-SNMP",
+    "NetCDF",
+    "Newsletr",
+    "NGPL",
+    "NICTA-1.0",
+    "NIST-PD",
+    "NIST-PD-fallback",
+    "NLOD-1.0",
+    "NLOD-2.0",
+    "NLPL",
+    "Nokia",
+    "NOSL",
+    "Noweb",
+    "NPL-1.0",
+    "NPL-1.1",
+    "NPOSL-3.0",
+    "NRL",
+    "NTP",
+    "NTP-0",
+    "O-UDA-1.0",
+    "OCCT-PL",
+    "OCLC-2.0",
+    "ODbL-1.0",
+    "ODC-By-1.0",
+    "OFL-1.0",
+    "OFL-1.0-no-RFN",
+    "OFL-1.0-RFN",
+    "OFL-1.1",
+    "OFL-1.1-no-RFN",
+    "OFL-1.1-RFN",
+    "OGC-1.0",
+    "OGDL-Taiwan-1.0",
+    "OGL-Canada-2.0",
+    "OGL-UK-1.0",
+    "OGL-UK-2.0",
+    "OGL-UK-3.0",
+    "OGTSL",
+    "OLDAP-1.1",
+    "OLDAP-1.2",
+    "OLDAP-1.3",
+    "OLDAP-1.4",
+    "OLDAP-2.0",
+    "OLDAP-2.0.1",
+    "OLDAP-2.1",
+    "OLDAP-2.2",
+    "OLDAP-2.2.1",
+    "OLDAP-2.2.2",
+    "OLDAP-2.3",
+    "OLDAP-2.4",
+    "OLDAP-2.5",
+    "OLDAP-2.6",
+    "OLDAP-2.7",
+    "OLDAP-2.8",
+    "OML",
+    "OpenSSL",
+    "OPL-1.0",
+    "OPUBL-1.0",
+    "OSET-PL-2.1",
+    "OSL-1.0",
+    "OSL-1.1",
+    "OSL-2.0",
+    "OSL-2.1",
+    "OSL-3.0",
+    "Parity-6.0.0",
+    "Parity-7.0.0",
+    "PDDL-1.0",
+    "PHP-3.0",
+    "PHP-3.01",
+    "Plexus",
+    "PolyForm-Noncommercial-1.0.0",
+    "PolyForm-Small-Business-1.0.0",
+    "PostgreSQL",
+    "PSF-2.0",
+    "psfrag",
+    "psutils",
+    "Python-2.0",
+    "Python-2.0.1",
+    "Qhull",
+    "QPL-1.0",
+    "Rdisc",
+    "RHeCos-1.1",
+    "RPL-1.1",
+    "RPL-1.5",
+    "RPSL-1.0",
+    "RSA-MD",
+    "RSCPL",
+    "Ruby",
+    "SAX-PD",
+    "Saxpath",
+    "SCEA",
+    "SchemeReport",
+    "Sendmail",
+    "Sendmail-8.23",
+    "SGI-B-1.0",
+    "SGI-B-1.1",
+    "SGI-B-2.0",
+    "SHL-0.5",
+    "SHL-0.51",
+    "SimPL-2.0",
+    "SISSL",
+    "SISSL-1.2",
+    "Sleepycat",
+    "SMLNJ",
+    "SMPPL",
+    "SNIA",
+    "Spencer-86",
+    "Spencer-94",
+    "Spencer-99",
+    "SPL-1.0",
+    "SSH-OpenSSH",
+    "SSH-short",
+    "SSPL-1.0",
+    "SugarCRM-1.1.3",
+    "SWL",
+    "TAPR-OHL-1.0",
+    "TCL",
+    "TCP-wrappers",
+    "TMate",
+    "TORQUE-1.1",
+    "TOSL",
+    "TU-Berlin-1.0",
+    "TU-Berlin-2.0",
+    "UCL-1.0",
+    "Unicode-DFS-2015",
+    "Unicode-DFS-2016",
+    "Unicode-TOU",
+    "Unlicense",
+    "UPL-1.0",
+    "Vim",
+    "VOSTROM",
+    "VSL-1.0",
+    "W3C",
+    "W3C-19980720",
+    "W3C-20150513",
+    "Watcom-1.0",
+    "Wsuipa",
+    "WTFPL",
+    "X11",
+    "X11-distribute-modifications-variant",
+    "Xerox",
+    "XFree86-1.1",
+    "xinetd",
+    "Xnet",
+    "xpp",
+    "XSkat",
+    "YPL-1.0",
+    "YPL-1.1",
+    "Zed",
+    "Zend-2.0",
+    "Zimbra-1.3",
+    "Zimbra-1.4",
+    "Zlib",
+    "zlib-acknowledgement",
+    "ZPL-1.1",
+    "ZPL-2.0",
+    "ZPL-2.1",
+    "BSD",  # Need to identify license version.
+    "OPENLDAP",  # Need to identify license version.
+    "lsof",  # COPYING file added to source code on Jan 13 2023.
+    # One paragraph inside the 955-line README
+    # describes its license.
+    "netcat",
+}
+
+
+LSOF_LICENSE_TXT = """Copyright 2002 Purdue Research Foundation,
+West Lafayette,
+Indiana 47907.  All rights reserved.
+
+Written by Victor A. Abell
+
+This software is not subject to any license of the American
+Telephone and Telegraph Company or the Regents of the
+University of California.
+
+Permission is granted to anyone to use this software for
+any purpose on any computer system, and to alter it and
+redistribute it freely, subject to the following
+restrictions:
+
+1. Neither the authors nor Purdue University are responsible
+   for any consequences of the use of this software.
+
+2. The origin of this software must not be misrepresented,
+   either by explicit claim or by omission.  Credit to the
+   authors and Purdue University must appear in documentation
+   and sources.
+
+3. Altered versions must be plainly marked as such, and must
+   not be misrepresented as being the original software.
+
+4. This notice may not be removed or altered."""
+
+
+OTHER_LICENSE_LSOF = {
+    "license_name": "LicenseRef-lsof",
+    "file_names": ["lsof/COPYING"],
+    "license_txt": LSOF_LICENSE_TXT,
+}
+
+
+NETCAT_LICENSE_TXT = """Netcat is entirely my own creation, although
+plenty of other code was used as
+examples.  It is freely given away to the Internet community in the hope that
+it will be useful, with no restrictions except giving credit where it is due.
+No GPLs, Berkeley copyrights or any of that nonsense.  The author assumes NO
+responsibility for how anyone uses it.  If netcat makes you rich somehow and
+you're feeling generous, mail me a check.  If you are affiliated in any way
+with Microsoft Network, get a life.  Always ski in control.  Comments,
+questions, and patches to nc110-devel@lists.sourceforge.net."""
+
+
+OTHER_LICENSE_NETCAT = {
+    "license_name": "LicenseRef-netcat",
+    "file_names": ["nc110/README"],
+    "license_txt": NETCAT_LICENSE_TXT,
+}
diff --git a/hooks/install/sbom_info_lib/licenses.py b/hooks/install/sbom_info_lib/licenses.py
index cbf379d..bc6ea60 100644
--- a/hooks/install/sbom_info_lib/licenses.py
+++ b/hooks/install/sbom_info_lib/licenses.py
@@ -14,58 +14,259 @@
 
 import re
 import os
-
-# Parse LICENSE is a ebuild.
-def parse_gentoo_license(line):
-    license_set = set()
-    use_or = False
-    res = ""
-    for e in line.strip().split(" "):
-        if e == "||":
-            use_or = True
-        elif e == "(":
-            if res:
-                res += "AND ("
-            else:
-                res += "("
-        elif e == ")":
-            res = f"{res[:-1]}) "
-            use_or = False
-        else:
-            license_set.add(e)
-            if not res or res.endswith("("):
-                res += f"{e} "
-            elif use_or:
-                res += f"OR {e} "
-            else:
-                res += f"AND {e} "
-    return res.strip(), license_set
+from chromite.lib import cros_build_lib
+from chromite.licensing import licenses_lib
+from sbom_info_lib import license_data
 
 
-# If a license is in license.yaml but not LICENSE,
-# add it to the result.
-def parse_license_yaml(yaml, res, license_set):
-    lines = yaml.strip().split("\n")
-    idx = lines.index("  - license_names") + 1
-    match = re.findall("\{(.*?)\}", lines[idx], re.DOTALL)
-    if not match:
-        return res
+LICENSE_MAP = {
+    "Artistic-2": "Artistic-2.0",
+    "BSD-2": "BSD-2-Clause",
+    "BSD-4": "BSD-4-Clause",
+    "BSD-Google": "BSD-3-Clause",
+    "BZIP2": "bzip2-1.0.6",
+    "Boost-1.0": "BSL-1.0",
+    "FDL-1.1": "GFDL-1.1-only",
+    "FDL-1.2": "GFDL-1.2-only",
+    "GPL-2": "GPL-2.0-only",
+    "GPL-2+": "GPL-2.0-or-later",
+    "GPL-3": "GPL-3.0-only",
+    "GPL-3+": "GPL-3.0-or-later",
+    "LGPL-2": "LGPL-2.0-only",
+    "LGPL-2.1": "LGPL-2.1-only",
+    "LGPL-2.1+": "LGPL-2.1-or-later",
+    "LGPL-3": "LGPL-3.0-only",
+    "LGPL-3+": "LGPL-3.0-or-later",
+    "PSF-2": "PSF-2.0",
+    "RSA": "RSA-MD",
+    "UoI-NCSA": "NCSA",
+    "ZLIB": "Zlib",
+    "ZPL": "ZPL-2.1",
+    "openssl": "OpenSSL",
+    "vim": "Vim",
+}
+
+
+LICENSE_NAMES_REGEX = [
+    r"^copyright$",
+    r"^copyright[.]txt$",
+    r"^copyright[.]regex$",  # llvm
+    r"^copying.*$",
+    r"^licen[cs]e.*$",
+    r"^licensing.*$",  # libatomic_ops
+    r"^ipa_font_license_agreement_v1[.]0[.]txt$",  # ja-ipafonts
+    r"^MIT-LICENSE$",  # rake
+    r"^PKG-INFO$",  # copyright assignment for
+    # some python packages
+    # (netifaces, unittest2)
+    r"^NOTICE$",  # mit-krb5
+]
+
+LICENSE_REF = "LicenseRef-{}"
+
+
+BSD_VERSION_IDENTIFIER = [
+    "",
+    "Redistributions of source code must retain",
+    "Redistributions in binary form must reproduce",
+    "endorse or promote products derived",
+    "All advertising materials mentioning features",
+]
+
+REGEX_LICENSE_NAME = "- license_names\n  - !!set \{(.*?)\}"
+REGEX_LICENSE_TEXT = "- license_text_scanned\n  - \[(.*?)\]\n- !!python/tuple"
+REGEX_LICENSE_FILE_NAME = "Scanned Source License ([^\n]*?):"
+
+
+SPECIAL_LICENSE_MAP = {
+    "lsof": license_data.OTHER_LICENSE_LSOF,
+    "netcat": license_data.OTHER_LICENSE_NETCAT,
+}
+
+
+class UnknownLicenseError(Exception):
+    def __init__(self, msg):
+        super().__init__(msg)
+
+
+# Parse license.yaml.
+def parse_license_yaml(yaml, pkg_name):
+    # Try to find scanned license text in license.yaml.
+    saved_scanned_txt = ""
+    saved_license_files = []
+    scanned_txt_match = re.findall(REGEX_LICENSE_TEXT, yaml, re.DOTALL)
+    if scanned_txt_match:
+        for m in scanned_txt_match:
+            if not m.strip():
+                continue
+            saved_scanned_txt = m.strip()
+            license_file_match = re.findall(REGEX_LICENSE_FILE_NAME, m, re.DOTALL)
+            for n in license_file_match:
+                saved_license_files.append(n.strip())
+
+    # Try to find scanned license names in license.yaml.
     found = []
-    for m in match:
-        for part in m.split(","):
-            found.append(part.split(":")[0])
-    for license in found:
-        license = license.strip()
-        if license and not license in license_set:
-            license_set.add(license)
-            res += f" AND {license}"
-    return res
+    license_match = re.findall(REGEX_LICENSE_NAME, yaml, re.DOTALL)
+    if license_match:
+        for m in license_match:
+            for part in m.replace("\n", " ").split(","):
+                license = part.split(":")[0].strip()
+                # Being in the public domain is not a license.
+                if (
+                    not license
+                    or license == "public-domain"
+                    or license == "metapackage"
+                ):
+                    continue
+                if license in LICENSE_MAP:
+                    license = LICENSE_MAP[license]
+                found.append(license)
+    # There are cases where license.yaml contains no license
+    # but only sanned license text e.g. dev-libs/libpcre.
+    if not found and saved_license_files:
+        found.append(pkg_name)
+    return found, saved_scanned_txt, saved_license_files
 
 
-def get_licenses(build_info_dir):
-    if not os.path.exists(os.path.join(build_info_dir, "LICENSE")):
+def extract_other_licenses(licenses, src_path, saved_scanned_txt, saved_license_files):
+    # other_license_list format: [
+    # {
+    #     "license_name": <license_name>,
+    #     "file_names": [file_name],
+    #     "license_txt": <license_txt>
+    # },{
+    #     ......
+    # }]
+    other_license_list = []
+    all_license_files = list_all_license_files(src_path)
+    for license in licenses:
+        if license not in license_data.SPDX_LICENSES:
+            license_file_names = []
+            # Find license files same as license names
+            # e.g. LICENCE.ralink-firmware.txt.
+            license_txt, license_file_names = find_license_file(
+                src_path, all_license_files, license
+            )
+            if not license_txt:
+                if not saved_scanned_txt:
+                    # Find general license files e.g. COPYING.
+                    (
+                        saved_scanned_txt,
+                        saved_license_files,
+                    ) = find_general_license_txt(src_path, all_license_files)
+                    if not saved_scanned_txt:
+                        raise UnknownLicenseError(
+                            f"unknown license without scanned text: {license}"
+                        )
+                license_txt = saved_scanned_txt
+                license_file_names = saved_license_files
+
+            # Mark licenses not listed by SPDX spec as license reference.
+            license_ref = LICENSE_REF.format(license)
+            other_license_list.append(
+                {
+                    "license_name": license_ref,
+                    "file_names": license_file_names,
+                    "license_txt": license_txt,
+                }
+            )
+            licenses[licenses.index(license)] = license_ref
+        # Identify BSD version.
+        if "BSD" in licenses:
+            if not saved_scanned_txt:
+                saved_scanned_txt, _ = find_general_license_txt(src_path)
+                if not saved_scanned_txt:
+                    raise UnknownLicenseError(
+                        f"no license file found, cannot identify BSD version"
+                    )
+            bsd_v = find_bsd_version(saved_scanned_txt)
+            licenses[licenses.index("BSD")] = bsd_v
+        # Identify OPENLDAP version.
+        if "OPENLDAP" in licenses:
+            # Currently, OPENLDAP is only used in mit-krb5.
+            if "krb5" in saved_scanned_txt:
+                licenses[licenses.index("OPENLDAP")] = "OLDAP-2.8"
+            else:
+                raise UnknownLicenseError(f"license OPENLDAP doesn't have a version")
+        # Handle special licenses if present.
+        for license, license_content in SPECIAL_LICENSE_MAP.items():
+            if license in licenses:
+                license_ref = LICENSE_REF.format(license)
+                licenses[licenses.index(license)] = license_ref
+                other_license_list.append(license_content)
+    return other_license_list
+
+
+def find_bsd_version(saved_scanned_txt):
+    i = len(BSD_VERSION_IDENTIFIER) - 1
+    while i > 0:
+        if BSD_VERSION_IDENTIFIER[i] in saved_scanned_txt:
+            return f"BSD-{i}-Clause"
+        i -= 1
+    raise UnknownLicenseError(f"failed to identify BSD version")
+
+
+def list_all_license_files(src_path):
+    args = ["find", src_path, "-type", "f"]
+    result = cros_build_lib.run(args, stdout=True, encoding="utf-8")
+    # Truncate results to look like this: swig-2.0.4/COPYRIGHT
+    files = [x[len(src_path) :].lstrip("/") for x in result.stdout.splitlines()]
+    license_files = []
+    for name in files:
+        if ".git/" in name:
+            continue
+        basename = os.path.basename(name)
+        # Looking for license.* brings up things like license.gpl, and we
+        # never want a GPL license when looking for copyright attribution,
+        # so we skip them here. We also skip regexes that can return
+        # license.py (seen in some code).
+        if re.search(r".*GPL.*", basename) or re.search(r"\.py$", basename):
+            continue
+        for regex in LICENSE_NAMES_REGEX:
+            if re.search(regex, basename, re.IGNORECASE):
+                license_files.append(name)
+                break
+    return license_files
+
+
+# Find license files same as license names
+# e.g. LICENCE.ralink-firmware.txt
+# in sys-kernel/linux-firmware.
+def find_license_file(src_path, license_files, license):
+    for name in license_files:
+        basename = os.path.basename(name)
+        if os.path.splitext(basename)[0] == license or basename == license:
+            license_path = os.path.join(src_path, name)
+            return licenses_lib.ReadUnknownEncodedFile(
+                license_path, "Adding License"
+            ), [name]
+    return "", []
+
+
+# Find general license files e.g. COPYING.
+# Partially copy-pasted from chromite.licensing.licenses_lib._ExtractLicenses().
+def find_general_license_txt(src_path, license_files):
+    res = ""
+    for license_file in sorted(license_files):
+        # Joy and pink ponies. Some license_files are encoded as latin1 while
+        # others are utf-8 and of course you can't know but only guess.
+        license_path = os.path.join(src_path, license_file)
+        license_txt = licenses_lib.ReadUnknownEncodedFile(
+            license_path, "Adding License"
+        )
+        res += "\n\nScanned Source License %s:\n\n%s" % (license_file, license_txt)
+    return res, license_files
+
+
+def get_licenses(build_info_dir, src_path, pkg_name):
+    if not os.path.exists(os.path.join(build_info_dir, "license.yaml")):
         return ""
-    with open(os.path.join(build_info_dir, "LICENSE"), "r") as f:
-        res, license_set = parse_gentoo_license(f.read())
-        with open(os.path.join(build_info_dir, "license.yaml"), "r") as y:
-            return parse_license_yaml(y.read(), res, license_set)
+    with open(os.path.join(build_info_dir, "license.yaml"), "r") as l:
+        licenses, saved_scanned_txt, saved_license_files = parse_license_yaml(
+            l.read(), pkg_name
+        )
+
+    other_license_list = extract_other_licenses(
+        licenses, src_path, saved_scanned_txt, saved_license_files
+    )
+    return " AND ".join(licenses), other_license_list