Add SBOM info generation to install hook

It wirtes the info to file `sbom-pkg-info`
for each package merged.
Example output: https://paste.googleplex.com/5945476472897536

chromite/lib/gs.py cannot be used because the class
cannot be initialized due to permission denied when creating
cache dir.

BUG=b/254334533
TEST=presbumit
RELEASE_NOTE=None

Change-Id: Iaafc26f1d9726f41d342376c971955cf2dc7c68d
Reviewed-on: https://cos-review.googlesource.com/c/third_party/platform/crosutils/+/39267
Reviewed-by: Robert Kolchmeyer <rkolchmeyer@google.com>
Tested-by: Cusky Presubmit Bot <presubmit@cos-infra-prod.iam.gserviceaccount.com>
diff --git a/hooks/install/gen-sbom-package-info.py b/hooks/install/gen-sbom-package-info.py
new file mode 100755
index 0000000..59e5ffd
--- /dev/null
+++ b/hooks/install/gen-sbom-package-info.py
@@ -0,0 +1,64 @@
+#!/usr/bin/env python3
+#
+# Copyright 2022 Google LLC
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# version 2 as published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+
+# This script is used to automatically generate package
+# information for SBOM of COS image bundled dependencies.
+
+import os
+import sys
+from sbom_info_lib import download_url
+from sbom_info_lib import go_dep
+from sbom_info_lib import licenses
+
+SBOM_INFO_FILE_NAME = "sbom-pkg-info"
+
+
+class SbomPackageInfo:
+    def __init__(self, url, license, go_dep):
+        self.download_url = url
+        self.licenses = license
+        self.go_dep = go_dep
+
+    def write_to_build_info(self, build_info_dir):
+        with open(f"{build_info_dir}/{SBOM_INFO_FILE_NAME}", "w") as f:
+            f.write(f"download-url:{self.download_url}\n")
+            f.write(f"licenses:{self.licenses}\n")
+            f.write(f"go-dep:{self.go_dep}\n")
+
+
+class SBOMPkgInfoError(Exception):
+    def __init__(self, msg):
+        super().__init__(msg)
+
+
+def main():
+    package_dir = os.getenv("PORTAGE_BUILDDIR")
+    build_info_dir = os.path.join(package_dir, "build-info")
+    package_name = os.path.basename(package_dir)
+    ebuild = os.path.join(build_info_dir, package_name + ".ebuild")
+    url = download_url.get_download_url(build_info_dir, ebuild)
+    sbom_pkg_info = SbomPackageInfo(
+        url,
+        licenses.get_licenses(build_info_dir),
+        go_dep.get_go_dep(url, build_info_dir),
+    )
+    if not sbom_pkg_info.download_url and "private-overlays" not in ebuild:
+        raise SBOMPkgInfoError(f"download url not found")
+    if not sbom_pkg_info.licenses:
+        raise SBOMPkgInfoError(f"license not found")
+    sbom_pkg_info.write_to_build_info(build_info_dir)
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/hooks/install/sbom_info_lib/download_url.py b/hooks/install/sbom_info_lib/download_url.py
new file mode 100644
index 0000000..9df3912
--- /dev/null
+++ b/hooks/install/sbom_info_lib/download_url.py
@@ -0,0 +1,389 @@
+# Copyright 2022 Google LLC
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# version 2 as published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+
+# get_download_url() in this script is used to
+# find download location for a COS package.
+
+import subprocess
+import re
+import os
+import requests
+
+
+CROS_GCS_MIRRORS = [
+    "gs://chromeos-mirror/gentoo/distfiles/",
+    "gs://chromeos-localmirror/distfiles/",
+]
+# An allow-list for variables parsed in an ebuild file.
+EBUILD_VARS = {
+    "MY_P",
+    "MY_PV",
+    "MY_PN",
+    "PARCH",
+    "SRC_PV",
+    "code_ver",
+    "RE2_VER",
+    "MODULE_VERSION",
+    "GIT_COMMIT",
+    "SRC_URI",
+    "EGIT_REPO_URI",
+    "EGIT_COMMIT",
+    "CROS_WORKON_COMMIT",
+    "CROS_WORKON_PROJECT",
+    "CROS_WORKON_SUBTREE",
+    "HOMEPAGE",
+    "CROS_GO_SOURCE",
+    "GN_X64_SHA1",
+    "LLVM_HASH",
+    "CROS_WORKON_REPO",
+    "GNOME_ORG_MODULE",
+}
+# For packages whose package names are hard to parse or not defined in ebuilds.
+PN_REPLACE_DICT = {
+    "Locale-gettext": lambda x: "gettext",
+    "systemd": lambda x: "systemd-stable" if "." in x else "systemd",
+    "perf": lambda x: "patch",
+}
+SRC_URI_VARS = ["SRC_URI", "EGIT_REPO_URI"]
+COMMIT_VARS = ["GIT_COMMIT", "EGIT_COMMIT", "LLVM_HASH"]
+# REGEX_STRING_VAR finds `var_name=var_value` and `var_name="var_value"` (no new line) in ebuilds.
+REGEX_STRING_VAR = '([^\n]*?)="?([^\n]*?)"?\n'
+# REGEX_ARRAY_VAR finds `var_name=("var_value1" "var_value2" ...)` (allow new lines) in ebuilds.
+REGEX_ARRAY_VAR = "([^\n]*?)=(\(.*?\))"
+# REGEX_SRC_URI finds `SRC_URI="uri1 uri2 ..."` (allow new lines) in ebuilds.
+REGEX_SRC_URI = 'SRC_URI="(.*?)"'
+# REGEX_SRC_URI_PLUS finds `SRC_URI+="uri1 uri2 ..."` (allow new lines) in ebuilds.
+REGEX_SRC_URI_PLUS = 'SRC_URI\+="(.*?)"'
+# REGEX_PKG_REVISION finds package revision like `-r12` in package full name.
+REGEX_PKG_REVISION = "-r[0-9]+$"
+# REGEX_PKG_REVISION finds package version like `-1` or `-1.2.3.4` in package full name.
+REGEX_PKG_VERSION = "-[0-9]+(\.[0-9]*)*"
+# REGEX_FIND_STRING finds string inside double quotes like "string1".
+REGEX_FIND_STRING = '"(.*?)"'
+# REGEX_EBUILD_REPLACE finds ebuild replacement string `(ver_rs 1- some_string)`.
+REGEX_EBUILD_REPLACE = "\$\(ver_rs 1- (.*?)\)"
+GNOME_PN = "GNOME_ORG_MODULE"
+GO_SOURCE = "CROS_GO_SOURCE"
+CROS_REPO = "CROS_WORKON_REPO"
+CROS_COMMIT = "CROS_WORKON_COMMIT"
+CROS_PROJECT = "CROS_WORKON_PROJECT"
+CROS_SUBTREE = "CROS_WORKON_SUBTREE"
+CROS_GIT_HOST_URL = "https://chromium.googlesource.com"
+CROS_GIT_AOSP_URL = "https://android.googlesource.com"
+CROS_HOMEPAGE = "HOMEPAGE"
+GOB_REPO_DICT = {
+    "project-lakitu": "https://cos.googlesource.com/cos/overlays/board-overlays/+/master/project-lakitu/",
+    "chromiumos-overlay": "https://cos.googlesource.com/third_party/overlays/chromiumos-overlay/+/master/",
+    "portage-stable": "https://cos.googlesource.com/third_party/overlays/portage-stable/+/master/",
+    "eclass-overlay": "https://cos.googlesource.com/third_party/overlays/eclass-overlay/+/master/",
+}
+# Packages that use `MODULE_VERSION` as package version.
+KEYWORDS_FOR_MODULE_VERSION = ["dev-perl", "perl-core"]
+PACKAGES_FROM_GOB = {
+    # portage-stable
+    "dev-util/meson-format-array",
+    "sys-devel/autoconf-wrapper",
+    "sys-devel/automake-wrapper",
+    "dev-python/namespace-zope",
+    # project-lakitu
+    "app-admin/cgroup-helper",
+    "app-admin/extensions-manager",
+    "app-admin/kdump-helper",
+    "app-admin/stackdriver",
+    "app-admin/toolbox-config",
+    "app-emulation/cloud-init-config",
+    "chromeos-base/chromeos-auth-config-lakitu",
+    "chromeos-base/chromeos-base",
+    "chromeos-base/chromeos-bsp-lakitu-common",
+    "chromeos-base/chromeos-firewall-init-lakitu",
+    "chromeos-base/chromeos-init-systemd",
+    "chromeos-base/cloud-audit-config",
+    "chromeos-base/cloud-filesystem-init",
+    "chromeos-base/cloud-network-init",
+    "net-misc/chrony-config",
+    "sys-apps/loadpin-trigger",
+    "sys-apps/system-sysdaemons",
+    "sys-libs/lakitu-custom-locales",
+    # chromiumos-overlay
+    "chromeos-base/chromeos-ca-certificates",
+    "chromeos-base/chromeos-sshd-init",
+    "chromeos-base/tty",
+    "chromeos-base/update-policy-embedded",
+    "dev-util/glib-utils",
+    "chromeos-base/openssh-server-init",
+}
+
+
+def is_uri_valid(uri):
+    if not uri.strip().startswith("http"):
+        return False
+    request = requests.get(uri, stream=True)
+    if request.status_code == 200:
+        return True
+    return False
+
+
+def parse_var(s):
+    # avoid downloading packages.
+    parts = s.split("->")
+    if len(parts) > 1:
+        s = parts[0]
+    # do not evaluate commands.
+    if s.startswith("("):
+        s = f"'{s}'"
+    cmd = f"echo {s}"
+    res = subprocess.run(
+        ["bash", "-c", cmd], stdout=subprocess.PIPE, stderr=subprocess.PIPE
+    )
+    if res.stderr:
+        return ""
+    return res.stdout.decode("utf-8").rstrip()
+
+
+# Parse an environment variable and return a list.
+def parse_var_from_env(key):
+    val = os.getenv(key)
+    if not val:
+        return []
+    if val.startswith("("):
+        res = []
+        match = re.findall(REGEX_FIND_STRING, val, re.DOTALL)
+        # in some cases, go src version cannot be parsed in array
+        # e.g. chromiumos-overlay/dev-go/protobuf
+        for m in match:
+            res.append(parse_var(m))
+        return res
+    return [val]
+
+
+def find_var_and_set_env(regex, content):
+    env_set = set()
+    match = re.findall(regex, content, re.DOTALL)
+    for m in match:
+        key = m[0].strip()
+        if key not in EBUILD_VARS:
+            continue
+        val = parse_var(m[1]).strip()
+        if val:
+            os.environ[key] = val
+            env_set.add(key)
+    return env_set
+
+
+def parse_vars_in_ebuild(content):
+    env_set = set()
+    # Replace ebuild replacement gramma with bash format.
+    match = re.findall(REGEX_EBUILD_REPLACE, content, re.DOTALL)
+    if match:
+        for m in match:
+            content = content.replace(f"$(ver_rs 1- {m})", f"${{PV//./{m}}}")
+    env_set.update(find_var_and_set_env(REGEX_STRING_VAR, content))
+    env_set.update(find_var_and_set_env(REGEX_ARRAY_VAR, content))
+    return env_set
+
+
+def parse_pkg_name(pf):
+    match = re.search(REGEX_PKG_REVISION, pf)
+    if match:
+        p = pf[: match.start()]
+    else:
+        p = pf
+    match = re.search(REGEX_PKG_VERSION, p)
+    pn = p[: match.start()]
+    p_name = pn
+    pv = p[match.start() + 1 :]
+    if pn in PN_REPLACE_DICT:
+        pn = PN_REPLACE_DICT[pn](pv)
+        p = f"{pn}-{pv}"
+    os.environ["PN"] = pn
+    os.environ["PV"] = pv
+    os.environ["P"] = p
+    # possbile package names in CROS GCS mirror buckets.
+    return p_name, {f"{p}.tar.gz", f"{p}.tar.xz", f"{p}.tgz", f"{p}.xz"}
+
+
+def search_pkg_from_gob(repository, category, p_name):
+    pkg = f"{category}/{p_name}"
+    if (
+        pkg in PACKAGES_FROM_GOB
+        or category == "virtual"
+        or repository == "eclass-overlay"
+    ):
+        uri = os.path.join(GOB_REPO_DICT[repository], pkg)
+        if is_uri_valid(uri):
+            return uri
+        return ""
+    return ""
+
+
+def find_cros_uri():
+    res = []
+    cros_repo = parse_var_from_env(CROS_REPO)
+    cros_proj = parse_var_from_env(CROS_PROJECT)
+    cros_subtree = parse_var_from_env(CROS_SUBTREE)
+    cros_commit = parse_var_from_env(CROS_COMMIT)
+    if not cros_repo:
+        cros_repo = [CROS_GIT_HOST_URL] * len(cros_proj)
+    if len(cros_proj) != len(cros_commit):
+        return res
+    for i in range(len(cros_proj)):
+        uri = os.path.join(cros_repo[i], cros_proj[i])
+        if not is_uri_valid(uri):
+            continue
+        if cros_subtree and cros_subtree[i]:
+            subtrees = cros_subtree[i].split(" ")
+            for subtree in subtrees:
+                res.append(f"{uri}@{cros_commit[i]}#{subtree}")
+        else:
+            res.append(f"{uri}@{cros_commit[i]}")
+    return res
+
+
+def get_gcs_name_from_src_uri(regex, content):
+    gcs_names = set()
+    match = re.findall(regex, content, re.DOTALL)
+    if match:
+        for src_uri_group in match:
+            for uri_line in src_uri_group.split("\n"):
+                for uri in uri_line.split(" "):
+                    if uri == "->":
+                        continue
+                    gcs_names.add(os.path.basename(parse_var(uri)))
+    return gcs_names
+
+
+# Parse ebuild and set environment variables.
+# Find possible CROS gcs mirror package names,
+# and cros download url.
+def parse_ebuild(ebuild):
+    gcs_names = set()
+    with open(ebuild) as eb:
+        content = eb.read()
+        env_set = parse_vars_in_ebuild(content)
+        cros_uri = find_cros_uri()
+        for keyword in KEYWORDS_FOR_MODULE_VERSION:
+            if keyword in ebuild:
+                gcs_names.add(f'{os.getenv("PN")}-{os.getenv("MODULE_VERSION")}.tar.gz')
+                break
+        gnome_pn = os.getenv(GNOME_PN)
+        if gnome_pn:
+            gcs_names.add(f'{gnome_pn}-{os.getenv["PV"]}')
+        gcs_names_src = get_gcs_name_from_src_uri(REGEX_SRC_URI, content)
+        if gcs_names:
+            gcs_names.update(gcs_names_src)
+        else:
+            gcs_names.update(get_gcs_name_from_src_uri(REGEX_SRC_URI_PLUS, content))
+        return env_set, cros_uri, gcs_names
+
+
+def search_mirror_gcs(gcs_names):
+    for name in gcs_names:
+        name = name.replace("?", "%3f")
+        for bucket in CROS_GCS_MIRRORS:
+            link = os.path.join(bucket, name)
+            res = subprocess.run(
+                ["gsutil", "ls", link], stdout=subprocess.PIPE, stderr=subprocess.PIPE
+            )
+            if res.stderr:
+                continue
+            else:
+                return res.stdout.decode("utf-8").rstrip()
+    return ""
+
+
+def search_src_uri():
+    for uri_name in SRC_URI_VARS:
+        uri = os.getenv(uri_name)
+        if uri and is_uri_valid(uri):
+            for commit_name in COMMIT_VARS:
+                commit = os.getenv(commit_name)
+                if commit:
+                    return f"{uri}@{commit}"
+            return uri
+    return ""
+
+
+def search_go_source(category):
+    res = []
+    go_src = parse_var_from_env(GO_SOURCE)
+    for src in go_src:
+        parts = src.split(" ")
+        if len(parts) == 2:
+            version = parts[1]
+            sources = parts[0].split(":")
+            for uri in sources:
+                uri = "https://" + uri
+                if is_uri_valid(uri):
+                    res.append(f"{uri}@{version}")
+                    break
+    return ",".join(res)
+
+
+def search_homepage():
+    homepage = os.getenv(CROS_HOMEPAGE)
+    if "chromium.googlesource.com" in homepage and is_uri_valid(homepage):
+        commit = os.getenv(CROS_COMMIT)
+        if commit:
+            return f"{homepage}@{commit}"
+        return homepage
+    return ""
+
+
+def search_download_location(gcs_names, category, cros_uri):
+    res = search_mirror_gcs(gcs_names)
+    if res:
+        return res
+    res = search_src_uri()
+    if res:
+        return res
+    if cros_uri:
+        return cros_uri
+    res = search_go_source(category)
+    if res:
+        return res
+    res = search_homepage()
+    if res:
+        return res
+    return ""
+
+
+def unset_env(env_set):
+    for var in env_set:
+        os.environ[var] = ""
+
+
+def read_build_info(build_info_dir):
+    with open(os.path.join(build_info_dir, "repository"), "r") as f:
+        repository = f.read().strip()
+    with open(os.path.join(build_info_dir, "CATEGORY"), "r") as f:
+        category = f.read().strip()
+    with open(os.path.join(build_info_dir, "PF"), "r") as f:
+        pf = f.read().strip()
+    return repository, category, pf
+
+
+def get_download_url(build_info_dir, ebuild):
+    repository, category, pf = read_build_info(build_info_dir)
+    if repository == "private-overlays":
+        return ""
+    os.environ["CROS_GIT_HOST_URL"] = CROS_GIT_HOST_URL
+    os.environ["CROS_GIT_AOSP_URL"] = CROS_GIT_AOSP_URL
+    p_name, gcs_names = parse_pkg_name(pf)
+    gob_res = search_pkg_from_gob(repository, category, p_name)
+    if gob_res:
+        return gob_res
+    env_set, cros_uri, gcs_names_ebuild = parse_ebuild(ebuild)
+    gcs_names.update(gcs_names_ebuild)
+    gcs_names.discard("")
+    res = search_download_location(gcs_names, category, cros_uri)
+    unset_env(env_set)
+    return res
diff --git a/hooks/install/sbom_info_lib/go_dep.py b/hooks/install/sbom_info_lib/go_dep.py
new file mode 100644
index 0000000..da05c9c
--- /dev/null
+++ b/hooks/install/sbom_info_lib/go_dep.py
@@ -0,0 +1,70 @@
+# Copyright 2022 Google LLC
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# version 2 as published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+
+# This script is used to find go dependencies
+# of a go pacakge. It reads 'go.mod', 'vendor.mod'
+# or 'vendor.conf' in the source code.
+
+import os
+import re
+import subprocess
+import tarfile
+import requests
+
+# REGEX_GO_MOD_DEP finds
+# `require (
+#   go-pkg1 v1.2.3
+#   go-pkg2 v4.5.6 ...
+# )` in go.mod or other mod file.
+REGEX_GO_MOD_DEP = "require \((.*?)\)"
+GO_MOD_DEP_FILE = ["go.mod", "vendor.mod", "vendor.conf"]
+
+
+def download_src_code(url, build_info_dir):
+    filepath = os.path.join(build_info_dir, os.path.basename(url))
+    if url.startswith("gs://"):
+        subprocess.run(["gsutil", "cp", url, filepath])
+    else:
+        if url.startswith("https://github.com"):
+            url = f'{url.replace("@","/archive/")}.tar.gz'
+        else:
+            url = f'{url.replace("@","/+archive/").replace("#","/")}.tar.gz'
+        response = requests.get(url)
+        open(filepath, "wb").write(response.content)
+    return filepath
+
+
+def get_go_dep(download_url, build_info_dir):
+    res = set()
+    for url in download_url.split(","):
+        if url.endswith(".gn"):
+            continue
+        filepath = download_src_code(url, build_info_dir)
+        try:
+            t = tarfile.open(filepath, "r:gz")
+            for filename in t.getnames():
+                if os.path.basename(filename) not in GO_MOD_DEP_FILE:
+                    continue
+                f = t.extractfile(filename)
+                content = f.read()
+                match = re.findall(REGEX_GO_MOD_DEP, content.decode("utf-8"), re.DOTALL)
+                for req in match:
+                    deps = req.strip().split("\n")
+                    for dep in deps:
+                        # remove comments.
+                        dep = dep.split("//")[0].strip()
+                        if dep:
+                            res.add(dep)
+        except:
+            print(f"{url} is not a .gz file.")
+        os.remove(filepath)
+    return ",".join(res)
diff --git a/hooks/install/sbom_info_lib/licenses.py b/hooks/install/sbom_info_lib/licenses.py
new file mode 100644
index 0000000..cbf379d
--- /dev/null
+++ b/hooks/install/sbom_info_lib/licenses.py
@@ -0,0 +1,71 @@
+# Copyright 2022 Google LLC
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# version 2 as published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+
+# This script is used to parse licenses of a package.
+
+import re
+import os
+
+# Parse LICENSE is a ebuild.
+def parse_gentoo_license(line):
+    license_set = set()
+    use_or = False
+    res = ""
+    for e in line.strip().split(" "):
+        if e == "||":
+            use_or = True
+        elif e == "(":
+            if res:
+                res += "AND ("
+            else:
+                res += "("
+        elif e == ")":
+            res = f"{res[:-1]}) "
+            use_or = False
+        else:
+            license_set.add(e)
+            if not res or res.endswith("("):
+                res += f"{e} "
+            elif use_or:
+                res += f"OR {e} "
+            else:
+                res += f"AND {e} "
+    return res.strip(), license_set
+
+
+# If a license is in license.yaml but not LICENSE,
+# add it to the result.
+def parse_license_yaml(yaml, res, license_set):
+    lines = yaml.strip().split("\n")
+    idx = lines.index("  - license_names") + 1
+    match = re.findall("\{(.*?)\}", lines[idx], re.DOTALL)
+    if not match:
+        return res
+    found = []
+    for m in match:
+        for part in m.split(","):
+            found.append(part.split(":")[0])
+    for license in found:
+        license = license.strip()
+        if license and not license in license_set:
+            license_set.add(license)
+            res += f" AND {license}"
+    return res
+
+
+def get_licenses(build_info_dir):
+    if not os.path.exists(os.path.join(build_info_dir, "LICENSE")):
+        return ""
+    with open(os.path.join(build_info_dir, "LICENSE"), "r") as f:
+        res, license_set = parse_gentoo_license(f.read())
+        with open(os.path.join(build_info_dir, "license.yaml"), "r") as y:
+            return parse_license_yaml(y.read(), res, license_set)