| # Copyright 2020 The ChromiumOS Authors |
| # Use of this source code is governed by a BSD-style license that can be |
| # found in the LICENSE file. |
| |
| """Module for collecting CVE data.""" |
| |
| import logging |
| import os |
| import string |
| from urllib.parse import parse_qs |
| from urllib.parse import urlparse |
| |
| from bs4 import BeautifulSoup |
| from cvelib import logutils |
| import requests |
| |
| |
| LOGGER = logutils.setuplogging(loglvl=logging.DEBUG, name="WebScraper") |
| |
| CVE_URL = "https://cve.mitre.org/cgi-bin/cvename.cgi" |
| KERNEL_ORG = "git.kernel.org" |
| KERNEL_PATH = [ |
| "/cgit/linux/kernel/git/torvalds", |
| "/pub/scm/linux/kernel/git/torvalds/", |
| ] |
| GITHUB_COM = "github.com" |
| GITHUB_PATH = "/torvalds/linux/" |
| |
| |
| class WebScraperException(Exception): |
| """Exception class for web scraper.""" |
| |
| |
| def make_cve_request(cve_number): |
| """Generates CVE url.""" |
| cve = {"name": cve_number} |
| r = requests.get(CVE_URL, params=cve) |
| |
| if r.status_code != 200: |
| raise WebScraperException("Status code is not 200 OK.") |
| |
| # Checks page for an invalid CVE number. This is only done because the page |
| # is still existent even if the CVE number is not, therefore it returns a |
| # 200 status code and passes the first check. |
| soup = BeautifulSoup(r.text, "html.parser") |
| tag = soup.find("div", attrs={"id": "CenterPane"}) |
| |
| for d in tag.descendants: |
| if d.name == "h2" and d.string.startswith("ERROR:"): |
| raise WebScraperException("CVE number is invalid.") |
| |
| return r |
| |
| |
| def is_kernel_org(netloc, path): |
| """Check if is useful git.kernel.org link.""" |
| if netloc != KERNEL_ORG: |
| return False |
| |
| for link_path in KERNEL_PATH: |
| if path.startswith(link_path): |
| return True |
| |
| return False |
| |
| |
| def is_github_com(netloc, path): |
| """Check if is useful github.com link.""" |
| return netloc == GITHUB_COM and path.startswith(GITHUB_PATH) |
| |
| |
| def find_cve_description(cve_html): |
| """Returns given CVE's description.""" |
| soup = BeautifulSoup(cve_html, "html.parser") |
| |
| tag = soup.find("div", attrs={"id": "GeneratedTable"}) |
| |
| for t in tag.descendants: |
| if t.name == "th" and t.text == "Description": |
| description = t.parent.find_next_sibling().get_text() |
| |
| return description.replace("\n", "") |
| |
| |
| def find_commit_links(cve_html): |
| """Returns commit links from given CVE's webpage.""" |
| # TODO: Additional pattern to look for might be: |
| # https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2019-19076 |
| commits = [] |
| soup = BeautifulSoup(cve_html, "html.parser") |
| |
| # Searches through link tags. |
| tag = soup.find("div", attrs={"id": "GeneratedTable"}) |
| for l in tag.descendants: |
| if l.name == "a": |
| link = l.get("href") |
| parsed_link = urlparse(link) |
| netloc, path = parsed_link.netloc, parsed_link.path |
| |
| if is_kernel_org(netloc, path): |
| commits.append(link) |
| |
| elif is_github_com(netloc, path): |
| commits.append(link) |
| |
| return commits |
| |
| |
| def is_valid(sha): |
| """Returns True if sha is a hexidecimal string.""" |
| if not sha: |
| return False |
| return set(sha).issubset(string.hexdigits) |
| |
| |
| def find_sha_from_link(link): |
| """Returns sha, if it exists, based on link given.""" |
| parsed_link = urlparse(link) |
| netloc, path = parsed_link.netloc, parsed_link.path |
| |
| sha = None |
| |
| if is_kernel_org(netloc, path): |
| try: |
| sha = parse_qs(parsed_link.query)["id"][0] |
| except KeyError: |
| LOGGER.error(f"Sha not found in {link}") |
| |
| elif is_github_com(netloc, path): |
| sha = os.path.basename(path) |
| |
| return sha if is_valid(sha) else None |
| |
| |
| def find_relevant_commits(cve_number): |
| """Looks for the fix commit(s) given the CVE.""" |
| commits = set() |
| |
| req = make_cve_request(cve_number) |
| |
| cve_description = find_cve_description(req.text) |
| LOGGER.info(f"CVE Description: {cve_description}") |
| |
| commit_links = find_commit_links(req.text) |
| |
| # Collects fix commit sha(s) from links. |
| for link in commit_links: |
| LOGGER.debug(f"Looking for sha in {link}") |
| |
| sha = find_sha_from_link(link) |
| if sha: |
| commits.add(sha) |
| |
| return commits |