contrib/cvetriager/cvelib/webscraper.py - mirrors/cros/chromiumos/platform/dev-util - Git at Google

 # Copyright 2020 The ChromiumOS Authors
 # Use of this source code is governed by a BSD-style license that can be
 # found in the LICENSE file.

 """Module for collecting CVE data."""

 import logging
 import os
 import string
 from urllib.parse import parse_qs
 from urllib.parse import urlparse

 from bs4 import BeautifulSoup
 from cvelib import logutils
 import requests


 LOGGER = logutils.setuplogging(loglvl=logging.DEBUG, name="WebScraper")

 CVE_URL = "https://cve.mitre.org/cgi-bin/cvename.cgi"
 KERNEL_ORG = "git.kernel.org"
 KERNEL_PATH = [
     "/cgit/linux/kernel/git/torvalds",
     "/pub/scm/linux/kernel/git/torvalds/",
 ]
 GITHUB_COM = "github.com"
 GITHUB_PATH = "/torvalds/linux/"


 class WebScraperException(Exception):
     """Exception class for web scraper."""


 def make_cve_request(cve_number):
     """Generates CVE url."""
     cve = {"name": cve_number}
     r = requests.get(CVE_URL, params=cve)

     if r.status_code != 200:
         raise WebScraperException("Status code is not 200 OK.")

     # Checks page for an invalid CVE number. This is only done because the page
     # is still existent even if the CVE number is not, therefore it returns a
     # 200 status code and passes the first check.
     soup = BeautifulSoup(r.text, "html.parser")
     tag = soup.find("div", attrs={"id": "CenterPane"})

     for d in tag.descendants:
         if d.name == "h2" and d.string.startswith("ERROR:"):
             raise WebScraperException("CVE number is invalid.")

     return r


 def is_kernel_org(netloc, path):
     """Check if is useful git.kernel.org link."""
     if netloc != KERNEL_ORG:
         return False

     for link_path in KERNEL_PATH:
         if path.startswith(link_path):
             return True

     return False


 def is_github_com(netloc, path):
     """Check if is useful github.com link."""
     return netloc == GITHUB_COM and path.startswith(GITHUB_PATH)


 def find_cve_description(cve_html):
     """Returns given CVE's description."""
     soup = BeautifulSoup(cve_html, "html.parser")

     tag = soup.find("div", attrs={"id": "GeneratedTable"})

     for t in tag.descendants:
         if t.name == "th" and t.text == "Description":
             description = t.parent.find_next_sibling().get_text()

     return description.replace("\n", "")


 def find_commit_links(cve_html):
     """Returns commit links from given CVE's webpage."""
     # TODO: Additional pattern to look for might be:
     # https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2019-19076
     commits = []
     soup = BeautifulSoup(cve_html, "html.parser")

     # Searches through link tags.
     tag = soup.find("div", attrs={"id": "GeneratedTable"})
     for l in tag.descendants:
         if l.name == "a":
             link = l.get("href")
             parsed_link = urlparse(link)
             netloc, path = parsed_link.netloc, parsed_link.path

             if is_kernel_org(netloc, path):
                 commits.append(link)

             elif is_github_com(netloc, path):
                 commits.append(link)

     return commits


 def is_valid(sha):
     """Returns True if sha is a hexidecimal string."""
     if not sha:
         return False
     return set(sha).issubset(string.hexdigits)


 def find_sha_from_link(link):
     """Returns sha, if it exists, based on link given."""
     parsed_link = urlparse(link)
     netloc, path = parsed_link.netloc, parsed_link.path

     sha = None

     if is_kernel_org(netloc, path):
         try:
             sha = parse_qs(parsed_link.query)["id"][0]
         except KeyError:
             LOGGER.error(f"Sha not found in {link}")

     elif is_github_com(netloc, path):
         sha = os.path.basename(path)

     return sha if is_valid(sha) else None


 def find_relevant_commits(cve_number):
     """Looks for the fix commit(s) given the CVE."""
     commits = set()

     req = make_cve_request(cve_number)

     cve_description = find_cve_description(req.text)
     LOGGER.info(f"CVE Description: {cve_description}")

     commit_links = find_commit_links(req.text)

     # Collects fix commit sha(s) from links.
     for link in commit_links:
         LOGGER.debug(f"Looking for sha in {link}")

         sha = find_sha_from_link(link)
         if sha:
             commits.add(sha)

     return commits
	# Copyright 2020 The ChromiumOS Authors
	# Use of this source code is governed by a BSD-style license that can be
	# found in the LICENSE file.

	"""Module for collecting CVE data."""

	import logging
	import os
	import string
	from urllib.parse import parse_qs
	from urllib.parse import urlparse

	from bs4 import BeautifulSoup
	from cvelib import logutils
	import requests


	LOGGER = logutils.setuplogging(loglvl=logging.DEBUG, name="WebScraper")

	CVE_URL = "https://cve.mitre.org/cgi-bin/cvename.cgi"
	KERNEL_ORG = "git.kernel.org"
	KERNEL_PATH = [
	"/cgit/linux/kernel/git/torvalds",
	"/pub/scm/linux/kernel/git/torvalds/",
	]
	GITHUB_COM = "github.com"
	GITHUB_PATH = "/torvalds/linux/"


	class WebScraperException(Exception):
	"""Exception class for web scraper."""


	def make_cve_request(cve_number):
	"""Generates CVE url."""
	cve = {"name": cve_number}
	r = requests.get(CVE_URL, params=cve)

	if r.status_code != 200:
	raise WebScraperException("Status code is not 200 OK.")

	# Checks page for an invalid CVE number. This is only done because the page
	# is still existent even if the CVE number is not, therefore it returns a
	# 200 status code and passes the first check.
	soup = BeautifulSoup(r.text, "html.parser")
	tag = soup.find("div", attrs={"id": "CenterPane"})

	for d in tag.descendants:
	if d.name == "h2" and d.string.startswith("ERROR:"):
	raise WebScraperException("CVE number is invalid.")

	return r


	def is_kernel_org(netloc, path):
	"""Check if is useful git.kernel.org link."""
	if netloc != KERNEL_ORG:
	return False

	for link_path in KERNEL_PATH:
	if path.startswith(link_path):
	return True

	return False


	def is_github_com(netloc, path):
	"""Check if is useful github.com link."""
	return netloc == GITHUB_COM and path.startswith(GITHUB_PATH)


	def find_cve_description(cve_html):
	"""Returns given CVE's description."""
	soup = BeautifulSoup(cve_html, "html.parser")

	tag = soup.find("div", attrs={"id": "GeneratedTable"})

	for t in tag.descendants:
	if t.name == "th" and t.text == "Description":
	description = t.parent.find_next_sibling().get_text()

	return description.replace("\n", "")


	def find_commit_links(cve_html):
	"""Returns commit links from given CVE's webpage."""
	# TODO: Additional pattern to look for might be:
	# https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2019-19076
	commits = []
	soup = BeautifulSoup(cve_html, "html.parser")

	# Searches through link tags.
	tag = soup.find("div", attrs={"id": "GeneratedTable"})
	for l in tag.descendants:
	if l.name == "a":
	link = l.get("href")
	parsed_link = urlparse(link)
	netloc, path = parsed_link.netloc, parsed_link.path

	if is_kernel_org(netloc, path):
	commits.append(link)

	elif is_github_com(netloc, path):
	commits.append(link)

	return commits


	def is_valid(sha):
	"""Returns True if sha is a hexidecimal string."""
	if not sha:
	return False
	return set(sha).issubset(string.hexdigits)


	def find_sha_from_link(link):
	"""Returns sha, if it exists, based on link given."""
	parsed_link = urlparse(link)
	netloc, path = parsed_link.netloc, parsed_link.path

	sha = None

	if is_kernel_org(netloc, path):
	try:
	sha = parse_qs(parsed_link.query)["id"][0]
	except KeyError:
	LOGGER.error(f"Sha not found in {link}")

	elif is_github_com(netloc, path):
	sha = os.path.basename(path)

	return sha if is_valid(sha) else None


	def find_relevant_commits(cve_number):
	"""Looks for the fix commit(s) given the CVE."""
	commits = set()

	req = make_cve_request(cve_number)

	cve_description = find_cve_description(req.text)
	LOGGER.info(f"CVE Description: {cve_description}")

	commit_links = find_commit_links(req.text)

	# Collects fix commit sha(s) from links.
	for link in commit_links:
	LOGGER.debug(f"Looking for sha in {link}")

	sha = find_sha_from_link(link)
	if sha:
	commits.add(sha)

	return commits