contrib/cvetriager/cvelib/webscraper.py - mirrors/cros/chromiumos/platform/dev-util - Git at Google

 # Copyright 2020 The Chromium OS Authors. All rights reserved.
 # Use of this source code is governed by a BSD-style license that can be
 # found in the LICENSE file.

 """Module for collecting CVE data."""

 from urllib.parse import urlparse, parse_qs
 import os
 import string
 import logging

 from cvelib import logutils
 from bs4 import BeautifulSoup
 import requests


 LOGGER = logutils.setuplogging(loglvl=logging.DEBUG, name='WebScraper')

 CVE_URL = 'https://cve.mitre.org/cgi-bin/cvename.cgi'
 KERNEL_ORG = 'git.kernel.org'
 KERNEL_PATH = ['/cgit/linux/kernel/git/torvalds', '/pub/scm/linux/kernel/git/torvalds/']
 GITHUB_COM = 'github.com'
 GITHUB_PATH = '/torvalds/linux/'


 class WebScraperException(Exception):
     """Exception class for web scraper."""


 def make_cve_request(cve_number):
     """Generates CVE url."""
     cve = {'name': cve_number}
     r = requests.get(CVE_URL, params=cve)

     if r.status_code != 200:
         raise WebScraperException('Status code is not 200 OK.')

     # Checks page for an invalid CVE number. This is only done because the page
     # is still existent even if the CVE number is not, therefore it returns a
     # 200 status code and passes the first check.
     soup = BeautifulSoup(r.text, 'html.parser')
     tag = soup.find('div', attrs={'id':'CenterPane'})

     for d in tag.descendants:
         if d.name == 'h2' and d.string.startswith('ERROR:'):
             raise WebScraperException('CVE number is invalid.')

     return r


 def is_kernel_org(netloc, path):
     """Check if is useful git.kernel.org link."""
     if netloc != KERNEL_ORG:
         return False

     for link_path in KERNEL_PATH:
         if path.startswith(link_path):
             return True

     return False


 def is_github_com(netloc, path):
     """Check if is useful github.com link."""
     return netloc == GITHUB_COM and path.startswith(GITHUB_PATH)


 def find_cve_description(cve_html):
     """Returns given CVE's description."""
     soup = BeautifulSoup(cve_html, 'html.parser')

     tag = soup.find('div', attrs={'id': 'GeneratedTable'})

     for t in tag.descendants:
         if t.name == 'th' and t.text == 'Description':
             description = t.parent.find_next_sibling().get_text()

     return description.replace('\n', '')


 def find_commit_links(cve_html):
     """Returns commit links from given CVE's webpage."""
     # TODO: Additional pattern to look for might be:
     # https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2019-19076
     commits = []
     soup = BeautifulSoup(cve_html, 'html.parser')

     # Searches through link tags.
     tag = soup.find('div', attrs={'id': 'GeneratedTable'})
     for l in tag.descendants:
         if l.name == 'a':
             link = l.get('href')
             parsed_link = urlparse(link)
             netloc, path = parsed_link.netloc, parsed_link.path

             if is_kernel_org(netloc, path):
                 commits.append(link)

             elif is_github_com(netloc, path):
                 commits.append(link)

     return commits


 def is_valid(sha):
     """Returns True if sha is a hexidecimal string."""
     if not sha:
         return False
     return set(sha).issubset(string.hexdigits)


 def find_sha_from_link(link):
     """Returns sha, if it exists, based on link given."""
     parsed_link = urlparse(link)
     netloc, path = parsed_link.netloc, parsed_link.path

     sha = None

     if is_kernel_org(netloc, path):
         try:
             sha = parse_qs(parsed_link.query)['id'][0]
         except KeyError:
             LOGGER.error(f'Sha not found in {link}')

     elif is_github_com(netloc, path):
         sha = os.path.basename(path)

     return sha if is_valid(sha) else None


 def find_relevant_commits(cve_number):
     """Looks for the fix commit(s) given the CVE."""
     commits = set()

     req = make_cve_request(cve_number)

     cve_description = find_cve_description(req.text)
     LOGGER.info(f'CVE Description: {cve_description}')

     commit_links = find_commit_links(req.text)

     # Collects fix commit sha(s) from links.
     for link in commit_links:
         LOGGER.debug(f'Looking for sha in {link}')

         sha = find_sha_from_link(link)
         if sha:
             commits.add(sha)

     return commits
	# Copyright 2020 The Chromium OS Authors. All rights reserved.
	# Use of this source code is governed by a BSD-style license that can be
	# found in the LICENSE file.

	"""Module for collecting CVE data."""

	from urllib.parse import urlparse, parse_qs
	import os
	import string
	import logging

	from cvelib import logutils
	from bs4 import BeautifulSoup
	import requests


	LOGGER = logutils.setuplogging(loglvl=logging.DEBUG, name='WebScraper')

	CVE_URL = 'https://cve.mitre.org/cgi-bin/cvename.cgi'
	KERNEL_ORG = 'git.kernel.org'
	KERNEL_PATH = ['/cgit/linux/kernel/git/torvalds', '/pub/scm/linux/kernel/git/torvalds/']
	GITHUB_COM = 'github.com'
	GITHUB_PATH = '/torvalds/linux/'


	class WebScraperException(Exception):
	"""Exception class for web scraper."""


	def make_cve_request(cve_number):
	"""Generates CVE url."""
	cve = {'name': cve_number}
	r = requests.get(CVE_URL, params=cve)

	if r.status_code != 200:
	raise WebScraperException('Status code is not 200 OK.')

	# Checks page for an invalid CVE number. This is only done because the page
	# is still existent even if the CVE number is not, therefore it returns a
	# 200 status code and passes the first check.
	soup = BeautifulSoup(r.text, 'html.parser')
	tag = soup.find('div', attrs={'id':'CenterPane'})

	for d in tag.descendants:
	if d.name == 'h2' and d.string.startswith('ERROR:'):
	raise WebScraperException('CVE number is invalid.')

	return r


	def is_kernel_org(netloc, path):
	"""Check if is useful git.kernel.org link."""
	if netloc != KERNEL_ORG:
	return False

	for link_path in KERNEL_PATH:
	if path.startswith(link_path):
	return True

	return False


	def is_github_com(netloc, path):
	"""Check if is useful github.com link."""
	return netloc == GITHUB_COM and path.startswith(GITHUB_PATH)


	def find_cve_description(cve_html):
	"""Returns given CVE's description."""
	soup = BeautifulSoup(cve_html, 'html.parser')

	tag = soup.find('div', attrs={'id': 'GeneratedTable'})

	for t in tag.descendants:
	if t.name == 'th' and t.text == 'Description':
	description = t.parent.find_next_sibling().get_text()

	return description.replace('\n', '')


	def find_commit_links(cve_html):
	"""Returns commit links from given CVE's webpage."""
	# TODO: Additional pattern to look for might be:
	# https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2019-19076
	commits = []
	soup = BeautifulSoup(cve_html, 'html.parser')

	# Searches through link tags.
	tag = soup.find('div', attrs={'id': 'GeneratedTable'})
	for l in tag.descendants:
	if l.name == 'a':
	link = l.get('href')
	parsed_link = urlparse(link)
	netloc, path = parsed_link.netloc, parsed_link.path

	if is_kernel_org(netloc, path):
	commits.append(link)

	elif is_github_com(netloc, path):
	commits.append(link)

	return commits


	def is_valid(sha):
	"""Returns True if sha is a hexidecimal string."""
	if not sha:
	return False
	return set(sha).issubset(string.hexdigits)


	def find_sha_from_link(link):
	"""Returns sha, if it exists, based on link given."""
	parsed_link = urlparse(link)
	netloc, path = parsed_link.netloc, parsed_link.path

	sha = None

	if is_kernel_org(netloc, path):
	try:
	sha = parse_qs(parsed_link.query)['id'][0]
	except KeyError:
	LOGGER.error(f'Sha not found in {link}')

	elif is_github_com(netloc, path):
	sha = os.path.basename(path)

	return sha if is_valid(sha) else None


	def find_relevant_commits(cve_number):
	"""Looks for the fix commit(s) given the CVE."""
	commits = set()

	req = make_cve_request(cve_number)

	cve_description = find_cve_description(req.text)
	LOGGER.info(f'CVE Description: {cve_description}')

	commit_links = find_commit_links(req.text)

	# Collects fix commit sha(s) from links.
	for link in commit_links:
	LOGGER.debug(f'Looking for sha in {link}')

	sha = find_sha_from_link(link)
	if sha:
	commits.add(sha)

	return commits