blob: f1a84fd074ce7cf5d7fb254f7207b5e5962a381d [file] [log] [blame]
# Copyright 2020 The Chromium OS Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
"""Module for collecting CVE data."""
from urllib.parse import urlparse, parse_qs
import os
import string
import logging
from cvelib import logutils
from bs4 import BeautifulSoup
import requests
LOGGER = logutils.setuplogging(loglvl=logging.DEBUG, name='WebScraper')
CVE_URL = 'https://cve.mitre.org/cgi-bin/cvename.cgi'
KERNEL_ORG = 'git.kernel.org'
KERNEL_PATH = ['/cgit/linux/kernel/git/torvalds', '/pub/scm/linux/kernel/git/torvalds/']
GITHUB_COM = 'github.com'
GITHUB_PATH = '/torvalds/linux/'
class WebScraperException(Exception):
"""Exception class for web scraper."""
def make_cve_request(cve_number):
"""Generates CVE url."""
cve = {'name': cve_number}
r = requests.get(CVE_URL, params=cve)
if r.status_code != 200:
raise WebScraperException('Status code is not 200 OK.')
# Checks page for an invalid CVE number. This is only done because the page
# is still existent even if the CVE number is not, therefore it returns a
# 200 status code and passes the first check.
soup = BeautifulSoup(r.text, 'html.parser')
tag = soup.find('div', attrs={'id':'CenterPane'})
for d in tag.descendants:
if d.name == 'h2' and d.string.startswith('ERROR:'):
raise WebScraperException('CVE number is invalid.')
return r
def is_kernel_org(netloc, path):
"""Check if is useful git.kernel.org link."""
if netloc != KERNEL_ORG:
return False
for link_path in KERNEL_PATH:
if path.startswith(link_path):
return True
return False
def is_github_com(netloc, path):
"""Check if is useful github.com link."""
return netloc == GITHUB_COM and path.startswith(GITHUB_PATH)
def find_cve_description(cve_html):
"""Returns given CVE's description."""
soup = BeautifulSoup(cve_html, 'html.parser')
tag = soup.find('div', attrs={'id': 'GeneratedTable'})
for t in tag.descendants:
if t.name == 'th' and t.text == 'Description':
description = t.parent.find_next_sibling().get_text()
return description.replace('\n', '')
def find_commit_links(cve_html):
"""Returns commit links from given CVE's webpage."""
# TODO: Additional pattern to look for might be:
# https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2019-19076
commits = []
soup = BeautifulSoup(cve_html, 'html.parser')
# Searches through link tags.
tag = soup.find('div', attrs={'id': 'GeneratedTable'})
for l in tag.descendants:
if l.name == 'a':
link = l.get('href')
parsed_link = urlparse(link)
netloc, path = parsed_link.netloc, parsed_link.path
if is_kernel_org(netloc, path):
commits.append(link)
elif is_github_com(netloc, path):
commits.append(link)
return commits
def is_valid(sha):
"""Returns True if sha is a hexidecimal string."""
if not sha:
return False
return set(sha).issubset(string.hexdigits)
def find_sha_from_link(link):
"""Returns sha, if it exists, based on link given."""
parsed_link = urlparse(link)
netloc, path = parsed_link.netloc, parsed_link.path
sha = None
if is_kernel_org(netloc, path):
try:
sha = parse_qs(parsed_link.query)['id'][0]
except KeyError:
LOGGER.error(f'Sha not found in {link}')
elif is_github_com(netloc, path):
sha = os.path.basename(path)
return sha if is_valid(sha) else None
def find_relevant_commits(cve_number):
"""Looks for the fix commit(s) given the CVE."""
commits = set()
req = make_cve_request(cve_number)
cve_description = find_cve_description(req.text)
LOGGER.info(f'CVE Description: {cve_description}')
commit_links = find_commit_links(req.text)
# Collects fix commit sha(s) from links.
for link in commit_links:
LOGGER.debug(f'Looking for sha in {link}')
sha = find_sha_from_link(link)
if sha:
commits.add(sha)
return commits