blob: 2dfb7319bd16ad7fce501df82e0f01cc00d482d5 [file] [log] [blame]
#!/usr/bin/env python3
# Copyright 2023 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
import os
import re
import sys
from typing import Optional, List
from urllib.parse import urlparse, urlunparse
from itertools import filterfalse
_THIS_DIR = os.path.abspath(os.path.dirname(__file__))
# The repo's root directory.
_ROOT_DIR = os.path.abspath(os.path.join(_THIS_DIR, "..", "..", ".."))
# Add the repo's root directory for clearer imports.
sys.path.insert(0, _ROOT_DIR)
import metadata.fields.field_types as field_types
import metadata.fields.util as util
import metadata.validation_result as vr
_PATTERN_URL_CANONICAL_REPO = re.compile(
r"^This is the canonical (public )?repo(sitory)?\.?$", re.IGNORECASE)
_SUPPORTED_SCHEMES = {
'http',
'https',
'git',
'ftp',
}
# URLs can't contain whitespaces. Treat them as delimiters so we can handle cases where URL field contains one URL per line (without comma delimiter).
_PATTERN_URL_DELIMITER = re.compile("{}|{}".format(
r'\s+', field_types.MetadataField.VALUE_DELIMITER))
def _split_urls(value: str) -> List[str]:
"""Split url field value into individual URLs."""
urls = _PATTERN_URL_DELIMITER.split(value)
return list(filter(lambda x: len(x) > 0, map(str.strip, urls)))
def _url_canonicalize(url: str) -> str:
"""Return the canonicalized URL (e.g. make scheme lower case)."""
return urlunparse(urlparse(url))
def _url_is_canonical(url: str) -> bool:
return url == _url_canonicalize(url)
def _url_is_valid(url: str) -> bool:
"""Checks whether the given `url` is acceptable:
* url is can be parsed without an error.
* url uses a supported scheme / protocol.
"""
try:
u = urlparse(url)
except:
return False
if u.scheme not in _SUPPORTED_SCHEMES:
return False
return True
class URLField(field_types.MetadataField):
"""Custom field for the package URL(s)."""
def __init__(self):
super().__init__(name="URL")
def repo_is_canonical(self, value: str):
"""Returns if `raw_value` indicates this repository is the canonical repository."""
return util.matches(_PATTERN_URL_CANONICAL_REPO, value.strip())
def validate(self, value: str) -> Optional[vr.ValidationResult]:
"""Checks the given value has acceptable URL values only.
Note: this field supports multiple values.
"""
if self.repo_is_canonical(value):
return None
urls = _split_urls(value)
if not urls:
return vr.ValidationError(reason=f"{self._name} must be provided.")
invalid_values = list(filterfalse(_url_is_valid, urls))
if invalid_values:
return vr.ValidationError(
reason=f"{self._name} is invalid.",
additional=[
"URLs must use a protocol scheme in "
"[http, https, ftp, git].",
f"Separate URLs using a '{self.VALUE_DELIMITER}'.",
f"Invalid values: {util.quoted(invalid_values)}.",
])
non_canon_values = list(filterfalse(_url_is_canonical, urls))
if non_canon_values:
canon_values = list(map(_url_canonicalize, non_canon_values))
return vr.ValidationWarning(
reason=f"{self._name} is contains non-canonical URLs.",
additional=[
"URLs should be canonical and well-formed."
f"Non canonical values: {util.quoted(non_canon_values)}.",
f"Canonicalized URLs should be: {util.quoted(canon_values)}."
])
return None
def narrow_type(self, value) -> Optional[List[str]]:
if not value:
return None
if self.repo_is_canonical(value):
return None
# Filter out invalid URLs, and canonicalize the URLs.
return list(
map(_url_canonicalize, filter(_url_is_valid, _split_urls(value))))