blob: 35b9c1657c87107569673aa87836e9d63fd472c9 [file] [log] [blame]
# Copyright 2023 The ChromiumOS Authors
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
"""Library to make common google storage operations more reliable."""
from typing import TYPE_CHECKING, Union
import urllib.parse
if TYPE_CHECKING:
import os
# Public path, only really works for files.
PUBLIC_BASE_HTTPS_URL = "https://storage.googleapis.com/"
# Private path for files.
PRIVATE_BASE_HTTPS_URL = "https://storage.cloud.google.com/"
# Private path for directories.
# TODO(akeshet): this is a workaround for b/27653354. If that is ultimately
# fixed, revisit this workaround.
PRIVATE_BASE_HTTPS_DOWNLOAD_URL = "https://stainless.corp.google.com/browse/"
BASE_GS_URL = "gs://"
def PathIsGs(path: Union[str, "os.PathLike[str]"]) -> bool:
"""Determine if |path| is a Google Storage URI.
We accept pathlib objects because our GS APIs handle local filesystem paths.
"""
return isinstance(path, str) and path.startswith(BASE_GS_URL)
def CanonicalizeURL(url: str, strict: bool = False) -> str:
"""Convert provided URL to gs:// URL, if it follows a known format.
Args:
url: URL to canonicalize.
strict: Raises exception if URL cannot be canonicalized.
"""
for prefix in (
PUBLIC_BASE_HTTPS_URL,
PRIVATE_BASE_HTTPS_URL,
PRIVATE_BASE_HTTPS_DOWNLOAD_URL,
"https://pantheon.corp.google.com/storage/browser/",
"https://commondatastorage.googleapis.com/",
):
if url.startswith(prefix):
return url.replace(prefix, BASE_GS_URL, 1)
if not PathIsGs(url) and strict:
raise ValueError("Url %r cannot be canonicalized." % url)
return url
def GetGsURL(
bucket: str, for_gsutil: bool = False, public: bool = True, suburl: str = ""
) -> str:
"""Construct a Google Storage URL
Args:
bucket: The Google Storage bucket to use
for_gsutil: Do you want a URL for passing to `gsutil`?
public: Do we want the public or private url
suburl: A url fragment to tack onto the end
Returns:
The fully constructed URL
"""
url = "gs://%s/%s" % (bucket, suburl)
if for_gsutil:
return url
else:
return GsUrlToHttp(url, public=public)
def GsUrlToHttp(path: str, public: bool = True, directory: bool = False) -> str:
"""Convert a GS URL to a HTTP URL for the same resource.
Because the HTTP Urls are not fixed (and may not always be simple prefix
replacements), use this method to centralize the conversion.
Directories need to have different URLs from files, because the Web UIs for
GS are weird and really inconsistent. Also, public directories probably
don't work, and probably never will (permissions as well as UI).
e.g. 'gs://chromeos-image-archive/path/file' ->
'https://pantheon/path/file'
Args:
path: GS URL to convert.
public: Is this URL for Googler access, or publicly visible?
directory: Force this URL to be treated as a directory?
We try to autodetect on False.
Returns:
https URL as a string.
"""
assert PathIsGs(path)
directory = directory or path.endswith("/")
# Public HTTP URls for directories don't work'
# assert not public or not directory,
if public:
return path.replace(BASE_GS_URL, PUBLIC_BASE_HTTPS_URL, 1)
else:
if directory:
return path.replace(BASE_GS_URL, PRIVATE_BASE_HTTPS_DOWNLOAD_URL, 1)
else:
return path.replace(BASE_GS_URL, PRIVATE_BASE_HTTPS_URL, 1)
def extract_gs_bucket(uri: str) -> str:
"""Extract the Google Storage bucket from an ambiguously formatted URI.
All of the following inputs should return the same output (my_bucket):
gs://my_bucket
gs://my_bucket
gs://my_bucket/my_resource.txt
my_bucket
my_bucket/my_resource.txt
Raises:
ValueError: The URI uses a scheme other than gs://, such as https://.
"""
parsed_uri = urllib.parse.urlparse(uri)
if parsed_uri.scheme == "gs":
return parsed_uri.netloc
if not parsed_uri.scheme:
return uri.split("/")[0]
raise ValueError(f"Unexpected scheme {parsed_uri.scheme} in URI {uri}.")