blob: 7e4ade13284a578589be7cb4578c52c951b2d1dc [file] [log] [blame]
#!/bin/bash
#
# Copyright 2020 The Chromium OS Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
#
# Utility to manage COS extensions.
set -o errexit
set -o pipefail
set -o nounset
PROG_NAME=$(basename "$0")
readonly PROG_NAME
readonly OS_RELEASE="/etc/os-release"
readonly EXTENSIONS_CACHE="/var/lib/cos-extensions"
readonly ARTIFACT_REGISTRY_REGIONS=("us" "eu" "asia")
DEFAULT_GPU_INSTALLER="gcr.io/cos-cloud/cos-gpu-installer:v2.4.0"
COS_GPU_INSTALLER="${COS_GPU_INSTALLER:-}"
set_cos_gpu_installer() {
if [[ -n "${COS_GPU_INSTALLER}" ]]; then
return
fi
# Change gpu container image name if location of the vm is available
# and it's from cos-cloud project.
local instance_zone
instance_zone="$(curl -s "http://metadata.google.internal/computeMetadata/v1/instance/zone?alt=text" -H "Metadata-Flavor: Google" || true)"
if [[ -n "${instance_zone}" ]]; then
# Syntax of instance_zone: projects/<project-id>/zones/<region>
# Example: projects/475556798229/zones/us-central1-a
local instance_region
instance_region="$(echo "${instance_zone}"|cut -d/ -f4|cut -d- -f1)"
for region in "${ARTIFACT_REGISTRY_REGIONS[@]}"; do
if [[ "${region}" == "${instance_region}" ]]; then
DEFAULT_GPU_INSTALLER="${instance_region}.${DEFAULT_GPU_INSTALLER}"
break
fi
done
fi
COS_GPU_INSTALLER="${DEFAULT_GPU_INSTALLER}"
}
usage() {
cat <<EOF
${PROG_NAME}: Utility to manage COS extensions.
Usage:
${PROG_NAME} [OPTIONS] COMMAND [ARGS]...
Options:
-h, --help print help message
Commands:
list [-- --gpu-installer] list all available COS extensions and
their versions.
install <extension> [-- -<version>] install a COS extension. If no version
is given, then the default version
will be installed.
Additional Description:
${PROG_NAME} install gpu -- [options]
--version
The gpu extension can be invoked with --version. The possible values are
'latest', 'default', 'R<major-version>' eg. 'R470', 'R535'. or
precise driver versions retrievable by running ${PROG_NAME} list.
If unspecified, the default driver version is installed.
--force-fallback
This flag indicates whether to use fallback mechanism when specified
GPU driver is not compatible with GPU devices. In case unspecified,
the fallback behavior of the installer is not applicable for
--version=R<major-version> eg. 'R470', 'R525' or
--version=<precise-version> eg. '535.129.03', '525.147.05'.
The fallback behavior of the installer is active for --version is unset
or --version=default or --version=latest.
When fallback behavior is active, the installer will find a compatible
driver to install for the detected GPU on the VM.
--target-gpu
This flag specifies the GPU device for driver installation.
If specified, it must be one of: NVIDIA_TESLA_K80, NVIDIA_TESLA_P4,
NVIDIA_TESLA_P100, NVIDIA_TESLA_V100, NVIDIA_L4, NVIDIA_H100_80GB,
NVIDIA_TESLA_A100, NVIDIA_A100_80GB, NVIDIA_TESLA_T4.
If not specified, the GPU device will be auto-detected by the installer.
--prepare-build-tools
The gpu extension can be invoked with a --prepare-build-tools optional
argument that can be used to cache the toolchain for the installer.
Caching the toolchain carries the overhead of ~1GB disk space on the
stateful partition.
Using this command only populates the cache and does NOT install the GPU
drivers thus may saves time on downloading the toolchain during subsequent
installations.
--clean-build--tools
Use this optional command to delete the cache for the toolchain present on
the stateful partition.
-test
The gpu extension can be used to install drivers on a dev channel image with
this.
-no-verify
Skip kernel module loading, installation verification,
and the enabling of NVIDIA persistence mode.
Useful for preloading drivers without attached GPU.
-debug
The granularity of logging of the gpu extension can be increased by
specifying this.
${PROG_NAME} list -- [options]
--target-gpu
This flag specifies the GPU device to display its compatible drivers.
If specified, it must be one of: NVIDIA_TESLA_K80, NVIDIA_TESLA_P4,
NVIDIA_TESLA_P100, NVIDIA_TESLA_V100, NVIDIA_L4, NVIDIA_H100_80GB,
NVIDIA_TESLA_A100, NVIDIA_A100_80GB, NVIDIA_TESLA_T4.
If not specified, the GPU device will be auto-detected by the installer.
--gpu-proto-cache-dir
The GPU proto cache directory that GPU driver versions proto file is stored into.
If unspecified, the GPU driver versions proto file will not be cached.
EOF
exit "${1}"
}
parse_args() {
local args
if ! args=$(getopt --options "h" --longoptions "help" -- "$@"); then
usage 1
fi
eval set -- "${args}"
while true; do
case "$1" in
-h|--help)
usage 0
;;
--)
shift
break
;;
*)
usage 1
;;
esac
done
if [[ "$#" -eq 0 ]]; then
usage 1
fi
case "$1" in
list)
shift
list "$@"
;;
install)
if [[ "$#" -eq 2 ]]; then
install "$2"
elif [[ "$#" -ge 3 ]]; then
extension="$2"
shift 2
install "${extension}" "$@"
else
usage 1
fi
;;
*)
usage 1
;;
esac
}
list() {
local list_installer=false
for i in "$@"; do
if [[ ${i} == '--gpu-installer' ]]; then
list_installer=true
else
installer_args+=("${i}")
fi
done
if [[ "${list_installer}" = true ]]; then
echo "${DEFAULT_GPU_INSTALLER}"
else
# shellcheck disable=SC2154
printf "Available extensions for COS version %s-%s:\n\n" \
"${VERSION_ID}" "${BUILD_ID}"
echo "[gpu]"
echo "gpu installer: ${DEFAULT_GPU_INSTALLER}"
run_gpu_installer list "${installer_args[@]}" 2>/dev/null
fi
}
install() {
case "$1" in
gpu)
shift
run_gpu_installer install "-host-dir=/var/lib/nvidia" "$@"
echo "GPU drivers successfully installed."
if [[ "$@" =~ (--no-verify|-no-verify) ]]; then
echo "Skipping post_install due to no-verify flag."
else
post_install
fi
;;
*)
echo "Unsupported extension $1"
exit 1
;;
esac
}
post_install() {
echo "Making the GPU driver installation path executable by re-mounting it."
if ! sudo -n -- sh -c 'mount --bind /var/lib/nvidia /var/lib/nvidia; mount -o remount,exec /var/lib/nvidia'; then
cat << EOF
Automatic remount failed: Before deploying GPU workloads - please make the driver installation path executable by re-mounting it using:
sudo mount --bind /var/lib/nvidia /var/lib/nvidia
sudo mount -o remount,exec /var/lib/nvidia
EOF
fi
echo "Enabling NVIDIA persistence mode."
# Enable persistence mode - Whenever the NVIDIA device resources are no longer
# in use, the NVIDIA kernel driver will tear down the device state.
# When persistence mode is enabled, the daemon holds the NVIDIA character
# device files open, preventing the NVIDIA kernel driver from tearing down
# device state when no other process is using the device This utility does not
# actually use any device resources itself - it will simply sleep while
# maintaining a reference to the NVIDIA device state.
if ! sudo /var/lib/nvidia/bin/nvidia-persistenced; then
echo "Failed to run nvidia-persistenced daemon."
fi
}
check_arch() {
arch=$(uname -m)
if [[ ${arch} != "x86_64" && "${arch}" != "aarch64" ]]; then
echo "GPU installation is only supported on X86 and ARM64 for now.
Current architecture detected: ${arch}"
exit 1
fi
}
run_gpu_installer() {
check_arch
local use_build_cache=false
local clean_cache=false
local installer_args=()
for i in "$@"; do
if [[ ${i} == '--clean-build-tools' ]]; then
clean_cache=true
else
if [[ ${i} == '--prepare-build-tools' ]]; then
use_build_cache=true
fi
installer_args+=("${i}")
fi
done
local docker_args=(
--rm
--name="cos-gpu-installer"
--privileged
--net=host
--pid=host
--volume /dev:/dev
--volume /:/root
--log-driver journald
)
if [[ "${clean_cache}" = true ]]; then
echo "Cleaning cache present at: ${EXTENSIONS_CACHE}"
rm -rf "${EXTENSIONS_CACHE}"
else
# use extensions cache(if it exists) by default
if [[ "${use_build_cache}" = true || -d ${EXTENSIONS_CACHE} ]]; then
docker_args+=(--volume "${EXTENSIONS_CACHE}/:/build/")
fi
/usr/bin/docker run "${docker_args[@]}" "${COS_GPU_INSTALLER}"\
"${installer_args[@]}"
fi
}
main() {
# shellcheck source=/etc/os-release
source "${OS_RELEASE}"
set_cos_gpu_installer
parse_args "$@"
}
main "$@"