| #!/bin/bash |
| # |
| # Copyright 2020 The Chromium OS Authors. All rights reserved. |
| # Use of this source code is governed by a BSD-style license that can be |
| # found in the LICENSE file. |
| # |
| # Utility to manage COS extensions. |
| |
| set -o errexit |
| set -o pipefail |
| set -o nounset |
| |
| PROG_NAME=$(basename "$0") |
| readonly PROG_NAME |
| readonly OS_RELEASE="/etc/os-release" |
| readonly EXTENSIONS_CACHE="/var/lib/cos-extensions" |
| readonly ARTIFACT_REGISTRY_REGIONS=("us" "eu" "asia") |
| DEFAULT_GPU_INSTALLER="gcr.io/cos-cloud/cos-gpu-installer:v2.4.0" |
| |
| COS_GPU_INSTALLER="${COS_GPU_INSTALLER:-}" |
| |
| set_cos_gpu_installer() { |
| if [[ -n "${COS_GPU_INSTALLER}" ]]; then |
| return |
| fi |
| |
| # Change gpu container image name if location of the vm is available |
| # and it's from cos-cloud project. |
| local instance_zone |
| instance_zone="$(curl -s "http://metadata.google.internal/computeMetadata/v1/instance/zone?alt=text" -H "Metadata-Flavor: Google" || true)" |
| if [[ -n "${instance_zone}" ]]; then |
| # Syntax of instance_zone: projects/<project-id>/zones/<region> |
| # Example: projects/475556798229/zones/us-central1-a |
| local instance_region |
| instance_region="$(echo "${instance_zone}"|cut -d/ -f4|cut -d- -f1)" |
| for region in "${ARTIFACT_REGISTRY_REGIONS[@]}"; do |
| if [[ "${region}" == "${instance_region}" ]]; then |
| DEFAULT_GPU_INSTALLER="${instance_region}.${DEFAULT_GPU_INSTALLER}" |
| break |
| fi |
| done |
| fi |
| COS_GPU_INSTALLER="${DEFAULT_GPU_INSTALLER}" |
| } |
| |
| usage() { |
| cat <<EOF |
| |
| ${PROG_NAME}: Utility to manage COS extensions. |
| |
| Usage: |
| ${PROG_NAME} [OPTIONS] COMMAND [ARGS]... |
| |
| Options: |
| -h, --help print help message |
| |
| Commands: |
| list [-- --gpu-installer] list all available COS extensions and |
| their versions. |
| |
| install <extension> [-- -<version>] install a COS extension. If no version |
| is given, then the default version |
| will be installed. |
| |
| Additional Description: |
| ${PROG_NAME} install gpu -- [options] |
| |
| --version |
| The gpu extension can be invoked with --version. The possible values are |
| 'latest', 'default', 'R<major-version>' eg. 'R470', 'R535'. or |
| precise driver versions retrievable by running ${PROG_NAME} list. |
| If unspecified, the default driver version is installed. |
| |
| --force-fallback |
| This flag indicates whether to use fallback mechanism when specified |
| GPU driver is not compatible with GPU devices. In case unspecified, |
| the fallback behavior of the installer is not applicable for |
| --version=R<major-version> eg. 'R470', 'R525' or |
| --version=<precise-version> eg. '535.129.03', '525.147.05'. |
| The fallback behavior of the installer is active for --version is unset |
| or --version=default or --version=latest. |
| When fallback behavior is active, the installer will find a compatible |
| driver to install for the detected GPU on the VM. |
| |
| --target-gpu |
| This flag specifies the GPU device for driver installation. |
| If specified, it must be one of: NVIDIA_TESLA_K80, NVIDIA_TESLA_P4, |
| NVIDIA_TESLA_P100, NVIDIA_TESLA_V100, NVIDIA_L4, NVIDIA_H100_80GB, |
| NVIDIA_TESLA_A100, NVIDIA_A100_80GB, NVIDIA_TESLA_T4. |
| If not specified, the GPU device will be auto-detected by the installer. |
| |
| --prepare-build-tools |
| The gpu extension can be invoked with a --prepare-build-tools optional |
| argument that can be used to cache the toolchain for the installer. |
| Caching the toolchain carries the overhead of ~1GB disk space on the |
| stateful partition. |
| Using this command only populates the cache and does NOT install the GPU |
| drivers thus may saves time on downloading the toolchain during subsequent |
| installations. |
| |
| --clean-build--tools |
| Use this optional command to delete the cache for the toolchain present on |
| the stateful partition. |
| |
| -test |
| The gpu extension can be used to install drivers on a dev channel image with |
| this. |
| |
| -no-verify |
| Skip kernel module loading, installation verification, |
| and the enabling of NVIDIA persistence mode. |
| Useful for preloading drivers without attached GPU. |
| |
| -debug |
| The granularity of logging of the gpu extension can be increased by |
| specifying this. |
| |
| ${PROG_NAME} list -- [options] |
| |
| --target-gpu |
| This flag specifies the GPU device to display its compatible drivers. |
| If specified, it must be one of: NVIDIA_TESLA_K80, NVIDIA_TESLA_P4, |
| NVIDIA_TESLA_P100, NVIDIA_TESLA_V100, NVIDIA_L4, NVIDIA_H100_80GB, |
| NVIDIA_TESLA_A100, NVIDIA_A100_80GB, NVIDIA_TESLA_T4. |
| If not specified, the GPU device will be auto-detected by the installer. |
| |
| --gpu-proto-cache-dir |
| The GPU proto cache directory that GPU driver versions proto file is stored into. |
| If unspecified, the GPU driver versions proto file will not be cached. |
| EOF |
| exit "${1}" |
| } |
| |
| parse_args() { |
| local args |
| if ! args=$(getopt --options "h" --longoptions "help" -- "$@"); then |
| usage 1 |
| fi |
| |
| eval set -- "${args}" |
| while true; do |
| case "$1" in |
| -h|--help) |
| usage 0 |
| ;; |
| --) |
| shift |
| break |
| ;; |
| *) |
| usage 1 |
| ;; |
| esac |
| done |
| |
| if [[ "$#" -eq 0 ]]; then |
| usage 1 |
| fi |
| |
| case "$1" in |
| list) |
| shift |
| list "$@" |
| ;; |
| install) |
| if [[ "$#" -eq 2 ]]; then |
| install "$2" |
| elif [[ "$#" -ge 3 ]]; then |
| extension="$2" |
| shift 2 |
| install "${extension}" "$@" |
| else |
| usage 1 |
| fi |
| ;; |
| *) |
| usage 1 |
| ;; |
| esac |
| } |
| |
| list() { |
| local list_installer=false |
| for i in "$@"; do |
| if [[ ${i} == '--gpu-installer' ]]; then |
| list_installer=true |
| else |
| installer_args+=("${i}") |
| fi |
| done |
| |
| if [[ "${list_installer}" = true ]]; then |
| echo "${DEFAULT_GPU_INSTALLER}" |
| else |
| # shellcheck disable=SC2154 |
| printf "Available extensions for COS version %s-%s:\n\n" \ |
| "${VERSION_ID}" "${BUILD_ID}" |
| echo "[gpu]" |
| echo "gpu installer: ${DEFAULT_GPU_INSTALLER}" |
| run_gpu_installer list "${installer_args[@]}" 2>/dev/null |
| fi |
| } |
| |
| install() { |
| case "$1" in |
| gpu) |
| shift |
| run_gpu_installer install "-host-dir=/var/lib/nvidia" "$@" |
| echo "GPU drivers successfully installed." |
| if [[ "$@" =~ (--no-verify|-no-verify) ]]; then |
| echo "Skipping post_install due to no-verify flag." |
| else |
| post_install |
| fi |
| ;; |
| *) |
| echo "Unsupported extension $1" |
| exit 1 |
| ;; |
| esac |
| } |
| |
| post_install() { |
| echo "Making the GPU driver installation path executable by re-mounting it." |
| if ! sudo -n -- sh -c 'mount --bind /var/lib/nvidia /var/lib/nvidia; mount -o remount,exec /var/lib/nvidia'; then |
| cat << EOF |
| Automatic remount failed: Before deploying GPU workloads - please make the driver installation path executable by re-mounting it using: |
| sudo mount --bind /var/lib/nvidia /var/lib/nvidia |
| sudo mount -o remount,exec /var/lib/nvidia |
| EOF |
| fi |
| |
| echo "Enabling NVIDIA persistence mode." |
| # Enable persistence mode - Whenever the NVIDIA device resources are no longer |
| # in use, the NVIDIA kernel driver will tear down the device state. |
| # When persistence mode is enabled, the daemon holds the NVIDIA character |
| # device files open, preventing the NVIDIA kernel driver from tearing down |
| # device state when no other process is using the device This utility does not |
| # actually use any device resources itself - it will simply sleep while |
| # maintaining a reference to the NVIDIA device state. |
| if ! sudo /var/lib/nvidia/bin/nvidia-persistenced; then |
| echo "Failed to run nvidia-persistenced daemon." |
| fi |
| } |
| |
| check_arch() { |
| arch=$(uname -m) |
| if [[ ${arch} != "x86_64" && "${arch}" != "aarch64" ]]; then |
| echo "GPU installation is only supported on X86 and ARM64 for now. |
| Current architecture detected: ${arch}" |
| exit 1 |
| fi |
| } |
| |
| run_gpu_installer() { |
| check_arch |
| local use_build_cache=false |
| local clean_cache=false |
| |
| local installer_args=() |
| for i in "$@"; do |
| if [[ ${i} == '--clean-build-tools' ]]; then |
| clean_cache=true |
| else |
| if [[ ${i} == '--prepare-build-tools' ]]; then |
| use_build_cache=true |
| fi |
| installer_args+=("${i}") |
| fi |
| done |
| |
| local docker_args=( |
| --rm |
| --name="cos-gpu-installer" |
| --privileged |
| --net=host |
| --pid=host |
| --volume /dev:/dev |
| --volume /:/root |
| --log-driver journald |
| ) |
| |
| if [[ "${clean_cache}" = true ]]; then |
| echo "Cleaning cache present at: ${EXTENSIONS_CACHE}" |
| rm -rf "${EXTENSIONS_CACHE}" |
| else |
| # use extensions cache(if it exists) by default |
| if [[ "${use_build_cache}" = true || -d ${EXTENSIONS_CACHE} ]]; then |
| docker_args+=(--volume "${EXTENSIONS_CACHE}/:/build/") |
| fi |
| |
| /usr/bin/docker run "${docker_args[@]}" "${COS_GPU_INSTALLER}"\ |
| "${installer_args[@]}" |
| fi |
| } |
| |
| main() { |
| # shellcheck source=/etc/os-release |
| source "${OS_RELEASE}" |
| set_cos_gpu_installer |
| parse_args "$@" |
| } |
| |
| main "$@" |