blob: d7e5dc71d7afa3acb564e6f2a77da1ccada42748 [file] [log] [blame] [edit]
// Package gpu provides the tools need to install a gpu extension.
package gpu
import (
"errors"
"fmt"
"io"
"net/http"
"os"
"slices"
"strings"
"cos-extensions/tools/osutils"
deviceInfo "cos.googlesource.com/cos/tools.git/src/cmd/cos_gpu_installer/deviceinfo"
"github.com/golang/glog"
)
const (
osReleasePath = "/etc/os-release"
versionID = "VERSION_ID"
buildID = "BUILD_ID"
apiDomain = "API_DOMAIN"
artifactRegistryDomain = "ARTIFACT_REGISTRY_DOMAIN"
projectPrefix = "PROJECT_PREFIX"
instanceURL = "http://metadata.google.internal/computeMetadata/v1/instance/zone?alt=text"
extensionCache = "/var/lib/cos-extensions"
gcrPrefix = "gcr.io/cos-cloud"
)
var (
run = osutils.Run
parse = osutils.Parse
artifactRegions = []string{"us", "eu", "asia"}
apiDomainsPath = "/etc/cloud-api-domains"
tpcGPUInstallerRepo = "docker.%s/%s/cos-cloud/cos-gpu-installer"
// Note: Please update the app-admin/extensions-manager ebuild file.
DefaultInstaller = "gcr.io/cos-cloud/cos-gpu-installer:v2.4.4"
)
// Installers represent the names of the possible installers to be used during the program execution.
type Installers struct {
// Default installer represents the default gpu driver installer (cos-gpu-installer)
DefaultInstaller string
// Selected installer represents the installer set in the environmental variable [COS_GPU_INSTALLER]
SelectedInstaller string
}
var writer io.Writer = os.Stdout
// List returns all available COS extensions and their versions.
func List(installers Installers, printInstaller bool, installerArgs []string) error {
osReleaseFile, err := parse(osReleasePath)
if err != nil {
return fmt.Errorf("couldn't parse os release file: %v", err)
}
if printInstaller {
// Using Fprint as this displays output to the stdout
// Logging is set to stderr
fmt.Fprintln(writer, installers.DefaultInstaller)
} else {
fmt.Fprintf(writer, "Available extensions for COS version %s-%s:\n\n", osReleaseFile[versionID], osReleaseFile[buildID])
fmt.Fprintln(writer, "[gpu]")
fmt.Fprintf(writer, "gpu installer: %s\n", installers.DefaultInstaller)
err := runInstaller("list", true, extensionCache, installers, installerArgs...)
if err != nil {
return fmt.Errorf("couldn't retrieve available GPU driver versions: %v", err)
}
}
return nil
}
// Install installs the GPU drivers according to specifications
// from the installerArgs.
func Install(installers Installers, installerArgs []string) error {
// Check for --no-verify flag
skipKernelModuleLoadingArg := slices.Contains(installerArgs, "-no-verify") || slices.Contains(installerArgs, "--no-verify")
args := []string{"-host-dir=/var/lib/nvidia"}
if !skipKernelModuleLoadingArg {
gpuType, _ := deviceInfo.GetGPUTypeInfo("/sys/bus/pci/devices")
if gpuType == deviceInfo.GB200 || gpuType == deviceInfo.GB300 {
glog.Infof("%s GPU detected. Enabling IMEX channel creation via NVreg_CreateImexChannel0=1", gpuType)
installerArgs = append(installerArgs, "-module-arg", "nvidia.NVreg_CreateImexChannel0=1")
}
}
args = append(args, installerArgs...)
err := runInstaller("install", false, extensionCache, installers, args...)
if err != nil {
return fmt.Errorf("could not install GPU drivers: %v", err)
}
glog.Info("GPU drivers successfully installed")
if skipKernelModuleLoadingArg {
fmt.Fprintln(writer, "Skipping post_install due to no-verify flag.")
} else {
postInstall()
}
return nil
}
func postInstall() {
glog.Info("Making the GPU driver installation path executable by re-mounting it.")
cmd := `sudo -n -- sh -c "mount --bind /var/lib/nvidia /var/lib/nvidia; mount -o remount,exec /var/lib/nvidia"`
errMessage := `Before deploying GPU workloads - please make the driver installation path executable by re-mounting it using:
sudo mount --bind /var/lib/nvidia /var/lib/nvidia
sudo mount -o remount,exec /var/lib/nvidia`
_, err := run("/bin/sh", []string{"-c", cmd}, false)
if err != nil {
glog.Errorf("Automatic remount failed: %v\n%v\n", errMessage, err)
}
glog.Info("Enabling NVIDIA persistence mode.")
// Enable persistence mode - Whenever the NVIDIA device resources are no longer
// in use, the NVIDIA kernel driver will tear down the device state.
// When persistence mode is enabled, the daemon holds the NVIDIA character
// device files open, preventing the NVIDIA kernel driver from tearing down
// device state when no other process is using the device This utility does not
// actually use any device resources itself - it will simply sleep while
// maintaining a reference to the NVIDIA device state.
cmd = `sudo /var/lib/nvidia/bin/nvidia-persistenced`
_, err = run("/bin/sh", []string{"-c", cmd}, false)
if err != nil {
glog.Errorf("Failed to run nvidia-persistenced daemon.\n%v\n", err)
}
}
// Client represents an http.Client interface
type Client interface {
Do(r *http.Request) (*http.Response, error)
}
func retrieveInstanceZone(c Client, url string) (string, error) {
req, err := http.NewRequest("GET", url, nil)
if err != nil {
return "", fmt.Errorf("failed to create request to retrieve instance zone: %v", err)
}
req.Header.Set("Metadata-Flavor", "Google")
resp, err := c.Do(req)
if err != nil {
return "", fmt.Errorf("failed to retrieve instance zone: %v", err)
}
defer resp.Body.Close()
body, err := io.ReadAll(resp.Body)
if err != nil {
return "", fmt.Errorf("failed to read response body: %v", err)
}
// Syntax of response: projects/<project-id>/zones/<region>
// Example: projects/475556798229/zones/us-central1-a
zone := strings.Split(string(body), "/")
zone = strings.Split(zone[len(zone)-1], "-")
return zone[0], nil
}
func checkArch() error {
arch, err := run("uname", []string{"-m"}, false)
if err != nil {
return fmt.Errorf("failed to retrieve machine architecture: %v", err)
}
if !strings.Contains(arch, "x86_64") && !strings.Contains(arch, "aarch64") {
return fmt.Errorf("GPU installation is only supported on X86 and ARM64 for now.\n Current architecture detected: %s", arch)
}
return nil
}
// GetCosInstaller retrieves the installers to be used in execution of the program's commands.
func GetCosInstaller(c Client) (Installers, error) {
var installers Installers
defaultInstaller := DefaultInstaller
// Modify DefaultInstaller based on APIDomainsPath. If
// APIDomainsPath does not exist, just use DefaultInstaller. This
// adjustment is required when running in a TPC environment,
// where the installer needs to be fetched from an
// internal repository.
if _, err := os.Stat(apiDomainsPath); err == nil {
domains, err := parse(apiDomainsPath)
if err != nil {
return installers, fmt.Errorf("failed to parse %q, err: %v", apiDomainsPath, err)
}
if domains[apiDomain] != "googleapis.com" {
// We are running in TPC, set default installer to in-universe
// repo.
universeRepo := fmt.Sprintf(tpcGPUInstallerRepo, domains[artifactRegistryDomain], strings.TrimSuffix(domains[projectPrefix], ":"))
defaultInstaller = strings.Replace(defaultInstaller, gcrPrefix, universeRepo, -1)
glog.Infof("Using TPC Default installer: %q", defaultInstaller)
}
} else if !errors.Is(err, os.ErrNotExist) {
return installers, fmt.Errorf("failed to stat %q, err: %v", apiDomainsPath, err)
}
// Checks if it is in the environmental variables and returns the value.
if installer := os.Getenv("COS_GPU_INSTALLER"); installer != "" {
installers = Installers{
DefaultInstaller: defaultInstaller,
SelectedInstaller: installer,
}
return installers, nil
}
// Change gpu installer container image name if location of the vm is available
// and it's from cos-cloud project.
zone, err := retrieveInstanceZone(c, instanceURL)
if err != nil {
// It's OK if metadata server returns error.
glog.Errorf("couldn't retrieve instance zone: %v", err)
return installers, nil
}
if zone != "" && slices.Contains(artifactRegions, zone) {
installers = Installers{
DefaultInstaller: fmt.Sprintf("%s.%s", zone, defaultInstaller),
}
return installers, nil
}
installers = Installers{
DefaultInstaller: defaultInstaller,
}
glog.Infof("Using installers: %v", installers)
return installers, nil
}
// runInstaller runs the gpu installer with the cmd and installerArgs.
func runInstaller(cmd string, hideStderr bool, cacheDir string, installers Installers, installerArgs ...string) error {
err := checkArch()
if err != nil {
return err
}
var cosInstaller string
// Checks if selectedInstaller is set and uses that, else it defaults to the
// default installer.
if installers.SelectedInstaller != "" {
cosInstaller = installers.SelectedInstaller
} else {
cosInstaller = installers.DefaultInstaller
}
args := []string{
"run",
"--rm",
"--name=cos-gpu-installer",
"--privileged",
"--net=host",
"--pid=host",
"--volume", "/dev:/dev",
"--volume", "/:/root",
"--log-driver", "journald",
}
if slices.Contains(installerArgs, "--clean-build-tools") {
glog.Infof("Cleaning cache present at: %s", cacheDir)
if err := os.RemoveAll(cacheDir); err != nil {
return fmt.Errorf("failed to clean cache present at %s: %v", cacheDir, err)
}
} else {
_, err := os.Stat(cacheDir)
// Use extension cache if exists by default or if build flag passed in
if slices.Contains(installerArgs, "--prepare-build-tools") || !os.IsNotExist(err) {
buildDir := fmt.Sprintf("%s/:/build/", cacheDir)
args = append(args, []string{"--volume", buildDir}...)
}
args = append(args, cosInstaller)
args = append(args, cmd)
args = append(args, installerArgs...)
res, err := run("/usr/bin/docker", args, hideStderr)
if !hideStderr && err != nil {
return fmt.Errorf("failed to complete installation using installer '%s': %v", cosInstaller, err)
}
fmt.Fprint(writer, res)
}
return nil
}