| // Package gpu provides the tools need to install a gpu extension. |
| package gpu |
| |
| import ( |
| "errors" |
| "fmt" |
| "io" |
| "net/http" |
| "os" |
| "slices" |
| "strings" |
| |
| "cos-extensions/tools/osutils" |
| |
| deviceInfo "cos.googlesource.com/cos/tools.git/src/cmd/cos_gpu_installer/deviceinfo" |
| |
| "github.com/golang/glog" |
| ) |
| |
| const ( |
| osReleasePath = "/etc/os-release" |
| versionID = "VERSION_ID" |
| buildID = "BUILD_ID" |
| apiDomain = "API_DOMAIN" |
| artifactRegistryDomain = "ARTIFACT_REGISTRY_DOMAIN" |
| projectPrefix = "PROJECT_PREFIX" |
| instanceURL = "http://metadata.google.internal/computeMetadata/v1/instance/zone?alt=text" |
| extensionCache = "/var/lib/cos-extensions" |
| gcrPrefix = "gcr.io/cos-cloud" |
| ) |
| |
| var ( |
| run = osutils.Run |
| parse = osutils.Parse |
| artifactRegions = []string{"us", "eu", "asia"} |
| apiDomainsPath = "/etc/cloud-api-domains" |
| tpcGPUInstallerRepo = "docker.%s/%s/cos-cloud/cos-gpu-installer" |
| // Note: Please update the app-admin/extensions-manager ebuild file. |
| DefaultInstaller = "gcr.io/cos-cloud/cos-gpu-installer:v2.4.4" |
| ) |
| |
| // Installers represent the names of the possible installers to be used during the program execution. |
| type Installers struct { |
| // Default installer represents the default gpu driver installer (cos-gpu-installer) |
| DefaultInstaller string |
| // Selected installer represents the installer set in the environmental variable [COS_GPU_INSTALLER] |
| SelectedInstaller string |
| } |
| |
| var writer io.Writer = os.Stdout |
| |
| // List returns all available COS extensions and their versions. |
| func List(installers Installers, printInstaller bool, installerArgs []string) error { |
| osReleaseFile, err := parse(osReleasePath) |
| if err != nil { |
| return fmt.Errorf("couldn't parse os release file: %v", err) |
| } |
| if printInstaller { |
| // Using Fprint as this displays output to the stdout |
| // Logging is set to stderr |
| fmt.Fprintln(writer, installers.DefaultInstaller) |
| } else { |
| fmt.Fprintf(writer, "Available extensions for COS version %s-%s:\n\n", osReleaseFile[versionID], osReleaseFile[buildID]) |
| fmt.Fprintln(writer, "[gpu]") |
| fmt.Fprintf(writer, "gpu installer: %s\n", installers.DefaultInstaller) |
| |
| err := runInstaller("list", true, extensionCache, installers, installerArgs...) |
| if err != nil { |
| return fmt.Errorf("couldn't retrieve available GPU driver versions: %v", err) |
| } |
| } |
| return nil |
| } |
| |
| // Install installs the GPU drivers according to specifications |
| // from the installerArgs. |
| func Install(installers Installers, installerArgs []string) error { |
| // Check for --no-verify flag |
| skipKernelModuleLoadingArg := slices.Contains(installerArgs, "-no-verify") || slices.Contains(installerArgs, "--no-verify") |
| |
| args := []string{"-host-dir=/var/lib/nvidia"} |
| if !skipKernelModuleLoadingArg { |
| gpuType, _ := deviceInfo.GetGPUTypeInfo("/sys/bus/pci/devices") |
| if gpuType == deviceInfo.GB200 || gpuType == deviceInfo.GB300 { |
| glog.Infof("%s GPU detected. Enabling IMEX channel creation via NVreg_CreateImexChannel0=1", gpuType) |
| installerArgs = append(installerArgs, "-module-arg", "nvidia.NVreg_CreateImexChannel0=1") |
| } |
| } |
| args = append(args, installerArgs...) |
| err := runInstaller("install", false, extensionCache, installers, args...) |
| if err != nil { |
| return fmt.Errorf("could not install GPU drivers: %v", err) |
| } |
| glog.Info("GPU drivers successfully installed") |
| if skipKernelModuleLoadingArg { |
| fmt.Fprintln(writer, "Skipping post_install due to no-verify flag.") |
| } else { |
| postInstall() |
| } |
| return nil |
| } |
| |
| func postInstall() { |
| glog.Info("Making the GPU driver installation path executable by re-mounting it.") |
| cmd := `sudo -n -- sh -c "mount --bind /var/lib/nvidia /var/lib/nvidia; mount -o remount,exec /var/lib/nvidia"` |
| errMessage := `Before deploying GPU workloads - please make the driver installation path executable by re-mounting it using: |
| sudo mount --bind /var/lib/nvidia /var/lib/nvidia |
| sudo mount -o remount,exec /var/lib/nvidia` |
| _, err := run("/bin/sh", []string{"-c", cmd}, false) |
| if err != nil { |
| glog.Errorf("Automatic remount failed: %v\n%v\n", errMessage, err) |
| } |
| |
| glog.Info("Enabling NVIDIA persistence mode.") |
| |
| // Enable persistence mode - Whenever the NVIDIA device resources are no longer |
| // in use, the NVIDIA kernel driver will tear down the device state. |
| // When persistence mode is enabled, the daemon holds the NVIDIA character |
| // device files open, preventing the NVIDIA kernel driver from tearing down |
| // device state when no other process is using the device This utility does not |
| // actually use any device resources itself - it will simply sleep while |
| // maintaining a reference to the NVIDIA device state. |
| cmd = `sudo /var/lib/nvidia/bin/nvidia-persistenced` |
| _, err = run("/bin/sh", []string{"-c", cmd}, false) |
| if err != nil { |
| glog.Errorf("Failed to run nvidia-persistenced daemon.\n%v\n", err) |
| } |
| } |
| |
| // Client represents an http.Client interface |
| type Client interface { |
| Do(r *http.Request) (*http.Response, error) |
| } |
| |
| func retrieveInstanceZone(c Client, url string) (string, error) { |
| req, err := http.NewRequest("GET", url, nil) |
| if err != nil { |
| return "", fmt.Errorf("failed to create request to retrieve instance zone: %v", err) |
| } |
| req.Header.Set("Metadata-Flavor", "Google") |
| resp, err := c.Do(req) |
| if err != nil { |
| return "", fmt.Errorf("failed to retrieve instance zone: %v", err) |
| } |
| defer resp.Body.Close() |
| body, err := io.ReadAll(resp.Body) |
| if err != nil { |
| return "", fmt.Errorf("failed to read response body: %v", err) |
| } |
| |
| // Syntax of response: projects/<project-id>/zones/<region> |
| // Example: projects/475556798229/zones/us-central1-a |
| zone := strings.Split(string(body), "/") |
| zone = strings.Split(zone[len(zone)-1], "-") |
| return zone[0], nil |
| } |
| |
| func checkArch() error { |
| arch, err := run("uname", []string{"-m"}, false) |
| if err != nil { |
| return fmt.Errorf("failed to retrieve machine architecture: %v", err) |
| } |
| if !strings.Contains(arch, "x86_64") && !strings.Contains(arch, "aarch64") { |
| return fmt.Errorf("GPU installation is only supported on X86 and ARM64 for now.\n Current architecture detected: %s", arch) |
| } |
| return nil |
| } |
| |
| // GetCosInstaller retrieves the installers to be used in execution of the program's commands. |
| func GetCosInstaller(c Client) (Installers, error) { |
| var installers Installers |
| defaultInstaller := DefaultInstaller |
| |
| // Modify DefaultInstaller based on APIDomainsPath. If |
| // APIDomainsPath does not exist, just use DefaultInstaller. This |
| // adjustment is required when running in a TPC environment, |
| // where the installer needs to be fetched from an |
| // internal repository. |
| if _, err := os.Stat(apiDomainsPath); err == nil { |
| domains, err := parse(apiDomainsPath) |
| if err != nil { |
| return installers, fmt.Errorf("failed to parse %q, err: %v", apiDomainsPath, err) |
| } |
| if domains[apiDomain] != "googleapis.com" { |
| // We are running in TPC, set default installer to in-universe |
| // repo. |
| universeRepo := fmt.Sprintf(tpcGPUInstallerRepo, domains[artifactRegistryDomain], strings.TrimSuffix(domains[projectPrefix], ":")) |
| defaultInstaller = strings.Replace(defaultInstaller, gcrPrefix, universeRepo, -1) |
| glog.Infof("Using TPC Default installer: %q", defaultInstaller) |
| } |
| } else if !errors.Is(err, os.ErrNotExist) { |
| return installers, fmt.Errorf("failed to stat %q, err: %v", apiDomainsPath, err) |
| } |
| |
| // Checks if it is in the environmental variables and returns the value. |
| if installer := os.Getenv("COS_GPU_INSTALLER"); installer != "" { |
| installers = Installers{ |
| DefaultInstaller: defaultInstaller, |
| SelectedInstaller: installer, |
| } |
| return installers, nil |
| } |
| |
| // Change gpu installer container image name if location of the vm is available |
| // and it's from cos-cloud project. |
| zone, err := retrieveInstanceZone(c, instanceURL) |
| if err != nil { |
| // It's OK if metadata server returns error. |
| glog.Errorf("couldn't retrieve instance zone: %v", err) |
| return installers, nil |
| } |
| if zone != "" && slices.Contains(artifactRegions, zone) { |
| installers = Installers{ |
| DefaultInstaller: fmt.Sprintf("%s.%s", zone, defaultInstaller), |
| } |
| return installers, nil |
| } |
| installers = Installers{ |
| DefaultInstaller: defaultInstaller, |
| } |
| glog.Infof("Using installers: %v", installers) |
| return installers, nil |
| } |
| |
| // runInstaller runs the gpu installer with the cmd and installerArgs. |
| func runInstaller(cmd string, hideStderr bool, cacheDir string, installers Installers, installerArgs ...string) error { |
| err := checkArch() |
| if err != nil { |
| return err |
| } |
| var cosInstaller string |
| |
| // Checks if selectedInstaller is set and uses that, else it defaults to the |
| // default installer. |
| if installers.SelectedInstaller != "" { |
| cosInstaller = installers.SelectedInstaller |
| } else { |
| cosInstaller = installers.DefaultInstaller |
| } |
| |
| args := []string{ |
| "run", |
| "--rm", |
| "--name=cos-gpu-installer", |
| "--privileged", |
| "--net=host", |
| "--pid=host", |
| "--volume", "/dev:/dev", |
| "--volume", "/:/root", |
| "--log-driver", "journald", |
| } |
| |
| if slices.Contains(installerArgs, "--clean-build-tools") { |
| glog.Infof("Cleaning cache present at: %s", cacheDir) |
| |
| if err := os.RemoveAll(cacheDir); err != nil { |
| return fmt.Errorf("failed to clean cache present at %s: %v", cacheDir, err) |
| } |
| } else { |
| _, err := os.Stat(cacheDir) |
| |
| // Use extension cache if exists by default or if build flag passed in |
| if slices.Contains(installerArgs, "--prepare-build-tools") || !os.IsNotExist(err) { |
| buildDir := fmt.Sprintf("%s/:/build/", cacheDir) |
| args = append(args, []string{"--volume", buildDir}...) |
| } |
| args = append(args, cosInstaller) |
| args = append(args, cmd) |
| args = append(args, installerArgs...) |
| res, err := run("/usr/bin/docker", args, hideStderr) |
| if !hideStderr && err != nil { |
| return fmt.Errorf("failed to complete installation using installer '%s': %v", cosInstaller, err) |
| } |
| fmt.Fprint(writer, res) |
| } |
| return nil |
| } |