| // Copyright 2021 Google LLC |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| |
| package provisioner |
| |
| import ( |
| "context" |
| "errors" |
| "fmt" |
| "log" |
| "os" |
| "os/exec" |
| "path/filepath" |
| "runtime" |
| "strings" |
| "text/template" |
| "time" |
| |
| "cos.googlesource.com/cos/tools.git/src/pkg/tools/partutil" |
| "cos.googlesource.com/cos/tools.git/src/pkg/utils" |
| |
| "golang.org/x/mod/semver" |
| ) |
| |
| type InstallGPUStep struct { |
| NvidiaDriverVersion string |
| GCSBucket string |
| GCSPrefix string |
| VerifyInstall bool |
| } |
| |
| func (s *InstallGPUStep) validate() error { |
| if s.NvidiaDriverVersion == "" { |
| return errors.New("invalid args: NvidiaDriverVersion is required in InstallGPU") |
| } |
| if s.GCSBucket == "" { |
| return errors.New("invalid args: GCSBucket is required in InstallGPU") |
| } |
| if s.GCSPrefix == "" { |
| return errors.New("invalid args: GCSPrefix is required in InstallGPU") |
| } |
| return nil |
| } |
| |
| func (s *InstallGPUStep) installScript(path, driverVersion string) (err error) { |
| if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil { |
| return err |
| } |
| f, err := os.OpenFile(path, os.O_RDWR|os.O_CREATE|os.O_TRUNC, 0744) |
| if err != nil { |
| return err |
| } |
| defer utils.CheckClose(f, fmt.Sprintf("error closing %q", path), &err) |
| t, err := template.New("gpu-script").Parse(gpuSetupScriptTemplate) |
| if err != nil { |
| return err |
| } |
| if err := t.Execute(f, &InstallGPUStep{ |
| NvidiaDriverVersion: utils.QuoteForShell(driverVersion), |
| }); err != nil { |
| return fmt.Errorf("error installing %q: %v", path, err) |
| } |
| return nil |
| } |
| |
| func (s *InstallGPUStep) setKernelArgs() error { |
| log.Println("Setting kernel parameters for custom GPU driver") |
| cmdLine, err := os.ReadFile("/proc/cmdline") |
| if err != nil { |
| return err |
| } |
| log.Printf("Found cmdline: %q", string(cmdLine)) |
| if !strings.Contains(string(cmdLine), "module.sig_enforce=1") && strings.Contains(string(cmdLine), "ima_appraise=off") { |
| log.Println("No cmdline changes needed") |
| return nil |
| } |
| grubCfgPath, err := partutil.MountEFIPartition() |
| if err != nil { |
| return err |
| } |
| grubCfg, err := os.ReadFile(grubCfgPath) |
| if err != nil { |
| umountErr := partutil.UnmountEFIPartition() |
| if umountErr != nil { |
| log.Printf("ERROR: failed to umount EFI partition: %v", umountErr) |
| } |
| return err |
| } |
| if strings.Contains(string(cmdLine), "module.sig_enforce=1") { |
| log.Println("Found module.sig_enforce=1 on cmdline, disabling") |
| newGrubCfg := strings.Replace(string(grubCfg), "module.sig_enforce=1", "module.sig_enforce=0", -1) |
| if err := os.WriteFile(grubCfgPath, []byte(newGrubCfg), 0644); err != nil { |
| umountErr := partutil.UnmountEFIPartition() |
| if umountErr != nil { |
| log.Printf("ERROR: failed to umount EFI partition: %v", umountErr) |
| } |
| return err |
| } |
| } |
| if !strings.Contains(string(cmdLine), "ima_appraise=off") { |
| log.Println("Did not find ima_appraise=off on cmdline, setting") |
| if err := partutil.AddCmdToGRUB(grubCfgPath, "ima_appraise=off"); err != nil { |
| umountErr := partutil.UnmountEFIPartition() |
| if umountErr != nil { |
| log.Printf("ERROR: failed to umount EFI partition: %v", umountErr) |
| } |
| return err |
| } |
| } |
| if err := partutil.UnmountEFIPartition(); err != nil { |
| return err |
| } |
| log.Println("Rebooting for kernel command line changes") |
| if err := exec.Command("reboot").Start(); err != nil { |
| return err |
| } |
| for { |
| time.Sleep(10 * time.Minute) |
| } |
| // Deliberately unreachable |
| return nil |
| } |
| |
| func (s *InstallGPUStep) getInstallerVersion() (string, error) { |
| imageURI, err := exec.Command("cos-extensions", "list", "gpu", "--gpu-installer").Output() |
| if err != nil { |
| if exitErr, ok := err.(*exec.ExitError); ok { |
| fmt.Println(exitErr.Stderr) |
| } |
| return "", fmt.Errorf("could not get cos-gpu-installer version from cos-extensions: %v", err) |
| } |
| parts := strings.Split(strings.TrimSpace(string(imageURI)), ":") |
| if len(parts) != 2 { |
| return "", fmt.Errorf("unrecognized container image format: %v", imageURI) |
| } |
| return parts[1], nil |
| } |
| |
| func (s *InstallGPUStep) runInstaller() error { |
| version, err := s.getInstallerVersion() |
| if err != nil { |
| log.Printf("WARNING: failed to get cos-gpu-installer version: %v", err) |
| } |
| cmd := []string{ |
| "cos-extensions", "install", "gpu", "--", |
| "--version", s.NvidiaDriverVersion, |
| "--gcs-download-bucket", s.GCSBucket, "--gcs-download-prefix", s.GCSPrefix, |
| } |
| // Custom nvidia runfile locations only available starting in v2.3.0. On older |
| // COS images, try whatever the installer finds and hope for the best... |
| if version != "" && semver.Compare(version, "v2.3.0") >= 0 { |
| cmd = append(cmd, "--gcs-download-bucket-nvidia", s.GCSBucket, "--gcs-download-prefix-nvidia", s.GCSPrefix) |
| } |
| if !s.VerifyInstall { |
| // These devices are arbitrary choices to force the kernelOpen code path in |
| // cos-gpu-installer. |
| var dummyDevice string |
| if runtime.GOARCH == "arm64" { |
| dummyDevice = "NVIDIA_GB300" |
| } else { |
| dummyDevice = "NVIDIA_L4" |
| } |
| cmd = append(cmd, "--no-verify", "--target-gpu", dummyDevice) |
| } |
| if err := utils.RunCommand(cmd, "", os.Environ()); err != nil { |
| return err |
| } |
| return nil |
| } |
| |
| func (s *InstallGPUStep) run(ctx context.Context, runState *state, deps *stepDeps) error { |
| if err := s.validate(); err != nil { |
| return err |
| } |
| log.Println("Installing GPU drivers...") |
| if err := s.setKernelArgs(); err != nil { |
| return err |
| } |
| if err := s.installScript("/var/lib/nvidia/setup_gpu.sh", s.NvidiaDriverVersion); err != nil { |
| return err |
| } |
| if err := s.runInstaller(); err != nil { |
| log.Println("Installing GPU drivers failed") |
| return err |
| } |
| log.Println("Done installing GPU drivers") |
| return nil |
| } |