Trigger daemons from cos-gpu-installer (instead of cos-extensions) BUG=b/502284908 TEST=Run custom cos-extensions and custom cos-gpu-installer on VMs, installing grid and non-grid GPU drivers. On grid: test license and CUDA workload. `sudo systemctl list-units --type=path | grep nvidia` and verify the right services are running. RELEASE_NOTE=None Change-Id: I6778d527d76b90654b56cc778f76a5401ea165eb Reviewed-on: https://cos-review.googlesource.com/c/cos/tools/+/149184 Reviewed-by: Robert Kolchmeyer <rkolchmeyer@google.com> Cloud-Build: 228075978874@cloudbuild.gserviceaccount.com <228075978874@cloudbuild.gserviceaccount.com> Tested-by: Miri Amarilio <mirilio@google.com>
diff --git a/src/cmd/cos_gpu_installer/internal/commands/install.go b/src/cmd/cos_gpu_installer/internal/commands/install.go index 8d9bd90..68191d4 100644 --- a/src/cmd/cos_gpu_installer/internal/commands/install.go +++ b/src/cmd/cos_gpu_installer/internal/commands/install.go
@@ -317,6 +317,14 @@ c.hostInstallDir = os.Getenv("NVIDIA_INSTALL_DIR_HOST") } hostInstallDir := filepath.Join(hostRootPath, c.hostInstallDir) + if err := os.MkdirAll(hostInstallDir, 0755); err != nil { + c.logError(errors.Wrap(err, "failed to create host install dir")) + return subcommands.ExitFailure + } + if err := installer.EnsureExecutable(c.hostInstallDir, hostRootPath); err != nil { + c.logError(errors.Wrap(err, "failed to ensure installation directory is executable")) + return subcommands.ExitFailure + } var downloader *cos.GCSDownloader if c.localArtifactsDir != "" { @@ -416,6 +424,10 @@ c.logError(errors.Wrap(err, "failed to update host ld cache")) return subcommands.ExitFailure } + if err := installer.SetupDaemons(c.hostInstallDir, hostRootPath); err != nil { + c.logError(errors.Wrap(err, "failed to setup daemons")) + return subcommands.ExitFailure + } return subcommands.ExitSuccess } } @@ -440,6 +452,10 @@ c.logError(err) return subcommands.ExitFailure } + if err := installer.SetupDaemons(c.hostInstallDir, hostRootPath); err != nil { + c.logError(errors.Wrap(err, "failed to setup daemons")) + return subcommands.ExitFailure + } return subcommands.ExitSuccess } @@ -457,6 +473,11 @@ return subcommands.ExitFailure } + if err := installer.SetupDaemons(c.hostInstallDir, hostRootPath); err != nil { + c.logError(errors.Wrap(err, "failed to setup daemons")) + return subcommands.ExitFailure + } + return subcommands.ExitSuccess }
diff --git a/src/cmd/cos_gpu_installer/internal/installer/daemon.go b/src/cmd/cos_gpu_installer/internal/installer/daemon.go new file mode 100644 index 0000000..fabf8ea --- /dev/null +++ b/src/cmd/cos_gpu_installer/internal/installer/daemon.go
@@ -0,0 +1,169 @@ +package installer + +import ( + "bytes" + "fmt" + "os" + "os/exec" + "path/filepath" + "text/template" + + log "github.com/golang/glog" + "golang.org/x/sys/unix" +) + +const ( + griddPathTmpl = `[Unit] +Description=Trigger the nvidia-gridd binary if it exists/changes + +[Path] +PathExists={{.InstallDir}}/bin/nvidia-gridd +Unit=nvidia-gridd.service + +[Install] +WantedBy=multi-user.target +` + + griddServiceTmpl = `[Unit] +Description=Launches nvidia-gridd +ConditionPathExists={{.InstallDir}}/bin/nvidia-gridd + +[Service] +Type=forking +Environment="LD_LIBRARY_PATH={{.InstallDir}}/gridd-libs" +ExecStart={{.InstallDir}}/bin/nvidia-gridd +Restart=on-failure +RestartSec=6s + +[Install] +WantedBy=multi-user.target +` + + persistencedPathTmpl = `[Unit] +Description=Trigger the nvidia-persistenced binary if it exists/changes + +[Path] +PathExists={{.InstallDir}}/bin/nvidia-persistenced +Unit=nvidia-persistenced.service + +[Install] +WantedBy=multi-user.target +` + + persistencedServiceTmpl = `[Unit] +Description=Launches nvidia-persistenced +ConditionPathExists={{.InstallDir}}/bin/nvidia-persistenced + +[Service] +Type=forking +ExecStart={{.InstallDir}}/bin/nvidia-persistenced +Restart=on-failure +RestartSec=6s + +[Install] +WantedBy=multi-user.target +` +) + +type sysdTemplateData struct { + InstallDir string +} + +func writeTemplate(path string, tmplStr string, data sysdTemplateData) error { + tmpl, err := template.New(filepath.Base(path)).Parse(tmplStr) + if err != nil { + return err + } + var buf bytes.Buffer + if err := tmpl.Execute(&buf, data); err != nil { + return err + } + return os.WriteFile(path, buf.Bytes(), 0644) +} + +// EnsureExecutable ensures that the installation directory on the host +// is executable by ensuring any noexec mounts are remounted with exec. +func EnsureExecutable(hostInstallDir, hostRootPath string) error { + containerViewOfHostDir := filepath.Join(hostRootPath, hostInstallDir) + var stat unix.Statfs_t + if err := unix.Statfs(containerViewOfHostDir, &stat); err != nil { + return fmt.Errorf("failed to statfs %s: %v", containerViewOfHostDir, err) + } + + if stat.Flags&unix.ST_NOEXEC != 0 { + log.Infof("Target installation directory %s is mounted with noexec. Remounting as executable on the host.", hostInstallDir) + + cmd := exec.Command("nsenter", "-t", "1", "-m", "mount", "--bind", hostInstallDir, hostInstallDir) + if out, err := cmd.CombinedOutput(); err != nil { + return fmt.Errorf("failed to bind mount %s on host: %v, out: %s", hostInstallDir, err, string(out)) + } + + cmd = exec.Command("nsenter", "-t", "1", "-m", "mount", "-o", "remount,exec", hostInstallDir) + if out, err := cmd.CombinedOutput(); err != nil { + return fmt.Errorf("failed to remount %s as executable on host: %v, out: %s", hostInstallDir, err, string(out)) + } + } else { + log.V(2).Infof("Target installation directory %s is already executable.", hostInstallDir) + } + + return nil +} + +// SetupDaemons deploys systemd units for nvidia-persistenced and conditionally nvidia-gridd. +func SetupDaemons(hostInstallDir, hostRootPath string) error { + sysdDir := filepath.Join(hostRootPath, "etc/systemd/system") + if err := os.MkdirAll(sysdDir, 0755); err != nil { + return fmt.Errorf("failed to create systemd directory: %v", err) + } + + data := sysdTemplateData{ + InstallDir: hostInstallDir, + } + + type unitDef struct { + name string + content string + } + + units := []unitDef{ + {"nvidia-persistenced.path", persistencedPathTmpl}, + {"nvidia-persistenced.service", persistencedServiceTmpl}, + } + + isGrid := false + if _, err := os.Stat(filepath.Join(hostRootPath, hostInstallDir, "bin", "nvidia-gridd")); err == nil { + isGrid = true + } + + if isGrid { + units = append(units, unitDef{"nvidia-gridd.path", griddPathTmpl}) + units = append(units, unitDef{"nvidia-gridd.service", griddServiceTmpl}) + } + + for _, u := range units { + path := filepath.Join(sysdDir, u.name) + if err := writeTemplate(path, u.content, data); err != nil { + return fmt.Errorf("failed to write %s: %v", u.name, err) + } + } + + cmd := exec.Command("chroot", hostRootPath, "systemctl", "daemon-reload") + if out, err := cmd.CombinedOutput(); err != nil { + log.Warningf("failed to daemon-reload: %v, out: %s", err, string(out)) + } else { + startUnits := []string{"nvidia-persistenced.path"} + if isGrid { + startUnits = append(startUnits, "nvidia-gridd.path") + } + + for _, unit := range startUnits { + cmd = exec.Command("chroot", hostRootPath, "systemctl", "start", unit) + if out, err := cmd.CombinedOutput(); err != nil { + log.Warningf("failed to start %s: %v, out: %s", unit, err, string(out)) + } + } + log.Infof("Successfully setup NVIDIA systemd daemons") + } + + return nil +}