blob: 7b45f19d8f747a3ab05ad0fc9295a274b4935336 [file] [log] [blame]
// Copyright 2021 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package provisioner
import (
"context"
"errors"
"fmt"
"log"
"os"
"os/exec"
"path/filepath"
"runtime"
"strings"
"text/template"
"time"
"cos.googlesource.com/cos/tools.git/src/pkg/tools/partutil"
"cos.googlesource.com/cos/tools.git/src/pkg/utils"
"golang.org/x/mod/semver"
)
type InstallGPUStep struct {
NvidiaDriverVersion string
GCSBucket string
GCSPrefix string
VerifyInstall bool
}
func (s *InstallGPUStep) validate() error {
if s.NvidiaDriverVersion == "" {
return errors.New("invalid args: NvidiaDriverVersion is required in InstallGPU")
}
if s.GCSBucket == "" {
return errors.New("invalid args: GCSBucket is required in InstallGPU")
}
if s.GCSPrefix == "" {
return errors.New("invalid args: GCSPrefix is required in InstallGPU")
}
return nil
}
func (s *InstallGPUStep) installScript(path, driverVersion string) (err error) {
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
return err
}
f, err := os.OpenFile(path, os.O_RDWR|os.O_CREATE|os.O_TRUNC, 0744)
if err != nil {
return err
}
defer utils.CheckClose(f, fmt.Sprintf("error closing %q", path), &err)
t, err := template.New("gpu-script").Parse(gpuSetupScriptTemplate)
if err != nil {
return err
}
if err := t.Execute(f, &InstallGPUStep{
NvidiaDriverVersion: utils.QuoteForShell(driverVersion),
}); err != nil {
return fmt.Errorf("error installing %q: %v", path, err)
}
return nil
}
func (s *InstallGPUStep) setKernelArgs() error {
log.Println("Setting kernel parameters for custom GPU driver")
cmdLine, err := os.ReadFile("/proc/cmdline")
if err != nil {
return err
}
log.Printf("Found cmdline: %q", string(cmdLine))
if !strings.Contains(string(cmdLine), "module.sig_enforce=1") && strings.Contains(string(cmdLine), "ima_appraise=off") {
log.Println("No cmdline changes needed")
return nil
}
grubCfgPath, err := partutil.MountEFIPartition()
if err != nil {
return err
}
grubCfg, err := os.ReadFile(grubCfgPath)
if err != nil {
umountErr := partutil.UnmountEFIPartition()
if umountErr != nil {
log.Printf("ERROR: failed to umount EFI partition: %v", umountErr)
}
return err
}
if strings.Contains(string(cmdLine), "module.sig_enforce=1") {
log.Println("Found module.sig_enforce=1 on cmdline, disabling")
newGrubCfg := strings.Replace(string(grubCfg), "module.sig_enforce=1", "module.sig_enforce=0", -1)
if err := os.WriteFile(grubCfgPath, []byte(newGrubCfg), 0644); err != nil {
umountErr := partutil.UnmountEFIPartition()
if umountErr != nil {
log.Printf("ERROR: failed to umount EFI partition: %v", umountErr)
}
return err
}
}
if !strings.Contains(string(cmdLine), "ima_appraise=off") {
log.Println("Did not find ima_appraise=off on cmdline, setting")
if err := partutil.AddCmdToGRUB(grubCfgPath, "ima_appraise=off"); err != nil {
umountErr := partutil.UnmountEFIPartition()
if umountErr != nil {
log.Printf("ERROR: failed to umount EFI partition: %v", umountErr)
}
return err
}
}
if err := partutil.UnmountEFIPartition(); err != nil {
return err
}
log.Println("Rebooting for kernel command line changes")
if err := exec.Command("reboot").Start(); err != nil {
return err
}
for {
time.Sleep(10 * time.Minute)
}
// Deliberately unreachable
return nil
}
func (s *InstallGPUStep) getInstallerVersion() (string, error) {
imageURI, err := exec.Command("cos-extensions", "list", "gpu", "--gpu-installer").Output()
if err != nil {
if exitErr, ok := err.(*exec.ExitError); ok {
fmt.Println(exitErr.Stderr)
}
return "", fmt.Errorf("could not get cos-gpu-installer version from cos-extensions: %v", err)
}
parts := strings.Split(strings.TrimSpace(string(imageURI)), ":")
if len(parts) != 2 {
return "", fmt.Errorf("unrecognized container image format: %v", imageURI)
}
return parts[1], nil
}
func (s *InstallGPUStep) runInstaller() error {
version, err := s.getInstallerVersion()
if err != nil {
log.Printf("WARNING: failed to get cos-gpu-installer version: %v", err)
}
cmd := []string{
"cos-extensions", "install", "gpu", "--",
"--version", s.NvidiaDriverVersion,
"--gcs-download-bucket", s.GCSBucket, "--gcs-download-prefix", s.GCSPrefix,
}
// Custom nvidia runfile locations only available starting in v2.3.0. On older
// COS images, try whatever the installer finds and hope for the best...
if version != "" && semver.Compare(version, "v2.3.0") >= 0 {
cmd = append(cmd, "--gcs-download-bucket-nvidia", s.GCSBucket, "--gcs-download-prefix-nvidia", s.GCSPrefix)
}
if !s.VerifyInstall {
// These devices are arbitrary choices to force the kernelOpen code path in
// cos-gpu-installer.
var dummyDevice string
if runtime.GOARCH == "arm64" {
dummyDevice = "NVIDIA_GB300"
} else {
dummyDevice = "NVIDIA_L4"
}
cmd = append(cmd, "--no-verify", "--target-gpu", dummyDevice)
}
if err := utils.RunCommand(cmd, "", os.Environ()); err != nil {
return err
}
return nil
}
func (s *InstallGPUStep) run(ctx context.Context, runState *state, deps *stepDeps) error {
if err := s.validate(); err != nil {
return err
}
log.Println("Installing GPU drivers...")
if err := s.setKernelArgs(); err != nil {
return err
}
if err := s.installScript("/var/lib/nvidia/setup_gpu.sh", s.NvidiaDriverVersion); err != nil {
return err
}
if err := s.runInstaller(); err != nil {
log.Println("Installing GPU drivers failed")
return err
}
log.Println("Done installing GPU drivers")
return nil
}