blob: 07d75412bb1f7a9841554f6cfe7b5d961450bafb [file] [log] [blame]
// Package commands implements subcommands of cos_gpu_installer.
package commands
import (
"context"
stderrors "errors"
"fmt"
"io/ioutil"
"os"
"path/filepath"
"strconv"
"strings"
"syscall"
"flag"
"cos.googlesource.com/cos/tools.git/src/cmd/cos_gpu_installer/internal/installer"
"cos.googlesource.com/cos/tools.git/src/cmd/cos_gpu_installer/internal/signing"
"cos.googlesource.com/cos/tools.git/src/pkg/cos"
"cos.googlesource.com/cos/tools.git/src/pkg/modules"
"cos.googlesource.com/cos/tools.git/src/pkg/utils"
log "github.com/golang/glog"
"github.com/google/subcommands"
"github.com/pkg/errors"
)
const (
grepFound = 0
hostRootPath = "/root"
kernelSrcDir = "/build/usr/src/linux"
toolchainPkgDir = "/build/cos-tools"
)
// InstallCommand is the subcommand to install GPU drivers.
type InstallCommand struct {
driverVersion string
hostInstallDir string
unsignedDriver bool
gcsDownloadBucket string
gcsDownloadPrefix string
nvidiaInstallerURL string
debug bool
test bool
}
// Name implements subcommands.Command.Name.
func (*InstallCommand) Name() string { return "install" }
// Synopsis implements subcommands.Command.Synopsis.
func (*InstallCommand) Synopsis() string { return "Install GPU drivers." }
// Usage implements subcommands.Command.Usage.
func (*InstallCommand) Usage() string { return "install [-dir <filepath>]\n" }
// SetFlags implements subcommands.Command.SetFlags.
func (c *InstallCommand) SetFlags(f *flag.FlagSet) {
f.StringVar(&c.driverVersion, "version", "",
"The GPU driver verion to install. "+
"It will install the default GPU driver if the flag is not set explicitly. "+
"Set the flag to 'latest' to install the latest GPU driver version.")
f.StringVar(&c.hostInstallDir, "host-dir", "",
"Host directory that GPU drivers should be installed to. "+
"It tries to read from the env NVIDIA_INSTALL_DIR_HOST if the flag is not set explicitly.")
f.BoolVar(&c.unsignedDriver, "allow-unsigned-driver", false,
"Whether to allow load unsigned GPU drivers. "+
"If this flag is set to true, module signing security features must be disabled on the host for driver installation to succeed. "+
"This flag is only for debugging and testing.")
f.StringVar(&c.gcsDownloadBucket, "gcs-download-bucket", "",
"The GCS bucket to download COS artifacts from. "+
"The default bucket is one of 'cos-tools', 'cos-tools-asia' and 'cos-tools-eu' based on where the VM is running. "+
"Those are the public COS artifacts buckets.")
f.StringVar(&c.gcsDownloadPrefix, "gcs-download-prefix", "",
"The GCS path prefix when downloading COS artifacts."+
"If not set then the COS version build number (e.g. 13310.1041.38) will be used.")
f.StringVar(&c.nvidiaInstallerURL, "nvidia-installer-url", "",
"A URL to an nvidia-installer to use for driver installation. This flag is mutually exclusive with `-version`. This flag must be used with `-allow-unsigned-driver`. This flag is only for debugging.")
f.BoolVar(&c.debug, "debug", false,
"Enable debug mode.")
f.BoolVar(&c.test, "test", false,
"Enable test mode.")
}
func (c *InstallCommand) validateFlags() error {
if c.nvidiaInstallerURL != "" && c.driverVersion != "" {
return stderrors.New("-nvidia-installer-url and -version are both set; these flags are mutually exclusive")
}
if c.nvidiaInstallerURL != "" && c.unsignedDriver == false && c.test == false {
return stderrors.New("-nvidia-installer-url is set, and -allow-unsigned-driver is not; -nvidia-installer-url must be used with -allow-unsigned-driver if not in test mode")
}
return nil
}
// Execute implements subcommands.Command.Execute.
func (c *InstallCommand) Execute(ctx context.Context, _ *flag.FlagSet, _ ...interface{}) subcommands.ExitStatus {
if err := c.validateFlags(); err != nil {
c.logError(err)
return subcommands.ExitFailure
}
envReader, err := cos.NewEnvReader(hostRootPath)
if err != nil {
c.logError(errors.Wrapf(err, "failed to create envReader with host root path %s", hostRootPath))
return subcommands.ExitFailure
}
if c.debug {
if err := flag.Set("v", "2"); err != nil {
log.Errorf("Unable to set debug logging: %v", err)
}
}
log.V(2).Infof("Running on COS build id %s", envReader.BuildNumber())
if releaseTrack := envReader.ReleaseTrack(); releaseTrack == "dev-channel" {
c.logError(fmt.Errorf("GPU installation is not supported on dev images for now; Please use LTS image."))
return subcommands.ExitFailure
}
var isGpuConfigured bool
if isGpuConfigured, err = c.isGpuConfigured(); err != nil {
c.logError(errors.Wrapf(err, "failed to check if GPU is configured"))
return subcommands.ExitFailure
}
if !isGpuConfigured {
c.logError(fmt.Errorf("Please have GPU device configured"))
return subcommands.ExitFailure
}
downloader := cos.NewGCSDownloader(envReader, c.gcsDownloadBucket, c.gcsDownloadPrefix)
if c.nvidiaInstallerURL == "" {
versionInput := c.driverVersion
milestone, err := strconv.Atoi(envReader.Milestone())
if err != nil {
c.logError(errors.Wrap(err, "failed to parse milestone number"))
return subcommands.ExitFailure
}
c.driverVersion, err = getDriverVersion(downloader, c.driverVersion)
if err != nil {
if versionInput == "latest" && milestone < 93 {
c.logError(errors.Wrap(err, "'--version=latest' is only supported on COS M93 and onwards, please unset this flag"))
return subcommands.ExitFailure
} else {
c.logError(errors.Wrap(err, "failed to get default driver version"))
return subcommands.ExitFailure
}
}
log.Infof("Installing GPU driver version %s", c.driverVersion)
} else {
log.Infof("Installing GPU driver from %q", c.nvidiaInstallerURL)
}
if c.unsignedDriver {
kernelCmdline, err := ioutil.ReadFile("/proc/cmdline")
if err != nil {
c.logError(fmt.Errorf("failed to read kernel command line: %v", err))
}
if !cos.CheckKernelModuleSigning(string(kernelCmdline)) {
log.Warning("Current kernel command line does not support unsigned kernel modules. Not enforcing kernel module signing may cause installation fail.")
}
}
// Read value from env NVIDIA_INSTALL_DIR_HOST if the flag is not set. This is to be compatible with old interface.
if c.hostInstallDir == "" {
c.hostInstallDir = os.Getenv("NVIDIA_INSTALL_DIR_HOST")
}
hostInstallDir := filepath.Join(hostRootPath, c.hostInstallDir)
var cacher *installer.Cacher
// We only want to cache drivers installed from official sources.
if c.nvidiaInstallerURL == "" {
cacher = installer.NewCacher(hostInstallDir, envReader.BuildNumber(), c.driverVersion)
if isCached, err := cacher.IsCached(); isCached && err == nil {
log.V(2).Info("Found cached version, NOT building the drivers.")
if err := installer.ConfigureCachedInstalltion(hostInstallDir, !c.unsignedDriver); err != nil {
c.logError(errors.Wrap(err, "failed to configure cached installation"))
return subcommands.ExitFailure
}
if err := installer.VerifyDriverInstallation(); err != nil {
c.logError(errors.Wrap(err, "failed to verify GPU driver installation"))
return subcommands.ExitFailure
}
if err := modules.UpdateHostLdCache(hostRootPath, filepath.Join(c.hostInstallDir, "lib64")); err != nil {
c.logError(errors.Wrap(err, "failed to update host ld cache"))
return subcommands.ExitFailure
}
return subcommands.ExitSuccess
}
}
log.V(2).Info("Did not find cached version, installing the drivers...")
if err := installDriver(c, cacher, envReader, downloader); err != nil {
c.logError(err)
return subcommands.ExitFailure
}
return subcommands.ExitSuccess
}
func getDriverVersion(downloader *cos.GCSDownloader, argVersion string) (string, error) {
if argVersion == "" {
return installer.GetDefaultGPUDriverVersion(downloader)
} else if argVersion == "latest" {
return installer.GetLatestGPUDriverVersion(downloader)
}
// argVersion is an acutal verson, return it as-is.
return argVersion, nil
}
func remountExecutable(dir string) error {
if err := os.MkdirAll(dir, 0755); err != nil {
return fmt.Errorf("failed to create dir %q: %v", dir, err)
}
if err := syscall.Mount(dir, dir, "", syscall.MS_BIND, ""); err != nil {
return fmt.Errorf("failed to create bind mount at %q: %v", dir, err)
}
if err := syscall.Mount("", dir, "", syscall.MS_REMOUNT|syscall.MS_NOSUID|syscall.MS_NODEV|syscall.MS_RELATIME, ""); err != nil {
return fmt.Errorf("failed to remount %q: %v", dir, err)
}
return nil
}
func installDriver(c *InstallCommand, cacher *installer.Cacher, envReader *cos.EnvReader, downloader *cos.GCSDownloader) error {
callback, err := installer.ConfigureDriverInstallationDirs(filepath.Join(hostRootPath, c.hostInstallDir), envReader.KernelRelease())
if err != nil {
return errors.Wrap(err, "failed to configure GPU driver installation dirs")
}
defer func() { callback <- 0 }()
if !c.unsignedDriver {
if err := signing.DownloadDriverSignatures(downloader, c.driverVersion); err != nil {
if strings.Contains(err.Error(), "404 Not Found") {
return fmt.Errorf("The GPU driver is not available for the COS version. Please wait for half a day and retry.")
}
return errors.Wrap(err, "failed to download driver signature")
}
}
var installerFile string
if c.nvidiaInstallerURL == "" {
installerFile, err = installer.DownloadDriverInstaller(
c.driverVersion, envReader.Milestone(), envReader.BuildNumber())
if err != nil {
return errors.Wrap(err, "failed to download GPU driver installer")
}
} else {
installerFile, err = installer.DownloadToInstallDir(c.nvidiaInstallerURL, "Unofficial GPU driver installer")
if err != nil {
return err
}
}
if err := cos.SetCompilationEnv(downloader); err != nil {
return errors.Wrap(err, "failed to set compilation environment variables")
}
if err := remountExecutable(toolchainPkgDir); err != nil {
return fmt.Errorf("failed to remount %q as executable: %v", filepath.Dir(toolchainPkgDir), err)
}
if err := cos.InstallCrossToolchain(downloader, toolchainPkgDir); err != nil {
return errors.Wrap(err, "failed to install toolchain")
}
if err := installer.RunDriverInstaller(toolchainPkgDir, installerFile, !c.unsignedDriver, false); err != nil {
if errors.Is(err, installer.ErrDriverLoad) {
// Drivers were linked, but couldn't load; try again with legacy linking
log.Info("Retrying driver installation with legacy linking")
if err := installer.RunDriverInstaller(toolchainPkgDir, installerFile, !c.unsignedDriver, true); err != nil {
return fmt.Errorf("failed to run GPU driver installer: %v", err)
}
} else {
return errors.Wrap(err, "failed to run GPU driver installer")
}
}
if cacher != nil {
if err := cacher.Cache(); err != nil {
return errors.Wrap(err, "failed to cache installation")
}
}
if err := installer.VerifyDriverInstallation(); err != nil {
return errors.Wrap(err, "failed to verify installation")
}
if err := modules.UpdateHostLdCache(hostRootPath, filepath.Join(c.hostInstallDir, "lib64")); err != nil {
return errors.Wrap(err, "failed to update host ld cache")
}
log.Info("Finished installing the drivers.")
return nil
}
func (c *InstallCommand) logError(err error) {
if c.debug {
log.Errorf("%+v", err)
} else {
log.Errorf("%v", err)
}
}
func (c *InstallCommand) isGpuConfigured() (bool, error) {
cmd := "lspci | grep -i \"nvidia\""
returnCode, err := utils.RunCommandWithExitCode([]string{"/bin/bash", "-c", cmd}, "", nil)
if err != nil {
return false, err
}
isConfigured := returnCode == grepFound
return isConfigured, nil
}