Merge "cos-gpu-installer-v2: Add installer package"
diff --git a/src/cmd/cos_gpu_installer/internal/installer/cache.go b/src/cmd/cos_gpu_installer/internal/installer/cache.go
new file mode 100644
index 0000000..d391b16
--- /dev/null
+++ b/src/cmd/cos_gpu_installer/internal/installer/cache.go
@@ -0,0 +1,76 @@
+package installer
+
+import (
+ "fmt"
+ "os"
+ "path/filepath"
+
+ log "github.com/golang/glog"
+ "github.com/pkg/errors"
+
+ "pkg/utils"
+)
+
+const (
+ cacheFile = ".cache"
+ buildNumberKey = "BUILD_ID"
+ driverVersionKey = "DRIVER_VERSION"
+)
+
+// Cacher is to cache GPU driver installation info.
+type Cacher struct {
+ gpuInstallDir string
+ buildNumber string
+ driverVersion string
+}
+
+// NewCacher returns an instance of Cacher.
+func NewCacher(gpuInstallDir, buildNumber, driverVersion string) *Cacher {
+ if gpuInstallDir != "" {
+ return &Cacher{gpuInstallDir: gpuInstallDir, buildNumber: buildNumber, driverVersion: driverVersion}
+ }
+
+ return &Cacher{gpuInstallDir: gpuInstallDirContainer, buildNumber: buildNumber, driverVersion: driverVersion}
+}
+
+// Cache writes to fs about the information that a given GPU driver has been installed.
+func (c *Cacher) Cache() error {
+ cachePath := filepath.Join(c.gpuInstallDir, cacheFile)
+ f, err := os.Create(cachePath)
+ defer f.Close()
+ if err != nil {
+ return errors.Wrapf(err, "Failed to create file %s", cachePath)
+ }
+
+ cacheMap := map[string]string{
+ buildNumberKey: c.buildNumber,
+ driverVersionKey: c.driverVersion}
+
+ var cache string
+ for k, v := range cacheMap {
+ cache = cache + fmt.Sprintf("%s=%s\n", k, v)
+ }
+
+ if _, err = f.WriteString(cache); err != nil {
+ return errors.Wrapf(err, "Failed to write to file %s", cachePath)
+ }
+
+ log.Info("Updated cached version as")
+ for key, value := range cacheMap {
+ log.Infof("%s=%s", key, value)
+ }
+ return nil
+}
+
+// IsCached returns a bool indicating whether a given GPU driver has been installed.
+func (c *Cacher) IsCached() (bool, error) {
+ cacheMap, err := utils.LoadEnvFromFile(c.gpuInstallDir, cacheFile)
+ if err != nil {
+ log.Infof("error: %v", err)
+ return false, err
+ }
+ log.Infof("%v", cacheMap)
+
+ return (c.buildNumber == cacheMap[buildNumberKey] &&
+ c.driverVersion == cacheMap[driverVersionKey]), nil
+}
diff --git a/src/cmd/cos_gpu_installer/internal/installer/cache_test.go b/src/cmd/cos_gpu_installer/internal/installer/cache_test.go
new file mode 100644
index 0000000..24768dc
--- /dev/null
+++ b/src/cmd/cos_gpu_installer/internal/installer/cache_test.go
@@ -0,0 +1,42 @@
+package installer
+
+import (
+ "io/ioutil"
+ "os"
+ "testing"
+)
+
+func TestIsCached(t *testing.T) {
+ testDir, err := ioutil.TempDir("", "testing")
+ if err != nil {
+ t.Fatalf("Failed to create tempdir: %v", err)
+ }
+ defer os.RemoveAll(testDir)
+
+ cacher := NewCacher(testDir, "12688.0.0", "418.67")
+ if err := cacher.Cache(); err != nil {
+ t.Fatalf("Failed to cache: %v", err)
+ }
+
+ for _, tc := range []struct {
+ testName string
+ buildNumber string
+ driverVersion string
+ expectOut bool
+ }{
+ {"TestIsCachedTrue", "12688.0.0", "418.67", true},
+ {"TestIsCachedWrongBuild", "12670.0.0", "418.67", false},
+ {"TestIsCachedWrongDriver", "12688.0.0", "418.00", false},
+ } {
+ t.Run(tc.testName, func(t *testing.T) {
+ testCacher := NewCacher(testDir, tc.buildNumber, tc.driverVersion)
+ out, err := testCacher.IsCached()
+ if err != nil {
+ t.Fatalf("Failed to check cache result: %v", err)
+ }
+ if out != tc.expectOut {
+ t.Errorf("Unexpected cache result: want :%v, got: %v", tc.expectOut, out)
+ }
+ })
+ }
+}
diff --git a/src/cmd/cos_gpu_installer/internal/installer/installer.go b/src/cmd/cos_gpu_installer/internal/installer/installer.go
new file mode 100644
index 0000000..dd6db07
--- /dev/null
+++ b/src/cmd/cos_gpu_installer/internal/installer/installer.go
@@ -0,0 +1,303 @@
+// Package installer provides functionality to install GPU drivers.
+package installer
+
+import (
+ "fmt"
+ "io/ioutil"
+ "os"
+ "os/exec"
+ "path"
+ "path/filepath"
+ "strings"
+ "syscall"
+
+ "cmd/cos_gpu_installer/internal/signing"
+ "pkg/cos"
+ "pkg/modules"
+ "pkg/utils"
+
+ log "github.com/golang/glog"
+ "github.com/pkg/errors"
+)
+
+const (
+ gpuInstallDirContainer = "/usr/local/nvidia"
+ defaultGPUDriverFile = "default_version"
+ precompiledInstallerURLFormat = "https://storage.googleapis.com/nvidia-drivers-%s-public/nvidia-cos-project/%s/tesla/%s_00/%s/NVIDIA-Linux-x86_64-%s_%s-%s.cos"
+ defaultFilePermission = 0755
+)
+
+// VerifyDriverInstallation runs some commands to verify the driver installation.
+func VerifyDriverInstallation() error {
+ log.Info("Verifying GPU driver installation")
+
+ newPathEnv := fmt.Sprintf("%s/bin:%s", gpuInstallDirContainer, os.Getenv("PATH"))
+ os.Setenv("PATH", newPathEnv)
+ // Run nvidia-smi to check whether nvidia GPU driver is installed.
+ if err := utils.RunCommandAndLogOutput(exec.Command("nvidia-smi"), false); err != nil {
+ return errors.Wrap(err, "failed to verify GPU driver installation")
+ }
+
+ // Create unified memory device file.
+ if err := utils.RunCommandAndLogOutput(exec.Command("nvidia-modprobe", "-c0", "-u"), false); err != nil {
+ return errors.Wrap(err, "failed to create unified memory device file")
+ }
+
+ return nil
+}
+
+// ConfigureCachedInstalltion updates ldconfig and installs the cached GPU driver kernel modules.
+func ConfigureCachedInstalltion(gpuInstallDirHost string, needSigned bool) error {
+ log.Info("Configuring cached driver installation")
+
+ if err := createHostDirBindMount(gpuInstallDirHost, gpuInstallDirContainer); err != nil {
+ return errors.Wrap(err, "failed to create driver installation dir")
+ }
+ if err := updateContainerLdCache(); err != nil {
+ return errors.Wrap(err, "failed to configure cached driver installation")
+ }
+ if err := loadGPUDrivers(needSigned); err != nil {
+ return errors.Wrap(err, "failed to configure cached driver installation")
+ }
+
+ return nil
+}
+
+// DownloadDriverInstaller downloads GPU driver installer given driver version and COS version.
+func DownloadDriverInstaller(driverVersion, cosMilestone, cosBuildNumber string) (string, error) {
+ log.Infof("Downloading GPU driver installer version %s", driverVersion)
+ downloadURL, err := getDriverInstallerDownloadURL(driverVersion, cosMilestone, cosBuildNumber)
+ if err != nil {
+ return "", errors.Wrap(err, "failed to get driver installer download URL")
+ }
+ outputPath := filepath.Join(gpuInstallDirContainer, path.Base(downloadURL))
+ if err := utils.DownloadContentFromURL(downloadURL, outputPath, "GPU driver installer"); err != nil {
+ return "", errors.Wrapf(err, "failed to download GPU driver installer version %s", driverVersion)
+ }
+ return filepath.Base(outputPath), nil
+}
+
+// ConfigureDriverInstallationDirs configures GPU driver installation directories by creating mounts.
+func ConfigureDriverInstallationDirs(gpuInstallDirHost string, kernelRelease string) (chan<- int, error) {
+ log.Info("Configuring driver installation directories")
+
+ if err := createHostDirBindMount(gpuInstallDirHost, gpuInstallDirContainer); err != nil {
+ return nil, errors.Wrap(err, "failed to create dirver installation dir")
+ }
+
+ if err := createOverlayFS(
+ "/usr/bin", gpuInstallDirContainer+"/bin", gpuInstallDirContainer+"/bin-workdir"); err != nil {
+ return nil, errors.Wrap(err, "failed to create bin overlay")
+ }
+ if err := createOverlayFS(
+ "/usr/lib/x86_64-linux-gnu", gpuInstallDirContainer+"/lib64", gpuInstallDirContainer+"/lib64-workdir"); err != nil {
+ return nil, errors.Wrap(err, "failed to create lib64 overlay")
+ }
+ modulePath := filepath.Join("/lib/modules", kernelRelease, "video")
+ if err := createOverlayFS(
+ modulePath, gpuInstallDirContainer+"/drivers", gpuInstallDirContainer+"/drivers-workdir"); err != nil {
+ return nil, errors.Wrap(err, "failed to create drivers overlay")
+ }
+
+ if err := updateContainerLdCache(); err != nil {
+ return nil, errors.Wrap(err, "failed to update container ld cache")
+ }
+
+ ch := make(chan int, 1)
+ go func() {
+ // cleans up mounts created above.
+ <-ch
+ syscall.Unmount("/usr/bin", 0)
+ syscall.Unmount("/usr/lib/x86_64-linux-gnu", 0)
+ syscall.Unmount(modulePath, 0)
+ syscall.Unmount(gpuInstallDirContainer, 0)
+ }()
+ return ch, nil
+}
+
+// RunDriverInstaller runs GPU driver installer.
+func RunDriverInstaller(installerFilename string, needSigned bool) error {
+ log.Info("Running GPU driver installer")
+
+ // Extract files to a fixed path first to make sure md5sum of generated gpu drivers are consistent.
+ extractDir := "/tmp/extract"
+ cmd := exec.Command("sh", installerFilename, "-x", "--target", extractDir)
+ cmd.Dir = gpuInstallDirContainer
+ if err := cmd.Run(); err != nil {
+ return errors.Wrap(err, "failed to extract installer files")
+ }
+
+ cmd = exec.Command(filepath.Join(extractDir, "nvidia-installer"),
+ "--utility-prefix="+gpuInstallDirContainer,
+ "--opengl-prefix="+gpuInstallDirContainer,
+ "--no-install-compat32-libs",
+ "--log-file-name="+filepath.Join(gpuInstallDirContainer, "nvidia-installer.log"),
+ "--silent",
+ "--accept-license",
+ )
+
+ log.Infof("Installer arguments:\n%v", cmd.Args)
+
+ if needSigned {
+ // Run installer to compile drivers. Expect the command to fail as the drivers are not signed yet.
+ if err := utils.RunCommandAndLogOutput(cmd, true); err != nil {
+ return errors.Wrap(err, "failed to run GPU driver installer")
+ }
+ // sign GPU drivers.
+ kernelFiles, err := ioutil.ReadDir(filepath.Join(extractDir, "kernel"))
+ if err != nil {
+ return errors.Wrapf(err, "failed to list files in directory %s", filepath.Join(extractDir, "kernel"))
+ }
+ for _, kernelFile := range kernelFiles {
+ if strings.HasSuffix(kernelFile.Name(), ".ko") {
+ module := kernelFile.Name()
+ signaturePath := signing.GetModuleSignature(module)
+ modulePath := filepath.Join(extractDir, "kernel", module)
+ signedModulePath := filepath.Join(gpuInstallDirContainer, "drivers", module)
+ if err := modules.AppendSignature(signedModulePath, modulePath, signaturePath); err != nil {
+ return errors.Wrapf(err, "failed to sign kernel module %s", module)
+ }
+ }
+ }
+ // Copy public key.
+ if utils.CopyFile(signing.GetPublicKeyDer(), filepath.Join(gpuInstallDirContainer, "pubkey.der")); err != nil {
+ return errors.Wrapf(err, "failed to copy file %s", signing.GetPublicKeyDer())
+ }
+ // Finally, load signed GPU drivers.
+ if err := loadGPUDrivers(needSigned); err != nil {
+ return errors.Wrap(err, "failed to load GPU drivers")
+ }
+
+ // Run installer again to only install user space libraries.
+ cmd = exec.Command(cmd.Path, cmd.Args[1:]...)
+ cmd.Args = append(cmd.Args, "--no-kernel-module")
+ if err := utils.RunCommandAndLogOutput(cmd, true); err != nil {
+ return errors.Wrap(err, "failed to run GPU driver installer")
+ }
+ } else {
+ if err := utils.RunCommandAndLogOutput(cmd, false); err != nil {
+ return errors.Wrap(err, "failed to run GPU driver installer")
+ }
+ }
+
+ return nil
+}
+
+// GetDefaultGPUDriverVersion gets the default GPU driver version.
+func GetDefaultGPUDriverVersion(downloader cos.ExtensionsDownloader) (string, error) {
+ log.Info("Getting the default GPU driver version")
+ content, err := downloader.GetExtensionArtifact(cos.GpuExtension, defaultGPUDriverFile)
+ if err != nil {
+ return "", errors.Wrapf(err, "failed to get default GPU driver version")
+ }
+ return strings.Trim(string(content), "\n "), nil
+}
+
+func updateContainerLdCache() error {
+ log.Info("Updating container's ld cache")
+
+ f, err := os.Create("/etc/ld.so.conf.d/nvidia.conf")
+ if err != nil {
+ f.Close()
+ return errors.Wrap(err, "failed to update ld cache")
+ }
+ f.WriteString(gpuInstallDirContainer + "/lib64")
+ f.Close()
+
+ err = exec.Command("ldconfig").Run()
+ if err != nil {
+ return errors.Wrap(err, "failed to update ld cache")
+ }
+ return nil
+}
+
+func getDriverInstallerDownloadURL(driverVersion, cosMilestone, cosBuildNumber string) (string, error) {
+ metadataZone, err := utils.GetGCEMetadata("zone")
+ if err != nil {
+ return "", errors.Wrap(err, "failed to get GCE metadata zone")
+ }
+ downloadLocation := getInstallerDownloadLocation(metadataZone)
+
+ return getPrecompiledInstallerURL(driverVersion, cosMilestone, cosBuildNumber, downloadLocation), nil
+}
+
+func getInstallerDownloadLocation(metadataZone string) string {
+ fields := strings.Split(metadataZone, "/")
+ zone := fields[len(fields)-1]
+ locationMapping := map[string]string{
+ "us": "us",
+ "asia": "asia",
+ "europe": "eu",
+ }
+ location, ok := locationMapping[strings.Split(zone, "-")[0]]
+ if !ok {
+ location = "us"
+ }
+ return location
+}
+
+func getPrecompiledInstallerURL(driverVersion, cosMilestone, cosBuildNumber, downloadLocation string) string {
+ // 418.67 -> 418
+ majorVersion := strings.Split(driverVersion, ".")[0]
+ // 12371.284.0 -> 12371-284-0
+ cosBuildNumber = strings.Replace(cosBuildNumber, ".", "-", -1)
+ return fmt.Sprintf(
+ precompiledInstallerURLFormat,
+ downloadLocation, cosMilestone, majorVersion, driverVersion, driverVersion, cosMilestone, cosBuildNumber)
+}
+
+func createHostDirBindMount(hostDir, bindMountPath string) error {
+ if err := os.MkdirAll(hostDir, defaultFilePermission); err != nil {
+ return errors.Wrapf(err, "failed to create dir %s", hostDir)
+ }
+ if err := os.MkdirAll(bindMountPath, defaultFilePermission); err != nil {
+ return errors.Wrapf(err, "failed to create dir %s", bindMountPath)
+ }
+ if err := syscall.Mount(hostDir, bindMountPath, "", syscall.MS_BIND, ""); err != nil {
+ return errors.Wrapf(err, "failed to create bind mount %s", bindMountPath)
+ }
+ // Remount to clear noexec flag.
+ if err := syscall.Mount("", bindMountPath, "",
+ syscall.MS_REMOUNT|syscall.MS_NOSUID|syscall.MS_NODEV|syscall.MS_RELATIME, ""); err != nil {
+ return errors.Wrapf(err, "failed to remount %s", bindMountPath)
+ }
+ return nil
+}
+
+func createOverlayFS(lowerDir, upperDir, workDir string) error {
+ if err := os.MkdirAll(lowerDir, defaultFilePermission); err != nil {
+ return errors.Wrapf(err, "failed to create dir %s", lowerDir)
+ }
+ if err := os.MkdirAll(upperDir, defaultFilePermission); err != nil {
+ return errors.Wrapf(err, "failed to create dir %s", upperDir)
+ }
+ if err := os.MkdirAll(workDir, defaultFilePermission); err != nil {
+ return errors.Wrapf(err, "failed to create dir %s", workDir)
+ }
+
+ if err := syscall.Mount("none", lowerDir, "overlay", 0,
+ fmt.Sprintf("lowerdir=%s,upperdir=%s,workdir=%s", lowerDir, upperDir, workDir)); err != nil {
+ return errors.Wrapf(err, "failed to create overlayfs (lowerdir=%s, upperdir=%s)", lowerDir, upperDir)
+ }
+ return nil
+}
+
+func loadGPUDrivers(needSigned bool) error {
+ if needSigned {
+ if err := modules.LoadPublicKey("gpu-key", filepath.Join(gpuInstallDirContainer, "pubkey.der")); err != nil {
+ return errors.Wrap(err, "failed to load public key")
+ }
+ }
+ // Need to load modules in order due to module dependency.
+ gpuModules := map[string]string{
+ "nvidia": filepath.Join(gpuInstallDirContainer, "drivers", "nvidia.ko"),
+ "nvidia_uvm": filepath.Join(gpuInstallDirContainer, "drivers", "nvidia-uvm.ko"),
+ "nvidia_drm": filepath.Join(gpuInstallDirContainer, "drivers", "nvidia-drm.ko"),
+ }
+ for moduleName, modulePath := range gpuModules {
+ if err := modules.LoadModule(moduleName, modulePath); err != nil {
+ return errors.Wrapf(err, "failed to load module %s", modulePath)
+ }
+ }
+ return nil
+}
diff --git a/src/cmd/cos_gpu_installer/internal/installer/installer_test.go b/src/cmd/cos_gpu_installer/internal/installer/installer_test.go
new file mode 100644
index 0000000..d283761
--- /dev/null
+++ b/src/cmd/cos_gpu_installer/internal/installer/installer_test.go
@@ -0,0 +1,47 @@
+package installer
+
+import (
+ "testing"
+)
+
+func TestGetInstallerDownloadLocation(t *testing.T) {
+ for _, tc := range []struct {
+ testName string
+ metadataZone string
+ expectedLocation string
+ }{
+ {
+ "us-west1-b",
+ "projects/123456789/zones/us-west1-b",
+ "us",
+ },
+ {
+ "asia-east1-a",
+ "projects/123456789/zones/asia-east1-a",
+ "asia",
+ },
+ {
+ "europe-west1-b",
+ "projects/123456789/zones/europe-west1-b",
+ "eu",
+ },
+ {
+ "australia-southeast1-a",
+ "projects/123456789/zones/australia-southeast1-a",
+ "us",
+ },
+ } {
+ location := getInstallerDownloadLocation(tc.metadataZone)
+ if location != tc.expectedLocation {
+ t.Errorf("%s: expect location: %s, got: %s", tc.testName, tc.expectedLocation, location)
+ }
+ }
+}
+
+func TestGetPrecompiledInstallerURL(t *testing.T) {
+ ret := getPrecompiledInstallerURL("418.116.00", "73", "11647.415.0", "us")
+ expectedRet := "https://storage.googleapis.com/nvidia-drivers-us-public/nvidia-cos-project/73/tesla/418_00/418.116.00/NVIDIA-Linux-x86_64-418.116.00_73-11647-415-0.cos"
+ if ret != expectedRet {
+ t.Errorf("Unexpected return, want: %s, got: %s", expectedRet, ret)
+ }
+}