Merge "cos-gpu-installer-v2: Add installer package"
diff --git a/src/cmd/cos_gpu_installer/internal/installer/cache.go b/src/cmd/cos_gpu_installer/internal/installer/cache.go
new file mode 100644
index 0000000..d391b16
--- /dev/null
+++ b/src/cmd/cos_gpu_installer/internal/installer/cache.go
@@ -0,0 +1,76 @@
+package installer
+
+import (
+	"fmt"
+	"os"
+	"path/filepath"
+
+	log "github.com/golang/glog"
+	"github.com/pkg/errors"
+
+	"pkg/utils"
+)
+
+const (
+	cacheFile        = ".cache"
+	buildNumberKey   = "BUILD_ID"
+	driverVersionKey = "DRIVER_VERSION"
+)
+
+// Cacher is to cache GPU driver installation info.
+type Cacher struct {
+	gpuInstallDir string
+	buildNumber   string
+	driverVersion string
+}
+
+// NewCacher returns an instance of Cacher.
+func NewCacher(gpuInstallDir, buildNumber, driverVersion string) *Cacher {
+	if gpuInstallDir != "" {
+		return &Cacher{gpuInstallDir: gpuInstallDir, buildNumber: buildNumber, driverVersion: driverVersion}
+	}
+
+	return &Cacher{gpuInstallDir: gpuInstallDirContainer, buildNumber: buildNumber, driverVersion: driverVersion}
+}
+
+// Cache writes to fs about the information that a given GPU driver has been installed.
+func (c *Cacher) Cache() error {
+	cachePath := filepath.Join(c.gpuInstallDir, cacheFile)
+	f, err := os.Create(cachePath)
+	defer f.Close()
+	if err != nil {
+		return errors.Wrapf(err, "Failed to create file %s", cachePath)
+	}
+
+	cacheMap := map[string]string{
+		buildNumberKey:   c.buildNumber,
+		driverVersionKey: c.driverVersion}
+
+	var cache string
+	for k, v := range cacheMap {
+		cache = cache + fmt.Sprintf("%s=%s\n", k, v)
+	}
+
+	if _, err = f.WriteString(cache); err != nil {
+		return errors.Wrapf(err, "Failed to write to file %s", cachePath)
+	}
+
+	log.Info("Updated cached version as")
+	for key, value := range cacheMap {
+		log.Infof("%s=%s", key, value)
+	}
+	return nil
+}
+
+// IsCached returns a bool indicating whether a given GPU driver has been installed.
+func (c *Cacher) IsCached() (bool, error) {
+	cacheMap, err := utils.LoadEnvFromFile(c.gpuInstallDir, cacheFile)
+	if err != nil {
+		log.Infof("error: %v", err)
+		return false, err
+	}
+	log.Infof("%v", cacheMap)
+
+	return (c.buildNumber == cacheMap[buildNumberKey] &&
+		c.driverVersion == cacheMap[driverVersionKey]), nil
+}
diff --git a/src/cmd/cos_gpu_installer/internal/installer/cache_test.go b/src/cmd/cos_gpu_installer/internal/installer/cache_test.go
new file mode 100644
index 0000000..24768dc
--- /dev/null
+++ b/src/cmd/cos_gpu_installer/internal/installer/cache_test.go
@@ -0,0 +1,42 @@
+package installer
+
+import (
+	"io/ioutil"
+	"os"
+	"testing"
+)
+
+func TestIsCached(t *testing.T) {
+	testDir, err := ioutil.TempDir("", "testing")
+	if err != nil {
+		t.Fatalf("Failed to create tempdir: %v", err)
+	}
+	defer os.RemoveAll(testDir)
+
+	cacher := NewCacher(testDir, "12688.0.0", "418.67")
+	if err := cacher.Cache(); err != nil {
+		t.Fatalf("Failed to cache: %v", err)
+	}
+
+	for _, tc := range []struct {
+		testName      string
+		buildNumber   string
+		driverVersion string
+		expectOut     bool
+	}{
+		{"TestIsCachedTrue", "12688.0.0", "418.67", true},
+		{"TestIsCachedWrongBuild", "12670.0.0", "418.67", false},
+		{"TestIsCachedWrongDriver", "12688.0.0", "418.00", false},
+	} {
+		t.Run(tc.testName, func(t *testing.T) {
+			testCacher := NewCacher(testDir, tc.buildNumber, tc.driverVersion)
+			out, err := testCacher.IsCached()
+			if err != nil {
+				t.Fatalf("Failed to check cache result: %v", err)
+			}
+			if out != tc.expectOut {
+				t.Errorf("Unexpected cache result: want :%v, got: %v", tc.expectOut, out)
+			}
+		})
+	}
+}
diff --git a/src/cmd/cos_gpu_installer/internal/installer/installer.go b/src/cmd/cos_gpu_installer/internal/installer/installer.go
new file mode 100644
index 0000000..dd6db07
--- /dev/null
+++ b/src/cmd/cos_gpu_installer/internal/installer/installer.go
@@ -0,0 +1,303 @@
+// Package installer provides functionality to install GPU drivers.
+package installer
+
+import (
+	"fmt"
+	"io/ioutil"
+	"os"
+	"os/exec"
+	"path"
+	"path/filepath"
+	"strings"
+	"syscall"
+
+	"cmd/cos_gpu_installer/internal/signing"
+	"pkg/cos"
+	"pkg/modules"
+	"pkg/utils"
+
+	log "github.com/golang/glog"
+	"github.com/pkg/errors"
+)
+
+const (
+	gpuInstallDirContainer        = "/usr/local/nvidia"
+	defaultGPUDriverFile          = "default_version"
+	precompiledInstallerURLFormat = "https://storage.googleapis.com/nvidia-drivers-%s-public/nvidia-cos-project/%s/tesla/%s_00/%s/NVIDIA-Linux-x86_64-%s_%s-%s.cos"
+	defaultFilePermission         = 0755
+)
+
+// VerifyDriverInstallation runs some commands to verify the driver installation.
+func VerifyDriverInstallation() error {
+	log.Info("Verifying GPU driver installation")
+
+	newPathEnv := fmt.Sprintf("%s/bin:%s", gpuInstallDirContainer, os.Getenv("PATH"))
+	os.Setenv("PATH", newPathEnv)
+	// Run nvidia-smi to check whether nvidia GPU driver is installed.
+	if err := utils.RunCommandAndLogOutput(exec.Command("nvidia-smi"), false); err != nil {
+		return errors.Wrap(err, "failed to verify GPU driver installation")
+	}
+
+	// Create unified memory device file.
+	if err := utils.RunCommandAndLogOutput(exec.Command("nvidia-modprobe", "-c0", "-u"), false); err != nil {
+		return errors.Wrap(err, "failed to create unified memory device file")
+	}
+
+	return nil
+}
+
+// ConfigureCachedInstalltion updates ldconfig and installs the cached GPU driver kernel modules.
+func ConfigureCachedInstalltion(gpuInstallDirHost string, needSigned bool) error {
+	log.Info("Configuring cached driver installation")
+
+	if err := createHostDirBindMount(gpuInstallDirHost, gpuInstallDirContainer); err != nil {
+		return errors.Wrap(err, "failed to create driver installation dir")
+	}
+	if err := updateContainerLdCache(); err != nil {
+		return errors.Wrap(err, "failed to configure cached driver installation")
+	}
+	if err := loadGPUDrivers(needSigned); err != nil {
+		return errors.Wrap(err, "failed to configure cached driver installation")
+	}
+
+	return nil
+}
+
+// DownloadDriverInstaller downloads GPU driver installer given driver version and COS version.
+func DownloadDriverInstaller(driverVersion, cosMilestone, cosBuildNumber string) (string, error) {
+	log.Infof("Downloading GPU driver installer version %s", driverVersion)
+	downloadURL, err := getDriverInstallerDownloadURL(driverVersion, cosMilestone, cosBuildNumber)
+	if err != nil {
+		return "", errors.Wrap(err, "failed to get driver installer download URL")
+	}
+	outputPath := filepath.Join(gpuInstallDirContainer, path.Base(downloadURL))
+	if err := utils.DownloadContentFromURL(downloadURL, outputPath, "GPU driver installer"); err != nil {
+		return "", errors.Wrapf(err, "failed to download GPU driver installer version %s", driverVersion)
+	}
+	return filepath.Base(outputPath), nil
+}
+
+// ConfigureDriverInstallationDirs configures GPU driver installation directories by creating mounts.
+func ConfigureDriverInstallationDirs(gpuInstallDirHost string, kernelRelease string) (chan<- int, error) {
+	log.Info("Configuring driver installation directories")
+
+	if err := createHostDirBindMount(gpuInstallDirHost, gpuInstallDirContainer); err != nil {
+		return nil, errors.Wrap(err, "failed to create dirver installation dir")
+	}
+
+	if err := createOverlayFS(
+		"/usr/bin", gpuInstallDirContainer+"/bin", gpuInstallDirContainer+"/bin-workdir"); err != nil {
+		return nil, errors.Wrap(err, "failed to create bin overlay")
+	}
+	if err := createOverlayFS(
+		"/usr/lib/x86_64-linux-gnu", gpuInstallDirContainer+"/lib64", gpuInstallDirContainer+"/lib64-workdir"); err != nil {
+		return nil, errors.Wrap(err, "failed to create lib64 overlay")
+	}
+	modulePath := filepath.Join("/lib/modules", kernelRelease, "video")
+	if err := createOverlayFS(
+		modulePath, gpuInstallDirContainer+"/drivers", gpuInstallDirContainer+"/drivers-workdir"); err != nil {
+		return nil, errors.Wrap(err, "failed to create drivers overlay")
+	}
+
+	if err := updateContainerLdCache(); err != nil {
+		return nil, errors.Wrap(err, "failed to update container ld cache")
+	}
+
+	ch := make(chan int, 1)
+	go func() {
+		// cleans up mounts created above.
+		<-ch
+		syscall.Unmount("/usr/bin", 0)
+		syscall.Unmount("/usr/lib/x86_64-linux-gnu", 0)
+		syscall.Unmount(modulePath, 0)
+		syscall.Unmount(gpuInstallDirContainer, 0)
+	}()
+	return ch, nil
+}
+
+// RunDriverInstaller runs GPU driver installer.
+func RunDriverInstaller(installerFilename string, needSigned bool) error {
+	log.Info("Running GPU driver installer")
+
+	// Extract files to a fixed path first to make sure md5sum of generated gpu drivers are consistent.
+	extractDir := "/tmp/extract"
+	cmd := exec.Command("sh", installerFilename, "-x", "--target", extractDir)
+	cmd.Dir = gpuInstallDirContainer
+	if err := cmd.Run(); err != nil {
+		return errors.Wrap(err, "failed to extract installer files")
+	}
+
+	cmd = exec.Command(filepath.Join(extractDir, "nvidia-installer"),
+		"--utility-prefix="+gpuInstallDirContainer,
+		"--opengl-prefix="+gpuInstallDirContainer,
+		"--no-install-compat32-libs",
+		"--log-file-name="+filepath.Join(gpuInstallDirContainer, "nvidia-installer.log"),
+		"--silent",
+		"--accept-license",
+	)
+
+	log.Infof("Installer arguments:\n%v", cmd.Args)
+
+	if needSigned {
+		// Run installer to compile drivers. Expect the command to fail as the drivers are not signed yet.
+		if err := utils.RunCommandAndLogOutput(cmd, true); err != nil {
+			return errors.Wrap(err, "failed to run GPU driver installer")
+		}
+		// sign GPU drivers.
+		kernelFiles, err := ioutil.ReadDir(filepath.Join(extractDir, "kernel"))
+		if err != nil {
+			return errors.Wrapf(err, "failed to list files in directory %s", filepath.Join(extractDir, "kernel"))
+		}
+		for _, kernelFile := range kernelFiles {
+			if strings.HasSuffix(kernelFile.Name(), ".ko") {
+				module := kernelFile.Name()
+				signaturePath := signing.GetModuleSignature(module)
+				modulePath := filepath.Join(extractDir, "kernel", module)
+				signedModulePath := filepath.Join(gpuInstallDirContainer, "drivers", module)
+				if err := modules.AppendSignature(signedModulePath, modulePath, signaturePath); err != nil {
+					return errors.Wrapf(err, "failed to sign kernel module %s", module)
+				}
+			}
+		}
+		// Copy public key.
+		if utils.CopyFile(signing.GetPublicKeyDer(), filepath.Join(gpuInstallDirContainer, "pubkey.der")); err != nil {
+			return errors.Wrapf(err, "failed to copy file %s", signing.GetPublicKeyDer())
+		}
+		// Finally, load signed GPU drivers.
+		if err := loadGPUDrivers(needSigned); err != nil {
+			return errors.Wrap(err, "failed to load GPU drivers")
+		}
+
+		// Run installer again to only install user space libraries.
+		cmd = exec.Command(cmd.Path, cmd.Args[1:]...)
+		cmd.Args = append(cmd.Args, "--no-kernel-module")
+		if err := utils.RunCommandAndLogOutput(cmd, true); err != nil {
+			return errors.Wrap(err, "failed to run GPU driver installer")
+		}
+	} else {
+		if err := utils.RunCommandAndLogOutput(cmd, false); err != nil {
+			return errors.Wrap(err, "failed to run GPU driver installer")
+		}
+	}
+
+	return nil
+}
+
+// GetDefaultGPUDriverVersion gets the default GPU driver version.
+func GetDefaultGPUDriverVersion(downloader cos.ExtensionsDownloader) (string, error) {
+	log.Info("Getting the default GPU driver version")
+	content, err := downloader.GetExtensionArtifact(cos.GpuExtension, defaultGPUDriverFile)
+	if err != nil {
+		return "", errors.Wrapf(err, "failed to get default GPU driver version")
+	}
+	return strings.Trim(string(content), "\n "), nil
+}
+
+func updateContainerLdCache() error {
+	log.Info("Updating container's ld cache")
+
+	f, err := os.Create("/etc/ld.so.conf.d/nvidia.conf")
+	if err != nil {
+		f.Close()
+		return errors.Wrap(err, "failed to update ld cache")
+	}
+	f.WriteString(gpuInstallDirContainer + "/lib64")
+	f.Close()
+
+	err = exec.Command("ldconfig").Run()
+	if err != nil {
+		return errors.Wrap(err, "failed to update ld cache")
+	}
+	return nil
+}
+
+func getDriverInstallerDownloadURL(driverVersion, cosMilestone, cosBuildNumber string) (string, error) {
+	metadataZone, err := utils.GetGCEMetadata("zone")
+	if err != nil {
+		return "", errors.Wrap(err, "failed to get GCE metadata zone")
+	}
+	downloadLocation := getInstallerDownloadLocation(metadataZone)
+
+	return getPrecompiledInstallerURL(driverVersion, cosMilestone, cosBuildNumber, downloadLocation), nil
+}
+
+func getInstallerDownloadLocation(metadataZone string) string {
+	fields := strings.Split(metadataZone, "/")
+	zone := fields[len(fields)-1]
+	locationMapping := map[string]string{
+		"us":     "us",
+		"asia":   "asia",
+		"europe": "eu",
+	}
+	location, ok := locationMapping[strings.Split(zone, "-")[0]]
+	if !ok {
+		location = "us"
+	}
+	return location
+}
+
+func getPrecompiledInstallerURL(driverVersion, cosMilestone, cosBuildNumber, downloadLocation string) string {
+	// 418.67 -> 418
+	majorVersion := strings.Split(driverVersion, ".")[0]
+	// 12371.284.0 -> 12371-284-0
+	cosBuildNumber = strings.Replace(cosBuildNumber, ".", "-", -1)
+	return fmt.Sprintf(
+		precompiledInstallerURLFormat,
+		downloadLocation, cosMilestone, majorVersion, driverVersion, driverVersion, cosMilestone, cosBuildNumber)
+}
+
+func createHostDirBindMount(hostDir, bindMountPath string) error {
+	if err := os.MkdirAll(hostDir, defaultFilePermission); err != nil {
+		return errors.Wrapf(err, "failed to create dir %s", hostDir)
+	}
+	if err := os.MkdirAll(bindMountPath, defaultFilePermission); err != nil {
+		return errors.Wrapf(err, "failed to create dir %s", bindMountPath)
+	}
+	if err := syscall.Mount(hostDir, bindMountPath, "", syscall.MS_BIND, ""); err != nil {
+		return errors.Wrapf(err, "failed to create bind mount %s", bindMountPath)
+	}
+	// Remount to clear noexec flag.
+	if err := syscall.Mount("", bindMountPath, "",
+		syscall.MS_REMOUNT|syscall.MS_NOSUID|syscall.MS_NODEV|syscall.MS_RELATIME, ""); err != nil {
+		return errors.Wrapf(err, "failed to remount %s", bindMountPath)
+	}
+	return nil
+}
+
+func createOverlayFS(lowerDir, upperDir, workDir string) error {
+	if err := os.MkdirAll(lowerDir, defaultFilePermission); err != nil {
+		return errors.Wrapf(err, "failed to create dir %s", lowerDir)
+	}
+	if err := os.MkdirAll(upperDir, defaultFilePermission); err != nil {
+		return errors.Wrapf(err, "failed to create dir %s", upperDir)
+	}
+	if err := os.MkdirAll(workDir, defaultFilePermission); err != nil {
+		return errors.Wrapf(err, "failed to create dir %s", workDir)
+	}
+
+	if err := syscall.Mount("none", lowerDir, "overlay", 0,
+		fmt.Sprintf("lowerdir=%s,upperdir=%s,workdir=%s", lowerDir, upperDir, workDir)); err != nil {
+		return errors.Wrapf(err, "failed to create overlayfs (lowerdir=%s, upperdir=%s)", lowerDir, upperDir)
+	}
+	return nil
+}
+
+func loadGPUDrivers(needSigned bool) error {
+	if needSigned {
+		if err := modules.LoadPublicKey("gpu-key", filepath.Join(gpuInstallDirContainer, "pubkey.der")); err != nil {
+			return errors.Wrap(err, "failed to load public key")
+		}
+	}
+	// Need to load modules in order due to module dependency.
+	gpuModules := map[string]string{
+		"nvidia":     filepath.Join(gpuInstallDirContainer, "drivers", "nvidia.ko"),
+		"nvidia_uvm": filepath.Join(gpuInstallDirContainer, "drivers", "nvidia-uvm.ko"),
+		"nvidia_drm": filepath.Join(gpuInstallDirContainer, "drivers", "nvidia-drm.ko"),
+	}
+	for moduleName, modulePath := range gpuModules {
+		if err := modules.LoadModule(moduleName, modulePath); err != nil {
+			return errors.Wrapf(err, "failed to load module %s", modulePath)
+		}
+	}
+	return nil
+}
diff --git a/src/cmd/cos_gpu_installer/internal/installer/installer_test.go b/src/cmd/cos_gpu_installer/internal/installer/installer_test.go
new file mode 100644
index 0000000..d283761
--- /dev/null
+++ b/src/cmd/cos_gpu_installer/internal/installer/installer_test.go
@@ -0,0 +1,47 @@
+package installer
+
+import (
+	"testing"
+)
+
+func TestGetInstallerDownloadLocation(t *testing.T) {
+	for _, tc := range []struct {
+		testName         string
+		metadataZone     string
+		expectedLocation string
+	}{
+		{
+			"us-west1-b",
+			"projects/123456789/zones/us-west1-b",
+			"us",
+		},
+		{
+			"asia-east1-a",
+			"projects/123456789/zones/asia-east1-a",
+			"asia",
+		},
+		{
+			"europe-west1-b",
+			"projects/123456789/zones/europe-west1-b",
+			"eu",
+		},
+		{
+			"australia-southeast1-a",
+			"projects/123456789/zones/australia-southeast1-a",
+			"us",
+		},
+	} {
+		location := getInstallerDownloadLocation(tc.metadataZone)
+		if location != tc.expectedLocation {
+			t.Errorf("%s: expect location: %s, got: %s", tc.testName, tc.expectedLocation, location)
+		}
+	}
+}
+
+func TestGetPrecompiledInstallerURL(t *testing.T) {
+	ret := getPrecompiledInstallerURL("418.116.00", "73", "11647.415.0", "us")
+	expectedRet := "https://storage.googleapis.com/nvidia-drivers-us-public/nvidia-cos-project/73/tesla/418_00/418.116.00/NVIDIA-Linux-x86_64-418.116.00_73-11647-415-0.cos"
+	if ret != expectedRet {
+		t.Errorf("Unexpected return, want: %s, got: %s", expectedRet, ret)
+	}
+}