Link nvidia drivers explicitly.

It turns out that the nvidia-installer does not link the drivers
correctly, because linking the drivers needs the kernel's module.lds
linker script. Nvidia recommended to us that we link the modules
explicitly.

Doing this requires also having the kernel headers installed (which
includes module.lds). To simplify, we install the kernel headers in the
same directory structure as the standalone toolchain. This means that no
additional changes are needed for correct dependency caching to work.
Since we are installing the kernel headers as part of the toolchain
installation, we remove the existing function for installing the kernel
headers, which was dead code anyway (nothing called that function).
Furthermore, the kernel headers have no value without the toolchain (the
only use case for the kernel headers is to compile code that targets
COS).

One complication here is that this change introduces an incompatibility
between the cos-gpu-installer and the driver signatures. Signatures
produced by the new linking procedure cannot be consumed by old versions
of cos-gpu-installer. To help with this, we still keep the old linking
behavior as a fallback functionality. If GKE, for example, happens to
deploy a new cos-gpu-installer on an old COS version, nothing bad will
happen. The fallback behavior also allows the timing to be less strict
when we update images vs when we update the signer.

We also enable installation on beta images now, since we expect that to
work now.

Since github.com/pkg/errors is deprecated, new code uses the standard
library errors package instead.

BUG=b/200957688
TEST=Manually ran the following test scenarios:
- Tested with live cos-85 and cos-89 versions, using production drivers
- Tested with live cos-93, using unsigned drivers
- Signed live driver with new cos-gpu-installer, installed on live
  cos-85 with new cos-gpu-installer
- Signed live driver with new cos-gpu-installer, installed on live
  cos-89 with new cos-gpu-installer
- Signed pre-release driver with new cos-gpu-installer, installed on
  live cos-93 with new cos-gpu-installer

Change-Id: I1451694fd3d41c9c04f08023291e1cdb6413fd8c
Reviewed-on: https://cos-review.googlesource.com/c/cos/tools/+/23430
Cloud-Build: GCB Service account <228075978874@cloudbuild.gserviceaccount.com>
Reviewed-by: Ke Wu <mikewu@google.com>
Tested-by: Robert Kolchmeyer <rkolchmeyer@google.com>
diff --git a/src/cmd/cos_gpu_installer/internal/commands/install.go b/src/cmd/cos_gpu_installer/internal/commands/install.go
index e6c56c6..bd2e9e8 100644
--- a/src/cmd/cos_gpu_installer/internal/commands/install.go
+++ b/src/cmd/cos_gpu_installer/internal/commands/install.go
@@ -8,6 +8,7 @@
 	"os"
 	"path/filepath"
 	"strings"
+	"syscall"
 
 	"flag"
 
@@ -26,7 +27,6 @@
 	grepFound       = 0
 	hostRootPath    = "/root"
 	kernelSrcDir    = "/build/usr/src/linux"
-	kernelHeaderDir = "/build/usr/src/linux-headers"
 	toolchainPkgDir = "/build/cos-tools"
 )
 
@@ -88,8 +88,8 @@
 
 	log.V(2).Infof("Running on COS build id %s", envReader.BuildNumber())
 
-	if releaseTrack := envReader.ReleaseTrack(); releaseTrack == "dev-channel" || releaseTrack == "beta-channel" {
-		c.logError(fmt.Errorf("GPU installation is not supported on dev & beta image for now; Please use LTS image."))
+	if releaseTrack := envReader.ReleaseTrack(); releaseTrack == "dev-channel" {
+		c.logError(fmt.Errorf("GPU installation is not supported on dev images for now; Please use LTS image."))
 		return subcommands.ExitFailure
 	}
 
@@ -164,6 +164,19 @@
 	return argVersion, nil
 }
 
+func remountExecutable(dir string) error {
+	if err := os.MkdirAll(dir, 0755); err != nil {
+		return fmt.Errorf("failed to create dir %q: %v", dir, err)
+	}
+	if err := syscall.Mount(dir, dir, "", syscall.MS_BIND, ""); err != nil {
+		return fmt.Errorf("failed to create bind mount at %q: %v", dir, err)
+	}
+	if err := syscall.Mount("", dir, "", syscall.MS_REMOUNT|syscall.MS_NOSUID|syscall.MS_NODEV|syscall.MS_RELATIME, ""); err != nil {
+		return fmt.Errorf("failed to remount %q: %v", dir, err)
+	}
+	return nil
+}
+
 func installDriver(c *InstallCommand, cacher *installer.Cacher, envReader *cos.EnvReader, downloader *cos.GCSDownloader) error {
 	callback, err := installer.ConfigureDriverInstallationDirs(filepath.Join(hostRootPath, c.hostInstallDir), envReader.KernelRelease())
 	if err != nil {
@@ -189,12 +202,23 @@
 	if err := cos.SetCompilationEnv(downloader); err != nil {
 		return errors.Wrap(err, "failed to set compilation environment variables")
 	}
+	if err := remountExecutable(toolchainPkgDir); err != nil {
+		return fmt.Errorf("failed to remount %q as executable: %v", filepath.Dir(toolchainPkgDir), err)
+	}
 	if err := cos.InstallCrossToolchain(downloader, toolchainPkgDir); err != nil {
 		return errors.Wrap(err, "failed to install toolchain")
 	}
 
-	if err := installer.RunDriverInstaller(installerFile, !c.unsignedDriver); err != nil {
-		return errors.Wrap(err, "failed to run GPU driver installer")
+	if err := installer.RunDriverInstaller(toolchainPkgDir, installerFile, !c.unsignedDriver, false); err != nil {
+		if errors.Is(err, installer.ErrDriverLoad) {
+			// Drivers were linked, but couldn't load; try again with legacy linking
+			log.Info("Retrying driver installation with legacy linking")
+			if err := installer.RunDriverInstaller(toolchainPkgDir, installerFile, !c.unsignedDriver, true); err != nil {
+				return fmt.Errorf("failed to run GPU driver installer: %v", err)
+			}
+		} else {
+			return errors.Wrap(err, "failed to run GPU driver installer")
+		}
 	}
 	if err := cacher.Cache(); err != nil {
 		return errors.Wrap(err, "failed to cache installation")
diff --git a/src/cmd/cos_gpu_installer/internal/installer/installer.go b/src/cmd/cos_gpu_installer/internal/installer/installer.go
index c567c26..02a50b3 100644
--- a/src/cmd/cos_gpu_installer/internal/installer/installer.go
+++ b/src/cmd/cos_gpu_installer/internal/installer/installer.go
@@ -2,12 +2,14 @@
 package installer
 
 import (
+	stderrors "errors"
 	"fmt"
 	"io/ioutil"
 	"os"
 	"os/exec"
 	"path"
 	"path/filepath"
+	"sort"
 	"strings"
 	"syscall"
 
@@ -18,6 +20,7 @@
 
 	log "github.com/golang/glog"
 	"github.com/pkg/errors"
+	"golang.org/x/sys/unix"
 )
 
 const (
@@ -28,6 +31,12 @@
 	defaultFilePermission         = 0755
 )
 
+var (
+	// ErrDriverLoad indicates that installed GPU drivers could not be loaded into
+	// the kernel.
+	ErrDriverLoad = stderrors.New("failed to load GPU drivers")
+)
+
 // VerifyDriverInstallation runs some commands to verify the driver installation.
 func VerifyDriverInstallation() error {
 	log.Info("Verifying GPU driver installation")
@@ -116,19 +125,94 @@
 	return ch, nil
 }
 
-// RunDriverInstaller runs GPU driver installer.
-func RunDriverInstaller(installerFilename string, needSigned bool) error {
-	log.Info("Running GPU driver installer")
-
-	// Extract files to a fixed path first to make sure md5sum of generated gpu drivers are consistent.
-	extractDir := "/tmp/extract"
-	cmd := exec.Command("sh", installerFilename, "-x", "--target", extractDir)
-	cmd.Dir = gpuInstallDirContainer
-	if err := cmd.Run(); err != nil {
-		return errors.Wrap(err, "failed to extract installer files")
+func extractPrecompiled(nvidiaDir string) error {
+	log.Info("Extracting precompiled artifacts...")
+	precompiledDir := filepath.Join(nvidiaDir, "kernel", "precompiled")
+	files, err := os.ReadDir(precompiledDir)
+	if err != nil {
+		return fmt.Errorf("failed to read %q: %v", precompiledDir, err)
 	}
+	var precompiledArchive string
+	if len(files) == 0 {
+		return stderrors.New("failed to find precompiled artifacts in this nvidia installer")
+	}
+	if len(files) == 1 {
+		precompiledArchive = filepath.Join(precompiledDir, files[0].Name())
+	}
+	if len(files) > 1 {
+		var fileNames []string
+		for _, f := range files {
+			fileNames = append(fileNames, f.Name())
+		}
+		sort.Strings(fileNames)
+		log.Warningf("Found multiple precompiled archives in this nvidia installer: %q", strings.Join(fileNames, ","))
+		log.Warningf("Using precompiled archive named %q", fileNames[len(fileNames)-1])
+		precompiledArchive = filepath.Join(precompiledDir, fileNames[len(fileNames)-1])
+	}
+	cmd := exec.Command(filepath.Join(nvidiaDir, "mkprecompiled"), "--unpack", precompiledArchive, "-o", precompiledDir)
+	if err := utils.RunCommandAndLogOutput(cmd, false); err != nil {
+		return fmt.Errorf("failed to unpack precompiled artifacts: %v", err)
+	}
+	log.Info("Done extracting precompiled artifacts")
+	return nil
+}
 
-	cmd = exec.Command(filepath.Join(extractDir, "nvidia-installer"),
+func linkDrivers(toolchainDir, nvidiaDir string) error {
+	log.Info("Linking drivers...")
+	var kernelInfo unix.Utsname
+	if err := unix.Uname(&kernelInfo); err != nil {
+		return fmt.Errorf("failed to find kernel release info using uname: %v", err)
+	}
+	kernelRelease := strings.Trim(string(kernelInfo.Release[:]), "\x00")
+	// COS 85+ kernels use lld as their linker
+	linker := filepath.Join(toolchainDir, "bin", "ld.lld")
+	linkerScript := filepath.Join(toolchainDir, "usr", "src", "linux-headers-"+kernelRelease, "scripts", "module.lds")
+	if _, err := os.Stat(linkerScript); os.IsNotExist(err) {
+		// Fallback to module-common.lds, which is used in the 5.4 kernel
+		linkerScript = filepath.Join(toolchainDir, "usr", "src", "linux-headers-"+kernelRelease, "scripts", "module-common.lds")
+	}
+	nvidiaKernelDir := filepath.Join(nvidiaDir, "kernel")
+	// Link nvidia.ko
+	nvidiaObjs := []string{
+		filepath.Join(nvidiaKernelDir, "precompiled", "nv-linux.o"),
+		filepath.Join(nvidiaKernelDir, "nvidia", "nv-kernel.o_binary"),
+	}
+	args := append([]string{"-T", linkerScript, "-r", "-o", filepath.Join(nvidiaKernelDir, "nvidia.ko")}, nvidiaObjs...)
+	cmd := exec.Command(linker, args...)
+	log.Infof("Running link command: %v", cmd.Args)
+	if err := utils.RunCommandAndLogOutput(cmd, false); err != nil {
+		return fmt.Errorf("failed to link nvidia.ko: %v", err)
+	}
+	// Link nvidia-modeset.ko
+	modesetObjs := []string{
+		filepath.Join(nvidiaKernelDir, "precompiled", "nv-modeset-linux.o"),
+		filepath.Join(nvidiaKernelDir, "nvidia-modeset", "nv-modeset-kernel.o_binary"),
+	}
+	args = append([]string{"-T", linkerScript, "-r", "-o", filepath.Join(nvidiaKernelDir, "nvidia-modeset.ko")}, modesetObjs...)
+	cmd = exec.Command(linker, args...)
+	log.Infof("Running link command: %v", cmd.Args)
+	if err := utils.RunCommandAndLogOutput(cmd, false); err != nil {
+		return fmt.Errorf("failed to link nvidia-modeset.ko: %v", err)
+	}
+	// nvidia-uvm.ko is pre-linked; move to kernel dir
+	oldPath := filepath.Join(nvidiaKernelDir, "precompiled", "nvidia-uvm.ko")
+	newPath := filepath.Join(nvidiaKernelDir, "nvidia-uvm.ko")
+	if err := unix.Rename(oldPath, newPath); err != nil {
+		return fmt.Errorf("failed to move %q to %q: %v", oldPath, newPath, err)
+	}
+	// nvidia-drm.ko is pre-linked; move to kernel dir
+	oldPath = filepath.Join(nvidiaKernelDir, "precompiled", "nvidia-drm.ko")
+	newPath = filepath.Join(nvidiaKernelDir, "nvidia-drm.ko")
+	if err := unix.Rename(oldPath, newPath); err != nil {
+		return fmt.Errorf("failed to move %q to %q: %v", oldPath, newPath, err)
+	}
+	log.Info("Done linking drivers")
+	return nil
+}
+
+func linkDriversLegacy(nvidiaDir string, needSigned bool) error {
+	log.Info("Linking drivers using legacy method...")
+	cmd := exec.Command(filepath.Join(nvidiaDir, "nvidia-installer"),
 		"--utility-prefix="+gpuInstallDirContainer,
 		"--opengl-prefix="+gpuInstallDirContainer,
 		"--x-prefix="+gpuInstallDirContainer,
@@ -138,18 +222,73 @@
 		"--silent",
 		"--accept-license",
 	)
-
 	log.Infof("Installer arguments:\n%v", cmd.Args)
+	if err := utils.RunCommandAndLogOutput(cmd, needSigned); err != nil {
+		return fmt.Errorf("failed to run GPU driver installer: %v", err)
+	}
+	log.Info("Done linking drivers")
+	return nil
+}
 
-	if needSigned {
-		// Run installer to compile drivers. Expect the command to fail as the drivers are not signed yet.
-		utils.RunCommandAndLogOutput(cmd, true)
+func installUserLibs(nvidiaDir string) error {
+	log.Info("Installing userspace libraries...")
+	cmd := exec.Command(filepath.Join(nvidiaDir, "nvidia-installer"),
+		"--utility-prefix="+gpuInstallDirContainer,
+		"--opengl-prefix="+gpuInstallDirContainer,
+		"--x-prefix="+gpuInstallDirContainer,
+		"--install-libglvnd",
+		"--no-install-compat32-libs",
+		"--log-file-name="+filepath.Join(gpuInstallDirContainer, "nvidia-installer.log"),
+		"--silent",
+		"--accept-license",
+		"--no-kernel-module",
+	)
+	log.Infof("Installer arguments:\n%v", cmd.Args)
+	if err := utils.RunCommandAndLogOutput(cmd, false); err != nil {
+		return fmt.Errorf("failed to run GPU driver installer: %v", err)
+	}
+	log.Info("Done installing userspace libraries")
+	return nil
+}
 
-		// sign GPU drivers.
-		kernelFiles, err := ioutil.ReadDir(filepath.Join(extractDir, "kernel"))
-		if err != nil {
-			return errors.Wrapf(err, "failed to list files in directory %s", filepath.Join(extractDir, "kernel"))
+// RunDriverInstaller runs GPU driver installer. Only works if the provided
+// installer includes precompiled drivers.
+func RunDriverInstaller(toolchainDir, installerFilename string, needSigned, legacyLink bool) error {
+	log.Info("Running GPU driver installer")
+
+	// Extract files to a fixed path first to make sure md5sum of generated gpu drivers are consistent.
+	extractDir := "/tmp/extract"
+	if err := os.RemoveAll(extractDir); err != nil {
+		return fmt.Errorf("failed to clean %q: %v", extractDir, err)
+	}
+	cmd := exec.Command("sh", installerFilename, "-x", "--target", extractDir)
+	cmd.Dir = gpuInstallDirContainer
+	if err := cmd.Run(); err != nil {
+		return errors.Wrap(err, "failed to extract installer files")
+	}
+
+	// Extract precompiled artifacts.
+	if err := extractPrecompiled(extractDir); err != nil {
+		return fmt.Errorf("failed to extract precompiled artifacts: %v", err)
+	}
+
+	// Link drivers.
+	if legacyLink {
+		if err := linkDriversLegacy(extractDir, needSigned); err != nil {
+			return fmt.Errorf("failed to link drivers: %v", err)
 		}
+	} else {
+		if err := linkDrivers(toolchainDir, extractDir); err != nil {
+			return fmt.Errorf("failed to link drivers: %v", err)
+		}
+	}
+
+	kernelFiles, err := ioutil.ReadDir(filepath.Join(extractDir, "kernel"))
+	if err != nil {
+		return errors.Wrapf(err, "failed to list files in directory %s", filepath.Join(extractDir, "kernel"))
+	}
+	if needSigned {
+		// sign GPU drivers.
 		for _, kernelFile := range kernelFiles {
 			if strings.HasSuffix(kernelFile.Name(), ".ko") {
 				module := kernelFile.Name()
@@ -162,23 +301,41 @@
 			}
 		}
 		// Copy public key.
-		if utils.CopyFile(signing.GetPublicKeyDer(), filepath.Join(gpuInstallDirContainer, "pubkey.der")); err != nil {
+		if err := utils.CopyFile(signing.GetPublicKeyDer(), filepath.Join(gpuInstallDirContainer, "pubkey.der")); err != nil {
 			return errors.Wrapf(err, "failed to copy file %s", signing.GetPublicKeyDer())
 		}
-		// Finally, load signed GPU drivers.
-		if err := loadGPUDrivers(needSigned); err != nil {
-			return errors.Wrap(err, "failed to load GPU drivers")
+	} else if !legacyLink {
+		// Copy drivers to the desired end directory. This is done as part of
+		// `modules.AppendSignature` in the above signing block, but we need to do
+		// it for unsigned modules as well. Legacy linking already does this copy
+		// in the unsigned case; we skip this block in the legacy link case to avoid
+		// redundancy.
+		for _, kernelFile := range kernelFiles {
+			if strings.HasSuffix(kernelFile.Name(), ".ko") {
+				module := kernelFile.Name()
+				src := filepath.Join(extractDir, "kernel", module)
+				dst := filepath.Join(gpuInstallDirContainer, "drivers", module)
+				if err := utils.CopyFile(src, dst); err != nil {
+					return fmt.Errorf("failed to copy kernel module %q: %v", module, err)
+				}
+			}
 		}
+	}
 
-		// Run installer again to only install user space libraries.
-		cmd = exec.Command(cmd.Path, cmd.Args[1:]...)
-		cmd.Args = append(cmd.Args, "--no-kernel-module")
-		if err := utils.RunCommandAndLogOutput(cmd, true); err != nil {
-			return errors.Wrap(err, "failed to run GPU driver installer")
+	// Load GPU drivers.
+	// The legacy linking method already does this in the unsigned case.
+	if needSigned || !legacyLink {
+		if err := loadGPUDrivers(needSigned); err != nil {
+			return fmt.Errorf("%w: %v", ErrDriverLoad, err)
 		}
-	} else {
-		if err := utils.RunCommandAndLogOutput(cmd, false); err != nil {
-			return errors.Wrap(err, "failed to run GPU driver installer")
+	}
+
+	// Install libs.
+	// The legacy linking method already installs these libs in the unsigned
+	// case. This step is redundant in that case.
+	if needSigned || !legacyLink {
+		if err := installUserLibs(extractDir); err != nil {
+			return fmt.Errorf("failed to install userspace libraries: %v", err)
 		}
 	}
 
diff --git a/src/pkg/cos/cos.go b/src/pkg/cos/cos.go
index a81786b..e751e05 100644
--- a/src/pkg/cos/cos.go
+++ b/src/pkg/cos/cos.go
@@ -66,7 +66,7 @@
 	return nil
 }
 
-// InstallCrossToolchain installs COS toolchain to destination directory.
+// InstallCrossToolchain installs COS toolchain and kernel headers to destination directory.
 func InstallCrossToolchain(downloader ArtifactsDownloader, destDir string) error {
 	log.Info("Installing the toolchain")
 
@@ -79,10 +79,20 @@
 		if err := downloader.DownloadToolchain(destDir); err != nil {
 			return errors.Wrap(err, "failed to download toolchain")
 		}
+		if err := downloader.DownloadKernelHeaders(destDir); err != nil {
+			return fmt.Errorf("failed to download kernel headers: %v", err)
+		}
 
+		log.Info("Unpacking toolchain...")
 		if err := exec.Command("tar", "xf", filepath.Join(destDir, toolchainArchive), "-C", destDir).Run(); err != nil {
 			return errors.Wrap(err, "failed to extract toolchain archive tarball")
 		}
+		log.Info("Done unpacking toolchain")
+		log.Info("Unpacking kernel headers...")
+		if err := exec.Command("tar", "xf", filepath.Join(destDir, kernelHeaders), "-C", destDir).Run(); err != nil {
+			return fmt.Errorf("failed to extract kernel headers: %v", err)
+		}
+		log.Info("Done unpacking kernel headers")
 	}
 
 	log.V(2).Info("Configuring environment variables for cross-compilation")
@@ -110,28 +120,6 @@
 	return nil
 }
 
-// InstallKernelHeaderPkg installs kernel header package to destination directory.
-func InstallKernelHeaderPkg(downloader ArtifactsDownloader, destDir string) error {
-	log.Info("Installing the kernel header package")
-
-	if err := os.MkdirAll(destDir, 0755); err != nil {
-		return errors.Wrapf(err, "failed to create dir %s", destDir)
-	}
-	if empty, _ := utils.IsDirEmpty(destDir); !empty {
-		return nil
-	}
-
-	log.Info("Kernel headers not found locally, downloading")
-	if err := downloader.DownloadKernelHeaders(destDir); err != nil {
-		return errors.Wrap(err, "failed to download kernel headers")
-	}
-	if err := exec.Command("tar", "xf", filepath.Join(destDir, kernelHeaders), "-C", destDir).Run(); err != nil {
-		return errors.Wrap(err, "failed to extract kernel header tarball")
-	}
-
-	return nil
-}
-
 // ConfigureModuleSymvers copys Module.symvers file from kernel header dir to kernel source dir.
 func ConfigureModuleSymvers(kernelHeaderDir, kernelSrcDir string) error {
 	log.Info("Configuring Module.symvers file")
diff --git a/src/pkg/cos/cos_test.go b/src/pkg/cos/cos_test.go
index 101ba9a..213c1d2 100644
--- a/src/pkg/cos/cos_test.go
+++ b/src/pkg/cos/cos_test.go
@@ -115,23 +115,6 @@
 	}
 }
 
-func TestInstallKernelHeaderPkg(t *testing.T) {
-	tmpDir, err := ioutil.TempDir("", "testing")
-	if err != nil {
-		t.Fatalf("Failed to create temp dir: %v", err)
-	}
-	defer os.RemoveAll(tmpDir)
-
-	downloder := fakeDownloader{}
-	if err := InstallKernelHeaderPkg(&downloder, tmpDir); err != nil {
-		t.Fatalf("Failed to run InstallKernelHeaderPkg: %v", err)
-	}
-
-	if _, err := os.Stat(filepath.Join(tmpDir, "kernel-header")); err != nil {
-		t.Errorf("Failed to get kernel headers file: %v", err)
-	}
-}
-
 func TestSetCompilationEnv(t *testing.T) {
 	origEnvs := os.Environ()
 	defer func() {