Make sure to not run the installer twice if the first time succeeds

The following sequence of events is invalid:
- installer runs and succeeds
- we sign the drivers
- installer runs again

This is because, during the second run, the installer will detect that
the drivers have been modified (because of the signatures), and will
consider the whole install invalid. It will then try to uninstall the
drivers. (aside: the error message this produces is a little confusing
because of how our code uses overlayfs, but the signed drivers get
uninstalled no matter what)

We solve this in the code by only calling the "install libs" function
when we either aren't doing legacy linking (which doesn't run the
installer), or the installer failed when doing legacy linking.

Typically, the installer won't run and succeed when doing legacy
linking. But when module signature enforcement is disabled (like in
GKE), it will succeed.

TEST=Run on cos-85 with module signature enforcement disabled. Run on
cos-85 with module signature enforcement enabled. Run on cos-93 with
module signature enforcement enabled. Run on cos-93 with module
signature enforcement disabled.

Change-Id: Ideeb71377404632d645d89bb417c60b73d41c3b5
Reviewed-on: https://cos-review.googlesource.com/c/cos/tools/+/23533
Tested-by: Robert Kolchmeyer <rkolchmeyer@google.com>
Cloud-Build: GCB Service account <228075978874@cloudbuild.gserviceaccount.com>
Reviewed-by: Arnav Kansal <rnv@google.com>
diff --git a/src/cmd/cos_gpu_installer/internal/installer/installer.go b/src/cmd/cos_gpu_installer/internal/installer/installer.go
index c4c12a3..fb05ddd 100644
--- a/src/cmd/cos_gpu_installer/internal/installer/installer.go
+++ b/src/cmd/cos_gpu_installer/internal/installer/installer.go
@@ -35,6 +35,8 @@
 	// ErrDriverLoad indicates that installed GPU drivers could not be loaded into
 	// the kernel.
 	ErrDriverLoad = stderrors.New("failed to load GPU drivers")
+
+	errInstallerFailed = stderrors.New("failed to run GPU driver installer")
 )
 
 // VerifyDriverInstallation runs some commands to verify the driver installation.
@@ -217,7 +219,7 @@
 	return nil
 }
 
-func linkDriversLegacy(toolchainDir, nvidiaDir string, needSigned bool) error {
+func linkDriversLegacy(toolchainDir, nvidiaDir string) error {
 	log.Info("Linking drivers using legacy method...")
 	// The legacy linking method needs to use "/usr/bin/ld" as the linker to
 	// maintain bit-for-bit compatibility with driver signatures. The legacy
@@ -250,10 +252,11 @@
 		"--accept-license",
 	)
 	log.Infof("Installer arguments:\n%v", cmd.Args)
-	if err := utils.RunCommandAndLogOutput(cmd, needSigned); err != nil {
-		return fmt.Errorf("failed to run GPU driver installer: %v", err)
-	}
+	err := utils.RunCommandAndLogOutput(cmd, false)
 	log.Info("Done linking drivers")
+	if err != nil {
+		return fmt.Errorf("%w: %v", errInstallerFailed, err)
+	}
 	return nil
 }
 
@@ -300,9 +303,20 @@
 	}
 
 	// Link drivers.
+	var legacyInstallerFailed bool
 	if legacyLink {
-		if err := linkDriversLegacy(toolchainDir, extractDir, needSigned); err != nil {
-			return fmt.Errorf("failed to link drivers: %v", err)
+		if err := linkDriversLegacy(toolchainDir, extractDir); err != nil {
+			if stderrors.Is(err, errInstallerFailed) {
+				// This case is expected when module signature enforcement is enabled.
+				// Since the installer terminated early, we need to re-run it after
+				// signing modules.
+				//
+				// If we don't sign modules (i.e. needSigned is false), then we'll see
+				// an error when we load the modules, and that will be fatal.
+				legacyInstallerFailed = true
+			} else {
+				return fmt.Errorf("failed to link drivers: %v", err)
+			}
 		}
 	} else {
 		if err := linkDrivers(toolchainDir, extractDir); err != nil {
@@ -335,7 +349,8 @@
 		// Copy drivers to the desired end directory. This is done as part of
 		// `modules.AppendSignature` in the above signing block, but we need to do
 		// it for unsigned modules as well. Legacy linking already does this copy
-		// in the unsigned case; we skip this block in the legacy link case to avoid
+		// in the unsigned case (we expect that legacy linking also does this when
+		// the installer fails); we skip this block in the legacy link case to avoid
 		// redundancy.
 		for _, kernelFile := range kernelFiles {
 			if strings.HasSuffix(kernelFile.Name(), ".ko") {
@@ -350,17 +365,18 @@
 	}
 
 	// Load GPU drivers.
-	// The legacy linking method already does this in the unsigned case.
-	if needSigned || !legacyLink {
+	// The legacy linking method does this when the installer doesn't fail (i.e.
+	// module signature verification isn't enforced).
+	if (legacyLink && legacyInstallerFailed) || !legacyLink {
 		if err := loadGPUDrivers(needSigned); err != nil {
 			return fmt.Errorf("%w: %v", ErrDriverLoad, err)
 		}
 	}
 
 	// Install libs.
-	// The legacy linking method already installs these libs in the unsigned
-	// case. This step is redundant in that case.
-	if needSigned || !legacyLink {
+	// The legacy linking method does this when the installer doesn't fail (i.e.
+	// module signature verification isn't enforced).
+	if (legacyLink && legacyInstallerFailed) || !legacyLink {
 		if err := installUserLibs(extractDir); err != nil {
 			return fmt.Errorf("failed to install userspace libraries: %v", err)
 		}