cos-gpu-installer: add flag --no-verify

To allow users to preload GPU driver without attached GPU,
we need to add a flag --no-verify to skip kernel module loading
and installation verification.

BUG=b/283274391
TEST=Run cos-extensions install gpu -- --no-verify on a VM
with no GPU
RELEASE_NOTE=None

Change-Id: Ic39e366ec509b32da3a14a56d1a017b634f5249b
Reviewed-on: https://cos-review.googlesource.com/c/cos/tools/+/49551
Cloud-Build: GCB Service account <228075978874@cloudbuild.gserviceaccount.com>
Tested-by: He Gao <hegao@google.com>
Reviewed-by: Arnav Kansal <rnv@google.com>
diff --git a/src/cmd/cos_gpu_installer/internal/commands/install.go b/src/cmd/cos_gpu_installer/internal/commands/install.go
index a3a059d..6f19b66 100644
--- a/src/cmd/cos_gpu_installer/internal/commands/install.go
+++ b/src/cmd/cos_gpu_installer/internal/commands/install.go
@@ -115,6 +115,7 @@
 	test               bool
 	prepareBuildTools  bool
 	kernelOpen         bool
+	noVerify           bool
 }
 
 // Name implements subcommands.Command.Name.
@@ -160,6 +161,7 @@
 		"Enable test mode. "+
 			"In test mode, `-nvidia-installer-url` can be used without `-allow-unsigned-driver`.")
 	f.BoolVar(&c.prepareBuildTools, "prepare-build-tools", false, "Whether to populate the build tools cache, i.e. to download and install the toolchain and the kernel headers. Drivers are NOT installed when this flag is set and running with this flag does not require GPU attached to the instance.")
+	f.BoolVar(&c.noVerify, "no-verify", false, "Skip kernel module loading and installation verification. Useful for preloading drivers without attached GPU.")
 
 }
 
@@ -206,15 +208,12 @@
 	var gpuType GPUType = NO_GPU
 
 	if !c.prepareBuildTools {
-		var isGpuConfigured bool
-		if isGpuConfigured, gpuType, err = c.getGPUTypeInfo(); err != nil {
-			c.logError(errors.Wrapf(err, "failed to get GPU type information"))
-			return subcommands.ExitFailure
-		}
-
-		if !isGpuConfigured {
-			c.logError(fmt.Errorf("Please have GPU device configured"))
-			return subcommands.ExitFailure
+		if gpuType, err = c.getGPUTypeInfo(); err != nil {
+			if !c.noVerify {
+				c.logError(errors.Wrapf(err, "failed to get GPU type information"))
+				return subcommands.ExitFailure
+			}
+			log.Infof("No GPU device configured, continue driver preoloading without verification.")
 		}
 	}
 
@@ -267,11 +266,11 @@
 		cacher = installer.NewCacher(hostInstallDir, envReader.BuildNumber(), c.driverVersion)
 		if isCached, isOpen, err := cacher.IsCached(); isCached && err == nil {
 			log.V(2).Info("Found cached version, NOT building the drivers.")
-			if err := installer.ConfigureCachedInstalltion(hostInstallDir, !c.unsignedDriver, c.test, isOpen); err != nil {
+			if err := installer.ConfigureCachedInstalltion(hostInstallDir, !c.unsignedDriver, c.test, isOpen, c.noVerify); err != nil {
 				c.logError(errors.Wrap(err, "failed to configure cached installation"))
 				return subcommands.ExitFailure
 			}
-			if err := installer.VerifyDriverInstallation(); err != nil {
+			if err := installer.VerifyDriverInstallation(c.noVerify); err != nil {
 				c.logError(errors.Wrap(err, "failed to verify GPU driver installation"))
 				return subcommands.ExitFailure
 			}
@@ -387,11 +386,11 @@
 		}
 	}
 
-	if err := installer.RunDriverInstaller(toolchainPkgDir, installerFile, c.driverVersion, !c.unsignedDriver, c.test, false); err != nil {
+	if err := installer.RunDriverInstaller(toolchainPkgDir, installerFile, c.driverVersion, !c.unsignedDriver, c.test, false, c.noVerify); err != nil {
 		if errors.Is(err, installer.ErrDriverLoad) {
 			// Drivers were linked, but couldn't load; try again with legacy linking
 			log.Infof("Failed to load kernel module, err: %v. Retrying driver installation with legacy linking", err)
-			if err := installer.RunDriverInstaller(toolchainPkgDir, installerFile, c.driverVersion, !c.unsignedDriver, c.test, true); err != nil {
+			if err := installer.RunDriverInstaller(toolchainPkgDir, installerFile, c.driverVersion, !c.unsignedDriver, c.test, true, c.noVerify); err != nil {
 				return fmt.Errorf("failed to run GPU driver installer: %v", err)
 			}
 		} else {
@@ -403,7 +402,7 @@
 			return errors.Wrap(err, "failed to cache installation")
 		}
 	}
-	if err := installer.VerifyDriverInstallation(); err != nil {
+	if err := installer.VerifyDriverInstallation(c.noVerify); err != nil {
 		return errors.Wrap(err, "failed to verify installation")
 	}
 	if err := modules.UpdateHostLdCache(hostRootPath, filepath.Join(c.hostInstallDir, "lib64")); err != nil {
@@ -426,7 +425,7 @@
 		return err
 	}
 
-	if err := installer.RunDriverInstallerPrebuiltModules(downloader, installerFile, c.driverVersion); err != nil {
+	if err := installer.RunDriverInstallerPrebuiltModules(downloader, installerFile, c.driverVersion, c.noVerify); err != nil {
 		return err
 	}
 
@@ -435,7 +434,7 @@
 			return errors.Wrap(err, "failed to cache installation")
 		}
 	}
-	if err := installer.VerifyDriverInstallation(); err != nil {
+	if err := installer.VerifyDriverInstallation(c.noVerify); err != nil {
 		return errors.Wrap(err, "failed to verify installation")
 	}
 	if err := modules.UpdateHostLdCache(hostRootPath, filepath.Join(c.hostInstallDir, "lib64")); err != nil {
@@ -453,28 +452,28 @@
 	}
 }
 
-func (c *InstallCommand) getGPUTypeInfo() (bool, GPUType, error) {
+func (c *InstallCommand) getGPUTypeInfo() (GPUType, error) {
 	cmd := "lspci | grep -i \"nvidia\""
 	outBytes, err := exec.Command("/bin/bash", "-c", cmd).Output()
 	if err != nil {
-		return false, NO_GPU, err
+		return NO_GPU, err
 	}
 	out := string(outBytes)
 	switch {
 	case strings.Contains(out, "[Tesla K80]"):
-		return true, K80, nil
+		return K80, nil
 	case strings.Contains(out, "NVIDIA Corporation Device 15f8"), strings.Contains(out, "NVIDIA Corporation GP100GL"), strings.Contains(out, "[Tesla P100"):
-		return true, P100, nil
+		return P100, nil
 	case strings.Contains(out, "NVIDIA Corporation Device 1db1"), strings.Contains(out, "NVIDIA Corporation GV100GL"), strings.Contains(out, "[Tesla V100"):
-		return true, V100, nil
+		return V100, nil
 	case strings.Contains(out, "NVIDIA Corporation Device 1bb3"), strings.Contains(out, "NVIDIA Corporation GP104GL"), strings.Contains(out, "[Tesla P4"):
-		return true, P4, nil
+		return P4, nil
 	case strings.Contains(out, "NVIDIA Corporation Device 27b8"), strings.Contains(out, "NVIDIA Corporation AD104GL [L4]"):
-		return true, L4, nil
+		return L4, nil
 	case strings.Contains(out, "NVIDIA Corporation Device 2330"), strings.Contains(out, "NVIDIA Corporation GH100[H100"):
-		return true, H100, nil
+		return H100, nil
 	default:
-		return true, Others, nil
+		return Others, nil
 	}
 }
 
diff --git a/src/cmd/cos_gpu_installer/internal/installer/installer.go b/src/cmd/cos_gpu_installer/internal/installer/installer.go
index 066f44f..f847822 100644
--- a/src/cmd/cos_gpu_installer/internal/installer/installer.go
+++ b/src/cmd/cos_gpu_installer/internal/installer/installer.go
@@ -46,7 +46,11 @@
 )
 
 // VerifyDriverInstallation runs some commands to verify the driver installation.
-func VerifyDriverInstallation() error {
+func VerifyDriverInstallation(noVerify bool) error {
+	if noVerify {
+		log.Infof("Flag --no-verify is set, skip driver installation verification.")
+		return nil
+	}
 	log.Info("Verifying GPU driver installation")
 
 	newPathEnv := fmt.Sprintf("%s/bin:%s", gpuInstallDirContainer, os.Getenv("PATH"))
@@ -69,7 +73,7 @@
 }
 
 // ConfigureCachedInstalltion updates ldconfig and installs the cached GPU driver kernel modules.
-func ConfigureCachedInstalltion(gpuInstallDirHost string, needSigned, test, kernelOpen bool) error {
+func ConfigureCachedInstalltion(gpuInstallDirHost string, needSigned, test, kernelOpen, noVerify bool) error {
 	log.V(2).Info("Configuring cached driver installation")
 
 	if err := createHostDirBindMount(gpuInstallDirHost, gpuInstallDirContainer); err != nil {
@@ -78,7 +82,7 @@
 	if err := updateContainerLdCache(); err != nil {
 		return errors.Wrap(err, "failed to configure cached driver installation")
 	}
-	if err := loadGPUDrivers(needSigned, test, kernelOpen); err != nil {
+	if err := loadGPUDrivers(needSigned, test, kernelOpen, noVerify); err != nil {
 		return errors.Wrap(err, "failed to configure cached driver installation")
 	}
 
@@ -304,7 +308,7 @@
 
 // RunDriverInstaller runs GPU driver installer. Only works if the provided
 // installer includes precompiled drivers.
-func RunDriverInstaller(toolchainDir, installerFilename, driverVersion string, needSigned, test, legacyLink bool) error {
+func RunDriverInstaller(toolchainDir, installerFilename, driverVersion string, needSigned, test, legacyLink, noVerify bool) error {
 	log.Info("Running GPU driver installer")
 
 	// Extract files to a fixed path first to make sure md5sum of generated gpu drivers are consistent.
@@ -389,7 +393,7 @@
 	// The legacy linking method does this when the installer doesn't fail (i.e.
 	// module signature verification isn't enforced).
 	if (legacyLink && legacyInstallerFailed) || !legacyLink {
-		if err := loadGPUDrivers(needSigned, test, false); err != nil {
+		if err := loadGPUDrivers(needSigned, test, false, noVerify); err != nil {
 			return fmt.Errorf("%w: %v", ErrDriverLoad, err)
 		}
 	}
@@ -518,7 +522,7 @@
 	return nil
 }
 
-func loadGPUDrivers(needSigned, test, kernelOpen bool) error {
+func loadGPUDrivers(needSigned, test, kernelOpen, noVerify bool) error {
 	// Don't need to load public key in test mode. Platform key is used.
 	if needSigned && !test && !kernelOpen {
 		if err := modules.LoadPublicKey("gpu-key", filepath.Join(gpuInstallDirContainer, "pubkey.der"), modules.SecondaryKeyring); err != nil {
@@ -529,6 +533,10 @@
 			log.Infof("Falied to load public key to IMA keyring, err: %v", err)
 		}
 	}
+	if noVerify {
+		log.Infof("Flag --no-verify is set, skip kernel module loading.")
+		return nil
+	}
 	kernelModulePath := filepath.Join(gpuInstallDirContainer, "drivers")
 	gpuModules := map[string]string{
 		"nvidia":         filepath.Join(kernelModulePath, "nvidia.ko"),
@@ -621,7 +629,7 @@
 	return driverVersion
 }
 
-func RunDriverInstallerPrebuiltModules(downloader *cos.GCSDownloader, installerFilename, driverVersion string) error {
+func RunDriverInstallerPrebuiltModules(downloader *cos.GCSDownloader, installerFilename, driverVersion string, noVerify bool) error {
 	// fetch the prebuilt modules
 	if err := downloader.DownloadArtifact(gpuInstallDirContainer, fmt.Sprintf(prebuiltModuleTemplate, driverVersion)); err != nil {
 		return fmt.Errorf("failed to download prebuilt modules: %v", err)
@@ -634,7 +642,7 @@
 	}
 
 	// load the prebuilt kernel modules
-	if err := loadGPUDrivers(false, false, true); err != nil {
+	if err := loadGPUDrivers(false, false, true, noVerify); err != nil {
 		return fmt.Errorf("%w: %v", ErrDriverLoad, err)
 	}