cos-gpu-installer: Switch precompiled driver and signature location to COS build artifacts for M109
precompiled drivers for M105 and below are fetched from NVIDIA
locations gs://nvidia-drivers-us-public/nvidia-cos-project/..
Example:
gs://nvidia-drivers-us-public/nvidia-cos-project/105/tesla/470_00/470.182.03/NVIDIA-Linux-x86_64-470.182.03_105-17412.1.75.cos
Signatures for the same are available at gs://cos-tools/build_id/extensions/gpu/
With the COS precompiled drivers the locations for both the drivers and
the signatures can be changed to gs://cos-tools/build_id/
BUG=b/292160336
TEST=manual
RELEASE_NOTE=Switch precompiled driver and signature location to COS
build artifacts for M109.
Change-Id: Icd1a96ac45e9a356fb60477b99834b9e0ee5e66c
Reviewed-on: https://cos-review.googlesource.com/c/cos/tools/+/52452
Cloud-Build: GCB Service account <228075978874@cloudbuild.gserviceaccount.com>
Tested-by: Arnav Kansal <rnv@google.com>
Reviewed-by: Oleksandr Tymoshenko <ovt@google.com>
diff --git a/src/cmd/cos_gpu_installer/internal/commands/install.go b/src/cmd/cos_gpu_installer/internal/commands/install.go
index a8f639f..fc9e3b4 100644
--- a/src/cmd/cos_gpu_installer/internal/commands/install.go
+++ b/src/cmd/cos_gpu_installer/internal/commands/install.go
@@ -116,6 +116,7 @@
kernelOpen bool
noVerify bool
kernelModuleParams modules.ModuleParameters
+ selfPrecompiled bool
}
// Name implements subcommands.Command.Name.
@@ -207,7 +208,6 @@
}
var gpuType GPUType = NO_GPU
-
if !c.prepareBuildTools {
if gpuType, err = c.getGPUTypeInfo(); err != nil {
if !c.noVerify {
@@ -218,23 +218,17 @@
}
}
+ if milestone := envReader.Milestone(); selfPrecompiledCandidate(milestone) {
+ c.selfPrecompiled = true
+ }
+
downloader := cos.NewGCSDownloader(envReader, c.gcsDownloadBucket, c.gcsDownloadPrefix)
if c.nvidiaInstallerURL == "" {
versionInput := c.driverVersion
- milestone, err := strconv.Atoi(envReader.Milestone())
- if err != nil {
- c.logError(errors.Wrap(err, "failed to parse milestone number"))
- return subcommands.ExitFailure
- }
c.driverVersion, err = getDriverVersion(downloader, c.driverVersion)
if err != nil {
- if versionInput == "latest" && milestone < 93 {
- c.logError(errors.Wrap(err, "'--version=latest' is only supported on COS M93 and onwards, please unset this flag"))
- return subcommands.ExitFailure
- } else {
- c.logError(errors.Wrap(err, "failed to get default driver version"))
- return subcommands.ExitFailure
- }
+ c.logError(errors.Wrap(err, fmt.Sprintf("failed to get %s driver version", versionInput)))
+ return subcommands.ExitFailure
}
if err := c.checkDriverCompatibility(downloader, gpuType); err != nil {
c.logError(errors.Wrap(err, "failed to check driver compatibility"))
@@ -267,7 +261,7 @@
cacher = installer.NewCacher(hostInstallDir, envReader.BuildNumber(), c.driverVersion)
if isCached, isOpen, err := cacher.IsCached(); isCached && err == nil {
log.V(2).Info("Found cached version, NOT building the drivers.")
- if err := installer.ConfigureCachedInstalltion(hostInstallDir, !c.unsignedDriver, c.test, isOpen, c.noVerify, c.kernelModuleParams); err != nil {
+ if err := installer.ConfigureCachedInstallation(hostInstallDir, !c.unsignedDriver, c.test, isOpen, c.noVerify, c.selfPrecompiled, c.kernelModuleParams); err != nil {
c.logError(errors.Wrap(err, "failed to configure cached installation"))
return subcommands.ExitFailure
}
@@ -362,8 +356,12 @@
var installerFile string
if c.nvidiaInstallerURL == "" {
- installerFile, err = installer.DownloadDriverInstaller(
- c.driverVersion, envReader.Milestone(), envReader.BuildNumber())
+ if c.selfPrecompiled {
+ installerFile, err = installer.DownloadDriverInstallerV2(downloader, c.driverVersion)
+ } else {
+ installerFile, err = installer.DownloadDriverInstaller(
+ c.driverVersion, envReader.Milestone(), envReader.BuildNumber())
+ }
if err != nil {
return errors.Wrap(err, "failed to download GPU driver installer")
}
@@ -379,19 +377,27 @@
if err := signing.DownloadDriverSignaturesFromURL(c.signatureURL); err != nil {
return errors.Wrap(err, "failed to download driver signature")
}
- } else if err := signing.DownloadDriverSignatures(downloader, c.driverVersion); err != nil {
- if strings.Contains(err.Error(), "404 Not Found") {
- return fmt.Errorf("The GPU driver is not available for the COS version. Please wait for half a day and retry.")
+ } else {
+ if c.selfPrecompiled {
+ if err = signing.DownloadDriverSignaturesV2(downloader, c.driverVersion); err != nil {
+ return errors.Wrap(err, "failed to download driver signature")
+ }
+ } else {
+ if err := signing.DownloadDriverSignatures(downloader, c.driverVersion); err != nil {
+ if strings.Contains(err.Error(), "404 Not Found") {
+ return fmt.Errorf("The GPU driver is not available for the COS version. Please wait for half a day and retry.")
+ }
+ return errors.Wrap(err, "failed to download driver signature")
+ }
}
- return errors.Wrap(err, "failed to download driver signature")
}
}
- if err := installer.RunDriverInstaller(toolchainPkgDir, installerFile, c.driverVersion, !c.unsignedDriver, c.test, false, c.noVerify, c.kernelModuleParams); err != nil {
+ if err := installer.RunDriverInstaller(toolchainPkgDir, installerFile, c.driverVersion, !c.unsignedDriver, c.test, false, c.noVerify, c.selfPrecompiled, c.kernelModuleParams); err != nil {
if errors.Is(err, installer.ErrDriverLoad) {
// Drivers were linked, but couldn't load; try again with legacy linking
log.Infof("Failed to load kernel module, err: %v. Retrying driver installation with legacy linking", err)
- if err := installer.RunDriverInstaller(toolchainPkgDir, installerFile, c.driverVersion, !c.unsignedDriver, c.test, true, c.noVerify, c.kernelModuleParams); err != nil {
+ if err := installer.RunDriverInstaller(toolchainPkgDir, installerFile, c.driverVersion, !c.unsignedDriver, c.test, true, c.noVerify, c.selfPrecompiled, c.kernelModuleParams); err != nil {
return fmt.Errorf("failed to run GPU driver installer: %v", err)
}
} else {
@@ -495,3 +501,12 @@
}
return nil
}
+
+func selfPrecompiledCandidate(milestone string) bool {
+ for _, v := range []string{"93", "97", "101", "105"} {
+ if v == milestone {
+ return false
+ }
+ }
+ return true
+}
diff --git a/src/cmd/cos_gpu_installer/internal/installer/installer.go b/src/cmd/cos_gpu_installer/internal/installer/installer.go
index ef2b8b8..497cfd0 100644
--- a/src/cmd/cos_gpu_installer/internal/installer/installer.go
+++ b/src/cmd/cos_gpu_installer/internal/installer/installer.go
@@ -29,6 +29,7 @@
gpuFirmwareDirContainer = "/usr/local/nvidia/firmware/nvidia"
templateGPUDriverFile = "gpu_%s_version"
precompiledInstallerURLFormat = "https://storage.googleapis.com/nvidia-drivers-%s-public/nvidia-cos-project/%s/tesla/%s_00/%s/NVIDIA-Linux-x86_64-%s_%s-%s.cos"
+ precompiledDriverTemplate = "NVIDIA-Linux-x86_64-%s-custom.run"
defaultFilePermission = 0755
signedURLKey = "Expires"
prebuiltModuleTemplate = "nvidia-drivers-%s.tgz"
@@ -73,8 +74,8 @@
return nil
}
-// ConfigureCachedInstalltion updates ldconfig and installs the cached GPU driver kernel modules.
-func ConfigureCachedInstalltion(gpuInstallDirHost string, needSigned, test, kernelOpen, noVerify bool, moduleParameters modules.ModuleParameters) error {
+// ConfigureCachedInstallation updates ldconfig and installs the cached GPU driver kernel modules.
+func ConfigureCachedInstallation(gpuInstallDirHost string, needSigned, test, kernelOpen, noVerify, selfPrecompiled bool, moduleParameters modules.ModuleParameters) error {
log.V(2).Info("Configuring cached driver installation")
if err := createHostDirBindMount(gpuInstallDirHost, gpuInstallDirContainer); err != nil {
@@ -83,7 +84,7 @@
if err := updateContainerLdCache(); err != nil {
return errors.Wrap(err, "failed to configure cached driver installation")
}
- if err := loadGPUDrivers(moduleParameters, needSigned, test, kernelOpen, noVerify); err != nil {
+ if err := loadGPUDrivers(moduleParameters, needSigned, test, kernelOpen, noVerify, selfPrecompiled); err != nil {
return errors.Wrap(err, "failed to configure cached driver installation")
}
@@ -111,6 +112,17 @@
return DownloadToInstallDir(downloadURL, "GPU driver installer")
}
+// DownloadDriverInstallerV2 downloads GPU driver installer given driver version from COS build artifacts.
+func DownloadDriverInstallerV2(downloader *cos.GCSDownloader, driverVersion string) (string, error) {
+ log.Infof("Downloading GPU driver installer version %s", driverVersion)
+ installerFilename := fmt.Sprintf(precompiledDriverTemplate, driverVersion)
+ err := downloader.DownloadArtifact(gpuInstallDirContainer, installerFilename)
+ if err != nil {
+ return "", errors.Wrap(err, "failed to download installer")
+ }
+ return installerFilename, nil
+}
+
// ConfigureDriverInstallationDirs configures GPU driver installation directories by creating mounts.
func ConfigureDriverInstallationDirs(gpuInstallDirHost string, kernelRelease string) (chan<- int, error) {
log.Info("Configuring driver installation directories")
@@ -309,7 +321,7 @@
// RunDriverInstaller runs GPU driver installer. Only works if the provided
// installer includes precompiled drivers.
-func RunDriverInstaller(toolchainDir, installerFilename, driverVersion string, needSigned, test, legacyLink, noVerify bool, moduleParameters modules.ModuleParameters) error {
+func RunDriverInstaller(toolchainDir, installerFilename, driverVersion string, needSigned, test, legacyLink, noVerify, selfPrecompiled bool, moduleParameters modules.ModuleParameters) error {
log.Info("Running GPU driver installer")
// Extract files to a fixed path first to make sure md5sum of generated gpu drivers are consistent.
@@ -368,8 +380,10 @@
}
}
// Copy public key.
- if err := utils.CopyFile(signing.GetPublicKeyDer(), filepath.Join(gpuInstallDirContainer, "pubkey.der")); err != nil {
- return errors.Wrapf(err, "failed to copy file %s", signing.GetPublicKeyDer())
+ if !selfPrecompiled {
+ if err := utils.CopyFile(signing.GetPublicKeyDer(), filepath.Join(gpuInstallDirContainer, "pubkey.der")); err != nil {
+ return errors.Wrapf(err, "failed to copy file %s", signing.GetPublicKeyDer())
+ }
}
} else if !legacyLink {
// Copy drivers to the desired end directory. This is done as part of
@@ -394,7 +408,7 @@
// The legacy linking method does this when the installer doesn't fail (i.e.
// module signature verification isn't enforced).
if (legacyLink && legacyInstallerFailed) || !legacyLink {
- if err := loadGPUDrivers(moduleParameters, needSigned, test, false, noVerify); err != nil {
+ if err := loadGPUDrivers(moduleParameters, needSigned, test, false, noVerify, selfPrecompiled); err != nil {
return fmt.Errorf("%w: %v", ErrDriverLoad, err)
}
}
@@ -523,9 +537,9 @@
return nil
}
-func loadGPUDrivers(moduleParams modules.ModuleParameters, needSigned, test, kernelOpen, noVerify bool) error {
+func loadGPUDrivers(moduleParams modules.ModuleParameters, needSigned, test, kernelOpen, noVerify, selfPrecompiled bool) error {
// Don't need to load public key in test mode. Platform key is used.
- if needSigned && !test && !kernelOpen {
+ if needSigned && !test && !kernelOpen && !selfPrecompiled {
if err := modules.LoadPublicKey("gpu-key", filepath.Join(gpuInstallDirContainer, "pubkey.der"), modules.SecondaryKeyring); err != nil {
return errors.Wrap(err, "failed to load public key")
}
@@ -643,7 +657,7 @@
}
// load the prebuilt kernel modules
- if err := loadGPUDrivers(moduleParameters, false, false, true, noVerify); err != nil {
+ if err := loadGPUDrivers(moduleParameters, false, false, true, noVerify, true); err != nil {
return fmt.Errorf("%w: %v", ErrDriverLoad, err)
}
diff --git a/src/cmd/cos_gpu_installer/internal/signing/signature.go b/src/cmd/cos_gpu_installer/internal/signing/signature.go
index 68126eb..e67570b 100644
--- a/src/cmd/cos_gpu_installer/internal/signing/signature.go
+++ b/src/cmd/cos_gpu_installer/internal/signing/signature.go
@@ -2,6 +2,7 @@
package signing
import (
+ "fmt"
"os"
"os/exec"
"path/filepath"
@@ -16,12 +17,31 @@
gpuDriverPubKeyPem = "gpu-driver-cert.pem"
gpuDriverPubKeyDer = "gpu-driver-cert.der"
gpuDriverDummyKey = "dummy-key"
+ signatureTemplate = "nvidia-drivers-%s-signature.tar.gz"
)
var (
gpuDriverSigningDir = "/build/sign-gpu-driver"
)
+// DownloadDriverSignaturesV2 downloads GPU driver signatures from COS build artifacts.
+func DownloadDriverSignaturesV2(downloader *cos.GCSDownloader, driverVersion string) error {
+ if err := os.MkdirAll(gpuDriverSigningDir, 0755); err != nil {
+ return errors.Wrapf(err, "failed to create signing dir %s", gpuDriverSigningDir)
+ }
+ log.Infof("Downloading driver signature for version %s", driverVersion)
+ signatureName := fmt.Sprintf(signatureTemplate, driverVersion)
+ if err := downloader.DownloadArtifact(gpuDriverSigningDir, signatureName); err != nil {
+ return errors.Wrapf(err, "failed to download driver signature for version %s", driverVersion)
+ }
+
+ if err := decompressSignature(signatureName); err != nil {
+ return errors.Wrapf(err, "failed to decompress driver signature for version %s.", driverVersion)
+ }
+
+ return nil
+}
+
// DownloadDriverSignatures downloads GPU driver signatures.
func DownloadDriverSignatures(downloader cos.ExtensionsDownloader, driverVersion string) error {
if err := os.MkdirAll(gpuDriverSigningDir, 0755); err != nil {