blob: 3d0d3cdd6b957bbf373871eebaadae5bdd23a5e1 [file] [log] [blame]
// Package installer provides functionality to install GPU drivers.
package installer
import (
stderrors "errors"
log ""
const (
gpuInstallDirContainer = "/usr/local/nvidia"
defaultGPUDriverFile = "gpu_default_version"
latestGPUDriverFile = "gpu_latest_version"
precompiledInstallerURLFormat = ""
defaultFilePermission = 0755
signedURLKey = "Expires"
var (
// ErrDriverLoad indicates that installed GPU drivers could not be loaded into
// the kernel.
ErrDriverLoad = stderrors.New("failed to load GPU drivers")
errInstallerFailed = stderrors.New("failed to run GPU driver installer")
// VerifyDriverInstallation runs some commands to verify the driver installation.
func VerifyDriverInstallation() error {
log.Info("Verifying GPU driver installation")
newPathEnv := fmt.Sprintf("%s/bin:%s", gpuInstallDirContainer, os.Getenv("PATH"))
os.Setenv("PATH", newPathEnv)
// Run nvidia-smi to check whether nvidia GPU driver is installed.
if err := utils.RunCommandAndLogOutput(exec.Command("nvidia-smi"), false); err != nil {
return errors.Wrap(err, "failed to verify GPU driver installation")
// Create unified memory device file.
if err := utils.RunCommandAndLogOutput(exec.Command("nvidia-modprobe", "-c0", "-u", "-m"), false); err != nil {
return errors.Wrap(err, "failed to create unified memory device file")
return nil
// ConfigureCachedInstalltion updates ldconfig and installs the cached GPU driver kernel modules.
func ConfigureCachedInstalltion(gpuInstallDirHost string, needSigned, test bool) error {
log.V(2).Info("Configuring cached driver installation")
if err := createHostDirBindMount(gpuInstallDirHost, gpuInstallDirContainer); err != nil {
return errors.Wrap(err, "failed to create driver installation dir")
if err := updateContainerLdCache(); err != nil {
return errors.Wrap(err, "failed to configure cached driver installation")
if err := loadGPUDrivers(needSigned, test); err != nil {
return errors.Wrap(err, "failed to configure cached driver installation")
return nil
// DownloadToInstallDir downloads data from the provided URL to the GPU
// installation directory. It returns the basename of the locally written file.
func DownloadToInstallDir(url, infoStr string) (string, error) {
outputPath := filepath.Join(gpuInstallDirContainer, strings.Split(path.Base(url), "?"+signedURLKey+"=")[0])
if err := utils.DownloadContentFromURL(url, outputPath, infoStr); err != nil {
return "", fmt.Errorf("failed to download file with description %q from %q and install into %q: %v", infoStr, url, gpuInstallDirContainer, err)
return filepath.Base(outputPath), nil
// DownloadDriverInstaller downloads GPU driver installer given driver version and COS version.
func DownloadDriverInstaller(driverVersion, cosMilestone, cosBuildNumber string) (string, error) {
log.Infof("Downloading GPU driver installer version %s", driverVersion)
downloadURL, err := getDriverInstallerDownloadURL(driverVersion, cosMilestone, cosBuildNumber)
if err != nil {
return "", errors.Wrap(err, "failed to get driver installer download URL")
return DownloadToInstallDir(downloadURL, "GPU driver installer")
// ConfigureDriverInstallationDirs configures GPU driver installation directories by creating mounts.
func ConfigureDriverInstallationDirs(gpuInstallDirHost string, kernelRelease string) (chan<- int, error) {
log.Info("Configuring driver installation directories")
if err := createHostDirBindMount(gpuInstallDirHost, gpuInstallDirContainer); err != nil {
return nil, errors.Wrap(err, "failed to create dirver installation dir")
if err := createOverlayFS(
"/usr/bin", gpuInstallDirContainer+"/bin", gpuInstallDirContainer+"/bin-workdir"); err != nil {
return nil, errors.Wrap(err, "failed to create bin overlay")
if err := createOverlayFS(
"/usr/lib/x86_64-linux-gnu", gpuInstallDirContainer+"/lib64", gpuInstallDirContainer+"/lib64-workdir"); err != nil {
return nil, errors.Wrap(err, "failed to create lib64 overlay")
modulePath := filepath.Join("/lib/modules", kernelRelease, "video")
if err := createOverlayFS(
modulePath, gpuInstallDirContainer+"/drivers", gpuInstallDirContainer+"/drivers-workdir"); err != nil {
return nil, errors.Wrap(err, "failed to create drivers overlay")
if err := updateContainerLdCache(); err != nil {
return nil, errors.Wrap(err, "failed to update container ld cache")
ch := make(chan int, 1)
go func() {
// cleans up mounts created above.
syscall.Unmount("/usr/bin", 0)
syscall.Unmount("/usr/lib/x86_64-linux-gnu", 0)
syscall.Unmount(modulePath, 0)
syscall.Unmount(gpuInstallDirContainer, 0)
return ch, nil
func extractPrecompiled(nvidiaDir string) error {
log.Info("Extracting precompiled artifacts...")
precompiledDir := filepath.Join(nvidiaDir, "kernel", "precompiled")
files, err := os.ReadDir(precompiledDir)
if err != nil {
return fmt.Errorf("failed to read %q: %v", precompiledDir, err)
var precompiledArchive string
if len(files) == 0 {
return stderrors.New("failed to find precompiled artifacts in this nvidia installer")
if len(files) == 1 {
precompiledArchive = filepath.Join(precompiledDir, files[0].Name())
if len(files) > 1 {
var fileNames []string
for _, f := range files {
fileNames = append(fileNames, f.Name())
log.Warningf("Found multiple precompiled archives in this nvidia installer: %q", strings.Join(fileNames, ","))
log.Warningf("Using precompiled archive named %q", fileNames[len(fileNames)-1])
precompiledArchive = filepath.Join(precompiledDir, fileNames[len(fileNames)-1])
cmd := exec.Command(filepath.Join(nvidiaDir, "mkprecompiled"), "--unpack", precompiledArchive, "-o", precompiledDir)
if err := utils.RunCommandAndLogOutput(cmd, false); err != nil {
return fmt.Errorf("failed to unpack precompiled artifacts: %v", err)
log.Info("Done extracting precompiled artifacts")
return nil
func linkDrivers(toolchainDir, nvidiaDir string) error {
log.Info("Linking drivers...")
var kernelInfo unix.Utsname
if err := unix.Uname(&kernelInfo); err != nil {
return fmt.Errorf("failed to find kernel release info using uname: %v", err)
kernelRelease := strings.Trim(string(kernelInfo.Release[:]), "\x00")
// COS 85+ kernels use lld as their linker
linker := filepath.Join(toolchainDir, "bin", "ld.lld")
linkerScript := filepath.Join(toolchainDir, "usr", "src", "linux-headers-"+kernelRelease, "scripts", "")
if _, err := os.Stat(linkerScript); os.IsNotExist(err) {
// Fallback to, which is used in the 5.4 kernel
linkerScript = filepath.Join(toolchainDir, "usr", "src", "linux-headers-"+kernelRelease, "scripts", "")
nvidiaKernelDir := filepath.Join(nvidiaDir, "kernel")
// Link nvidia.ko
nvidiaObjs := []string{
filepath.Join(nvidiaKernelDir, "precompiled", "nv-linux.o"),
filepath.Join(nvidiaKernelDir, "nvidia", "nv-kernel.o_binary"),
args := append([]string{"-T", linkerScript, "-r", "-o", filepath.Join(nvidiaKernelDir, "nvidia.ko")}, nvidiaObjs...)
cmd := exec.Command(linker, args...)
log.Infof("Running link command: %v", cmd.Args)
if err := utils.RunCommandAndLogOutput(cmd, false); err != nil {
return fmt.Errorf("failed to link nvidia.ko: %v", err)
// Link nvidia-modeset.ko
modesetObjs := []string{
filepath.Join(nvidiaKernelDir, "precompiled", "nv-modeset-linux.o"),
filepath.Join(nvidiaKernelDir, "nvidia-modeset", "nv-modeset-kernel.o_binary"),
args = append([]string{"-T", linkerScript, "-r", "-o", filepath.Join(nvidiaKernelDir, "nvidia-modeset.ko")}, modesetObjs...)
cmd = exec.Command(linker, args...)
log.Infof("Running link command: %v", cmd.Args)
if err := utils.RunCommandAndLogOutput(cmd, false); err != nil {
return fmt.Errorf("failed to link nvidia-modeset.ko: %v", err)
// nvidia-uvm.ko is pre-linked; move to kernel dir
oldPath := filepath.Join(nvidiaKernelDir, "precompiled", "nvidia-uvm.ko")
newPath := filepath.Join(nvidiaKernelDir, "nvidia-uvm.ko")
if err := unix.Rename(oldPath, newPath); err != nil {
return fmt.Errorf("failed to move %q to %q: %v", oldPath, newPath, err)
// nvidia-drm.ko is pre-linked; move to kernel dir
oldPath = filepath.Join(nvidiaKernelDir, "precompiled", "nvidia-drm.ko")
newPath = filepath.Join(nvidiaKernelDir, "nvidia-drm.ko")
if err := unix.Rename(oldPath, newPath); err != nil {
return fmt.Errorf("failed to move %q to %q: %v", oldPath, newPath, err)
log.Info("Done linking drivers")
return nil
func linkDriversLegacy(toolchainDir, nvidiaDir string) error {
log.Info("Linking drivers using legacy method...")
// The legacy linking method needs to use "/usr/bin/ld" as the linker to
// maintain bit-for-bit compatibility with driver signatures. The legacy
// linking method also finds the linker by searching the PATH for "ld". If
// bin/ld is present in the toolchain, rename it temporarily so the legacy
// linking method doesn't use it.
ld := filepath.Join(toolchainDir, "bin", "ld")
if _, err := os.Lstat(ld); !os.IsNotExist(err) {
dst := filepath.Join(toolchainDir, "bin", "ld.orig")
if err := unix.Rename(ld, dst); err != nil {
return fmt.Errorf("failed to rename %q to %q: %v", ld, dst, err)
defer func() {
if err := unix.Rename(dst, ld); err != nil {
// At this point, this error is non-fatal. It will become fatal when
// something tries to use bin/ld in the toolchain. At time of writing,
// nothing uses bin/ld after this point.
log.Warningf("Could not restore %q", ld)
cmd := exec.Command(filepath.Join(nvidiaDir, "nvidia-installer"),
"--log-file-name="+filepath.Join(gpuInstallDirContainer, "nvidia-installer.log"),
log.Infof("Installer arguments:\n%v", cmd.Args)
err := utils.RunCommandAndLogOutput(cmd, false)
log.Info("Done linking drivers")
if err != nil {
return fmt.Errorf("%w: %v", errInstallerFailed, err)
return nil
func installUserLibs(nvidiaDir string) error {
log.Info("Installing userspace libraries...")
cmd := exec.Command(filepath.Join(nvidiaDir, "nvidia-installer"),
"--log-file-name="+filepath.Join(gpuInstallDirContainer, "nvidia-installer.log"),
log.Infof("Installer arguments:\n%v", cmd.Args)
if err := utils.RunCommandAndLogOutput(cmd, false); err != nil {
return fmt.Errorf("failed to run GPU driver installer: %v", err)
log.Info("Done installing userspace libraries")
return nil
// RunDriverInstaller runs GPU driver installer. Only works if the provided
// installer includes precompiled drivers.
func RunDriverInstaller(toolchainDir, installerFilename string, needSigned, test, legacyLink bool) error {
log.Info("Running GPU driver installer")
// Extract files to a fixed path first to make sure md5sum of generated gpu drivers are consistent.
extractDir := "/tmp/extract"
if err := os.RemoveAll(extractDir); err != nil {
return fmt.Errorf("failed to clean %q: %v", extractDir, err)
cmd := exec.Command("sh", installerFilename, "-x", "--target", extractDir)
cmd.Dir = gpuInstallDirContainer
if err := cmd.Run(); err != nil {
return errors.Wrap(err, "failed to extract installer files")
// Extract precompiled artifacts.
if err := extractPrecompiled(extractDir); err != nil {
return fmt.Errorf("failed to extract precompiled artifacts: %v", err)
// Link drivers.
var legacyInstallerFailed bool
if legacyLink {
if err := linkDriversLegacy(toolchainDir, extractDir); err != nil {
if stderrors.Is(err, errInstallerFailed) {
// This case is expected when module signature enforcement is enabled.
// Since the installer terminated early, we need to re-run it after
// signing modules.
// If we don't sign modules (i.e. needSigned is false), then we'll see
// an error when we load the modules, and that will be fatal.
legacyInstallerFailed = true
} else {
return fmt.Errorf("failed to link drivers: %v", err)
} else {
if err := linkDrivers(toolchainDir, extractDir); err != nil {
return fmt.Errorf("failed to link drivers: %v", err)
kernelFiles, err := ioutil.ReadDir(filepath.Join(extractDir, "kernel"))
if err != nil {
return errors.Wrapf(err, "failed to list files in directory %s", filepath.Join(extractDir, "kernel"))
if needSigned {
// sign GPU drivers.
for _, kernelFile := range kernelFiles {
if strings.HasSuffix(kernelFile.Name(), ".ko") {
module := kernelFile.Name()
signaturePath := signing.GetModuleSignature(module)
modulePath := filepath.Join(extractDir, "kernel", module)
signedModulePath := filepath.Join(gpuInstallDirContainer, "drivers", module)
if err := modules.AppendSignature(signedModulePath, modulePath, signaturePath); err != nil {
return errors.Wrapf(err, "failed to sign kernel module %s", module)
// Copy public key.
if err := utils.CopyFile(signing.GetPublicKeyDer(), filepath.Join(gpuInstallDirContainer, "pubkey.der")); err != nil {
return errors.Wrapf(err, "failed to copy file %s", signing.GetPublicKeyDer())
} else if !legacyLink {
// Copy drivers to the desired end directory. This is done as part of
// `modules.AppendSignature` in the above signing block, but we need to do
// it for unsigned modules as well. Legacy linking already does this copy
// in the unsigned case (we expect that legacy linking also does this when
// the installer fails); we skip this block in the legacy link case to avoid
// redundancy.
for _, kernelFile := range kernelFiles {
if strings.HasSuffix(kernelFile.Name(), ".ko") {
module := kernelFile.Name()
src := filepath.Join(extractDir, "kernel", module)
dst := filepath.Join(gpuInstallDirContainer, "drivers", module)
if err := utils.CopyFile(src, dst); err != nil {
return fmt.Errorf("failed to copy kernel module %q: %v", module, err)
// Load GPU drivers.
// The legacy linking method does this when the installer doesn't fail (i.e.
// module signature verification isn't enforced).
if (legacyLink && legacyInstallerFailed) || !legacyLink {
if err := loadGPUDrivers(needSigned, test); err != nil {
return fmt.Errorf("%w: %v", ErrDriverLoad, err)
// Install libs.
// The legacy linking method does this when the installer doesn't fail (i.e.
// module signature verification isn't enforced).
if (legacyLink && legacyInstallerFailed) || !legacyLink {
if err := installUserLibs(extractDir); err != nil {
return fmt.Errorf("failed to install userspace libraries: %v", err)
return nil
// GetDefaultGPUDriverVersion gets the default GPU driver version.
func GetDefaultGPUDriverVersion(downloader cos.ArtifactsDownloader) (string, error) {
log.Info("Getting the default GPU driver version")
content, err := downloader.GetArtifact(defaultGPUDriverFile)
if err != nil {
return "", errors.Wrapf(err, "failed to get default GPU driver version")
return strings.Trim(string(content), "\n "), nil
// GetLatestGPUDriverVersion gets the latest GPU driver version.
func GetLatestGPUDriverVersion(downloader cos.ArtifactsDownloader) (string, error) {
log.Info("Getting the latest GPU driver version")
content, err := downloader.GetArtifact(latestGPUDriverFile)
if err != nil {
return "", errors.Wrapf(err, "failed to get latest GPU driver version")
return strings.Trim(string(content), "\n "), nil
func updateContainerLdCache() error {
log.V(2).Info("Updating container's ld cache")
f, err := os.Create("/etc/")
if err != nil {
return errors.Wrap(err, "failed to update ld cache")
f.WriteString(gpuInstallDirContainer + "/lib64")
err = exec.Command("ldconfig").Run()
if err != nil {
return errors.Wrap(err, "failed to update ld cache")
return nil
func getDriverInstallerDownloadURL(driverVersion, cosMilestone, cosBuildNumber string) (string, error) {
metadataZone, err := utils.GetGCEMetadata("zone")
if err != nil {
return "", errors.Wrap(err, "failed to get GCE metadata zone")
downloadLocation := getInstallerDownloadLocation(metadataZone)
return getPrecompiledInstallerURL(driverVersion, cosMilestone, cosBuildNumber, downloadLocation), nil
func getInstallerDownloadLocation(metadataZone string) string {
fields := strings.Split(metadataZone, "/")
zone := fields[len(fields)-1]
locationMapping := map[string]string{
"us": "us",
"asia": "asia",
"europe": "eu",
location, ok := locationMapping[strings.Split(zone, "-")[0]]
if !ok {
location = "us"
return location
func getPrecompiledInstallerURL(driverVersion, cosMilestone, cosBuildNumber, downloadLocation string) string {
// 418.67 -> 418
majorVersion := strings.Split(driverVersion, ".")[0]
// 12371.284.0 -> 12371-284-0
cosBuildNumber = strings.Replace(cosBuildNumber, ".", "-", -1)
return fmt.Sprintf(
downloadLocation, cosMilestone, majorVersion, driverVersion, driverVersion, cosMilestone, cosBuildNumber)
func createHostDirBindMount(hostDir, bindMountPath string) error {
if err := os.MkdirAll(hostDir, defaultFilePermission); err != nil {
return errors.Wrapf(err, "failed to create dir %s", hostDir)
if err := os.MkdirAll(bindMountPath, defaultFilePermission); err != nil {
return errors.Wrapf(err, "failed to create dir %s", bindMountPath)
if err := syscall.Mount(hostDir, bindMountPath, "", syscall.MS_BIND, ""); err != nil {
return errors.Wrapf(err, "failed to create bind mount %s", bindMountPath)
// Remount to clear noexec flag.
if err := syscall.Mount("", bindMountPath, "",
syscall.MS_REMOUNT|syscall.MS_NOSUID|syscall.MS_NODEV|syscall.MS_RELATIME, ""); err != nil {
return errors.Wrapf(err, "failed to remount %s", bindMountPath)
return nil
func createOverlayFS(lowerDir, upperDir, workDir string) error {
if err := os.MkdirAll(lowerDir, defaultFilePermission); err != nil {
return errors.Wrapf(err, "failed to create dir %s", lowerDir)
if err := os.MkdirAll(upperDir, defaultFilePermission); err != nil {
return errors.Wrapf(err, "failed to create dir %s", upperDir)
if err := os.MkdirAll(workDir, defaultFilePermission); err != nil {
return errors.Wrapf(err, "failed to create dir %s", workDir)
if err := syscall.Mount("none", lowerDir, "overlay", 0,
fmt.Sprintf("lowerdir=%s,upperdir=%s,workdir=%s", lowerDir, upperDir, workDir)); err != nil {
return errors.Wrapf(err, "failed to create overlayfs (lowerdir=%s, upperdir=%s)", lowerDir, upperDir)
return nil
func loadGPUDrivers(needSigned, test bool) error {
// Don't need to load public key in test mode. Platform key is used.
if needSigned && !test {
if err := modules.LoadPublicKey("gpu-key", filepath.Join(gpuInstallDirContainer, "pubkey.der")); err != nil {
return errors.Wrap(err, "failed to load public key")
gpuModules := map[string]string{
"nvidia": filepath.Join(gpuInstallDirContainer, "drivers", "nvidia.ko"),
"nvidia_uvm": filepath.Join(gpuInstallDirContainer, "drivers", "nvidia-uvm.ko"),
"nvidia_drm": filepath.Join(gpuInstallDirContainer, "drivers", "nvidia-drm.ko"),
"nvidia_modeset": filepath.Join(gpuInstallDirContainer, "drivers", "nvidia-modeset.ko"),
// Need to load modules in order due to module dependency.
moduleNames := []string{"nvidia", "nvidia_uvm", "nvidia_drm", "nvidia_modeset"}
for _, moduleName := range moduleNames {
modulePath := gpuModules[moduleName]
if err := modules.LoadModule(moduleName, modulePath); err != nil {
return errors.Wrapf(err, "failed to load module %s", modulePath)
return nil