| // Package installer provides functionality to install GPU drivers. |
| package installer |
| |
| import ( |
| "context" |
| stderrors "errors" |
| "fmt" |
| "io/fs" |
| "io/ioutil" |
| "os" |
| "os/exec" |
| "path" |
| "path/filepath" |
| "regexp" |
| "sort" |
| "strconv" |
| "strings" |
| "syscall" |
| |
| cosfs "cos.googlesource.com/cos/tools.git/src/pkg/fs" |
| |
| "cos.googlesource.com/cos/tools.git/src/cmd/cos_gpu_installer/internal/signing" |
| "cos.googlesource.com/cos/tools.git/src/pkg/cos" |
| "cos.googlesource.com/cos/tools.git/src/pkg/modules" |
| "cos.googlesource.com/cos/tools.git/src/pkg/utils" |
| |
| log "github.com/golang/glog" |
| "github.com/pkg/errors" |
| "golang.org/x/sys/unix" |
| ) |
| |
| const ( |
| gpuInstallDirContainer = "/usr/local/nvidia" |
| gpuDriverProtoBin = "gpu_driver_versions.bin" |
| gpuFirmwareDirContainer = "/usr/local/nvidia/firmware/nvidia" |
| templateGPUDriverFile = "gpu_%s_version" |
| precompiledDriverTemplate = "NVIDIA-Linux-x86_64-%s-custom.run" |
| defaultFilePermission = 0755 |
| imexConfigCfgDefaultFilePermission = 0644 |
| signedURLKey = "Expires" |
| prebuiltModuleTemplate = "nvidia-drivers-%s.tgz" |
| DefaultVersion = "default" |
| LatestVersion = "latest" |
| MajorGPUDriverArtifactPrefix = "gpu_" |
| MajorGPUDriverArtifactSuffix = "_version" |
| gdrdrvDevicePath = "/dev/gdrdrv" |
| gdrdrvModuleName = "gdrdrv" |
| procDevicesPath = "/proc/devices" |
| ) |
| |
| var ( |
| gspFileNames = []string{"gsp.bin", "gsp_tu10x.bin", "gsp_ad10x.bin", "gsp_ga10x.bin"} |
| // ErrDriverLoad indicates that installed GPU drivers could not be loaded into |
| // the kernel. |
| ErrDriverLoad = stderrors.New("failed to load GPU drivers") |
| |
| errInstallerFailed = stderrors.New("failed to run GPU driver installer") |
| ) |
| |
| // VerifyDriverInstallation runs some commands to verify the driver installation. |
| func VerifyDriverInstallation(noVerify, debug, skipNvidiaSmi bool) error { |
| if noVerify { |
| log.Infof("Flag --no-verify is set, skip driver installation verification.") |
| return nil |
| } |
| log.Info("Verifying GPU driver installation") |
| |
| newPathEnv := fmt.Sprintf("%s/bin:%s", gpuInstallDirContainer, os.Getenv("PATH")) |
| os.Setenv("PATH", newPathEnv) |
| // Run nvidia-smi to check whether nvidia GPU driver is installed. |
| if !skipNvidiaSmi { |
| if err := utils.RunCommandAndLogOutput(exec.Command("nvidia-smi"), false); err != nil { |
| return errors.Wrap(err, "failed to verify GPU driver installation") |
| } |
| } |
| |
| // Create unified memory device file. |
| if err := utils.RunCommandAndLogOutput(exec.Command("nvidia-modprobe", "-c0", "-u", "-m"), false); err != nil { |
| return errors.Wrap(err, "failed to create unified memory device file") |
| } |
| |
| // Create symlinks in /dev/char for all possible NVIDIA device nodes |
| var err error |
| if debug { |
| err = utils.RunCommandAndLogOutput(exec.Command("nvidia-ctk", "system", "create-dev-char-symlinks", "--create-all"), false) |
| } else { |
| err = utils.RunCommandAndLogOutput(exec.Command("nvidia-ctk", "--quiet", "system", "create-dev-char-symlinks", "--create-all"), false) |
| } |
| if err != nil { |
| return errors.Wrap(err, "failed to create symlinks") |
| } |
| return nil |
| } |
| |
| // ConfigureCachedInstallation updates ldconfig and installs the cached GPU driver kernel modules. |
| func ConfigureCachedInstallation(gpuInstallDirHost, kernelDriversPath string, test, kernelOpen, noVerify bool, moduleParameters modules.ModuleParameters) error { |
| log.V(2).Info("Configuring cached driver installation") |
| |
| if err := createHostDirBindMount(gpuInstallDirHost, gpuInstallDirContainer); err != nil { |
| return errors.Wrap(err, "failed to create driver installation dir") |
| } |
| if err := updateContainerLdCache(); err != nil { |
| return errors.Wrap(err, "failed to configure cached driver installation") |
| } |
| if err := loadGPUDrivers(moduleParameters, test, kernelOpen, noVerify, kernelDriversPath); err != nil { |
| return errors.Wrap(err, "failed to configure cached driver installation") |
| } |
| |
| return nil |
| } |
| |
| // DownloadToInstallDir downloads data from the provided URL to the GPU |
| // installation directory. It returns the basename of the locally written file. |
| func DownloadToInstallDir(url, infoStr string) (string, error) { |
| outputPath := filepath.Join(gpuInstallDirContainer, strings.Split(path.Base(url), "?"+signedURLKey+"=")[0]) |
| if err := utils.DownloadContentFromURL(url, outputPath, infoStr); err != nil { |
| return "", fmt.Errorf("failed to download file with description %q from %q and install into %q: %v", infoStr, url, gpuInstallDirContainer, err) |
| } |
| return filepath.Base(outputPath), nil |
| |
| } |
| |
| // DownloadDriverInstallerV2 downloads GPU driver installer given driver version from COS build artifacts. |
| func DownloadDriverInstallerV2(ctx context.Context, downloader cos.ExtensionsDownloader, driverVersion string) (string, error) { |
| log.Infof("Downloading GPU driver installer version %s", driverVersion) |
| installerFilename := fmt.Sprintf(precompiledDriverTemplate, driverVersion) |
| err := downloader.DownloadExtensionArtifact(ctx, gpuInstallDirContainer, cos.GPUExtension, installerFilename) |
| if err != nil { |
| return "", errors.Wrap(err, "failed to download installer") |
| } |
| return installerFilename, nil |
| } |
| |
| // ConfigureDriverInstallationDirs configures GPU driver installation directories by creating mounts. |
| func ConfigureDriverInstallationDirs(gpuInstallDirHost, kernelRelease, arch string) (func(), error) { |
| log.Info("Configuring driver installation directories") |
| |
| if err := createHostDirBindMount(gpuInstallDirHost, gpuInstallDirContainer); err != nil { |
| return nil, errors.Wrap(err, "failed to create dirver installation dir") |
| } |
| |
| if err := createOverlayFS( |
| "/usr/bin", gpuInstallDirContainer+"/bin", gpuInstallDirContainer+"/bin-workdir"); err != nil { |
| return nil, errors.Wrap(err, "failed to create bin overlay") |
| } |
| |
| ldLibraryPath := "/usr/lib/" + arch + "-linux-gnu" |
| if err := createOverlayFS( |
| ldLibraryPath, gpuInstallDirContainer+"/lib64", gpuInstallDirContainer+"/lib64-workdir"); err != nil { |
| return nil, errors.Wrap(err, "failed to create lib64 overlay") |
| } |
| modulePath := filepath.Join("/lib/modules", kernelRelease, "video") |
| if err := createOverlayFS( |
| modulePath, gpuInstallDirContainer+"/drivers", gpuInstallDirContainer+"/drivers-workdir"); err != nil { |
| return nil, errors.Wrap(err, "failed to create drivers overlay") |
| } |
| |
| if err := updateContainerLdCache(); err != nil { |
| return nil, errors.Wrap(err, "failed to update container ld cache") |
| } |
| return cleanupMounts( |
| "/usr/bin", |
| ldLibraryPath, |
| modulePath, |
| gpuInstallDirContainer), nil |
| } |
| |
| func cleanupMounts(paths ...string) func() { |
| return func() { |
| log.Info("Start to clean up mounts...") |
| for _, path := range paths { |
| if err := syscall.Unmount(path, 0); err != nil { |
| log.Errorf("Failed to unmount %s: %v", path, err) |
| } |
| } |
| log.Info("Cleanup finished!") |
| } |
| } |
| |
| func extractPrecompiled(nvidiaDir string) error { |
| log.Info("Extracting precompiled artifacts...") |
| precompiledDir := filepath.Join(nvidiaDir, "kernel", "precompiled") |
| files, err := os.ReadDir(precompiledDir) |
| if err != nil { |
| return fmt.Errorf("failed to read %q: %v", precompiledDir, err) |
| } |
| var precompiledArchive string |
| if len(files) == 0 { |
| return stderrors.New("failed to find precompiled artifacts in this nvidia installer") |
| } |
| if len(files) == 1 { |
| precompiledArchive = filepath.Join(precompiledDir, files[0].Name()) |
| } |
| if len(files) > 1 { |
| var fileNames []string |
| for _, f := range files { |
| fileNames = append(fileNames, f.Name()) |
| } |
| sort.Strings(fileNames) |
| log.Warningf("Found multiple precompiled archives in this nvidia installer: %q", strings.Join(fileNames, ",")) |
| log.Warningf("Using precompiled archive named %q", fileNames[len(fileNames)-1]) |
| precompiledArchive = filepath.Join(precompiledDir, fileNames[len(fileNames)-1]) |
| } |
| cmd := exec.Command(filepath.Join(nvidiaDir, "mkprecompiled"), "--unpack", precompiledArchive, "-o", precompiledDir) |
| if err := utils.RunCommandAndLogOutput(cmd, false); err != nil { |
| return fmt.Errorf("failed to unpack precompiled artifacts: %v", err) |
| } |
| log.Info("Done extracting precompiled artifacts") |
| return nil |
| } |
| |
| func linkDrivers(toolchainDir, nvidiaDir string) error { |
| log.Info("Linking drivers...") |
| var kernelInfo unix.Utsname |
| if err := unix.Uname(&kernelInfo); err != nil { |
| return fmt.Errorf("failed to find kernel release info using uname: %v", err) |
| } |
| kernelRelease := strings.Trim(string(kernelInfo.Release[:]), "\x00") |
| // COS 85+ kernels use lld as their linker |
| linker := filepath.Join(toolchainDir, "bin", "ld.lld") |
| headersDir := filepath.Join(toolchainDir, "usr", "src", "linux-headers-"+kernelRelease) |
| linkerScript := filepath.Join(headersDir, "scripts", "module.lds") |
| linkerScriptExists, err := utils.CheckFileExists(linkerScript) |
| if err != nil { |
| return fmt.Errorf("failed to check if %s exists, err: %v", linkerScript, err) |
| } |
| if !linkerScriptExists { |
| // Fallback to module-common.lds, which is used in the 5.4 kernel |
| linkerScript = filepath.Join(headersDir, "scripts", "module-common.lds") |
| } |
| |
| // For kernel 6.12, we also need to link .module-common.o (compiled from |
| // scripts/module-common.c), which will insert the appropriate vermagic |
| // string into the modules. |
| moduleCommonObj := filepath.Join(headersDir, ".module-common.o") |
| moduleCommonObjExists, err := utils.CheckFileExists(moduleCommonObj) |
| if err != nil { |
| return fmt.Errorf("failed to check if %s exists, err: %v", moduleCommonObj, err) |
| } |
| |
| nvidiaKernelDir := filepath.Join(nvidiaDir, "kernel") |
| // Link nvidia.ko |
| nvidiaObjs := []string{ |
| filepath.Join(nvidiaKernelDir, "precompiled", "nv-linux.o"), |
| filepath.Join(nvidiaKernelDir, "nvidia", "nv-kernel.o_binary"), |
| } |
| if moduleCommonObjExists { |
| nvidiaObjs = append(nvidiaObjs, moduleCommonObj) |
| } |
| args := append([]string{"-T", linkerScript, "-r", "-o", filepath.Join(nvidiaKernelDir, "precompiled", "nvidia.ko")}, nvidiaObjs...) |
| cmd := exec.Command(linker, args...) |
| log.Infof("Running link command: %v", cmd.Args) |
| if err := utils.RunCommandAndLogOutput(cmd, false); err != nil { |
| return fmt.Errorf("failed to link nvidia.ko: %v", err) |
| } |
| // Link nvidia-modeset.ko |
| modesetObjs := []string{ |
| filepath.Join(nvidiaKernelDir, "precompiled", "nv-modeset-linux.o"), |
| filepath.Join(nvidiaKernelDir, "nvidia-modeset", "nv-modeset-kernel.o_binary"), |
| } |
| if moduleCommonObjExists { |
| modesetObjs = append(modesetObjs, moduleCommonObj) |
| } |
| args = append([]string{"-T", linkerScript, "-r", "-o", filepath.Join(nvidiaKernelDir, "precompiled", "nvidia-modeset.ko")}, modesetObjs...) |
| cmd = exec.Command(linker, args...) |
| log.Infof("Running link command: %v", cmd.Args) |
| if err := utils.RunCommandAndLogOutput(cmd, false); err != nil { |
| return fmt.Errorf("failed to link nvidia-modeset.ko: %v", err) |
| } |
| // Move all modules to kernel dir (includes some pre-linked modules, in |
| // addition to the above linked ones) |
| if err := filepath.WalkDir(filepath.Join(nvidiaKernelDir, "precompiled"), func(path string, d fs.DirEntry, err error) error { |
| if err != nil { |
| return err |
| } |
| if d.IsDir() { |
| return nil |
| } |
| if filepath.Ext(path) == ".ko" { |
| newPath := filepath.Join(nvidiaKernelDir, filepath.Base(path)) |
| if err := unix.Rename(path, newPath); err != nil { |
| return fmt.Errorf("failed to move %q to %q: %v", path, newPath, err) |
| } |
| } |
| return nil |
| }); err != nil { |
| return fmt.Errorf("failed to copy kernel modules: %v", err) |
| } |
| log.Info("Done linking drivers") |
| return nil |
| } |
| |
| func linkDriversLegacy(toolchainDir, nvidiaDir string) error { |
| log.Info("Linking drivers using legacy method...") |
| // The legacy linking method needs to use "/usr/bin/ld" as the linker to |
| // maintain bit-for-bit compatibility with driver signatures. The legacy |
| // linking method also finds the linker by searching the PATH for "ld". If |
| // bin/ld is present in the toolchain, rename it temporarily so the legacy |
| // linking method doesn't use it. |
| ld := filepath.Join(toolchainDir, "bin", "ld") |
| if _, err := os.Lstat(ld); !os.IsNotExist(err) { |
| dst := filepath.Join(toolchainDir, "bin", "ld.orig") |
| if err := unix.Rename(ld, dst); err != nil { |
| return fmt.Errorf("failed to rename %q to %q: %v", ld, dst, err) |
| } |
| defer func() { |
| if err := unix.Rename(dst, ld); err != nil { |
| // At this point, this error is non-fatal. It will become fatal when |
| // something tries to use bin/ld in the toolchain. At time of writing, |
| // nothing uses bin/ld after this point. |
| log.Warningf("Could not restore %q", ld) |
| } |
| }() |
| } |
| cmd := exec.Command(filepath.Join(nvidiaDir, "nvidia-installer"), |
| "--utility-prefix="+gpuInstallDirContainer, |
| "--opengl-prefix="+gpuInstallDirContainer, |
| "--x-prefix="+gpuInstallDirContainer, |
| "--install-libglvnd", |
| "--no-install-compat32-libs", |
| "--log-file-name="+filepath.Join(gpuInstallDirContainer, "nvidia-installer.log"), |
| "--silent", |
| "--accept-license", |
| ) |
| log.Infof("Installer arguments:\n%v", cmd.Args) |
| err := utils.RunCommandAndLogOutput(cmd, false) |
| log.Info("Done linking drivers") |
| if err != nil { |
| return fmt.Errorf("%w: %v", errInstallerFailed, err) |
| } |
| return nil |
| } |
| |
| func installUserLibs(nvidiaDir, arch string) error { |
| log.Info("Installing userspace libraries...") |
| cmdArgs := []string{ |
| "--utility-prefix=" + gpuInstallDirContainer, |
| "--opengl-prefix=" + gpuInstallDirContainer, |
| "--x-prefix=" + gpuInstallDirContainer, |
| "--install-libglvnd", |
| "--log-file-name=" + filepath.Join(gpuInstallDirContainer, "nvidia-installer.log"), |
| "--silent", |
| "--accept-license", |
| "--no-kernel-module", |
| } |
| // Arm64 installer does not recognize the 32 bit compatibility libraries command line option. |
| if arch == "x86_64" { |
| cmdArgs = append(cmdArgs, "--no-install-compat32-libs") |
| } |
| cmd := exec.Command(filepath.Join(nvidiaDir, "nvidia-installer"), cmdArgs...) |
| log.Infof("Installer arguments:\n%v", cmd.Args) |
| if err := utils.RunCommandAndLogOutput(cmd, false); err != nil { |
| return fmt.Errorf("failed to run GPU driver installer: %v", err) |
| } |
| log.Info("Done installing userspace libraries") |
| return nil |
| } |
| |
| // RunDriverInstaller runs GPU driver installer. Only works if the provided |
| // installer includes precompiled drivers. |
| func RunDriverInstaller(toolchainDir, installerFilename, driverVersion, arch, kernelDriversPath string, needSigned, test, legacyLink, noVerify bool, moduleParameters modules.ModuleParameters) error { |
| log.Info("Running GPU driver installer") |
| |
| // Extract files to a fixed path first to make sure md5sum of generated gpu drivers are consistent. |
| extractDir := "/tmp/extract" |
| if err := os.RemoveAll(extractDir); err != nil { |
| return fmt.Errorf("failed to clean %q: %v", extractDir, err) |
| } |
| cmd := exec.Command("sh", installerFilename, "-x", "--target", extractDir) |
| cmd.Dir = gpuInstallDirContainer |
| if err := cmd.Run(); err != nil { |
| return errors.Wrap(err, "failed to extract installer files") |
| } |
| |
| // Extract precompiled artifacts. |
| if err := extractPrecompiled(extractDir); err != nil { |
| return fmt.Errorf("failed to extract precompiled artifacts: %v", err) |
| } |
| |
| // Link drivers. |
| var legacyInstallerFailed bool |
| if legacyLink { |
| if err := linkDriversLegacy(toolchainDir, extractDir); err != nil { |
| if stderrors.Is(err, errInstallerFailed) { |
| // This case is expected when module signature enforcement is enabled. |
| // Since the installer terminated early, we need to re-run it after |
| // signing modules. |
| // |
| // If we don't sign modules (i.e. needSigned is false), then we'll see |
| // an error when we load the modules, and that will be fatal. |
| legacyInstallerFailed = true |
| } else { |
| return fmt.Errorf("failed to link drivers: %v", err) |
| } |
| } |
| } else { |
| if err := linkDrivers(toolchainDir, extractDir); err != nil { |
| return fmt.Errorf("failed to link drivers: %v", err) |
| } |
| } |
| |
| kernelFiles, err := ioutil.ReadDir(filepath.Join(extractDir, "kernel")) |
| if err != nil { |
| return errors.Wrapf(err, "failed to list files in directory %s", filepath.Join(extractDir, "kernel")) |
| } |
| if needSigned { |
| // sign GPU drivers. |
| for _, kernelFile := range kernelFiles { |
| if strings.HasSuffix(kernelFile.Name(), ".ko") { |
| module := kernelFile.Name() |
| signaturePath := signing.GetModuleSignature(module) |
| modulePath := filepath.Join(extractDir, "kernel", module) |
| signedModulePath := filepath.Join(gpuInstallDirContainer, "drivers", module) |
| if err := modules.AppendSignature(signedModulePath, modulePath, signaturePath); err != nil { |
| return errors.Wrapf(err, "failed to sign kernel module %s", module) |
| } |
| } |
| } |
| } else if !legacyLink { |
| // Copy drivers to the desired end directory. This is done as part of |
| // `modules.AppendSignature` in the above signing block, but we need to do |
| // it for unsigned modules as well. Legacy linking already does this copy |
| // in the unsigned case (we expect that legacy linking also does this when |
| // the installer fails); we skip this block in the legacy link case to avoid |
| // redundancy. |
| for _, kernelFile := range kernelFiles { |
| if strings.HasSuffix(kernelFile.Name(), ".ko") { |
| module := kernelFile.Name() |
| src := filepath.Join(extractDir, "kernel", module) |
| dst := filepath.Join(gpuInstallDirContainer, "drivers", module) |
| if err := utils.CopyFile(src, dst); err != nil { |
| return fmt.Errorf("failed to copy kernel module %q: %v", module, err) |
| } |
| } |
| } |
| } |
| |
| // Load GPU drivers. |
| // The legacy linking method does this when the installer doesn't fail (i.e. |
| // module signature verification isn't enforced). |
| if (legacyLink && legacyInstallerFailed) || !legacyLink { |
| if err := loadGPUDrivers(moduleParameters, test, false, noVerify, kernelDriversPath); err != nil { |
| return fmt.Errorf("%w: %v", ErrDriverLoad, err) |
| } |
| } |
| |
| // Install libs. |
| // The legacy linking method does this when the installer doesn't fail (i.e. |
| // module signature verification isn't enforced). |
| if (legacyLink && legacyInstallerFailed) || !legacyLink { |
| if err := installUserLibs(extractDir, arch); err != nil { |
| return fmt.Errorf("failed to install userspace libraries: %v", err) |
| } |
| |
| // Driver version may be empty if custom nvidia-installer-url is used |
| // read from manifest file |
| if driverVersion == "" { |
| |
| driverVersion = findDriverVersionManifestFile(filepath.Join(extractDir, ".manifest")) |
| log.Info("found driver version from nvidia-installer pkg ", driverVersion) |
| } |
| |
| if err := prepareGSPFirmware(extractDir, driverVersion, needSigned); err != nil { |
| return fmt.Errorf("failed to prepare GSP firmware, err: %v", err) |
| } |
| } |
| |
| return nil |
| } |
| |
| // DownloadGPUDriverVersionsProto will download gpuDriverProtoBin from GCS bucket to /var/lib/nvidia if it does not exist. |
| func DownloadGPUDriverVersionsProto(ctx context.Context, downloader cos.ArtifactsDownloader, gpuInstallDir string) ([]byte, error) { |
| destFullPath := filepath.Join(gpuInstallDir, gpuDriverProtoBin) |
| _, err := os.Stat(destFullPath) |
| if err == nil { |
| log.Infof("Found GPU driver version proto: %s, skip downloading", destFullPath) |
| } else if os.IsNotExist(err) { |
| if err := os.MkdirAll(gpuInstallDir, defaultFilePermission); err != nil { |
| return nil, fmt.Errorf("failed to create %s with error: %v", gpuInstallDir, err) |
| } |
| if err := downloader.DownloadArtifact(ctx, gpuInstallDir, gpuDriverProtoBin); err != nil { |
| return nil, fmt.Errorf("failed to download %s from GCS bucket with error: %v", gpuDriverProtoBin, err) |
| } |
| log.Infof("Succesfully download %s from GCS bucket.", gpuDriverProtoBin) |
| } else { |
| return nil, fmt.Errorf("error checking %s : %v", destFullPath, err) |
| } |
| return os.ReadFile(destFullPath) |
| } |
| |
| // GeGGPUDriverVersion gets the supplied GPU driver version. |
| // Supports "default", "latest", "R470", "R525" aliases |
| func GetGPUDriverVersion(ctx context.Context, downloader cos.ArtifactsDownloader, alias string) (string, error) { |
| log.Infof("Getting the %s GPU driver version", alias) |
| content, err := downloader.GetArtifact(ctx, fmt.Sprintf(templateGPUDriverFile, alias)) |
| if err != nil { |
| return "", errors.Wrapf(err, "failed to get %s GPU driver version", alias) |
| } |
| return strings.Trim(string(content), "\n "), nil |
| } |
| |
| // DownloadGPUDriverVersionArtifacts fetch all the gpu_xx_version files and the key is the file name and the value is the content. |
| // E.g. |
| // gpu_default_version -> 535.129.03, |
| // gpu_R470_version -> 470.223.02, |
| // gpu_R535_version -> 535.129.03, |
| func DownloadGPUDriverVersionArtifacts(ctx context.Context, downloader cos.ArtifactsDownloader) (map[string]string, error) { |
| gpuArtifacts, err := downloader.ListArtifacts(ctx, MajorGPUDriverArtifactPrefix) |
| if err != nil { |
| return nil, fmt.Errorf("error happens when listing artifacts with prefix: %s: %w", MajorGPUDriverArtifactPrefix, err) |
| } |
| var GPUDriverMajorVersionArtifactsContentMap = map[string]string{} |
| for _, gpuArtifact := range gpuArtifacts { |
| if strings.HasSuffix(gpuArtifact, MajorGPUDriverArtifactSuffix) { |
| gpuArtifactName := path.Base(gpuArtifact) |
| content, err := downloader.GetArtifact(ctx, gpuArtifactName) |
| if err != nil { |
| return nil, errors.Wrapf(err, "failed to download artifact: %s", gpuArtifact) |
| } |
| GPUDriverMajorVersionArtifactsContentMap[gpuArtifactName] = strings.Trim(string(content), "\n ") |
| } |
| } |
| return GPUDriverMajorVersionArtifactsContentMap, nil |
| } |
| |
| func updateContainerLdCache() error { |
| log.V(2).Info("Updating container's ld cache") |
| |
| f, err := os.Create("/etc/ld.so.conf.d/nvidia.conf") |
| if err != nil { |
| f.Close() |
| return errors.Wrap(err, "failed to update ld cache") |
| } |
| f.WriteString(gpuInstallDirContainer + "/lib64") |
| f.Close() |
| |
| err = exec.Command("ldconfig").Run() |
| if err != nil { |
| return errors.Wrap(err, "failed to update ld cache") |
| } |
| return nil |
| } |
| |
| func createHostDirBindMount(hostDir, bindMountPath string) error { |
| if err := os.MkdirAll(hostDir, defaultFilePermission); err != nil { |
| return errors.Wrapf(err, "failed to create dir %s", hostDir) |
| } |
| if err := os.MkdirAll(bindMountPath, defaultFilePermission); err != nil { |
| return errors.Wrapf(err, "failed to create dir %s", bindMountPath) |
| } |
| if err := syscall.Mount(hostDir, bindMountPath, "", syscall.MS_BIND, ""); err != nil { |
| return errors.Wrapf(err, "failed to create bind mount %s", bindMountPath) |
| } |
| // Remount to clear noexec flag. |
| if err := syscall.Mount("", bindMountPath, "", |
| syscall.MS_REMOUNT|syscall.MS_NOSUID|syscall.MS_NODEV|syscall.MS_RELATIME, ""); err != nil { |
| return errors.Wrapf(err, "failed to remount %s", bindMountPath) |
| } |
| return nil |
| } |
| |
| func createOverlayFS(lowerDir, upperDir, workDir string) error { |
| if err := os.MkdirAll(lowerDir, defaultFilePermission); err != nil { |
| return errors.Wrapf(err, "failed to create dir %s", lowerDir) |
| } |
| if err := os.MkdirAll(upperDir, defaultFilePermission); err != nil { |
| return errors.Wrapf(err, "failed to create dir %s", upperDir) |
| } |
| if err := os.MkdirAll(workDir, defaultFilePermission); err != nil { |
| return errors.Wrapf(err, "failed to create dir %s", workDir) |
| } |
| |
| if err := syscall.Mount("none", lowerDir, "overlay", 0, |
| fmt.Sprintf("lowerdir=%s,upperdir=%s,workdir=%s", lowerDir, upperDir, workDir)); err != nil { |
| return errors.Wrapf(err, "failed to create overlayfs (lowerdir=%s, upperdir=%s)", lowerDir, upperDir) |
| } |
| return nil |
| } |
| |
| func loadGPUDrivers(moduleParams modules.ModuleParameters, test, kernelOpen, noVerify bool, kernelDriversPath string) error { |
| if noVerify { |
| log.Infof("Flag --no-verify is set, skip kernel module loading.") |
| return nil |
| } |
| kernelModulePath := filepath.Join(gpuInstallDirContainer, "drivers") |
| drmPanel := &modules.Module{ |
| Name: "drm_panel_orientation_quirks", |
| Path: filepath.Join(kernelDriversPath, "gpu/drm/drm_panel_orientation_quirks.ko"), |
| SkipNotFound: true, |
| } |
| i2c := &modules.Module{ |
| Name: "i2c_core", |
| Path: filepath.Join(kernelDriversPath, "i2c/i2c-core.ko"), |
| Deps: []*modules.Module{drmPanel}, |
| SkipNotFound: true, |
| } |
| drm := &modules.Module{ |
| Name: "drm", |
| Path: filepath.Join(kernelDriversPath, "gpu/drm/drm.ko"), |
| Deps: []*modules.Module{i2c, drmPanel}, |
| SkipNotFound: true, |
| } |
| drmKms := &modules.Module{ |
| Name: "drm_kms_helper", |
| Path: filepath.Join(kernelDriversPath, "gpu/drm/drm_kms_helper.ko"), |
| Deps: []*modules.Module{drm, i2c}, |
| SkipNotFound: true, |
| } |
| nvidia := &modules.Module{ |
| Name: "nvidia", |
| Path: filepath.Join(kernelModulePath, "nvidia.ko"), |
| Deps: []*modules.Module{drm, i2c}, |
| } |
| nvidiaUvm := &modules.Module{ |
| Name: "nvidia_uvm", |
| Path: filepath.Join(kernelModulePath, "nvidia-uvm.ko"), |
| Deps: []*modules.Module{nvidia}, |
| } |
| nvidiaModeset := &modules.Module{ |
| Name: "nvidia_modeset", |
| Path: filepath.Join(kernelModulePath, "nvidia-modeset.ko"), |
| Deps: []*modules.Module{nvidia}, |
| } |
| nvidiaDrm := &modules.Module{ |
| Name: "nvidia_drm", |
| Path: filepath.Join(kernelModulePath, "nvidia-drm.ko"), |
| Deps: []*modules.Module{nvidiaModeset, drmKms}, |
| } |
| |
| // Need to load modules in order due to module dependency. |
| gpuModules := []*modules.Module{nvidia, nvidiaUvm, nvidiaModeset, nvidiaDrm} |
| for _, module := range gpuModules { |
| if err := modules.LoadModule(module, moduleParams); err != nil { |
| return errors.Wrapf(err, "failed to load module %s", module.Path) |
| } |
| } |
| return nil |
| } |
| |
| func GetLoadedNVIDIAKernelModuleVersion(versionFilePath string) string { |
| log.V(2).Infof("Attempting to read version from: %s", versionFilePath) |
| content, err := os.ReadFile(versionFilePath) |
| if err != nil { |
| log.V(2).Infof("Failed to read version file: %v", err) |
| return "" |
| } |
| contentStr := string(content) |
| kernelModuleVersionPattern := regexp.MustCompile(`\d+\.\d+\.\d+`) |
| kernelModuleVersion := kernelModuleVersionPattern.FindString(contentStr) |
| log.V(2).Infof("NVIDIA kernel module version: %s", kernelModuleVersion) |
| return kernelModuleVersion |
| } |
| |
| func prepareGSPFirmware(extractDir, driverVersion string, needSigned bool) error { |
| for _, gspFileName := range gspFileNames { |
| signaturePath := signing.GetModuleSignature(gspFileName) |
| installerGSPPath := filepath.Join(extractDir, "firmware", gspFileName) |
| containerGSPPath := filepath.Join(gpuFirmwareDirContainer, driverVersion, gspFileName) |
| haveSignature, err := utils.CheckFileExists(signaturePath) |
| if err != nil { |
| return fmt.Errorf("failed to check if %s exists, err: %v", signaturePath, err) |
| } |
| haveFirmware, err := utils.CheckFileExists(installerGSPPath) |
| if err != nil { |
| return fmt.Errorf("failed to check if %s exists, err: %v", installerGSPPath, err) |
| } |
| switch { |
| case haveSignature && !haveFirmware: |
| return fmt.Errorf("firmware doesn't exist but its signature does.") |
| case !haveFirmware: |
| log.Infof("GSP firmware for %s doesn't exist. Skipping firmware preparation for %s.", gspFileName, gspFileName) |
| case !needSigned: |
| // No signature needed, copy firmware only. |
| if err := copyFirmware(installerGSPPath, containerGSPPath, driverVersion); err != nil { |
| return fmt.Errorf("failed to copy firmware, err: %v.", err) |
| } |
| case !haveSignature: |
| log.Infof("GSP firmware signature for %s doesn't exist. Skipping firmware preparation for %s.", gspFileName, gspFileName) |
| default: |
| // Both firmware and signature exist. |
| if err := copyFirmware(installerGSPPath, containerGSPPath, driverVersion); err != nil { |
| return fmt.Errorf("failed to copy firmware, err: %v.", err) |
| } |
| if err := setIMAXattr(signaturePath, containerGSPPath); err != nil { |
| return err |
| } |
| } |
| } |
| return nil |
| } |
| |
| func copyFirmware(installerGSPPath, containerGSPPath, gspFileName string) error { |
| if err := os.MkdirAll(filepath.Dir(containerGSPPath), defaultFilePermission); err != nil { |
| return fmt.Errorf("Falied to create firmware directory, err: %v", err) |
| } |
| if err := utils.CopyFile(installerGSPPath, containerGSPPath); err != nil { |
| return fmt.Errorf("Falied to copy %s, err: %v", gspFileName, err) |
| } |
| return nil |
| } |
| |
| func setIMAXattr(signaturePath, containerGSPPath string) error { |
| signature, err := os.ReadFile(signaturePath) |
| if err != nil { |
| return fmt.Errorf("failed to read signature err: %v", err) |
| } |
| if err := syscall.Setxattr(containerGSPPath, "security.ima", signature, 0); err != nil { |
| return fmt.Errorf("failed to set xattr for security.ima, err: %v", err) |
| } |
| return nil |
| } |
| |
| // tries to read .manifest file to find driverVersion present in the manifest |
| func findDriverVersionManifestFile(manifestFilePath string) string { |
| manifestFileRawBytes, err := os.ReadFile(manifestFilePath) |
| if err != nil { |
| return "" |
| } |
| lines := strings.Split(string(manifestFileRawBytes), "\n") |
| if len(lines) < 2 { |
| return "" |
| } |
| // driver version present in the second line of the file |
| driverVersion := strings.TrimSpace(lines[1]) |
| return driverVersion |
| } |
| |
| func RunDriverInstallerPrebuiltModules(ctx context.Context, downloader *cos.GCSDownloader, installerFilename, driverVersion, arch, kernelDriversPath string, noVerify bool, moduleParameters modules.ModuleParameters) error { |
| // fetch the prebuilt modules |
| if err := downloader.DownloadArtifact(ctx, gpuInstallDirContainer, fmt.Sprintf(prebuiltModuleTemplate, driverVersion)); err != nil { |
| return fmt.Errorf("failed to download prebuilt modules: %v", err) |
| } |
| |
| tarballPath := filepath.Join(gpuInstallDirContainer, fmt.Sprintf(prebuiltModuleTemplate, driverVersion)) |
| // extract the prebuilt modules and firmware to the installation dirs |
| if err := exec.Command("tar", "--overwrite", "--xattrs", "--xattrs-include=*", "-xf", tarballPath, "-C", gpuInstallDirContainer).Run(); err != nil { |
| return fmt.Errorf("failed to extract prebuilt modules: %v", err) |
| } |
| if err := os.Chmod(gpuInstallDirContainer, defaultFilePermission); err != nil { |
| return fmt.Errorf("failed to change permission of install dir: %v", err) |
| } |
| |
| // load the prebuilt kernel modules |
| if err := loadGPUDrivers(moduleParameters, false, true, noVerify, kernelDriversPath); err != nil { |
| return fmt.Errorf("%w: %v", ErrDriverLoad, err) |
| } |
| |
| // Extract files to a fixed path first to make sure md5sum of generated gpu drivers are consistent. |
| extractDir := "/tmp/extract" |
| if err := os.RemoveAll(extractDir); err != nil { |
| return fmt.Errorf("failed to clean %q: %v", extractDir, err) |
| } |
| cmd := exec.Command("sh", installerFilename, "-x", "--target", extractDir) |
| cmd.Dir = gpuInstallDirContainer |
| if err := cmd.Run(); err != nil { |
| return errors.Wrap(err, "failed to extract installer files") |
| } |
| if err := installUserLibs(extractDir, arch); err != nil { |
| return fmt.Errorf("failed to install userspace libraries: %v", err) |
| } |
| |
| return nil |
| } |
| |
| func PrebuiltModulesAvailable(ctx context.Context, downloader *cos.GCSDownloader, driverVersion string, kernelOpen bool) (bool, error) { |
| if !kernelOpen { |
| return false, nil |
| } |
| |
| prebuiltModulesArtifactPath := fmt.Sprintf(prebuiltModuleTemplate, driverVersion) |
| return downloader.ArtifactExists(ctx, prebuiltModulesArtifactPath) |
| } |
| |
| // DownloadGenericDriverInstaller downloads the generic GPU driver installer given driver version. |
| func DownloadGenericDriverInstaller(ctx context.Context, downloader *cos.GCSDownloader, driverVersion string) (string, error) { |
| log.Infof("Downloading GPU driver installer version %s", driverVersion) |
| return downloader.DownloadGenericNvidiaDriver(ctx, gpuInstallDirContainer, driverVersion) |
| } |
| |
| // DownloadImexDriver downloads the IMEX driver using the GCSDownloader. |
| func DownloadImexDriver(ctx context.Context, downloader *cos.GCSDownloader, driverVersion string) (string, error) { |
| log.Infof("Downloading IMEX driver version: %s", driverVersion) |
| return downloader.DownloadImexDriver(ctx, gpuInstallDirContainer, driverVersion) |
| } |
| |
| // InstallImexDriverDefault installs IMEX driver binaries to the default container location (const: gpuInstallDirContainer) |
| // This is a wrapper around InstallImexDriver for production usage, using the default constant gpuInstallDirContainer as the target install path. |
| func InstallImexDriverDefault(imexArtifactPath string, driverVersion string, installDirHost string) error { |
| return InstallImexDriver(imexArtifactPath, driverVersion, installDirHost, gpuInstallDirContainer) |
| } |
| |
| // InstallImexDriver installs IMEX driver binaries from a given tarball into the specified installDirContainer. |
| // It performs the following steps: |
| // 1. Extracts the IMEX driver tarball (imexArtifactPath) to a temporary working directory. |
| // 2. Installs the IMEX binaries ("nvidia-imex", "nvidia-imex-ctl") into <installDirContainer>/bin with default permissions(0755). |
| // 3. Copies the original config.cfg from the <temp-working-dir>/etc/nvidia-imex/config.cfg and installs it into <installDirContainer>/imex-config/config.cfg |
| // 4. Patches the copied config.cfg so that the default IMEX_NODE_CONFIG_FILE points to the default nodes_config.cfg to the `<installDirHost>/imex-config/config.cfg` with 0644 permission. |
| // 5. Create an empty nodes_config.cfg file at <installDirContainer>/imex-config/nodes_config.cfg |
| // This function is testable and allows overriding the install directory path. |
| func InstallImexDriver(imexArtifactPath string, driverVersion string, installDirHost string, installDirContainer string) error { |
| log.Infof("Starting to install IMEX driver components.") |
| imexScratchDir, err := os.MkdirTemp("/tmp", "imex-") |
| if err != nil { |
| return fmt.Errorf("failed to create temporary directory: %v", err) |
| } |
| defer os.RemoveAll(imexScratchDir) |
| //Extract the IMEX tarball |
| log.V(2).Infof("Extracting IMEX tarball from: %s to %s.", imexArtifactPath, imexScratchDir) |
| extractCmd := exec.Command("tar", "-xf", imexArtifactPath, "-C", imexScratchDir) |
| if output, err := extractCmd.CombinedOutput(); err != nil { |
| return fmt.Errorf("failed to extract IMEX tarball: %v, output: %s", err, output) |
| } |
| archiveRoot := filepath.Join(imexScratchDir, fmt.Sprintf("nvidia-imex-linux-sbsa-%s-archive", driverVersion)) |
| |
| // Copy IMEX binaries |
| imexBinaries := []string{"nvidia-imex", "nvidia-imex-ctl"} |
| for _, binary := range imexBinaries { |
| srcBinPath := filepath.Join(archiveRoot, "usr/bin", binary) |
| dstBinPath := filepath.Join(installDirContainer, "bin", binary) |
| log.V(2).Infof("Copying IMEX binary from %s to %s", srcBinPath, dstBinPath) |
| if err := cosfs.CopyFile(srcBinPath, dstBinPath, defaultFilePermission); err != nil { |
| return errors.Wrapf(err, "failed to copy IMEX binary %s", binary) |
| } |
| } |
| |
| // Create the imex-config directory |
| imexConfigDir := filepath.Join(installDirContainer, "imex-config") |
| imexConfigHostDir := filepath.Join(installDirHost, "imex-config") |
| if err := os.MkdirAll(imexConfigDir, defaultFilePermission); err != nil { |
| return fmt.Errorf("failed to create imex-config directory: %v", err) |
| } |
| |
| // Copy config.cfg |
| srcConfigPath := filepath.Join(archiveRoot, "etc/nvidia-imex/config.cfg") |
| dstConfigPath := filepath.Join(imexConfigDir, "config.cfg") |
| log.V(2).Infof("Copying IMEX config.cfg from %s to %s", srcConfigPath, dstConfigPath) |
| if err := cosfs.CopyFile(srcConfigPath, dstConfigPath, imexConfigCfgDefaultFilePermission); err != nil { |
| return fmt.Errorf("failed to copy IMEX config.cfg: %v", err) |
| } |
| |
| // Patch the config.cfg |
| configBytes, err := os.ReadFile(dstConfigPath) |
| if err != nil { |
| return fmt.Errorf("failed to read %s: %v", dstConfigPath, err) |
| } |
| updatedConfig := strings.ReplaceAll( |
| string(configBytes), |
| "IMEX_NODE_CONFIG_FILE=/etc/nvidia-imex/nodes_config.cfg", |
| fmt.Sprintf("IMEX_NODE_CONFIG_FILE=%s", filepath.Join(imexConfigHostDir, "nodes_config.cfg")), |
| ) |
| log.V(2).Infof("Patching the nodes_config.cfg location specified in config.cfg from IMEX_NODE_CONFIG_FILE=/etc/nvidia-imex/nodes_config.cfg to IMEX_NODE_CONFIG_FILE=%s", |
| filepath.Join(imexConfigHostDir, "nodes_config.cfg")) |
| if err := os.WriteFile(dstConfigPath, []byte(updatedConfig), imexConfigCfgDefaultFilePermission); err != nil { |
| return fmt.Errorf("failed to patch %s: %v", dstConfigPath, err) |
| } |
| |
| // Create empty nodes_config.cfg |
| nodesConfigPath := filepath.Join(imexConfigDir, "nodes_config.cfg") |
| log.V(2).Infof("Creating empty nodes_config.cfg") |
| if err := os.WriteFile(nodesConfigPath, nil, defaultFilePermission); err != nil { |
| return fmt.Errorf("failed to create nodes_config.cfg: %v", err) |
| } |
| |
| log.Infof("Successfully installed IMEX driver components.") |
| return nil |
| |
| } |
| |
| // isDeviceRegistered checks if a device is registered in /proc/devices |
| // and returns true and its major number if it is. |
| func isDeviceRegistered(deviceName string) (bool, int) { |
| content, err := ioutil.ReadFile(procDevicesPath) |
| if err != nil { |
| log.Errorf("Failed to read %s: %v", procDevicesPath, err) |
| return false, 0 |
| } |
| |
| lines := strings.Split(string(content), "\n") |
| for _, line := range lines { |
| fields := strings.Fields(line) |
| if len(fields) == 2 && fields[1] == deviceName { |
| major, err := strconv.Atoi(fields[0]) |
| if err != nil { |
| log.Errorf("Failed to parse major number for %s: %v", deviceName, err) |
| return false, 0 |
| } |
| return true, major |
| } |
| } |
| return false, 0 |
| } |
| |
| // mergeModuleParams takes a list of user-provided parameters (as "key=val" strings) |
| // and a map of default parameters. It returns a final slice of parameters, ensuring |
| // that any key provided by the user is not overridden by a default. |
| func mergeModuleParams(userParams []string, defaults map[string]string) []string { |
| userSetKeys := make(map[string]bool) |
| finalParamsList := []string{} |
| |
| // 1. Add all user params first and record which keys they set. |
| for _, userParam := range userParams { |
| finalParamsList = append(finalParamsList, userParam) |
| if key, _, found := strings.Cut(userParam, "="); found { |
| userSetKeys[key] = true |
| } |
| } |
| |
| // 2. Add defaults ONLY if the key wasn't already set by the user. |
| for key, value := range defaults { |
| if !userSetKeys[key] { |
| paramString := fmt.Sprintf("%s=%s", key, value) |
| finalParamsList = append(finalParamsList, paramString) |
| } |
| } |
| return finalParamsList |
| } |
| |
| // InstallGDRCopy loads the GDRCopy kernel module and creates its device node. |
| // This should be run after the main NVIDIA kernel modules are loaded. |
| // It follows https://github.com/NVIDIA/gdrcopy/blob/master/insmod.sh. |
| func InstallGDRCopy(noVerify bool, moduleParams modules.ModuleParameters) error { |
| if noVerify { |
| log.Info("Flag --no-verify is set, skipping GDRCopy installation.") |
| return nil |
| } |
| |
| kernelModulePath := filepath.Join(gpuInstallDirContainer, "drivers") |
| |
| // 1. Define the gdrdrv module. |
| gdrModule := &modules.Module{ |
| Name: gdrdrvModuleName, |
| Path: filepath.Join(kernelModulePath, "gdrdrv.ko"), |
| } |
| |
| // Set default module parameters if the user did not provide them. |
| // Flags are defined here: https://github.com/NVIDIA/gdrcopy/blob/master/insmod.sh#L28. |
| defaults := map[string]string{ |
| "dbg_enabled": "0", |
| "info_enabled": "0", |
| "use_persistent_mapping": "1", |
| } |
| |
| // Call our tested helper function to get the final parameter list. |
| userGDRParams := moduleParams[gdrdrvModuleName] |
| finalGDRParams := mergeModuleParams(userGDRParams, defaults) |
| |
| // Assign the merged list back to the global map to be passed to LoadModule. |
| moduleParams[gdrdrvModuleName] = finalGDRParams |
| log.V(1).Infof("Applying final parameters for %s: %v", gdrdrvModuleName, finalGDRParams) |
| |
| // 2. Load the module. |
| log.V(1).Info("Loading GDRCopy kernel module with dependencies.") |
| if err := modules.LoadModule(gdrModule, moduleParams); err != nil { |
| return errors.Wrap(err, "failed to load gdrdrv kernel module") |
| } |
| |
| // 3. Create the device node |
| isLoaded, major := isDeviceRegistered(gdrdrvModuleName) |
| if !isLoaded { |
| return stderrors.New("gdrdrv module loaded but device not found in /proc/devices") |
| } |
| log.Infof("GDRCopy driver major is %d", major) |
| |
| if _, err := os.Stat(gdrdrvDevicePath); err == nil { |
| log.Infof("Removing old inode %s", gdrdrvDevicePath) |
| if err := os.Remove(gdrdrvDevicePath); err != nil { |
| return errors.Wrapf(err, "failed to remove existing device node %s", gdrdrvDevicePath) |
| } |
| } |
| |
| log.Infof("Creating device node %s", gdrdrvDevicePath) |
| dev := unix.Mkdev(uint32(major), 0) |
| if err := unix.Mknod(gdrdrvDevicePath, unix.S_IFCHR|0666, int(dev)); err != nil { |
| return errors.Wrapf(err, "failed to create device node for %s", gdrdrvDevicePath) |
| } |
| |
| return nil |
| } |