blob: 317d93230b8be6397f31feba24d9fc2827bdec1b [file] [log] [blame]
// Copyright 2018 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package main
import (
"context"
"encoding/json"
"errors"
"flag"
"fmt"
"io/ioutil"
"log"
"os"
"path/filepath"
"strings"
"text/template"
"cos.googlesource.com/cos/tools.git/src/pkg/config"
"cos.googlesource.com/cos/tools.git/src/pkg/fs"
"cos.googlesource.com/cos/tools.git/src/pkg/provisioner"
"cos.googlesource.com/cos/tools.git/src/pkg/utils"
"cloud.google.com/go/storage"
"github.com/google/subcommands"
"google.golang.org/api/iterator"
)
const (
gpuScript = "install_gpu.sh"
installerContainer = "gcr.io/cos-cloud/cos-gpu-installer:v20230117"
)
// TODO(b/121332360): Move most GPU functionality to cos-gpu-installer
var (
validGPUs = []string{"nvidia-tesla-k80", "nvidia-tesla-p100", "nvidia-tesla-v100", "nvidia-tesla-t4"}
)
// InstallGPU implements subcommands.Command for the "install-gpu" command.
// This command configures the current image build process to customize the result image
// with GPU drivers.
type InstallGPU struct {
NvidiaDriverVersion string
NvidiaDriverMd5sum string
NvidiaInstallDirHost string
gpuType string
getValidDrivers bool
gpuDataDir string
}
// Name implements subcommands.Command.Name.
func (*InstallGPU) Name() string {
return "install-gpu"
}
// Synopsis implements subcommands.Command.Synopsis.
func (*InstallGPU) Synopsis() string {
return "Configure the image build with GPU drivers."
}
// Usage implements subcommands.Command.Usage.
func (*InstallGPU) Usage() string {
return `install-gpu [flags]
`
}
// SetFlags implements subcommands.Command.SetFlags.
func (i *InstallGPU) SetFlags(f *flag.FlagSet) {
f.StringVar(&i.NvidiaDriverVersion, "version", "", "Driver version to install. Can also be the name of an nvidia installer present in the "+
"directory specified by '-deps-dir'; e.g., NVIDIA-Linux-x86_64-450.51.06.run.")
f.StringVar(&i.NvidiaDriverMd5sum, "md5sum", "", "Md5sum of the driver to install.")
f.StringVar(&i.NvidiaInstallDirHost, "install-dir", "/var/lib/nvidia",
"Location to install drivers on the image.")
f.StringVar(
&i.gpuType, "gpu-type", "nvidia-tesla-p100",
fmt.Sprintf("The type of GPU to verify drivers for. Must be one of: %v", validGPUs))
f.BoolVar(
&i.getValidDrivers, "get-valid-drivers", false,
"Print the list of supported GPU driver versions. If this flag is given, no other actions will be taken.")
f.StringVar(&i.gpuDataDir, "deps-dir", "", "If provided, the local directory to search for cos-gpu-installer data dependencies. "+
"The exact data dependencies that must be present in this directory depends on the version of cos-gpu-installer "+
"used by cos-customizer. Do not expect this flag to be stable; it exists for compatibility with pre-release COS images.")
}
func validDriverVersions(ctx context.Context, gcsClient *storage.Client) (map[string]bool, error) {
// We gather the set of valid drivers from the set of drivers provided by Nvidia in their GCS bucket.
// Nominally, paths we care about in this bucket look like 'tesla/<version>/<binaries>'. Version 390.46 has
// a deprecated path structure, and since it's supported by cos-gpu-installer, we special case that here.
validDrivers := map[string]bool{"390.46": true}
query := &storage.Query{Prefix: "tesla/"}
it := gcsClient.Bucket("nvidia-drivers-us-public").Objects(ctx, query)
for {
objAttrs, err := it.Next()
if err == iterator.Done {
break
}
if err != nil {
return nil, err
}
// Example object: tesla/396.26/NVIDIA-Linux-x86_64-396.26-diagnostic.run
if splitPath := strings.SplitN(objAttrs.Name, "/", 3); len(splitPath) > 1 {
validDrivers[splitPath[1]] = true
}
}
return validDrivers, nil
}
func (i *InstallGPU) validate(ctx context.Context, gcsClient *storage.Client, files *fs.Files, provConfig *provisioner.Config) error {
isValidGPU := false
for _, g := range validGPUs {
if i.gpuType == g {
isValidGPU = true
break
}
}
if !isValidGPU {
return fmt.Errorf("%q is an invalid GPU type. Must be one of: %v", i.gpuType, validGPUs)
}
if i.NvidiaDriverVersion == "" {
return fmt.Errorf("version must be set")
}
var gpuAlreadyConf bool
for _, s := range provConfig.Steps {
if s.Type == "InstallGPU" {
gpuAlreadyConf = true
break
}
}
if gpuAlreadyConf {
return fmt.Errorf("install-gpu can only be invoked once in an image build process. Only one driver version can be installed on the image")
}
if strings.HasSuffix(i.NvidiaDriverVersion, ".run") {
log.Printf("driver version is set to %q, which looks like an nvidia installer file", i.NvidiaDriverVersion)
if i.gpuDataDir == "" {
return errors.New(`"-deps-dir" must be set when the version is specified as an nvidia installer file`)
}
fileName := filepath.Join(i.gpuDataDir, i.NvidiaDriverVersion)
info, err := os.Stat(fileName)
if os.IsNotExist(err) {
return fmt.Errorf("nvidia installer file at %q does not exist", fileName)
}
if info.IsDir() {
return fmt.Errorf("nvidia installer file at %q is a directory", fileName)
}
} else {
validDrivers, err := validDriverVersions(ctx, gcsClient)
if err != nil {
return err
}
if !validDrivers[i.NvidiaDriverVersion] {
var drivers []string
for d := range validDrivers {
drivers = append(drivers, d)
}
return fmt.Errorf("driver version %s is not valid; valid driver versions are: %v", i.NvidiaDriverVersion, drivers)
}
}
return nil
}
func (i *InstallGPU) templateScript(scriptPath string) error {
setCOSDownloadGCS := ""
if i.gpuDataDir != "" {
setCOSDownloadGCS = "true"
}
data := struct {
NvidiaDriverVersion string
NvidiaDriverMd5sum string
NvidiaInstallDirHost string
SetCOSDownloadGCS string
}{
NvidiaDriverVersion: utils.QuoteForShell(i.NvidiaDriverVersion),
NvidiaDriverMd5sum: utils.QuoteForShell(i.NvidiaDriverMd5sum),
NvidiaInstallDirHost: utils.QuoteForShell(i.NvidiaInstallDirHost),
SetCOSDownloadGCS: utils.QuoteForShell(setCOSDownloadGCS),
}
tmpl, err := template.New(filepath.Base(scriptPath)).ParseFiles(scriptPath)
if err != nil {
return err
}
w, err := os.Create(scriptPath)
if err != nil {
return err
}
defer w.Close()
return tmpl.Execute(w, data)
}
func (i *InstallGPU) updateBuildConfig(configPath string) error {
buildConfig := &config.Build{}
configFile, err := os.OpenFile(configPath, os.O_RDWR, 0666)
if err != nil {
return err
}
defer configFile.Close()
if err := config.Load(configFile, buildConfig); err != nil {
return err
}
buildConfig.GPUType = i.gpuType
if i.gpuDataDir != "" {
files, err := ioutil.ReadDir(i.gpuDataDir)
if err != nil {
return fmt.Errorf("error reading dir %q: %v", i.gpuDataDir, err)
}
for _, f := range files {
if f.Mode().IsRegular() {
buildConfig.GCSFiles = append(buildConfig.GCSFiles, filepath.Join(i.gpuDataDir, f.Name()))
}
}
}
if _, err := configFile.Seek(0, 0); err != nil {
return err
}
return config.SaveConfigToFile(configFile, buildConfig)
}
func (i *InstallGPU) updateProvConfig(provConfig *provisioner.Config) error {
buf, err := json.Marshal(&provisioner.InstallGPUStep{
NvidiaDriverVersion: i.NvidiaDriverVersion,
NvidiaDriverMD5Sum: i.NvidiaDriverMd5sum,
NvidiaInstallDirHost: i.NvidiaInstallDirHost,
NvidiaInstallerContainer: installerContainer,
// GCSDepsPrefix will be converted into a gs:// address by the preloader
// package.
GCSDepsPrefix: i.gpuDataDir,
})
if err != nil {
return err
}
provConfig.Steps = append(provConfig.Steps, provisioner.StepConfig{
Type: "InstallGPU",
Args: json.RawMessage(buf),
})
return nil
}
// Execute implements subcommands.Command.Execute. It configures the current image build process to
// customize the result image with GPU drivers.
func (i *InstallGPU) Execute(ctx context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
if f.NArg() != 0 {
f.Usage()
return subcommands.ExitUsageError
}
if len(args) < 2 {
log.Panic("InstallGPU expects two arguments; *fs.Files and ServiceClients")
}
files, ok := args[0].(*fs.Files)
if !ok {
log.Panic("InstallGPU expects two arguments; *fs.Files and ServiceClients")
}
serviceClients, ok := args[1].(ServiceClients)
if !ok {
log.Panic("InstallGPU expects two arguments; *fs.Files and ServiceClients")
}
_, gcsClient, err := serviceClients(ctx, false)
if err != nil {
log.Println(err)
return subcommands.ExitFailure
}
defer gcsClient.Close()
if i.getValidDrivers {
validDrivers, err := validDriverVersions(ctx, gcsClient)
if err != nil {
log.Println(err)
return subcommands.ExitFailure
}
var drivers []string
for d := range validDrivers {
drivers = append(drivers, d)
}
log.Printf("Valid driver versions are: %v\n", drivers)
return subcommands.ExitSuccess
}
var provConfig provisioner.Config
if err := config.LoadFromFile(files.ProvConfig, &provConfig); err != nil {
log.Println(err)
return subcommands.ExitFailure
}
if err := i.validate(ctx, gcsClient, files, &provConfig); err != nil {
log.Println(err)
return subcommands.ExitFailure
}
if err := i.updateBuildConfig(files.BuildConfig); err != nil {
log.Println(err)
return subcommands.ExitFailure
}
if err := i.updateProvConfig(&provConfig); err != nil {
log.Println(err)
return subcommands.ExitFailure
}
if err := config.SaveConfigToPath(files.ProvConfig, &provConfig); err != nil {
log.Println(err)
return subcommands.ExitFailure
}
return subcommands.ExitSuccess
}