cos_customizer: Adding a scratch disk for the install-gpu step

This change modifies the preloader and provisioner to take advantage of a "scratch disk"
during the install gpu step. The install gpu step requires a toolchain of 5gb which
gets installed onto the bootdisk, and subsequently customers are forced to increase their bootdisk size.
This change now downloads the gpu toolchain to a "scratch disk" which is only created when the install-gpu
step is configured. The scratch disk is created by the preloader and the provisioner takes advantage of
the disk by mounting the disk and using it as the new toolchain location. This reduces cos-gpu image sizes by around
4-5gb depending on the size of the toolchain.

BUG=b/240175057,b/172925856
TEST=presubmit

Change-Id: I556bdc25327274e13e2947ff7a5bfffde2c18c04
Reviewed-on: https://cos-review.googlesource.com/c/cos/tools/+/35847
Tested-by: Nobel Barakat <nobelbarakat@google.com>
Reviewed-by: Robert Kolchmeyer <rkolchmeyer@google.com>
Cloud-Build: GCB Service account <228075978874@cloudbuild.gserviceaccount.com>
diff --git a/.gitignore b/.gitignore
index ac51a05..26ffb1b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1,2 @@
 bazel-*
+TAGS
diff --git a/src/data/build_image.wf.json b/src/data/build_image.wf.json
index e7647f2..4a3e08e 100644
--- a/src/data/build_image.wf.json
+++ b/src/data/build_image.wf.json
@@ -6,6 +6,7 @@
     "output_image_family": {"Value": "", "Description": "Family of output image."},
     "output_image_project": {"Required": true, "Description": "Project of output image."},
     "cidata_img": {"Required": true, "Description": "Path to CIDATA vfat image containing cloud-init user-data and the provisioner program. Must be in .tar.gz format."},
+    "scratch_img": {"Required": true, "Description": "Path to SCRATCH ext4 image. Must be in .tar.gz format."},
     "disk_type": {"Value": "pd-standard", "Description": "The disk type to use for preloading."},
     "disk_size_gb": {"Value": "10", "Description": "The disk size to use for preloading."},
     "host_maintenance": {"Value": "MIGRATE", "Description": "VM behavior when there is maintenance."},
@@ -15,6 +16,7 @@
   },
   "Sources": {
     "cloud-config": "/data/startup.yaml",
+    "scratch.tar.gz_": "${scratch_img}",
     "cidata.tar.gz_": "${cidata_img}"
   },
   "Steps": {
@@ -23,21 +25,32 @@
         {
           "Source": "${SOURCESPATH}/cidata.tar.gz_",
           "Destination": "${SOURCESPATH}/cidata.tar.gz"
+        },
+        {
+          "Source": "${SOURCESPATH}/scratch.tar.gz_",
+          "Destination": "${SOURCESPATH}/scratch.tar.gz"
         }
       ]
     },
-    "create-cidata": {
+    "create-images": {
       "CreateImages": [
         {
           "Name": "cidata",
           "RawDisk": {
             "Source": "${SOURCESPATH}/cidata.tar.gz"
           }
+        },
+        {
+          "Name": "scratch",
+          "RawDisk": {
+            "Source": "${SOURCESPATH}/scratch.tar.gz"
+          }
         }
       ]
     },
     "setup": {
       "CreateDisks": [
+	{{.ScratchDisks}}
         {
           "Name": "boot-disk",
           "SourceImage": "${source_image}",
@@ -55,7 +68,7 @@
       "CreateInstances": [
         {
           "Name": "preload-vm",
-          "Disks": [{"Source": "boot-disk"}, {"Source": "cidata-disk"}],
+          "Disks": [{"Source": "boot-disk"}, {{.ScratchDiskSource}} {"Source": "cidata-disk"}],
           "Labels": {"cos-customizer-cleanup":""},
           "MachineType": "${machine_type}",
           "guestAccelerators": {{.Accelerators}},
@@ -135,8 +148,8 @@
     }
   },
   "Dependencies": {
-    "create-cidata": ["copy-gcs"],
-    "setup": ["create-cidata"],
+    "create-images" : ["copy-gcs"],
+    "setup": ["create-images"],
     "run": ["setup"],
     "wait-preload-finished": ["run"],
     "wait-for-resize": ["run"],
diff --git a/src/data/startup.yaml b/src/data/startup.yaml
index 7e0ad8d..20b8958 100644
--- a/src/data/startup.yaml
+++ b/src/data/startup.yaml
@@ -25,12 +25,24 @@
     set -o errexit
     set -o pipefail
     set -o nounset
+    set -x
 
     status() {
       $@ 2>&1 | sed "s/^/BuildStatus: /"
       return "${PIPESTATUS[0]}"
     }
 
+    # Setup the scratch disk for use among tasks that need it.
+    # Directories are created for any task that might use them.
+    # E.g. there will be a /mnt/disks/scratch/gpu for the gpu step
+    # and potentially other directories for other steps.
+    setup_scratch_disk() {
+      status mkdir -p /mnt/disks/scratch
+      status resize2fs -f /dev/disk/by-label/SCRATCH
+      status mount /dev/disk/by-label/SCRATCH /mnt/disks/scratch
+      status mkdir -p /mnt/disks/scratch/gpu
+    }
+
     # To make sure that ARCH gets set properly, this function cannot run in a
     # subshell.
     set_arch() {
@@ -74,7 +86,7 @@
         # that key to appear (and shutdown anyway after 5 minutes).
         /mnt/disks/cidata/metadata_watcher_${ARCH} DaisyEnd
         umount /mnt/disks/cidata
-        rm -r /mnt/disks || :
+        umount /mnt/disks/scratch || :
         shutdown -h now
         while true; do sleep 1; done
       fi
@@ -83,6 +95,9 @@
     main() {
       status history -c
       set_arch
+      if [[ -e /dev/disk/by-label/SCRATCH ]]; then
+        setup_scratch_disk
+      fi
       status mkdir -p /mnt/disks/cidata
       status mount /dev/disk/by-label/CIDATA /mnt/disks/cidata
       if [[ ! -d /var/lib/.cos-customizer ]]; then
diff --git a/src/pkg/preloader/BUILD.bazel b/src/pkg/preloader/BUILD.bazel
index b6b088a..021204a 100644
--- a/src/pkg/preloader/BUILD.bazel
+++ b/src/pkg/preloader/BUILD.bazel
@@ -15,6 +15,16 @@
 load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
 
 genrule(
+    name = "scratch",
+    outs = ["scratch.img"],
+    cmd = "\
+fallocate -l 512M $@;\
+mkfs.ext4 $@;\
+tune2fs -c0 -i0 $@;\
+e2label $@ SCRATCH",
+)
+
+genrule(
     name = "cidata",
     srcs = [
         "//:src/data/startup.yaml",
@@ -47,6 +57,7 @@
     ],
     embedsrcs = [
         ":cidata",
+	":scratch"
     ],
     importpath = "cos.googlesource.com/cos/tools.git/src/pkg/preloader",
     visibility = ["//visibility:public"],
diff --git a/src/pkg/preloader/preload.go b/src/pkg/preloader/preload.go
index cae3ffe..8bf5f8b 100644
--- a/src/pkg/preloader/preload.go
+++ b/src/pkg/preloader/preload.go
@@ -43,6 +43,9 @@
 //go:embed cidata.img
 var ciDataImg []byte
 
+//go:embed scratch.img
+var scratchImg []byte
+
 // storeInGCS stores the given files in GCS using the given gcsManager.
 // Files to store are provided in a map where each key is a file on the local
 // file system and each value is the relative path in GCS at which to store the
@@ -68,6 +71,15 @@
 	return nil
 }
 
+func needScratchDisk(provConfig *provisioner.Config) bool {
+	for _, step := range provConfig.Steps {
+		if step.Type == "InstallGPU" {
+			return true
+		}
+	}
+	return false
+}
+
 func needDiskResize(provConfig *provisioner.Config, buildSpec *config.Build) bool {
 	// We need to resize the disk during provisioning if:
 	// 1. The requested disk size is larger than default, and
@@ -105,6 +117,24 @@
 		return "", err
 	}
 
+	// template content for the scratch disk.
+	// This disk is used for certain tasks that require additional disk space.
+	// We use a scratch disk in order to keep from increasing the boot disk size.
+	// The thinking here is that steps that require temporary files for their devops
+	// process can use this scratch disk instead of writing to the boot disk directly.
+	var scratchDiskJson string
+	var scratchDiskSource string
+	if needScratchDisk(provConfig) {
+		scratchDiskJson =  `
+      {
+        "Name": "scratch-disk",
+        "SourceImage": "scratch",
+        "Type": "${disk_type}",
+        "SizeGb": "5"
+      },`
+		scratchDiskSource = `{"Source": "scratch-disk"},`
+	}
+
 	// template content for the step resize-disk.
 	// If the oem-size is set, or need to reclaim sda3 (with disk-size-gb set),
 	// create the disk with the default size, and then resize the disk.
@@ -142,17 +172,21 @@
 		return "", err
 	}
 	if err := tmpl.Execute(w, struct {
-		Labels       string
-		Accelerators string
-		Licenses     string
-		ResizeDisks  string
-		WaitResize   string
+		Labels            string
+		Accelerators      string
+		Licenses          string
+		ResizeDisks       string
+		WaitResize        string
+		ScratchDisks      string
+		ScratchDiskSource string
 	}{
 		string(labelsJSON),
 		string(acceleratorsJSON),
 		string(licensesJSON),
 		resizeDiskJSON,
 		waitResizeJSON,
+		scratchDiskJson,
+		scratchDiskSource,
 	}); err != nil {
 		w.Close()
 		os.Remove(w.Name())
@@ -165,12 +199,16 @@
 	return w.Name(), nil
 }
 
-func writeCIDataImage(files *fs.Files) (path string, err error) {
-	img, err := ioutil.TempFile(fs.ScratchDir, "cidata-")
+func mcopyFiles(path string, files *fs.Files) (err error) {
+	return utils.RunCommand([]string{"mcopy", "-i", path, files.ProvConfig, "::/config.json"}, "", nil)
+}
+
+func writeImage(imgData *[]byte) (path string, err error) {
+	img, err := ioutil.TempFile(fs.ScratchDir, "img-")
 	if err != nil {
 		return "", err
 	}
-	_, writeErr := img.Write(ciDataImg)
+	_, writeErr := img.Write(*imgData)
 	closeErr := img.Close()
 	if writeErr != nil {
 		return "", writeErr
@@ -178,10 +216,11 @@
 	if closeErr != nil {
 		return "", closeErr
 	}
-	if err := utils.RunCommand([]string{"mcopy", "-i", img.Name(), files.ProvConfig, "::/config.json"}, "", nil); err != nil {
-		return "", err
-	}
-	out, err := ioutil.TempFile(fs.ScratchDir, "cidata-tar-")
+	return img.Name(), err
+}
+
+func tarImage(imageName string) (path string, err error) {
+	out, err := ioutil.TempFile(fs.ScratchDir, "img-tar-")
 	if err != nil {
 		return "", err
 	}
@@ -193,8 +232,8 @@
 	if err := utils.RunCommand([]string{
 		"tar",
 		"cf", out.Name(),
-		"--transform", fmt.Sprintf("s|%s|disk.raw|g", strings.TrimLeft(img.Name(), "/")),
-		img.Name(),
+		"--transform", fmt.Sprintf("s|%s|disk.raw|g", strings.TrimLeft(imageName, "/")),
+		imageName,
 	}, "", nil); err != nil {
 		return "", err
 	}
@@ -266,7 +305,22 @@
 	if err := updateProvConfig(provConfig, buildSpec, buildContexts, gcs, files); err != nil {
 		return nil, err
 	}
-	ciDataFile, err := writeCIDataImage(files)
+	ciDataFile, err := writeImage(&ciDataImg)
+	if err != nil {
+		return nil, err
+	}
+	if err := mcopyFiles(ciDataFile, files); err != nil {
+		return nil, err
+	}
+	ciDataFileTar, err := tarImage(ciDataFile)
+	if err != nil {
+		return nil, err
+	}
+	scratchImgFile, err := writeImage(&scratchImg)
+	if err != nil {
+		return nil, err
+	}
+	scratchImgFileTar, err := tarImage(scratchImgFile)
 	if err != nil {
 		return nil, err
 	}
@@ -305,7 +359,9 @@
 		"-var:output_image_project",
 		output.Project,
 		"-var:cidata_img",
-		ciDataFile,
+		ciDataFileTar,
+		"-var:scratch_img",
+		scratchImgFileTar,
 		"-var:machine_type",
 		buildSpec.MachineType,
 		"-var:disk_type",
diff --git a/src/pkg/preloader/preload_test.go b/src/pkg/preloader/preload_test.go
index 7de309a..a058502 100644
--- a/src/pkg/preloader/preload_test.go
+++ b/src/pkg/preloader/preload_test.go
@@ -21,6 +21,7 @@
 	"io/ioutil"
 	"os"
 	"path/filepath"
+	"strings"
 	"testing"
 
 	"cos.googlesource.com/cos/tools.git/src/pkg/config"
@@ -135,64 +136,82 @@
 		testName    string
 		outputImage *config.Image
 		buildConfig *config.Build
+		provConfig  *provisioner.Config
 		workflow    []byte
-		want        []byte
+		want        string
 	}{
 		{
 			testName:    "Empty",
 			outputImage: config.NewImage("", ""),
 			buildConfig: &config.Build{GCSBucket: "bucket"},
 			workflow:    []byte("{{.Licenses}} {{.Labels}} {{.Accelerators}}"),
-			want:        []byte("null {} []"),
+			want:        "null {} []",
 		},
 		{
 			testName:    "OneLicense",
 			outputImage: &config.Image{Image: &compute.Image{Licenses: []string{"my-license"}}, Project: ""},
 			buildConfig: &config.Build{GCSBucket: "bucket"},
 			workflow:    []byte("{{.Licenses}}"),
-			want:        []byte("[\"my-license\"]"),
+			want:        "[\"my-license\"]",
 		},
 		{
 			testName:    "TwoLicenses",
 			outputImage: &config.Image{Image: &compute.Image{Licenses: []string{"license-1", "license-2"}}, Project: ""},
 			buildConfig: &config.Build{GCSBucket: "bucket"},
 			workflow:    []byte("{{.Licenses}}"),
-			want:        []byte("[\"license-1\",\"license-2\"]"),
+			want:        "[\"license-1\",\"license-2\"]",
 		},
 		{
 			testName:    "EmptyStringLicense",
 			outputImage: &config.Image{Image: &compute.Image{Licenses: []string{""}}, Project: ""},
 			buildConfig: &config.Build{GCSBucket: "bucket"},
 			workflow:    []byte("{{.Licenses}}"),
-			want:        []byte("null"),
+			want:        "null",
 		},
 		{
 			testName:    "OneEmptyLicense",
 			outputImage: &config.Image{Image: &compute.Image{Licenses: []string{"license-1", ""}}, Project: ""},
 			buildConfig: &config.Build{GCSBucket: "bucket"},
 			workflow:    []byte("{{.Licenses}}"),
-			want:        []byte("[\"license-1\"]"),
+			want:        "[\"license-1\"]",
 		},
 		{
 			testName:    "URLLicense",
 			outputImage: &config.Image{Image: &compute.Image{Licenses: []string{"https://www.googleapis.com/compute/v1/projects/my-proj/global/licenses/my-license"}}, Project: ""},
 			buildConfig: &config.Build{GCSBucket: "bucket"},
 			workflow:    []byte("{{.Licenses}}"),
-			want:        []byte("[\"projects/my-proj/global/licenses/my-license\"]"),
+			want:        "[\"projects/my-proj/global/licenses/my-license\"]",
 		},
 		{
 			testName:    "Labels",
 			outputImage: &config.Image{Image: &compute.Image{Labels: map[string]string{"key": "value"}}, Project: ""},
 			buildConfig: &config.Build{GCSBucket: "bucket"},
 			workflow:    []byte("{{.Labels}}"),
-			want:        []byte("{\"key\":\"value\"}"),
+			want:        "{\"key\":\"value\"}",
 		},
 		{
 			testName:    "Accelerators",
 			outputImage: config.NewImage("", ""),
 			buildConfig: &config.Build{GCSBucket: "bucket", GPUType: "nvidia-tesla-k80", Project: "p", Zone: "z"},
 			workflow:    []byte("{{.Accelerators}}"),
-			want:        []byte("[{\"acceleratorCount\":1,\"acceleratorType\":\"projects/p/zones/z/acceleratorTypes/nvidia-tesla-k80\"}]"),
+			want:        "[{\"acceleratorCount\":1,\"acceleratorType\":\"projects/p/zones/z/acceleratorTypes/nvidia-tesla-k80\"}]",
+		},
+		{
+			testName:    "ScratchDisk",
+			outputImage: config.NewImage("", ""),
+			buildConfig: &config.Build{GCSBucket: "bucket", GPUType: "nvidia-tesla-k80", Project: "p", Zone: "z"},
+			provConfig:  &provisioner.Config{
+				Steps: []provisioner.StepConfig{
+					{
+						Type: "InstallGPU",
+						Args: mustMarshalJSON(t, &provisioner.InstallGPUStep{
+							GCSDepsPrefix: "gcs_deps",
+						}),
+					},
+				},
+			},
+			workflow:    []byte("{{.ScratchDisks}} {{.ScratchDiskSource}}"),
+			want:        `{"Name":"scratch-disk","SourceImage":"scratch","Type":"${disk_type}","SizeGb":"5"},{"Source":"scratch-disk"},`,
 		},
 	}
 	gcs := fakes.GCSForTest(t)
@@ -209,7 +228,10 @@
 			if err := ioutil.WriteFile(files.DaisyWorkflow, input.workflow, 0744); err != nil {
 				t.Fatal(err)
 			}
-			args, err := daisyArgs(context.Background(), gm, files, config.NewImage("", ""), input.outputImage, input.buildConfig, &provisioner.Config{})
+			if input.provConfig == nil {
+				input.provConfig = &provisioner.Config{}
+			}
+			args, err := daisyArgs(context.Background(), gm, files, config.NewImage("", ""), input.outputImage, input.buildConfig, input.provConfig)
 			if err != nil {
 				t.Fatalf("daisyArgs: %v", err)
 			}
@@ -217,8 +239,14 @@
 			if err != nil {
 				t.Fatal(err)
 			}
-			if !cmp.Equal(got, input.want) {
-				t.Errorf("daisyArgs: template Daisy: got %s, want %s", string(got), string(input.want))
+			gotStr := string(got)
+			gotStr = strings.Replace(gotStr, "\n", "", -1)
+			gotStr = strings.Replace(gotStr, " ", "", -1)
+
+			input.want = strings.Replace(input.want, "\n", "", -1) 
+			input.want = strings.Replace(input.want, " ", "", -1) 
+			if !cmp.Equal(gotStr, input.want) {
+				t.Errorf("daisyArgs: template Daisy: got %s, want %s", gotStr, input.want)
 			}
 		})
 	}
diff --git a/src/pkg/provisioner/gpu_setup_script.go b/src/pkg/provisioner/gpu_setup_script.go
index 4c88229..8e83beb 100644
--- a/src/pkg/provisioner/gpu_setup_script.go
+++ b/src/pkg/provisioner/gpu_setup_script.go
@@ -17,6 +17,7 @@
 const gpuSetupScriptTemplate = `#!/bin/bash
 
 set -o errexit
+set -x
 
 export NVIDIA_DRIVER_VERSION={{.NvidiaDriverVersion}}
 export NVIDIA_DRIVER_MD5SUM={{.NvidiaDriverMD5Sum}}
@@ -24,6 +25,8 @@
 export COS_NVIDIA_INSTALLER_CONTAINER={{.NvidiaInstallerContainer}}
 export NVIDIA_INSTALL_DIR_CONTAINER=/usr/local/nvidia
 export ROOT_MOUNT_DIR=/root
+export TOOLCHAIN_PKG_DIR="/toolchain"
+export SCRATCH_DISK_LOCATION="/mnt/disks/scratch/gpu"
 
 pull_installer() {
   local docker_code
@@ -56,6 +59,8 @@
     --volume ${NVIDIA_INSTALL_DIR_HOST}:${NVIDIA_INSTALL_DIR_CONTAINER} \
     --volume /dev:/dev \
     --volume /:${ROOT_MOUNT_DIR} \
+    --volume ${SCRATCH_DISK_LOCATION}:/toolchain \
+    -e TOOLCHAIN_PKG_DIR \
     -e NVIDIA_DRIVER_VERSION \
     -e NVIDIA_DRIVER_MD5SUM \
     -e NVIDIA_INSTALL_DIR_HOST \