blob: f337bf1829d14ff9756a57c490d703962e8b03b5 [file] [log] [blame]
"""
gpu-installer-verifier is invoked when release-versions.yaml is changed.
It verifies the cos-gpu-installer main functionaility works correctly.
This ensures that a problematic cos-gpu-installer won't be released to the public.
"""
import sys
import yaml
import logging
import subprocess
import dataclasses
@dataclasses.dataclass
class GPUResource:
"""Class for representing a GPU device for validation test VMs."""
device_name: str
zones: list[str]
machine_type: str = 'n1-standard-1'
device_count: int = 1
def parse_tag(container_name):
with open('release/release-versions.yaml', 'r') as file:
data = yaml.safe_load(file)
for item in data:
if item['staging_container_name'] == container_name:
return item['build_tag']
return None
def get_cos_images():
"""
Retrieves a list of COS image families from gcloud
Excludes ARM64 image families.
Returns:
A list of strings, where each string is a COS image fanily name.
"""
try:
# Run the gcloud command and capture the output
command = "gcloud compute images list --no-standard-images --project=cos-cloud | awk '{print $3}' | tail -n +2 | grep -v arm"
process = subprocess.run(command, shell=True, capture_output=True, text=True)
if process.returncode != 0:
printf(f"Error executing gcloud command: {process.stderr}")
return []
output = process.stdout.strip()
image_list = output.splitlines()
return image_list
except Exception as e:
print(f"An error occurred: {e}")
return []
def main():
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
_PROJECT_ID = sys.argv[1]
_BUILD_GCR = sys.argv[2]
_VM_PROJECT = sys.argv[3]
logging.info(f"Project ID of cloud build: {_PROJECT_ID}, gcr of container image:{_BUILD_GCR}, Project ID to spin up test VMs: {_VM_PROJECT}.")
_CLOUD_CONFIG_DATA="./gpu_installer_test.cfg"
_IMAGE_PROJECT="cos-cloud"
_DAISY_GCS_BUCKET="{}-daisy-bkt".format(_PROJECT_ID.replace('google.com:', 'elgoog_com_'))
image_families = get_cos_images()
gpu_resources =[
GPUResource(
device_name='nvidia-tesla-p4',
zones=[
'us-central1-a',
'us-west2-b',
'us-east4-a'
]
),
GPUResource(
device_name='nvidia-tesla-t4',
zones=[
'us-west1-a',
'us-west2-c',
'us-east1-c',
],
),
]
_TAG = parse_tag("cos-gpu-installer")
if not _TAG:
logging.error(f"GPU installer tag cannot be parsed")
sys.exit(1)
logging.info(f"GPU installer tag parsed: {_TAG}")
for gpu_resource in gpu_resources:
for image_family in image_families:
print(f"Running the GPU installer sanity check for {gpu_resource.device_name} on {image_family}")
success = False
for zone in gpu_resource.zones:
test_name = f"test-{gpu_resource.device_name}-{zone}-{image_family}"
accelerator_resource=f"https://www.googleapis.com/compute/v1/projects/{_VM_PROJECT}/zones/{zone}/acceleratorTypes/nvidia-tesla-t4"
command = f"gcloud builds submit --config=testing/gpu_installer_test/gpu_installer_test.yaml --substitutions="\
f"_PROJECT_ID={_PROJECT_ID},"\
f"_ZONE={zone},"\
f"_CLOUD_CONFIG_DATA={_CLOUD_CONFIG_DATA},"\
f"_BUILD_GCR={_BUILD_GCR},"\
f"_CONTAINER_IMAGE_TAG={_TAG},"\
f"_VM_NAME={test_name},"\
f"_VM_PROJECT={_VM_PROJECT},"\
f"_MACHINE_TYPE={gpu_resource.machine_type},"\
f"_ACCELERATOR_TYPE={accelerator_resource},"\
f"_IMAGE_PROJECT={_IMAGE_PROJECT},"\
f"_IMAGE_FAMILY={image_family},"\
f"_DAISY_GCS_BUCKET={_DAISY_GCS_BUCKET}"\
" ."
try:
subprocess.run(command, text=True, shell=True, capture_output=True)
success = True
break
except subprocess.CalledProcessError as err:
logging.info(f"GPU installer sanity check for {gpu_resource.device_name} in {zone} on {image_family} failed.")
if not success:
logging.error(f"GPU installer sanity check for {gpu_resource.device_name} on {image_family} in all available zones failed.")
sys.exit(1)
logging.info(f"GPU installer sanity check for {gpu_resource.device_name} on {image_family} passed.")
sys.exit(0)
if __name__=='__main__':
main()