| |
| """ |
| gpu-installer-verifier is invoked when release-versions.yaml is changed. |
| It verifies the cos-gpu-installer main functionaility works correctly. |
| This ensures that a problematic cos-gpu-installer won't be released to the public. |
| """ |
| |
| import sys |
| import yaml |
| import logging |
| import subprocess |
| import dataclasses |
| |
| @dataclasses.dataclass |
| class GPUResource: |
| """Class for representing a GPU device for validation test VMs.""" |
| device_name: str |
| zones: list[str] |
| machine_type: str = 'n1-standard-1' |
| device_count: int = 1 |
| |
| def parse_tag(container_name): |
| with open('release/release-versions.yaml', 'r') as file: |
| data = yaml.safe_load(file) |
| for item in data: |
| if item['staging_container_name'] == container_name: |
| return item['build_tag'] |
| return None |
| |
| def get_cos_images(): |
| """ |
| Retrieves a list of COS image families from gcloud |
| Excludes ARM64 image families. |
| |
| Returns: |
| A list of strings, where each string is a COS image fanily name. |
| """ |
| try: |
| # Run the gcloud command and capture the output |
| command = "gcloud compute images list --no-standard-images --project=cos-cloud | awk '{print $3}' | tail -n +2 | grep -v arm" |
| process = subprocess.run(command, shell=True, capture_output=True, text=True) |
| if process.returncode != 0: |
| printf(f"Error executing gcloud command: {process.stderr}") |
| return [] |
| output = process.stdout.strip() |
| image_list = output.splitlines() |
| return image_list |
| |
| except Exception as e: |
| print(f"An error occurred: {e}") |
| return [] |
| |
| def main(): |
| logging.basicConfig(stream=sys.stdout, level=logging.INFO) |
| _PROJECT_ID = sys.argv[1] |
| _BUILD_GCR = sys.argv[2] |
| _VM_PROJECT = sys.argv[3] |
| |
| logging.info(f"Project ID of cloud build: {_PROJECT_ID}, gcr of container image:{_BUILD_GCR}, Project ID to spin up test VMs: {_VM_PROJECT}.") |
| |
| _CLOUD_CONFIG_DATA="./gpu_installer_test.cfg" |
| _IMAGE_PROJECT="cos-cloud" |
| _DAISY_GCS_BUCKET="{}-daisy-bkt".format(_PROJECT_ID.replace('google.com:', 'elgoog_com_')) |
| |
| image_families = get_cos_images() |
| gpu_resources =[ |
| GPUResource( |
| device_name='nvidia-tesla-p4', |
| zones=[ |
| 'us-central1-a', |
| 'us-west2-b', |
| 'us-east4-a' |
| ] |
| ), |
| GPUResource( |
| device_name='nvidia-tesla-t4', |
| zones=[ |
| 'us-west1-a', |
| 'us-west2-c', |
| 'us-east1-c', |
| ], |
| ), |
| ] |
| _TAG = parse_tag("cos-gpu-installer") |
| if not _TAG: |
| logging.error(f"GPU installer tag cannot be parsed") |
| sys.exit(1) |
| |
| logging.info(f"GPU installer tag parsed: {_TAG}") |
| for gpu_resource in gpu_resources: |
| for image_family in image_families: |
| print(f"Running the GPU installer sanity check for {gpu_resource.device_name} on {image_family}") |
| success = False |
| for zone in gpu_resource.zones: |
| test_name = f"test-{gpu_resource.device_name}-{zone}-{image_family}" |
| accelerator_resource=f"https://www.googleapis.com/compute/v1/projects/{_VM_PROJECT}/zones/{zone}/acceleratorTypes/nvidia-tesla-t4" |
| command = f"gcloud builds submit --config=testing/gpu_installer_test/gpu_installer_test.yaml --substitutions="\ |
| f"_PROJECT_ID={_PROJECT_ID},"\ |
| f"_ZONE={zone},"\ |
| f"_CLOUD_CONFIG_DATA={_CLOUD_CONFIG_DATA},"\ |
| f"_BUILD_GCR={_BUILD_GCR},"\ |
| f"_CONTAINER_IMAGE_TAG={_TAG},"\ |
| f"_VM_NAME={test_name},"\ |
| f"_VM_PROJECT={_VM_PROJECT},"\ |
| f"_MACHINE_TYPE={gpu_resource.machine_type},"\ |
| f"_ACCELERATOR_TYPE={accelerator_resource},"\ |
| f"_IMAGE_PROJECT={_IMAGE_PROJECT},"\ |
| f"_IMAGE_FAMILY={image_family},"\ |
| f"_DAISY_GCS_BUCKET={_DAISY_GCS_BUCKET}"\ |
| " ." |
| try: |
| subprocess.run(command, text=True, shell=True, capture_output=True) |
| success = True |
| break |
| except subprocess.CalledProcessError as err: |
| logging.info(f"GPU installer sanity check for {gpu_resource.device_name} in {zone} on {image_family} failed.") |
| if not success: |
| logging.error(f"GPU installer sanity check for {gpu_resource.device_name} on {image_family} in all available zones failed.") |
| sys.exit(1) |
| logging.info(f"GPU installer sanity check for {gpu_resource.device_name} on {image_family} passed.") |
| sys.exit(0) |
| |
| if __name__=='__main__': |
| main() |