| #!/bin/bash |
| # |
| # Copyright 2018 Google LLC |
| # |
| # Licensed under the Apache License, Version 2.0 (the "License"); |
| # you may not use this file except in compliance with the License. |
| # You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, software |
| # distributed under the License is distributed on an "AS IS" BASIS, |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| # See the License for the specific language governing permissions and |
| # limitations under the License. |
| |
| set -o errexit |
| set -o pipefail |
| |
| PROJECT="" |
| MODE="" |
| SERVICE_ACCOUNT="compute@cloud-image-docker-builder.google.com.iam.gserviceaccount.com" |
| COS_CUSTOMIZER="cos_customizer" |
| |
| # Zones with approved GPU quotas |
| readonly GPU_ZONES=( |
| "us-central1-a" "us-central1-b" "us-central1-c" "us-central1-f" |
| ) |
| # Broader pool of zones for standard CPU-only integration tests |
| readonly STANDARD_ZONES=( |
| "us-west1-a" "us-west1-b" "us-west1-c" |
| "us-central1-a" "us-central1-b" "us-central1-c" "us-central1-f" |
| "us-east1-b" "us-east1-c" "us-east1-d" |
| "us-east4-a" "us-east4-b" "us-east4-c" |
| "us-west2-a" "us-west2-b" "us-west2-c" |
| ) |
| readonly MAX_TEST_ATTEMPTS=5 |
| |
| usage() { |
| cat <<'EOF' |
| Usage: ./run_builds.sh [OPTIONS] |
| run_builds.sh runs multiple cloud builds and waits for the status of each |
| build and reports back failures. It is used in two ways in this repository: |
| - To run cos-customizer integration tests (20+ concurrent builds in |
| Cloud Build). Minimum quota requirements include ~10 K80s in us-west1. |
| - To publish container images in src/cmd to AR (8+ concurrent builds) |
| |
| -p,--project=<project_name> GCP project to run tests in. Required. |
| --service-account=<service_account> Service account to use in cos-customizer builds |
| --cos-customizer-image=<container_image_uri> cos-customizer container image to test. By default, each tests compiles an image from source. |
| -m,--mode=<test|publish> Mode to run the scripts in. Use `test` for running cos-customizer integration tests and `publish` to publish images. |
| EOF |
| } |
| |
| parse_arguments() { |
| local -r long_options="project:,mode:,help:,service-account:,cos-customizer-image:" |
| parsed_args="$(getopt --options=p:,m:,h --longoptions="${long_options}" --name "$0" -- "$@")" |
| eval set -- "${parsed_args}" |
| while true; do |
| case "$1" in |
| -p|--project) |
| PROJECT="$2" |
| shift 2 |
| ;; |
| --service-account) |
| SERVICE_ACCOUNT="$2" |
| shift 2 |
| ;; |
| --cos-customizer-image) |
| COS_CUSTOMIZER="$2" |
| shift 2 |
| ;; |
| -m|--mode) |
| MODE="$2" |
| [[ ${MODE} == "test" || ${MODE} == "publish" ]] || \ |
| ( echo "Incorrect value for argument: 'mode'" && usage ) |
| shift 2 |
| ;; |
| -h|--help) |
| usage |
| exit |
| ;; |
| --) |
| shift |
| break |
| ;; |
| *) |
| usage |
| exit |
| ;; |
| esac |
| done |
| } |
| |
| validate_args() { |
| if [[ -z "${PROJECT}" || -z "${MODE}" ]]; then |
| usage |
| return 1 |
| fi |
| } |
| |
| get_build_status() { |
| local -r build_id="$1" |
| gcloud builds describe "${build_id}" --project="${PROJECT}" --format='value(status)' |
| } |
| |
| get_log_url() { |
| local -r build_id="$1" |
| gcloud builds describe "${build_id}" --project="${PROJECT}" --format='value(logUrl)' |
| } |
| |
| start_test_build() { |
| local -r config="$1" |
| local -r zone="$2" |
| gcloud builds submit --config="${config}" --project="${PROJECT}" --substitutions=_PROJECT_NAME="${PROJECT_NAME}",_SERVICE_ACCOUNT="${SERVICE_ACCOUNT}",_COS_CUSTOMIZER="${COS_CUSTOMIZER}",_ZONE="${zone}" --async --format='value(ID)' . |
| } |
| |
| start_publish_build() { |
| local -r config="$1" |
| gcloud builds submit --config="${config}" --project="${PROJECT}" --substitutions="_OUTPUT_PROJECT=${OUTPUT_PROJECT},_TAG_NAME=${TAG_NAME},_BUILD_TYPE=${BUILD_TYPE}" --async --format='value(ID)' . |
| } |
| |
| wait_for_build() { |
| local -r build_id="$1" |
| local status |
| while true; do |
| status=$(get_build_status "${build_id}") |
| case "${status}" in |
| "SUCCESS"|"FAILURE"|"INTERNAL_ERROR"|"TIMEOUT"|"CANCELLED") |
| echo "${status}" |
| return |
| ;; |
| "QUEUED"|"WORKING") |
| sleep 5 |
| ;; |
| "STATUS_UNKNOWN") |
| echo "Received STATUS_UNKNOWN for build ${build_id}" 1>&2 |
| sleep 5 |
| ;; |
| *) |
| echo "Unknown status for build ${build_id}: ${status}" 1>&2 |
| return 1 |
| ;; |
| esac |
| done |
| } |
| |
| # Queries Cloud Logging to verify if a failed build hit a zone stockout. |
| # Uses limit=1 to return immediately upon finding the first match. |
| is_stockout() { |
| local -r build_id="$1" |
| local filter |
| filter='resource.type="build" AND resource.labels.build_id="'"${build_id}"'" AND SEARCH("ZONE_RESOURCE_POOL_EXHAUSTED")' |
| |
| local found |
| found=$(gcloud logging read "${filter}" --project="${PROJECT}" --limit=1 --format="value(insertId)" --freshness=1d 2>/dev/null) |
| |
| [[ -n "${found}" ]] |
| } |
| |
| # --- Execution Workflows --- |
| |
| run_publish_mode() { |
| local -a build_ids |
| local status log_url |
| local exit_code=0 |
| |
| for config in src/cmd/*/cloudbuild.yaml; do |
| build_ids+=("$(start_publish_build "${config}")") |
| done |
| |
| echo |
| for build_id in "${build_ids[@]}"; do |
| status="$(wait_for_build "${build_id}")" |
| if [[ "${status}" == "SUCCESS" ]]; then |
| echo "Build ${build_id} succeeded" |
| else |
| log_url="$(get_log_url "${build_id}")" |
| echo "Build ${build_id} failed" |
| echo "Logs: ${log_url}" |
| exit_code=1 |
| fi |
| done |
| return "${exit_code}" |
| } |
| |
| run_test_mode() { |
| local exit_code=0 |
| |
| # State tracking arrays |
| local -a active_configs=(testing/*.yaml) |
| declare -A attempts_map |
| declare -A history_map |
| declare -A final_status_map |
| |
| # Initialize state |
| for config in "${active_configs[@]}"; do |
| attempts_map["$config"]=0 |
| history_map["$config"]="" |
| final_status_map["$config"]="PENDING" |
| done |
| |
| local round=1 |
| # Loop until no configs require retries |
| while [[ ${#active_configs[@]} -gt 0 ]]; do |
| echo "=== Starting Test Round #${round} (${#active_configs[@]} builds) ===" |
| |
| # CLEAR the array so old builds aren't re-evaluated |
| unset current_builds |
| # Map active build IDs to their configuration files for this round |
| declare -A current_builds |
| for config in "${active_configs[@]}"; do |
| |
| # Check if filename contains 'gpu' (using ,, for case-insensitive matching) |
| if [[ "${config,,}" == *"gpu"* ]]; then |
| target_zones=("${GPU_ZONES[@]}") |
| pool_type="GPU" |
| else |
| target_zones=("${STANDARD_ZONES[@]}") |
| fi |
| |
| # Pick a random zone |
| local random_zone="${target_zones[$RANDOM % ${#target_zones[@]}]}" |
| |
| # Increment attempt count |
| attempts_map["$config"]=$((attempts_map["$config"] + 1)) |
| local current_attempt="${attempts_map[$config]}" |
| |
| echo "Triggering ${config} -> Zone: ${random_zone} (Attempt #${current_attempt})" |
| local build_id |
| build_id=$(start_test_build "${config}" "${random_zone}") |
| current_builds["$build_id"]="$config" |
| done |
| |
| echo "Waiting for round #${round} builds to complete..." |
| echo |
| |
| # Prepare queue for the next retry round |
| local -a next_active_configs=() |
| |
| # Resolve builds asynchronously as we loop through them |
| for build_id in "${!current_builds[@]}"; do |
| local config="${current_builds[$build_id]}" |
| local attempt="${attempts_map[$config]}" |
| |
| local status log_url |
| status=$(wait_for_build "${build_id}") |
| log_url=$(get_log_url "${build_id}") |
| echo "Build ${build_id} for config ${config} completed with status: ${status}" |
| |
| if [[ "${status}" == *"SUCCESS"* ]]; then |
| history_map["$config"]+="#${attempt} run: Build succeeded"$'\n' |
| final_status_map["$config"]="SUCCESS" |
| else |
| # Determine if failure warrants a zone retry |
| if is_stockout "${build_id}"; then |
| if [[ ${attempt} -lt ${MAX_TEST_ATTEMPTS} ]]; then |
| history_map["$config"]+="#${attempt} run: failed stockout: \"Logs: ${log_url}\""$'\n' |
| echo "Detected zone stockout for ${config} on attempt #${attempt}. See ${log_url}. Retrying in a new zone..." |
| next_active_configs+=("$config") |
| else |
| history_map["$config"]+="#${attempt} run: failed stockout (max retries exhausted): \"Logs: ${log_url}\""$'\n' |
| final_status_map["$config"]="FAILURE" |
| exit_code=1 |
| fi |
| else |
| # Legitimate non-stockout failures terminate immediately |
| history_map["$config"]+="#${attempt} run: failed [other issues]: \"Logs: ${log_url}\""$'\n' |
| final_status_map["$config"]="FAILURE" |
| exit_code=1 |
| fi |
| fi |
| done |
| |
| # Reset active configs to those that need another round |
| active_configs=("${next_active_configs[@]}") |
| ((round++)) |
| done |
| |
| # --- Display Final Report --- |
| echo |
| echo "========================================" |
| echo " TEST BUILD SUMMARY REPORT " |
| echo "========================================" |
| |
| # Sort configurations alphabetically for stable summary reading |
| local -a sorted_configs |
| readarray -t sorted_configs < <(printf '%s\n' "${!history_map[@]}" | sort) |
| |
| for config in "${sorted_configs[@]}"; do |
| echo "${config} -> Status: ${final_status_map[$config]}" |
| |
| # Only print the accumulated attempt history if the build ultimately failed |
| if [[ "${final_status_map[$config]}" != "SUCCESS" ]]; then |
| printf "%s" "${history_map[$config]}" |
| fi |
| done |
| |
| return "${exit_code}" |
| } |
| |
| main() { |
| parse_arguments "$@" |
| validate_args |
| |
| echo "Starting builds in project ${PROJECT} in mode ${MODE}" |
| PROJECT_NAME=${PROJECT//google.com:/elgoog_com_} |
| |
| if [[ ${MODE} == 'publish' ]]; then |
| run_publish_mode |
| else |
| run_test_mode |
| fi |
| } |
| |
| main "$@" |