blob: 18e76ee4b07d7997b620c289d66ee228a6fba847 [file]
#!/bin/bash
#
# Copyright 2018 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -o errexit
set -o pipefail
PROJECT=""
MODE=""
SERVICE_ACCOUNT="compute@cloud-image-docker-builder.google.com.iam.gserviceaccount.com"
COS_CUSTOMIZER="cos_customizer"
# Zones with approved GPU quotas
readonly GPU_ZONES=(
"us-central1-a" "us-central1-b" "us-central1-c" "us-central1-f"
)
# Broader pool of zones for standard CPU-only integration tests
readonly STANDARD_ZONES=(
"us-west1-a" "us-west1-b" "us-west1-c"
"us-central1-a" "us-central1-b" "us-central1-c" "us-central1-f"
"us-east1-b" "us-east1-c" "us-east1-d"
"us-east4-a" "us-east4-b" "us-east4-c"
"us-west2-a" "us-west2-b" "us-west2-c"
)
readonly MAX_TEST_ATTEMPTS=5
usage() {
cat <<'EOF'
Usage: ./run_builds.sh [OPTIONS]
run_builds.sh runs multiple cloud builds and waits for the status of each
build and reports back failures. It is used in two ways in this repository:
- To run cos-customizer integration tests (20+ concurrent builds in
Cloud Build). Minimum quota requirements include ~10 K80s in us-west1.
- To publish container images in src/cmd to AR (8+ concurrent builds)
-p,--project=<project_name> GCP project to run tests in. Required.
--service-account=<service_account> Service account to use in cos-customizer builds
--cos-customizer-image=<container_image_uri> cos-customizer container image to test. By default, each tests compiles an image from source.
-m,--mode=<test|publish> Mode to run the scripts in. Use `test` for running cos-customizer integration tests and `publish` to publish images.
EOF
}
parse_arguments() {
local -r long_options="project:,mode:,help:,service-account:,cos-customizer-image:"
parsed_args="$(getopt --options=p:,m:,h --longoptions="${long_options}" --name "$0" -- "$@")"
eval set -- "${parsed_args}"
while true; do
case "$1" in
-p|--project)
PROJECT="$2"
shift 2
;;
--service-account)
SERVICE_ACCOUNT="$2"
shift 2
;;
--cos-customizer-image)
COS_CUSTOMIZER="$2"
shift 2
;;
-m|--mode)
MODE="$2"
[[ ${MODE} == "test" || ${MODE} == "publish" ]] || \
( echo "Incorrect value for argument: 'mode'" && usage )
shift 2
;;
-h|--help)
usage
exit
;;
--)
shift
break
;;
*)
usage
exit
;;
esac
done
}
validate_args() {
if [[ -z "${PROJECT}" || -z "${MODE}" ]]; then
usage
return 1
fi
}
get_build_status() {
local -r build_id="$1"
gcloud builds describe "${build_id}" --project="${PROJECT}" --format='value(status)'
}
get_log_url() {
local -r build_id="$1"
gcloud builds describe "${build_id}" --project="${PROJECT}" --format='value(logUrl)'
}
start_test_build() {
local -r config="$1"
local -r zone="$2"
gcloud builds submit --config="${config}" --project="${PROJECT}" --substitutions=_PROJECT_NAME="${PROJECT_NAME}",_SERVICE_ACCOUNT="${SERVICE_ACCOUNT}",_COS_CUSTOMIZER="${COS_CUSTOMIZER}",_ZONE="${zone}" --async --format='value(ID)' .
}
start_publish_build() {
local -r config="$1"
gcloud builds submit --config="${config}" --project="${PROJECT}" --substitutions="_OUTPUT_PROJECT=${OUTPUT_PROJECT},_TAG_NAME=${TAG_NAME},_BUILD_TYPE=${BUILD_TYPE}" --async --format='value(ID)' .
}
wait_for_build() {
local -r build_id="$1"
local status
while true; do
status=$(get_build_status "${build_id}")
case "${status}" in
"SUCCESS"|"FAILURE"|"INTERNAL_ERROR"|"TIMEOUT"|"CANCELLED")
echo "${status}"
return
;;
"QUEUED"|"WORKING")
sleep 5
;;
"STATUS_UNKNOWN")
echo "Received STATUS_UNKNOWN for build ${build_id}" 1>&2
sleep 5
;;
*)
echo "Unknown status for build ${build_id}: ${status}" 1>&2
return 1
;;
esac
done
}
# Queries Cloud Logging to verify if a failed build hit a zone stockout.
# Uses limit=1 to return immediately upon finding the first match.
is_stockout() {
local -r build_id="$1"
local filter
filter='resource.type="build" AND resource.labels.build_id="'"${build_id}"'" AND SEARCH("ZONE_RESOURCE_POOL_EXHAUSTED")'
local found
found=$(gcloud logging read "${filter}" --project="${PROJECT}" --limit=1 --format="value(insertId)" --freshness=1d 2>/dev/null)
[[ -n "${found}" ]]
}
# --- Execution Workflows ---
run_publish_mode() {
local -a build_ids
local status log_url
local exit_code=0
for config in src/cmd/*/cloudbuild.yaml; do
build_ids+=("$(start_publish_build "${config}")")
done
echo
for build_id in "${build_ids[@]}"; do
status="$(wait_for_build "${build_id}")"
if [[ "${status}" == "SUCCESS" ]]; then
echo "Build ${build_id} succeeded"
else
log_url="$(get_log_url "${build_id}")"
echo "Build ${build_id} failed"
echo "Logs: ${log_url}"
exit_code=1
fi
done
return "${exit_code}"
}
run_test_mode() {
local exit_code=0
# State tracking arrays
local -a active_configs=(testing/*.yaml)
declare -A attempts_map
declare -A history_map
declare -A final_status_map
# Initialize state
for config in "${active_configs[@]}"; do
attempts_map["$config"]=0
history_map["$config"]=""
final_status_map["$config"]="PENDING"
done
local round=1
# Loop until no configs require retries
while [[ ${#active_configs[@]} -gt 0 ]]; do
echo "=== Starting Test Round #${round} (${#active_configs[@]} builds) ==="
# CLEAR the array so old builds aren't re-evaluated
unset current_builds
# Map active build IDs to their configuration files for this round
declare -A current_builds
for config in "${active_configs[@]}"; do
# Check if filename contains 'gpu' (using ,, for case-insensitive matching)
if [[ "${config,,}" == *"gpu"* ]]; then
target_zones=("${GPU_ZONES[@]}")
pool_type="GPU"
else
target_zones=("${STANDARD_ZONES[@]}")
fi
# Pick a random zone
local random_zone="${target_zones[$RANDOM % ${#target_zones[@]}]}"
# Increment attempt count
attempts_map["$config"]=$((attempts_map["$config"] + 1))
local current_attempt="${attempts_map[$config]}"
echo "Triggering ${config} -> Zone: ${random_zone} (Attempt #${current_attempt})"
local build_id
build_id=$(start_test_build "${config}" "${random_zone}")
current_builds["$build_id"]="$config"
done
echo "Waiting for round #${round} builds to complete..."
echo
# Prepare queue for the next retry round
local -a next_active_configs=()
# Resolve builds asynchronously as we loop through them
for build_id in "${!current_builds[@]}"; do
local config="${current_builds[$build_id]}"
local attempt="${attempts_map[$config]}"
local status log_url
status=$(wait_for_build "${build_id}")
log_url=$(get_log_url "${build_id}")
echo "Build ${build_id} for config ${config} completed with status: ${status}"
if [[ "${status}" == *"SUCCESS"* ]]; then
history_map["$config"]+="#${attempt} run: Build succeeded"$'\n'
final_status_map["$config"]="SUCCESS"
else
# Determine if failure warrants a zone retry
if is_stockout "${build_id}"; then
if [[ ${attempt} -lt ${MAX_TEST_ATTEMPTS} ]]; then
history_map["$config"]+="#${attempt} run: failed stockout: \"Logs: ${log_url}\""$'\n'
echo "Detected zone stockout for ${config} on attempt #${attempt}. See ${log_url}. Retrying in a new zone..."
next_active_configs+=("$config")
else
history_map["$config"]+="#${attempt} run: failed stockout (max retries exhausted): \"Logs: ${log_url}\""$'\n'
final_status_map["$config"]="FAILURE"
exit_code=1
fi
else
# Legitimate non-stockout failures terminate immediately
history_map["$config"]+="#${attempt} run: failed [other issues]: \"Logs: ${log_url}\""$'\n'
final_status_map["$config"]="FAILURE"
exit_code=1
fi
fi
done
# Reset active configs to those that need another round
active_configs=("${next_active_configs[@]}")
((round++))
done
# --- Display Final Report ---
echo
echo "========================================"
echo " TEST BUILD SUMMARY REPORT "
echo "========================================"
# Sort configurations alphabetically for stable summary reading
local -a sorted_configs
readarray -t sorted_configs < <(printf '%s\n' "${!history_map[@]}" | sort)
for config in "${sorted_configs[@]}"; do
echo "${config} -> Status: ${final_status_map[$config]}"
# Only print the accumulated attempt history if the build ultimately failed
if [[ "${final_status_map[$config]}" != "SUCCESS" ]]; then
printf "%s" "${history_map[$config]}"
fi
done
return "${exit_code}"
}
main() {
parse_arguments "$@"
validate_args
echo "Starting builds in project ${PROJECT} in mode ${MODE}"
PROJECT_NAME=${PROJECT//google.com:/elgoog_com_}
if [[ ${MODE} == 'publish' ]]; then
run_publish_mode
else
run_test_mode
fi
}
main "$@"