run_builds.sh - cos/tools - Git at Google

 #!/bin/bash
 #
 # Copyright 2018 Google LLC
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #      http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.

 set -o errexit
 set -o pipefail

 PROJECT=""
 MODE=""
 SERVICE_ACCOUNT="compute@cloud-image-docker-builder.google.com.iam.gserviceaccount.com"
 COS_CUSTOMIZER="cos_customizer"

 # Zones with approved GPU quotas
 readonly GPU_ZONES=(
   "us-central1-a" "us-central1-b" "us-central1-c" "us-central1-f"
 )
 # Broader pool of zones for standard CPU-only integration tests
 readonly STANDARD_ZONES=(
   "us-west1-a" "us-west1-b" "us-west1-c"
   "us-central1-a" "us-central1-b" "us-central1-c" "us-central1-f"
   "us-east1-b" "us-east1-c" "us-east1-d"
   "us-east4-a" "us-east4-b" "us-east4-c"
   "us-west2-a" "us-west2-b" "us-west2-c"
 )
 readonly MAX_TEST_ATTEMPTS=5

 usage() {
   cat <<'EOF'
 Usage: ./run_builds.sh [OPTIONS]
 run_builds.sh runs multiple cloud builds and waits for the status of each
 build and reports back failures. It is used in two ways in this repository:
  - To run  cos-customizer integration tests (20+ concurrent builds in
 Cloud Build). Minimum quota requirements include ~10 K80s in us-west1.
  - To publish container images in src/cmd to AR (8+ concurrent builds)

 -p,--project=<project_name>                     GCP project to run tests in. Required.
 --service-account=<service_account>             Service account to use in cos-customizer builds
 --cos-customizer-image=<container_image_uri>    cos-customizer container image to test. By default, each tests compiles an image from source.
 -m,--mode=<test|publish> Mode to run the scripts in. Use `test` for running cos-customizer integration tests and `publish` to publish images.
 EOF
 }

 parse_arguments() {
   local -r long_options="project:,mode:,help:,service-account:,cos-customizer-image:"
   parsed_args="$(getopt --options=p:,m:,h --longoptions="${long_options}" --name "$0" -- "$@")"
   eval set -- "${parsed_args}"
   while true; do
     case "$1" in
       -p|--project)
         PROJECT="$2"
         shift 2
         ;;
       --service-account)
         SERVICE_ACCOUNT="$2"
         shift 2
         ;;
       --cos-customizer-image)
         COS_CUSTOMIZER="$2"
         shift 2
         ;;
       -m|--mode)
         MODE="$2"
         [[ ${MODE} == "test" || ${MODE} == "publish" ]] || \
         ( echo "Incorrect value for argument: 'mode'" && usage )
         shift 2
         ;;
       -h|--help)
         usage
         exit
         ;;
       --)
         shift
         break
         ;;
       *)
         usage
         exit
         ;;
     esac
   done
 }

 validate_args() {
  if [[ -z "${PROJECT}" || -z "${MODE}" ]]; then
    usage
    return 1
  fi
 }

 get_build_status() {
   local -r build_id="$1"
   gcloud builds describe "${build_id}" --project="${PROJECT}" --format='value(status)'
 }

 get_log_url() {
   local -r build_id="$1"
   gcloud builds describe "${build_id}" --project="${PROJECT}" --format='value(logUrl)'
 }

 start_test_build() {
   local -r config="$1"
   local -r zone="$2"
   gcloud builds submit --config="${config}" --project="${PROJECT}" --substitutions=_PROJECT_NAME="${PROJECT_NAME}",_SERVICE_ACCOUNT="${SERVICE_ACCOUNT}",_COS_CUSTOMIZER="${COS_CUSTOMIZER}",_ZONE="${zone}" --async --format='value(ID)' .
 }

 start_publish_build() {
   local -r config="$1"
   gcloud builds submit --config="${config}" --project="${PROJECT}" --substitutions="_OUTPUT_PROJECT=${OUTPUT_PROJECT},_TAG_NAME=${TAG_NAME},_BUILD_TYPE=${BUILD_TYPE}" --async --format='value(ID)' .
 }

 wait_for_build() {
   local -r build_id="$1"
   local status
   while true; do
     status=$(get_build_status "${build_id}")
     case "${status}" in
       "SUCCESS"|"FAILURE"|"INTERNAL_ERROR"|"TIMEOUT"|"CANCELLED")
         echo "${status}"
         return
         ;;
       "QUEUED"|"WORKING")
         sleep 5
         ;;
       "STATUS_UNKNOWN")
         echo "Received STATUS_UNKNOWN for build ${build_id}" 1>&2
         sleep 5
         ;;
       *)
         echo "Unknown status for build ${build_id}: ${status}" 1>&2
         return 1
         ;;
     esac
   done
 }

 # Queries Cloud Logging to verify if a failed build hit a zone stockout.
 # Uses limit=1 to return immediately upon finding the first match.
 is_stockout() {
   local -r build_id="$1"
   local filter
   filter='resource.type="build" AND resource.labels.build_id="'"${build_id}"'" AND SEARCH("ZONE_RESOURCE_POOL_EXHAUSTED")'

   local found
   found=$(gcloud logging read "${filter}" --project="${PROJECT}" --limit=1 --format="value(insertId)" --freshness=1d 2>/dev/null)

   [[ -n "${found}" ]]
 }

 # --- Execution Workflows ---

 run_publish_mode() {
   local -a build_ids
   local status log_url
   local exit_code=0

   for config in src/cmd/*/cloudbuild.yaml; do
     build_ids+=("$(start_publish_build "${config}")")
   done

   echo
   for build_id in "${build_ids[@]}"; do
     status="$(wait_for_build "${build_id}")"
     if [[ "${status}" == "SUCCESS" ]]; then
       echo "Build ${build_id} succeeded"
     else
       log_url="$(get_log_url "${build_id}")"
       echo "Build ${build_id} failed"
       echo "Logs: ${log_url}"
       exit_code=1
     fi
   done
   return "${exit_code}"
 }

 run_test_mode() {
   local exit_code=0

   # State tracking arrays
   local -a active_configs=(testing/*.yaml)
   declare -A attempts_map
   declare -A history_map
   declare -A final_status_map

   # Initialize state
   for config in "${active_configs[@]}"; do
     attempts_map["$config"]=0
     history_map["$config"]=""
     final_status_map["$config"]="PENDING"
   done

   local round=1
   # Loop until no configs require retries
   while [[ ${#active_configs[@]} -gt 0 ]]; do
     echo "=== Starting Test Round #${round} (${#active_configs[@]} builds) ==="

     # CLEAR the array so old builds aren't re-evaluated
     unset current_builds
     # Map active build IDs to their configuration files for this round
     declare -A current_builds
     for config in "${active_configs[@]}"; do

       # Check if filename contains 'gpu' (using ,, for case-insensitive matching)
       if [[ "${config,,}" == *"gpu"* ]]; then
         target_zones=("${GPU_ZONES[@]}")
         pool_type="GPU"
       else
         target_zones=("${STANDARD_ZONES[@]}")
       fi

       # Pick a random zone
       local random_zone="${target_zones[$RANDOM % ${#target_zones[@]}]}"

       # Increment attempt count
       attempts_map["$config"]=$((attempts_map["$config"] + 1))
       local current_attempt="${attempts_map[$config]}"

       echo "Triggering ${config} -> Zone: ${random_zone} (Attempt #${current_attempt})"
       local build_id
       build_id=$(start_test_build "${config}" "${random_zone}")
       current_builds["$build_id"]="$config"
     done

     echo "Waiting for round #${round} builds to complete..."
     echo

     # Prepare queue for the next retry round
     local -a next_active_configs=()

     # Resolve builds asynchronously as we loop through them
     for build_id in "${!current_builds[@]}"; do
       local config="${current_builds[$build_id]}"
       local attempt="${attempts_map[$config]}"

       local status log_url
       status=$(wait_for_build "${build_id}")
       log_url=$(get_log_url "${build_id}")
       echo "Build ${build_id} for config ${config} completed with status: ${status}"

       if [[ "${status}" == *"SUCCESS"* ]]; then
         history_map["$config"]+="#${attempt} run: Build succeeded"$'\n'
         final_status_map["$config"]="SUCCESS"
       else
         # Determine if failure warrants a zone retry
         if is_stockout "${build_id}"; then
           if [[ ${attempt} -lt ${MAX_TEST_ATTEMPTS} ]]; then
             history_map["$config"]+="#${attempt} run: failed stockout: \"Logs: ${log_url}\""$'\n'
             echo "Detected zone stockout for ${config} on attempt #${attempt}. See ${log_url}. Retrying in a new zone..."
             next_active_configs+=("$config")
           else
             history_map["$config"]+="#${attempt} run: failed stockout (max retries exhausted): \"Logs: ${log_url}\""$'\n'
             final_status_map["$config"]="FAILURE"
             exit_code=1
           fi
         else
           # Legitimate non-stockout failures terminate immediately
           history_map["$config"]+="#${attempt} run: failed [other issues]: \"Logs: ${log_url}\""$'\n'
           final_status_map["$config"]="FAILURE"
           exit_code=1
         fi
       fi
     done

     # Reset active configs to those that need another round
     active_configs=("${next_active_configs[@]}")
     ((round++))
   done

   # --- Display Final Report ---
   echo
   echo "========================================"
   echo "        TEST BUILD SUMMARY REPORT       "
   echo "========================================"

   # Sort configurations alphabetically for stable summary reading
   local -a sorted_configs
   readarray -t sorted_configs < <(printf '%s\n' "${!history_map[@]}" | sort)

   for config in "${sorted_configs[@]}"; do
     echo "${config} -> Status: ${final_status_map[$config]}"

     # Only print the accumulated attempt history if the build ultimately failed
     if [[ "${final_status_map[$config]}" != "SUCCESS" ]]; then
       printf "%s" "${history_map[$config]}"
     fi
   done

   return "${exit_code}"
 }

 main() {
   parse_arguments "$@"
   validate_args

   echo "Starting builds in project ${PROJECT} in mode ${MODE}"
   PROJECT_NAME=${PROJECT//google.com:/elgoog_com_}

   if [[ ${MODE} == 'publish' ]]; then
     run_publish_mode
   else
     run_test_mode
   fi
 }

 main "$@"
	#!/bin/bash
	#
	# Copyright 2018 Google LLC
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	set -o errexit
	set -o pipefail

	PROJECT=""
	MODE=""
	SERVICE_ACCOUNT="compute@cloud-image-docker-builder.google.com.iam.gserviceaccount.com"
	COS_CUSTOMIZER="cos_customizer"

	# Zones with approved GPU quotas
	readonly GPU_ZONES=(
	"us-central1-a" "us-central1-b" "us-central1-c" "us-central1-f"
	)
	# Broader pool of zones for standard CPU-only integration tests
	readonly STANDARD_ZONES=(
	"us-west1-a" "us-west1-b" "us-west1-c"
	"us-central1-a" "us-central1-b" "us-central1-c" "us-central1-f"
	"us-east1-b" "us-east1-c" "us-east1-d"
	"us-east4-a" "us-east4-b" "us-east4-c"
	"us-west2-a" "us-west2-b" "us-west2-c"
	)
	readonly MAX_TEST_ATTEMPTS=5

	usage() {
	cat <<'EOF'
	Usage: ./run_builds.sh [OPTIONS]
	run_builds.sh runs multiple cloud builds and waits for the status of each
	build and reports back failures. It is used in two ways in this repository:
	- To run cos-customizer integration tests (20+ concurrent builds in
	Cloud Build). Minimum quota requirements include ~10 K80s in us-west1.
	- To publish container images in src/cmd to AR (8+ concurrent builds)

	-p,--project=<project_name> GCP project to run tests in. Required.
	--service-account=<service_account> Service account to use in cos-customizer builds
	--cos-customizer-image=<container_image_uri> cos-customizer container image to test. By default, each tests compiles an image from source.
	-m,--mode=<test\|publish> Mode to run the scripts in. Use `test` for running cos-customizer integration tests and `publish` to publish images.
	EOF
	}

	parse_arguments() {
	local -r long_options="project:,mode:,help:,service-account:,cos-customizer-image:"
	parsed_args="$(getopt --options=p:,m:,h --longoptions="${long_options}" --name "$0" -- "$@")"
	eval set -- "${parsed_args}"
	while true; do
	case "$1" in
	-p\|--project)
	PROJECT="$2"
	shift 2
	;;
	--service-account)
	SERVICE_ACCOUNT="$2"
	shift 2
	;;
	--cos-customizer-image)
	COS_CUSTOMIZER="$2"
	shift 2
	;;
	-m\|--mode)
	MODE="$2"
	[[ ${MODE} == "test" \|\| ${MODE} == "publish" ]] \|\| \
	( echo "Incorrect value for argument: 'mode'" && usage )
	shift 2
	;;
	-h\|--help)
	usage
	exit
	;;
	--)
	shift
	break
	;;
	*)
	usage
	exit
	;;
	esac
	done
	}

	validate_args() {
	if [[ -z "${PROJECT}" \|\| -z "${MODE}" ]]; then
	usage
	return 1
	fi
	}

	get_build_status() {
	local -r build_id="$1"
	gcloud builds describe "${build_id}" --project="${PROJECT}" --format='value(status)'
	}

	get_log_url() {
	local -r build_id="$1"
	gcloud builds describe "${build_id}" --project="${PROJECT}" --format='value(logUrl)'
	}

	start_test_build() {
	local -r config="$1"
	local -r zone="$2"
	gcloud builds submit --config="${config}" --project="${PROJECT}" --substitutions=_PROJECT_NAME="${PROJECT_NAME}",_SERVICE_ACCOUNT="${SERVICE_ACCOUNT}",_COS_CUSTOMIZER="${COS_CUSTOMIZER}",_ZONE="${zone}" --async --format='value(ID)' .
	}

	start_publish_build() {
	local -r config="$1"
	gcloud builds submit --config="${config}" --project="${PROJECT}" --substitutions="_OUTPUT_PROJECT=${OUTPUT_PROJECT},_TAG_NAME=${TAG_NAME},_BUILD_TYPE=${BUILD_TYPE}" --async --format='value(ID)' .
	}

	wait_for_build() {
	local -r build_id="$1"
	local status
	while true; do
	status=$(get_build_status "${build_id}")
	case "${status}" in
	"SUCCESS"\|"FAILURE"\|"INTERNAL_ERROR"\|"TIMEOUT"\|"CANCELLED")
	echo "${status}"
	return
	;;
	"QUEUED"\|"WORKING")
	sleep 5
	;;
	"STATUS_UNKNOWN")
	echo "Received STATUS_UNKNOWN for build ${build_id}" 1>&2
	sleep 5
	;;
	*)
	echo "Unknown status for build ${build_id}: ${status}" 1>&2
	return 1
	;;
	esac
	done
	}

	# Queries Cloud Logging to verify if a failed build hit a zone stockout.
	# Uses limit=1 to return immediately upon finding the first match.
	is_stockout() {
	local -r build_id="$1"
	local filter
	filter='resource.type="build" AND resource.labels.build_id="'"${build_id}"'" AND SEARCH("ZONE_RESOURCE_POOL_EXHAUSTED")'

	local found
	found=$(gcloud logging read "${filter}" --project="${PROJECT}" --limit=1 --format="value(insertId)" --freshness=1d 2>/dev/null)

	[[ -n "${found}" ]]
	}

	# --- Execution Workflows ---

	run_publish_mode() {
	local -a build_ids
	local status log_url
	local exit_code=0

	for config in src/cmd/*/cloudbuild.yaml; do
	build_ids+=("$(start_publish_build "${config}")")
	done

	echo
	for build_id in "${build_ids[@]}"; do
	status="$(wait_for_build "${build_id}")"
	if [[ "${status}" == "SUCCESS" ]]; then
	echo "Build ${build_id} succeeded"
	else
	log_url="$(get_log_url "${build_id}")"
	echo "Build ${build_id} failed"
	echo "Logs: ${log_url}"
	exit_code=1
	fi
	done
	return "${exit_code}"
	}

	run_test_mode() {
	local exit_code=0

	# State tracking arrays
	local -a active_configs=(testing/*.yaml)
	declare -A attempts_map
	declare -A history_map
	declare -A final_status_map

	# Initialize state
	for config in "${active_configs[@]}"; do
	attempts_map["$config"]=0
	history_map["$config"]=""
	final_status_map["$config"]="PENDING"
	done

	local round=1
	# Loop until no configs require retries
	while [[ ${#active_configs[@]} -gt 0 ]]; do
	echo "=== Starting Test Round #${round} (${#active_configs[@]} builds) ==="

	# CLEAR the array so old builds aren't re-evaluated
	unset current_builds
	# Map active build IDs to their configuration files for this round
	declare -A current_builds
	for config in "${active_configs[@]}"; do

	# Check if filename contains 'gpu' (using ,, for case-insensitive matching)
	if [[ "${config,,}" == "gpu" ]]; then
	target_zones=("${GPU_ZONES[@]}")
	pool_type="GPU"
	else
	target_zones=("${STANDARD_ZONES[@]}")
	fi

	# Pick a random zone
	local random_zone="${target_zones[$RANDOM % ${#target_zones[@]}]}"

	# Increment attempt count
	attempts_map["$config"]=$((attempts_map["$config"] + 1))
	local current_attempt="${attempts_map[$config]}"

	echo "Triggering ${config} -> Zone: ${random_zone} (Attempt #${current_attempt})"
	local build_id
	build_id=$(start_test_build "${config}" "${random_zone}")
	current_builds["$build_id"]="$config"
	done

	echo "Waiting for round #${round} builds to complete..."
	echo

	# Prepare queue for the next retry round
	local -a next_active_configs=()

	# Resolve builds asynchronously as we loop through them
	for build_id in "${!current_builds[@]}"; do
	local config="${current_builds[$build_id]}"
	local attempt="${attempts_map[$config]}"

	local status log_url
	status=$(wait_for_build "${build_id}")
	log_url=$(get_log_url "${build_id}")
	echo "Build ${build_id} for config ${config} completed with status: ${status}"

	if [[ "${status}" == "SUCCESS" ]]; then
	history_map["$config"]+="#${attempt} run: Build succeeded"$'\n'
	final_status_map["$config"]="SUCCESS"
	else
	# Determine if failure warrants a zone retry
	if is_stockout "${build_id}"; then
	if [[ ${attempt} -lt ${MAX_TEST_ATTEMPTS} ]]; then
	history_map["$config"]+="#${attempt} run: failed stockout: \"Logs: ${log_url}\""$'\n'
	echo "Detected zone stockout for ${config} on attempt #${attempt}. See ${log_url}. Retrying in a new zone..."
	next_active_configs+=("$config")
	else
	history_map["$config"]+="#${attempt} run: failed stockout (max retries exhausted): \"Logs: ${log_url}\""$'\n'
	final_status_map["$config"]="FAILURE"
	exit_code=1
	fi
	else
	# Legitimate non-stockout failures terminate immediately
	history_map["$config"]+="#${attempt} run: failed [other issues]: \"Logs: ${log_url}\""$'\n'
	final_status_map["$config"]="FAILURE"
	exit_code=1
	fi
	fi
	done

	# Reset active configs to those that need another round
	active_configs=("${next_active_configs[@]}")
	((round++))
	done

	# --- Display Final Report ---
	echo
	echo "========================================"
	echo " TEST BUILD SUMMARY REPORT "
	echo "========================================"

	# Sort configurations alphabetically for stable summary reading
	local -a sorted_configs
	readarray -t sorted_configs < <(printf '%s\n' "${!history_map[@]}" \| sort)

	for config in "${sorted_configs[@]}"; do
	echo "${config} -> Status: ${final_status_map[$config]}"

	# Only print the accumulated attempt history if the build ultimately failed
	if [[ "${final_status_map[$config]}" != "SUCCESS" ]]; then
	printf "%s" "${history_map[$config]}"
	fi
	done

	return "${exit_code}"
	}

	main() {
	parse_arguments "$@"
	validate_args

	echo "Starting builds in project ${PROJECT} in mode ${MODE}"
	PROJECT_NAME=${PROJECT//google.com:/elgoog_com_}

	if [[ ${MODE} == 'publish' ]]; then
	run_publish_mode
	else
	run_test_mode
	fi
	}

	main "$@"