gce_au_worker: Shard tests across zones to prevent from zonal outages
If for some reason an instance cannot be created in a particular zone,
try other zones (currently all in the same region) until we succeed.
BUG=b:64934461
TEST=trybot against lakitu-pre-cq
Change-Id: I6d1a3b300ae8c9c4c5ddd8171c494ecc587010b6
Reviewed-on: https://chromium-review.googlesource.com/630018
Commit-Ready: Daniel Wang <wonderfly@google.com>
Tested-by: Daniel Wang <wonderfly@google.com>
Reviewed-by: Prathmesh Prabhu <pprabhu@chromium.org>
(cherry picked from commit 01e72b496fe54c3180a22377e93c17c09d2079e1)
Reviewed-on: https://chromium-review.googlesource.com/633896
Reviewed-by: Aditya Kali <adityakali@google.com>
Commit-Queue: Daniel Wang <wonderfly@google.com>
diff --git a/au_test_harness/gce_au_worker.py b/au_test_harness/gce_au_worker.py
index ab39c65..93eafb6 100644
--- a/au_test_harness/gce_au_worker.py
+++ b/au_test_harness/gce_au_worker.py
@@ -8,6 +8,7 @@
import datetime
import os
+import random
import shutil
import tempfile
import time
@@ -24,6 +25,12 @@
# Information of the GCE project and default instance properties.
GCE_PROJECT = 'cros-autotest-bots'
+GCE_ALL_ZONES = (
+ 'us-central1-a',
+ 'us-central1-b',
+ 'us-central1-c',
+ 'us-central1-f',
+)
GCE_DEFAULT_ZONE = 'us-central1-a'
GCE_DEFAULT_NETWORK = 'network-prod'
GCE_DEFAULT_MACHINE_TYPE = 'n1-standard-8'
@@ -35,6 +42,8 @@
'https://www.googleapis.com/auth/logging.write',
'https://www.googleapis.com/auth/cloudimagemanagement',
)
+# Number of times to try until a GCE instance is created successfully.
+CREATE_INSTANCE_ATTEMPTS = 3
class GCEAUWorker(au_worker.AUWorker):
@@ -204,11 +213,22 @@
'scopes': self.instance_scopes,
},
]
- self.gce_context.CreateInstance(self.instance, self.image_link,
- machine_type=self.machine_type,
- network=self.network,
- static_address=self.address,
- serviceAccounts=service_accounts)
+ # Though rare, the create instance operation may fail due to zonal outages.
+ # Retry a couple of times for redundancy.
+ for i, zone in enumerate(random.sample(GCE_ALL_ZONES,
+ CREATE_INSTANCE_ATTEMPTS)):
+ try:
+ self.gce_context.zone = zone
+ self.gce_context.CreateInstance(self.instance, self.image_link,
+ machine_type=self.machine_type,
+ network=self.network,
+ static_address=self.address,
+ serviceAccounts=service_accounts)
+ except gce.Error as e:
+ logging.error('Failed to create instance [attempt %d/%d]: %r', i + 1,
+ CREATE_INSTANCE_ATTEMPTS, e)
+ else:
+ break
def _DeleteExistingResources(self):
"""Deletes all allocated GCP resources."""