Add retry in downloading Android artifacts.

BUG=chromium:512668
TEST=local test

Change-Id: Ic084079fd9eac71ac4cca40a702f1337d9eca2aa
Reviewed-on: https://chromium-review.googlesource.com/312503
Commit-Ready: Dan Shi <dshi@google.com>
Tested-by: Dan Shi <dshi@google.com>
Reviewed-by: Dan Shi <dshi@google.com>
diff --git a/android_build.py b/android_build.py
index 597d8bb..dd1a931 100644
--- a/android_build.py
+++ b/android_build.py
@@ -9,13 +9,24 @@
 import apiclient
 import httplib2
 import io
+import subprocess
 
 from apiclient import discovery
 from oauth2client.client import SignedJwtAssertionCredentials
 
+import retry
+
+
 CREDENTIAL_SCOPE = 'https://www.googleapis.com/auth/androidbuild.internal'
 DEFAULT_BUILDER = 'androidbuildinternal'
 DEFAULT_CHUNKSIZE = 20*1024*1024
+# Maximum attempts to interact with Launch Control API.
+MAX_ATTEMPTS = 10
+# Timeout in minutes for downloading attempt.
+DOWNLOAD_TIMEOUT_MINS = 30
+# Timeout in minutes for API query.
+QUERY_TIMEOUT_MINS = 1
+
 
 class AndroidBuildFetchError(Exception):
   """Exception to raise when failed to make calls to Android build server."""
@@ -28,6 +39,7 @@
   credential_info = None
 
   @classmethod
+  @retry.retry(Exception, timeout_min=QUERY_TIMEOUT_MINS)
   def _GetServiceObject(cls):
     """Returns a service object with given credential information."""
     if not cls.credential_info:
@@ -37,8 +49,8 @@
         cls.credential_info['client_email'],
         cls.credential_info['private_key'], CREDENTIAL_SCOPE)
     http_auth = credentials.authorize(httplib2.Http())
-    service_obj = discovery.build(DEFAULT_BUILDER, 'v1', http=http_auth)
-    return service_obj
+    return discovery.build(DEFAULT_BUILDER, 'v1', http=http_auth)
+
 
   @classmethod
   def _VerifyBranch(cls, service_obj, branch, build_id, target):
@@ -56,7 +68,7 @@
     """
     builds = service_obj.build().list(
         buildType='submitted', branch=branch, buildId=build_id, target=target,
-        maxResults=0).execute()
+        maxResults=0).execute(num_retries=MAX_ATTEMPTS)
     if not builds:
       raise AndroidBuildFetchError(
           'Failed to locate build with branch %s, build id %s and target %s.' %
@@ -91,12 +103,13 @@
     # Get all artifacts for the given build_id and target.
     artifacts = service_obj.buildartifact().list(
         buildType='submitted', buildId=build_id, target=target,
-        attemptId='latest', maxResults=0).execute()
+        attemptId='latest', maxResults=0).execute(num_retries=MAX_ATTEMPTS)
     return artifacts['artifacts']
 
   @classmethod
+  @retry.retry(Exception, timeout_min=DOWNLOAD_TIMEOUT_MINS)
   def Download(cls, branch, build_id, target, resource_id, dest_file):
-    """Get the list of artifacts for given build id and target.
+    """Download the list of artifacts for given build id and target.
 
     Args:
       branch: branch of the desired build.
@@ -108,6 +121,9 @@
     service_obj = cls._GetServiceObject()
     cls._VerifyBranch(service_obj, branch, build_id, target)
 
+    # Delete partially downloaded file if exists.
+    subprocess.call(['rm', '-rf', dest_file])
+
     # TODO(dshi): Add retry logic here to avoid API flakes.
     download_req = service_obj.buildartifact().get_media(
         buildType='submitted', buildId=build_id, target=target,
@@ -117,9 +133,12 @@
           fh, download_req, chunksize=DEFAULT_CHUNKSIZE)
       done = None
       while not done:
-        _, done = downloader.next_chunk()
+        _, done = downloader.next_chunk(num_retries=MAX_ATTEMPTS)
+
 
   @classmethod
+  @retry.retry(Exception, timeout_min=QUERY_TIMEOUT_MINS,
+               blacklist=[AndroidBuildFetchError])
   def GetLatestBuildID(cls, target, branch):
     """Get the latest build ID for the given target and branch.
 
@@ -134,7 +153,7 @@
     service_obj = cls._GetServiceObject()
     builds = service_obj.build().list(
         buildType='submitted', branch=branch, target=target, successful=True,
-        maxResults=1).execute()
+        maxResults=1).execute(num_retries=MAX_ATTEMPTS)
     if not builds or not builds['builds']:
       raise AndroidBuildFetchError(
           'Failed to locate build with branch %s and target %s.' %
diff --git a/retry.py b/retry.py
new file mode 100644
index 0000000..a0b51fb
--- /dev/null
+++ b/retry.py
@@ -0,0 +1,79 @@
+# Copyright 2015 The Chromium OS Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+"""Basic infrastructure for implementing retries.
+
+This code is adopted from autotest: client/common_lib/cros/retry.py
+This implementation removes the timeout feature as that requires the retry to
+be done in main thread. For devserver, the call is handled in a thread kicked
+off by cherrypy, so timeotu can't be supported.
+"""
+
+from __future__ import print_function
+
+import cherrypy
+import random
+import sys
+import time
+
+
+def retry(ExceptionToCheck, timeout_min=1.0, delay_sec=3, blacklist=None):
+  """Retry calling the decorated function using a delay with jitter.
+
+  Will raise RPC ValidationError exceptions from the decorated
+  function without retrying; a malformed RPC isn't going to
+  magically become good. Will raise exceptions in blacklist as well.
+
+  original from:
+    http://www.saltycrane.com/blog/2009/11/trying-out-retry-decorator-python/
+
+  Args:
+    ExceptionToCheck: the exception to check.  May be a tuple of exceptions to
+                      check.
+    timeout_min: timeout in minutes until giving up.
+    delay_sec: pre-jittered delay between retries in seconds.  Actual delays
+               will be centered around this value, ranging up to 50% off this
+               midpoint.
+    blacklist: a list of exceptions that will be raised without retrying
+  """
+  def deco_retry(func):
+    random.seed()
+
+    def delay():
+      """'Jitter' the delay, up to 50% in either direction."""
+      random_delay = random.uniform(.5 * delay_sec, 1.5 * delay_sec)
+      cherrypy.log('Retrying in %f seconds...' % random_delay)
+      time.sleep(random_delay)
+
+    def func_retry(*args, **kwargs):
+      # Used to cache exception to be raised later.
+      exc_info = None
+      delayed_enabled = False
+      exception_tuple = () if blacklist is None else tuple(blacklist)
+      start_time = time.time()
+      remaining_time = timeout_min * 60
+
+      while remaining_time > 0:
+        if delayed_enabled:
+          delay()
+        else:
+          delayed_enabled = True
+        try:
+          # Clear the cache
+          exc_info = None
+          return func(*args, **kwargs)
+        except exception_tuple:
+          raise
+        except ExceptionToCheck as e:
+          cherrypy.log('%s(%s)' % (e.__class__, e))
+          # Cache the exception to be raised later.
+          exc_info = sys.exc_info()
+
+          remaining_time = int(timeout_min*60 - (time.time() - start_time))
+
+      # Raise the cached exception with original backtrace.
+      raise exc_info[0], exc_info[1], exc_info[2]
+
+    return func_retry  # true decorator
+  return deco_retry