quick-provision: Handle ssh errors more gracefully.

There is a race upon reboot of the quick provision pass where the reboot
can occur and terminate the ssh connection resulting in an ssh failure.
Capture that failure and use the COMPLETED keyval to determine if the
error should be ignored.

BUG=chromium:817955
TEST=~autotest/contrib/loadtest.py DEVSERVER config-cyan.json -t 1 -s 1 --ping

Change-Id: I8f633b37581580483dce9970525857d1e828c229
Reviewed-on: https://chromium-review.googlesource.com/944564
Commit-Ready: David Riley <davidriley@chromium.org>
Tested-by: David Riley <davidriley@chromium.org>
Reviewed-by: Congbin Guo <guocb@chromium.org>
Reviewed-by: Don Garrett <dgarrett@chromium.org>
diff --git a/cros_update.py b/cros_update.py
index c1ec878..2025e4c 100644
--- a/cros_update.py
+++ b/cros_update.py
@@ -241,11 +241,25 @@
                dut_script, cros_build_lib.ShellQuote(status_url),
                self.build_name, self.static_url
            )
-    results = device.RunCommand(cmd, log_output=True, capture_output=True)
+    # Quick provision script issues a reboot and might result in the SSH
+    # connection being terminated so set ssh_error_ok so that output can
+    # still be captured.
+    results = device.RunCommand(cmd, log_output=True, capture_output=True,
+                                ssh_error_ok=True)
     key_re = re.compile(r'^KEYVAL: ([^\d\W]\w*)=(.*)$')
     matches = [key_re.match(l) for l in results.output.splitlines()]
     keyvals = {m.group(1): m.group(2) for m in matches if m}
     logging.info("DUT returned keyvals: %s", keyvals)
+
+    # If there was an SSH error, check the keyvals to see if it actually
+    # completed and suppress the error if so.
+    if results.returncode == remote_access.SSH_ERROR_CODE:
+      if 'COMPLETED' in keyvals:
+        logging.warning('Quick provision completed with ssh error, ignoring...')
+      else:
+        logging.error('Incomplete quick provision failed with ssh error')
+        raise remote_access.SSHConnectionError(results.error)
+
     return keyvals
 
   def TriggerAU(self):
@@ -298,7 +312,9 @@
           except (cros_build_lib.RunCommandError,
                   remote_access.SSHConnectionError,
                   auto_updater.RebootVerificationError) as e:
-            logging.warning('Error during quick provision, falling back: %s', e)
+            logging.warning(
+                'Error during quick provision, falling back to legacy: %s: %s',
+                type(e).__name__, e)
             time.sleep(QUICK_PROVISION_FAILURE_DELAY_SEC)
 
         if invoke_autoupdate:
@@ -350,6 +366,9 @@
           self._WriteAUStatus('post-check for CrOS auto-update')
           chromeos_AU.PostCheckCrOSUpdate()
           self._WriteAUStatus(cros_update_progress.FINISHED)
+
+        logging.debug('Provision successfully completed (%s)',
+                      'legacy' if invoke_autoupdate else 'quick provision')
     except Exception as e:
       logging.debug('Error happens in CrOS auto-update: %r', e)
       self._WriteAUStatus(CROS_ERROR_TEMPLATE % str(traceback.format_exc()))
diff --git a/quick-provision/quick-provision b/quick-provision/quick-provision
index 26b9e5d..4f69ac3 100644
--- a/quick-provision/quick-provision
+++ b/quick-provision/quick-provision
@@ -294,6 +294,7 @@
 
   # Record that quick provision is complete to avoid another attempt.
   echo "${build}" >"${COMPLETED_FILE}"
+  keyval "COMPLETED=${build}"
 
   # Reboot in the background, giving time for the ssh invocation to
   # cleanly terminate.