| # Copyright (c) 2013 The Chromium OS Authors. All rights reserved. |
| # Use of this source code is governed by a BSD-style license that can be |
| # found in the LICENSE file. |
| |
| """Module containing the completion stages.""" |
| |
| from __future__ import print_function |
| |
| from chromite.cbuildbot import buildbucket_lib |
| from chromite.cbuildbot import chroot_lib |
| from chromite.cbuildbot import commands |
| from chromite.cbuildbot import prebuilts |
| from chromite.cbuildbot import relevant_changes |
| from chromite.cbuildbot import tree_status |
| from chromite.cbuildbot.stages import generic_stages |
| from chromite.cbuildbot.stages import sync_stages |
| from chromite.lib import builder_status_lib |
| from chromite.lib import clactions |
| from chromite.lib import config_lib |
| from chromite.lib import constants |
| from chromite.lib import cros_logging as logging |
| from chromite.lib import failures_lib |
| from chromite.lib import results_lib |
| |
| |
| def GetBuilderSuccessMap(builder_run, overall_success): |
| """Get the pass/fail status of all builders. |
| |
| A builder is marked as passed if all of its steps ran all of the way to |
| completion. We determine this by looking at whether all of the steps for |
| all of the constituent boards ran to completion. |
| |
| In cases where a builder does not have any boards, or has child boards, we |
| fall back and instead just look at whether the entire build was successful. |
| |
| Args: |
| builder_run: The builder run we wish to get the status of. |
| overall_success: The overall status of the build. |
| |
| Returns: |
| A dict, mapping the builder names to whether they succeeded. |
| """ |
| success_map = {} |
| for run in [builder_run] + builder_run.GetChildren(): |
| if run.config.boards and not run.config.child_configs: |
| success_map[run.config.name] = True |
| for board in run.config.boards: |
| board_runattrs = run.GetBoardRunAttrs(board) |
| if not board_runattrs.HasParallel('success'): |
| success_map[run.config.name] = False |
| else: |
| # If a builder does not have boards, or if it has child configs, we |
| # will just use the overall status instead. |
| success_map[run.config.name] = overall_success |
| return success_map |
| |
| |
| def CreateBuildFailureMessage(overlays, builder_name, dashboard_url): |
| """Creates a message summarizing the failures. |
| |
| Args: |
| overlays: The overlays used for the build. |
| builder_name: The name of the builder. |
| dashboard_url: The URL of the build. |
| |
| Returns: |
| A failures_lib.BuildFailureMessage object. |
| """ |
| internal = overlays in [constants.PRIVATE_OVERLAYS, |
| constants.BOTH_OVERLAYS] |
| details = [] |
| tracebacks = tuple(results_lib.Results.GetTracebacks()) |
| for x in tracebacks: |
| if isinstance(x.exception, failures_lib.CompoundFailure): |
| # We do not want the textual tracebacks included in the |
| # stringified CompoundFailure instance because this will be |
| # printed on the waterfall. |
| ex_str = x.exception.ToSummaryString() |
| else: |
| ex_str = str(x.exception) |
| # Truncate displayed failure reason to 1000 characters. |
| ex_str = ex_str[:200] |
| details.append('The %s stage failed: %s' % (x.failed_stage, ex_str)) |
| if not details: |
| details = ['cbuildbot failed'] |
| |
| # reason does not include builder name or URL. This is mainly for |
| # populating the "failure message" column in the stats sheet. |
| reason = ' '.join(details) |
| details.append('in %s' % dashboard_url) |
| msg = '%s: %s' % (builder_name, ' '.join(details)) |
| |
| return failures_lib.BuildFailureMessage(msg, tracebacks, internal, reason, |
| builder_name) |
| |
| |
| class ManifestVersionedSyncCompletionStage( |
| generic_stages.ForgivingBuilderStage): |
| """Stage that records board specific results for a unique manifest file.""" |
| |
| option_name = 'sync' |
| |
| def __init__(self, builder_run, sync_stage, success, **kwargs): |
| super(ManifestVersionedSyncCompletionStage, self).__init__( |
| builder_run, **kwargs) |
| self.sync_stage = sync_stage |
| self.success = success |
| # Message that can be set that well be sent along with the status in |
| # UpdateStatus. |
| self.message = None |
| |
| def GetBuildFailureMessage(self): |
| """Returns message summarizing the failures.""" |
| return CreateBuildFailureMessage(self._run.config.overlays, |
| self._run.config.name, |
| self._run.ConstructDashboardURL()) |
| |
| def PerformStage(self): |
| if not self.success: |
| self.message = self.GetBuildFailureMessage() |
| |
| if not config_lib.IsPFQType(self._run.config.build_type): |
| # Update the pass/fail status in the manifest-versions |
| # repo. Suite scheduler checks the build status to schedule |
| # suites. |
| self._run.attrs.manifest_manager.UpdateStatus( |
| success_map=GetBuilderSuccessMap(self._run, self.success), |
| message=self.message, dashboard_url=self.ConstructDashboardURL()) |
| |
| |
| class ImportantBuilderFailedException(failures_lib.StepFailure): |
| """Exception thrown when an important build fails to build.""" |
| |
| |
| class MasterSlaveSyncCompletionStage(ManifestVersionedSyncCompletionStage): |
| """Stage that records whether we passed or failed to build/test manifest.""" |
| |
| def __init__(self, *args, **kwargs): |
| super(MasterSlaveSyncCompletionStage, self).__init__(*args, **kwargs) |
| self._slave_statuses = {} |
| self.buildbucket_client = self.GetBuildbucketClient() |
| |
| def _GetLocalBuildStatus(self): |
| """Return the status for this build as a dictionary.""" |
| status = builder_status_lib.BuilderStatus.GetCompletedStatus(self.success) |
| status_obj = builder_status_lib.BuilderStatus(status, self.message) |
| return {self._bot_id: status_obj} |
| |
| def _GetSlaveBuildStatus(self, manager, build_id, db, builder_names, |
| timeout): |
| """Return the statuses of slave builds. |
| |
| Args: |
| manager: An instance of BuildSpecsManager. |
| build_id: The build id of the master build. |
| db: An instance of cidb.CIDBConnection. |
| builder_names: A list of builder names (strings) of slave builds. |
| timeout: Number of seconds to wait for the results. |
| |
| Returns: |
| A build_config name-> status dictionary of build statuses |
| (See BuildSpecsManager.GetBuildersStatus). |
| """ |
| return manager.GetBuildersStatus( |
| build_id, |
| db, |
| builder_names, |
| timeout=timeout) |
| |
| def _FetchSlaveStatuses(self): |
| """Fetch and return build status for slaves of this build. |
| |
| If this build is not a master then return just the status of this build. |
| |
| Returns: |
| A dict of build_config name -> builder_status_lib.BuilderStatus objects, |
| for all important slave build configs. Build configs that never started |
| will have a builder_status_lib.BuilderStatus of MISSING. |
| """ |
| # Wait for slaves if we're a master, in production or mock-production. |
| # Otherwise just look at our own status. |
| slave_statuses = self._GetLocalBuildStatus() |
| if not self._run.config.master: |
| # The slave build returns its own status. |
| logging.warning('The build is not a master.') |
| elif self._run.options.mock_slave_status or not self._run.options.debug: |
| # The master build. |
| builders = self._GetSlaveConfigs() |
| builder_names = [b.name for b in builders] |
| timeout = None |
| build_id, db = self._run.GetCIDBHandle() |
| if db: |
| timeout = db.GetTimeToDeadline(build_id) |
| if timeout is None: |
| # Catch-all: This could happen if cidb is not setup, or the deadline |
| # query fails. |
| timeout = self._run.config.build_timeout |
| |
| if self._run.options.debug: |
| # For debug runs, wait for three minutes to ensure most code |
| # paths are executed. |
| logging.info('Waiting for 3 minutes only for debug run. ' |
| 'Would have waited for %s seconds.', timeout) |
| timeout = 3 * 60 |
| |
| manager = self._run.attrs.manifest_manager |
| if sync_stages.MasterSlaveLKGMSyncStage.external_manager: |
| manager = sync_stages.MasterSlaveLKGMSyncStage.external_manager |
| slave_statuses.update(self._GetSlaveBuildStatus( |
| manager, build_id, db, builder_names, timeout)) |
| return slave_statuses |
| |
| def _HandleStageException(self, exc_info): |
| """Decide whether an exception should be treated as fatal.""" |
| # Besides the master, the completion stages also run on slaves, to report |
| # their status back to the master. If the build failed, they throw an |
| # exception here. For slave builders, marking this stage 'red' would be |
| # redundant, since the build itself would already be red. In this case, |
| # report a warning instead. |
| # pylint: disable=protected-access |
| exc_type = exc_info[0] |
| if (issubclass(exc_type, ImportantBuilderFailedException) and |
| not self._run.config.master): |
| return self._HandleExceptionAsWarning(exc_info) |
| else: |
| # In all other cases, exceptions should be treated as fatal. To |
| # implement this, we bypass ForgivingStage and call |
| # generic_stages.BuilderStage._HandleStageException explicitly. |
| return generic_stages.BuilderStage._HandleStageException(self, exc_info) |
| |
| def HandleSuccess(self): |
| """Handle a successful build. |
| |
| This function is called whenever the cbuildbot run is successful. |
| For the master, this will only be called when all slave builders |
| are also successful. This function may be overridden by subclasses. |
| """ |
| # We only promote for the pfq, not chrome pfq. |
| # TODO(build): Run this logic in debug mode too. |
| if (not self._run.options.debug and |
| config_lib.IsPFQType(self._run.config.build_type) and |
| self._run.config.master and |
| self._run.manifest_branch == 'master' and |
| self._run.config.build_type != constants.CHROME_PFQ_TYPE): |
| self._run.attrs.manifest_manager.PromoteCandidate() |
| if sync_stages.MasterSlaveLKGMSyncStage.external_manager: |
| sync_stages.MasterSlaveLKGMSyncStage.external_manager.PromoteCandidate() |
| |
| def HandleFailure(self, failing, inflight, no_stat): |
| """Handle a build failure. |
| |
| This function is called whenever the cbuildbot run fails. |
| For the master, this will be called when any slave fails or times |
| out. This function may be overridden by subclasses. |
| |
| Args: |
| failing: The names of the failing builders. |
| inflight: The names of the builders that are still running. |
| no_stat: Set of builder names of slave builders that had status None. |
| """ |
| if failing or inflight or no_stat: |
| logging.PrintBuildbotStepWarnings() |
| |
| if failing: |
| logging.warning('\n'.join([ |
| 'The following builders failed with this manifest:', |
| ', '.join(sorted(failing)), |
| 'Please check the logs of the failing builders for details.'])) |
| |
| if inflight: |
| logging.warning('\n'.join([ |
| 'The following builders took too long to finish:', |
| ', '.join(sorted(inflight)), |
| 'Please check the logs of these builders for details.'])) |
| |
| if no_stat: |
| logging.warning('\n'.join([ |
| 'The following builders did not start or failed prematurely:', |
| ', '.join(sorted(no_stat)), |
| 'Please check the logs of these builders for details.'])) |
| |
| def PerformStage(self): |
| super(MasterSlaveSyncCompletionStage, self).PerformStage() |
| |
| # Upload our pass/fail status to Google Storage. |
| self._run.attrs.manifest_manager.UploadStatus( |
| success=self.success, message=self.message, |
| dashboard_url=self.ConstructDashboardURL()) |
| |
| statuses = self._FetchSlaveStatuses() |
| self._slave_statuses = statuses |
| no_stat = set(builder for builder, status in statuses.iteritems() |
| if status.Missing()) |
| failing = set(builder for builder, status in statuses.iteritems() |
| if status.Failed()) |
| inflight = set(builder for builder, status in statuses.iteritems() |
| if status.Inflight()) |
| |
| # If all the failing or inflight builders were sanity checkers |
| # then ignore the failure. |
| fatal = self._IsFailureFatal(failing, inflight, no_stat) |
| |
| if fatal: |
| self._AnnotateFailingBuilders(failing, inflight, no_stat, statuses) |
| self.HandleFailure(failing, inflight, no_stat) |
| raise ImportantBuilderFailedException() |
| else: |
| self.HandleSuccess() |
| |
| def _IsFailureFatal(self, failing, inflight, no_stat): |
| """Returns a boolean indicating whether the build should fail. |
| |
| Args: |
| failing: Set of builder names of slave builders that failed. |
| inflight: Set of builder names of slave builders that are inflight |
| no_stat: Set of builder names of slave builders that had status None. |
| |
| Returns: |
| True if any of the failing or inflight builders are not sanity check |
| builders for this master, or if there were any non-sanity-check builders |
| with status None. |
| """ |
| sanity_builders = self._run.config.sanity_check_slaves or [] |
| sanity_builders = set(sanity_builders) |
| return not sanity_builders.issuperset(failing | inflight | no_stat) |
| |
| def _AnnotateBuildStatusFromBuildbucket(self, no_stat): |
| """Annotate the build statuses fetched from the Buildbucket. |
| |
| Some builds may fail to upload statuses to GS. If the builds were |
| scheduled by Buildbucket, get the build statuses and annotate the results. |
| |
| Args: |
| no_stat: Config names of the slave builds with None status. |
| """ |
| buildbucket_info_dict = buildbucket_lib.GetBuildInfoDict( |
| self._run.attrs.metadata) |
| |
| for config_name in no_stat: |
| if config_name in buildbucket_info_dict: |
| buildbucket_id = buildbucket_info_dict[config_name].buildbucket_id |
| assert buildbucket_id is not None, 'buildbucket_id is None' |
| try: |
| content = self.buildbucket_client.GetBuildRequest( |
| buildbucket_id, self._run.options.debug) |
| |
| status = buildbucket_lib.GetBuildStatus(content) |
| result = buildbucket_lib.GetBuildResult(content) |
| |
| text = '%s: [status] %s [result] %s' % (config_name, status, result) |
| |
| if result == constants.BUILDBUCKET_BUILDER_RESULT_FAILURE: |
| failure_reason = buildbucket_lib.GetBuildFailureReason(content) |
| if failure_reason: |
| text += ' [failure_reason] %s' % failure_reason |
| elif result == constants.BUILDBUCKET_BUILDER_RESULT_CANCELED: |
| cancel_reason = buildbucket_lib.GetBuildCancelationReason(content) |
| if cancel_reason: |
| text += ' [cancelation_reason] %s' % cancel_reason |
| |
| dashboard_url = buildbucket_lib.GetBuildURL(content) |
| if dashboard_url: |
| logging.PrintBuildbotLink(text, dashboard_url) |
| else: |
| logging.PrintBuildbotStepText(text) |
| except buildbucket_lib.BuildbucketResponseException as e: |
| logging.error('Cannot get status for %s: %s', config_name, e) |
| logging.PrintBuildbotStepText( |
| 'No status found for build %s buildbucket_id %s' |
| % (config_name, buildbucket_id)) |
| else: |
| logging.PrintBuildbotStepText('%s wasn\'t scheduled by master.' |
| % config_name) |
| |
| def _AnnotateFailingBuilders(self, failing, inflight, no_stat, statuses): |
| """Add annotations that link to either failing or inflight builders. |
| |
| Adds buildbot links to failing builder dashboards. If no builders are |
| failing, adds links to inflight builders. Adds step text for builders |
| with status None. |
| |
| Args: |
| failing: Set of builder names of slave builders that failed. |
| inflight: Set of builder names of slave builders that are inflight. |
| no_stat: Set of builder names of slave builders that had status None. |
| statuses: A builder-name->status dictionary, which will provide |
| the dashboard_url values for any links. |
| """ |
| builders_to_link = set.union(failing, inflight) |
| for builder in builders_to_link: |
| if statuses[builder].dashboard_url: |
| if statuses[builder].message: |
| text = '%s: %s' % (builder, statuses[builder].message.reason) |
| else: |
| text = '%s: timed out' % builder |
| |
| logging.PrintBuildbotLink(text, statuses[builder].dashboard_url) |
| |
| if no_stat: |
| if config_lib.UseBuildbucketScheduler(self._run.config): |
| self._AnnotateBuildStatusFromBuildbucket(no_stat) |
| else: |
| for builder in no_stat: |
| logging.PrintBuildbotStepText('%s did not start.' % builder) |
| |
| def GetSlaveStatuses(self): |
| """Returns cached slave status results. |
| |
| Cached results are populated during PerformStage, so this function |
| should only be called after PerformStage has returned. |
| |
| Returns: |
| A dictionary from build names to builder_status_lib.BuilderStatus |
| builder status objects. |
| """ |
| return self._slave_statuses |
| |
| def _GetFailedMessages(self, failing): |
| """Gathers the BuildFailureMessages from the |failing| builders. |
| |
| Args: |
| failing: Names of the builders that failed. |
| |
| Returns: |
| A list of BuildFailureMessage or NoneType objects. |
| """ |
| return [self._slave_statuses[x].message for x in failing] |
| |
| def _GetBuildersWithNoneMessages(self, failing): |
| """Returns a list of failed builders with NoneType failure message. |
| |
| Args: |
| failing: Names of the builders that failed. |
| |
| Returns: |
| A list of builder names. |
| """ |
| return [x for x in failing if self._slave_statuses[x].message is None] |
| |
| |
| class CanaryCompletionStage(MasterSlaveSyncCompletionStage): |
| """Collect build slave statuses and handle the failures.""" |
| |
| def HandleFailure(self, failing, inflight, no_stat): |
| """Handle a build failure or timeout in the Canary builders. |
| |
| Args: |
| failing: Names of the builders that failed. |
| inflight: Names of the builders that timed out. |
| no_stat: Set of builder names of slave builders that had status None. |
| """ |
| # Print out the status about what builds failed or not. |
| MasterSlaveSyncCompletionStage.HandleFailure( |
| self, failing, inflight, no_stat) |
| |
| if self._run.config.master: |
| self.CanaryMasterHandleFailure(failing, inflight, no_stat) |
| |
| def SendCanaryFailureAlert(self, failing, inflight, no_stat): |
| """Send an alert email to summarize canary failures. |
| |
| Args: |
| failing: The names of the failing builders. |
| inflight: The names of the builders that are still running. |
| no_stat: The names of the builders that had status None. |
| """ |
| builder_name = 'Canary Master' |
| title = '%s has detected build failures:' % builder_name |
| msgs = [str(x) for x in self._GetFailedMessages(failing)] |
| slaves = self._GetBuildersWithNoneMessages(failing) |
| msgs += ['%s failed with unknown reason.' % x for x in slaves] |
| msgs += ['%s timed out' % x for x in inflight] |
| msgs += ['%s did not start' % x for x in no_stat] |
| msgs.insert(0, title) |
| msgs.append('You can also view the summary of the slave failures from ' |
| 'the %s stage of %s. Click on the failure message to go ' |
| 'to an individual slave\'s build status page: %s' % ( |
| self.name, builder_name, self.ConstructDashboardURL())) |
| msg = '\n\n'.join(msgs) |
| logging.warning(msg) |
| extra_fields = {'X-cbuildbot-alert': 'canary-fail-alert'} |
| tree_status.SendHealthAlert(self._run, 'Canary builder failures', msg, |
| extra_fields=extra_fields) |
| |
| def _ComposeTreeStatusMessage(self, failing, inflight, no_stat): |
| """Composes a tres status message. |
| |
| Args: |
| failing: Names of the builders that failed. |
| inflight: Names of the builders that timed out. |
| no_stat: Set of builder names of slave builders that had status None. |
| |
| Returns: |
| A string. |
| """ |
| slave_status_list = [ |
| ('did not start', list(no_stat)), |
| ('timed out', list(inflight)), |
| ('failed', list(failing)),] |
| # Print maximum 2 slaves for each category to not clutter the |
| # message. |
| max_num = 2 |
| messages = [] |
| for status, slaves in slave_status_list: |
| if not slaves: |
| continue |
| slaves_str = ','.join(slaves[:max_num]) |
| if len(slaves) <= max_num: |
| messages.append('%s %s' % (slaves_str, status)) |
| else: |
| messages.append('%s and %d others %s' % (slaves_str, |
| len(slaves) - max_num, |
| status)) |
| return '; '.join(messages) |
| |
| def CanaryMasterHandleFailure(self, failing, inflight, no_stat): |
| """Handles the failure by sending out an alert email. |
| |
| Args: |
| failing: Names of the builders that failed. |
| inflight: Names of the builders that timed out. |
| no_stat: Set of builder names of slave builders that had status None. |
| """ |
| if self._run.manifest_branch == 'master': |
| self.SendCanaryFailureAlert(failing, inflight, no_stat) |
| # Note: We used to throttle the tree here. As of |
| # https://chromium-review.googlesource.com/#/c/325821/ we no longer do. |
| |
| def _HandleStageException(self, exc_info): |
| """Decide whether an exception should be treated as fatal.""" |
| # Canary master already updates the tree status for slave |
| # failures. There is no need to mark this stage red. For slave |
| # builders, the build itself would already be red. In this case, |
| # report a warning instead. |
| # pylint: disable=protected-access |
| exc_type = exc_info[0] |
| if issubclass(exc_type, ImportantBuilderFailedException): |
| return self._HandleExceptionAsWarning(exc_info) |
| else: |
| # In all other cases, exceptions should be treated as fatal. |
| return super(CanaryCompletionStage, self)._HandleStageException(exc_info) |
| |
| |
| class CommitQueueCompletionStage(MasterSlaveSyncCompletionStage): |
| """Commits or reports errors to CL's that failed to be validated.""" |
| |
| # These stages are required to have run at least once and to never have |
| # failed, on each important slave. Otherwise, we may have incomplete |
| # information on which CLs affect which builders, and thus skip all |
| # board-aware submission. |
| _CRITICAL_STAGES = ('CommitQueueSync',) |
| |
| def HandleSuccess(self): |
| if self._run.config.master: |
| self.sync_stage.pool.SubmitPool(reason=constants.STRATEGY_CQ_SUCCESS) |
| if config_lib.IsPFQType(self._run.config.build_type): |
| super(CommitQueueCompletionStage, self).HandleSuccess() |
| |
| manager = self._run.attrs.manifest_manager |
| version = manager.current_version |
| if version: |
| chroot_manager = chroot_lib.ChrootManager(self._build_root) |
| chroot_manager.SetChrootVersion(version) |
| |
| self._RecordSubmissionMetrics() |
| |
| def HandleFailure(self, failing, inflight, no_stat): |
| """Handle a build failure or timeout in the Commit Queue. |
| |
| This function performs any tasks that need to happen when the Commit Queue |
| fails: |
| - Abort the HWTests if necessary. |
| - Push any CLs that indicate that they don't care about this failure. |
| - Determine what CLs to reject. |
| |
| See MasterSlaveSyncCompletionStage.HandleFailure. |
| |
| Args: |
| failing: Names of the builders that failed. |
| inflight: Names of the builders that timed out. |
| no_stat: Set of builder names of slave builders that had status None. |
| """ |
| # Print out the status about what builds failed or not. |
| MasterSlaveSyncCompletionStage.HandleFailure( |
| self, failing, inflight, no_stat) |
| |
| if self._run.config.master: |
| slave_buildbucket_ids = self.GetScheduledSlaveBuildbucketIds() |
| self.CQMasterHandleFailure( |
| failing, inflight, no_stat, slave_buildbucket_ids) |
| |
| self._RecordSubmissionMetrics() |
| |
| def _RecordSubmissionMetrics(self): |
| """Record CL handling statistics for submitted changes in monarch.""" |
| if not self._run.config.master: |
| return |
| |
| build_id, db = self._run.GetCIDBHandle() |
| if db: |
| my_actions = db.GetActionsForBuild(build_id) |
| my_submit_actions = [m for m in my_actions |
| if m.action == constants.CL_ACTION_SUBMITTED] |
| # A dictionary mapping from every change that was submitted to the |
| # submission reason. |
| submitted_change_strategies = {m.patch : m.reason |
| for m in my_submit_actions} |
| submitted_changes_all_actions = db.GetActionsForChanges( |
| submitted_change_strategies.keys()) |
| |
| action_history = clactions.CLActionHistory(submitted_changes_all_actions) |
| logging.info('Recording submission metrics about %s CLs to monarch.', |
| len(submitted_change_strategies)) |
| clactions.RecordSubmissionMetrics(action_history, |
| submitted_change_strategies) |
| |
| def _ShouldSubmitPartialPool(self, slave_buildbucket_ids): |
| """Determine whether we should attempt or skip SubmitPartialPool. |
| |
| Args: |
| slave_buildbucket_ids: A list of buildbucket_ids (strings) of slave |
| builds scheduled by Buildbucket. |
| |
| Returns: |
| True if all important, non-sanity-check slaves ran and completed all |
| critical stages, and hence it is safe to attempt SubmitPartialPool. False |
| otherwise. |
| """ |
| # sanity_check_slaves should not block board-aware submission, since they do |
| # not actually apply test patches. |
| sanity_check_slaves = set(self._run.config.sanity_check_slaves) |
| all_slaves = set([x.name for x in self._GetSlaveConfigs()]) |
| all_slaves -= sanity_check_slaves |
| assert self._run.config.name not in all_slaves |
| |
| # Get slave stages. |
| build_id, db = self._run.GetCIDBHandle() |
| assert db, 'No database connection to use.' |
| slave_stages = db.GetSlaveStages( |
| build_id, buildbucket_ids=slave_buildbucket_ids) |
| |
| should_submit = True |
| ACCEPTED_STATUSES = (constants.BUILDER_STATUS_PASSED, |
| constants.BUILDER_STATUS_SKIPPED,) |
| |
| # Configs that have passed critical stages. |
| configs_per_stage = {stage: set() for stage in self._CRITICAL_STAGES} |
| |
| for stage in slave_stages: |
| if (stage['name'] in self._CRITICAL_STAGES and |
| stage['status'] in ACCEPTED_STATUSES): |
| configs_per_stage[stage['name']].add(stage['build_config']) |
| |
| for stage in self._CRITICAL_STAGES: |
| missing_configs = all_slaves - configs_per_stage[stage] |
| if missing_configs: |
| logging.warning('Config(s) %s did not complete critical stage %s.', |
| ' '.join(missing_configs), stage) |
| should_submit = False |
| |
| return should_submit |
| |
| def CQMasterHandleFailure(self, failing, inflight, no_stat, |
| slave_buildbucket_ids): |
| """Handle changes in the validation pool upon build failure or timeout. |
| |
| This function determines whether to reject CLs and what CLs to |
| reject based on the category of the failures and whether the |
| sanity check builder(s) passed. |
| |
| Args: |
| failing: Names of the builders that failed. |
| inflight: Names of the builders that timed out. |
| no_stat: Set of builder names of slave builders that had status None. |
| slave_buildbucket_ids: A list of buildbucket_ids (strings) of slave builds |
| scheduled by Buildbucket. |
| """ |
| messages = self._GetFailedMessages(failing) |
| self.SendInfraAlertIfNeeded(failing, inflight, no_stat) |
| |
| changes = self.sync_stage.pool.applied |
| |
| do_partial_submission = self._ShouldSubmitPartialPool(slave_buildbucket_ids) |
| |
| if do_partial_submission: |
| build_id, db = self._run.GetCIDBHandle() |
| changes_by_config = ( |
| relevant_changes.RelevantChanges.GetRelevantChangesForSlaves( |
| build_id, db, self._run.config, changes, no_stat, |
| slave_buildbucket_ids)) |
| subsys_by_config = ( |
| relevant_changes.RelevantChanges.GetSubsysResultForSlaves( |
| build_id, db)) |
| |
| # Even if there was a failure, we can submit the changes that indicate |
| # that they don't care about this failure. |
| changes = self.sync_stage.pool.SubmitPartialPool( |
| changes, messages, changes_by_config, subsys_by_config, |
| failing, inflight, no_stat) |
| else: |
| logging.warning('Not doing any partial submission, due to critical stage ' |
| 'failure(s).') |
| title = 'CQ encountered a critical failure.' |
| msg = ('CQ encountered a critical failure, and hence skipped ' |
| 'board-aware submission. See %s' % self.ConstructDashboardURL()) |
| tree_status.SendHealthAlert(self._run, title, msg) |
| |
| sanity_check_slaves = set(self._run.config.sanity_check_slaves) |
| tot_sanity = self._ToTSanity(sanity_check_slaves, self._slave_statuses) |
| |
| if not tot_sanity: |
| # Sanity check slave failure may have been caused by bug(s) |
| # in ToT or broken infrastructure. In any of those cases, we |
| # should not reject any changes. |
| logging.warning('Detected that a sanity-check builder failed. ' |
| 'Will not reject any changes.') |
| |
| # If the tree was not open when we acquired a pool, do not assume that |
| # tot was sane. |
| if not self.sync_stage.pool.tree_was_open: |
| logging.info('The tree was not open when changes were acquired so we are ' |
| 'attributing failures to the broken tree rather than the ' |
| 'changes.') |
| tot_sanity = False |
| |
| if inflight: |
| # Some slave(s) timed out due to unknown causes, so only reject infra |
| # changes (probably just chromite changes). |
| self.sync_stage.pool.HandleValidationTimeout(sanity=tot_sanity, |
| changes=changes) |
| return |
| |
| # Some builder failed, or some builder did not report stats, or |
| # the intersection of both. Let HandleValidationFailure decide |
| # what changes to reject. |
| self.sync_stage.pool.HandleValidationFailure( |
| messages, sanity=tot_sanity, changes=changes, no_stat=no_stat) |
| |
| def _GetInfraFailMessages(self, failing): |
| """Returns a list of messages containing infra failures. |
| |
| Args: |
| failing: The names of the failing builders. |
| |
| Returns: |
| A list of BuildFailureMessage objects. |
| """ |
| msgs = self._GetFailedMessages(failing) |
| # Filter out None messages because we cannot analyze them. |
| return [x for x in msgs if x and |
| x.HasFailureType(failures_lib.InfrastructureFailure)] |
| |
| def SendInfraAlertIfNeeded(self, failing, inflight, no_stat): |
| """Send infra alerts if needed. |
| |
| Args: |
| failing: The names of the failing builders. |
| inflight: The names of the builders that are still running. |
| no_stat: The names of the builders that had status None. |
| """ |
| msgs = [str(x) for x in self._GetInfraFailMessages(failing)] |
| # Failed to report a non-None messages is an infra failure. |
| slaves = self._GetBuildersWithNoneMessages(failing) |
| msgs += ['%s failed with unknown reason.' % x for x in slaves] |
| msgs += ['%s timed out' % x for x in inflight] |
| msgs += ['%s did not start' % x for x in no_stat] |
| if msgs: |
| builder_name = self._run.config.name |
| title = '%s has encountered infra failures:' % (builder_name,) |
| msgs.insert(0, title) |
| msgs.append('See %s' % self.ConstructDashboardURL()) |
| msg = '\n\n'.join(msgs) |
| subject = '%s infra failures' % (builder_name,) |
| extra_fields = {'X-cbuildbot-alert': 'cq-infra-alert'} |
| tree_status.SendHealthAlert(self._run, subject, msg, |
| extra_fields=extra_fields) |
| |
| @staticmethod |
| def _ToTSanity(sanity_check_slaves, slave_statuses): |
| """Returns False if any sanity check slaves failed. |
| |
| Args: |
| sanity_check_slaves: Names of slave builders that are "sanity check" |
| builders for the current master. |
| slave_statuses: Dict of builder_status_lib.BuilderStatus objects by |
| builder name keys. |
| |
| Returns: |
| True if no sanity builders ran and failed. |
| """ |
| sanity_check_slaves = sanity_check_slaves or [] |
| return not any([x in slave_statuses and slave_statuses[x].Failed() for |
| x in sanity_check_slaves]) |
| |
| def _GetSlaveBuildStatus(self, manager, build_id, db, builder_names, timeout): |
| """Return the statuses of slave builds. |
| |
| Args: |
| manager: An instance of BuildSpecsManager. |
| build_id: The build id of the master build. |
| db: An instance of cidb.CIDBConnection. |
| builder_names: A list of builder names (strings) of slave builds. |
| timeout: Number of seconds to wait for the results. |
| |
| Returns: |
| A build_config name-> status dictionary of build statuses |
| (See BuildSpecsManager.GetBuildersStatus). |
| """ |
| # CQ master build needs needs validation_pool to keep track of applied |
| # changes and change dependencies. |
| return manager.GetBuildersStatus( |
| build_id, |
| db, |
| builder_names, |
| pool=self.sync_stage.pool, |
| timeout=timeout) |
| |
| def PerformStage(self): |
| """Run CommitQueueCompletionStage.""" |
| super(CommitQueueCompletionStage, self).PerformStage() |
| |
| |
| class PreCQCompletionStage(generic_stages.BuilderStage): |
| """Reports the status of a trybot run to Google Storage and Gerrit.""" |
| |
| def __init__(self, builder_run, sync_stage, success, **kwargs): |
| super(PreCQCompletionStage, self).__init__(builder_run, **kwargs) |
| self.sync_stage = sync_stage |
| self.success = success |
| |
| def GetBuildFailureMessage(self): |
| """Returns message summarizing the failures.""" |
| return CreateBuildFailureMessage(self._run.config.overlays, |
| self._run.config.name, |
| self._run.ConstructDashboardURL()) |
| |
| def PerformStage(self): |
| # Update Gerrit and Google Storage with the Pre-CQ status. |
| if self.success: |
| self.sync_stage.pool.HandlePreCQPerConfigSuccess() |
| else: |
| message = self.GetBuildFailureMessage() |
| self.sync_stage.pool.HandleValidationFailure([message]) |
| |
| |
| class PublishUprevChangesStage(generic_stages.BuilderStage): |
| """Makes uprev changes from pfq live for developers.""" |
| |
| def __init__(self, builder_run, success, stage_push=False, **kwargs): |
| """Constructor. |
| |
| Args: |
| builder_run: BuilderRun object. |
| success: Boolean indicating whether the build succeeded. |
| stage_push: Indicating whether to stage the push instead of pushing |
| it to master, default to False. |
| """ |
| super(PublishUprevChangesStage, self).__init__(builder_run, **kwargs) |
| self.success = success |
| self.stage_push = stage_push |
| |
| def CheckMasterBinhostTest(self, db, build_id): |
| """Check whether the master builder has passed BinhostTest stage. |
| |
| Args: |
| db: cidb.CIDBConnection object. |
| build_id: build_id of the master build to check for. |
| |
| Returns: |
| True if the status of the master build BinhostTest stage is 'pass'; |
| else, False. |
| """ |
| stage_name = 'BinhostTest' |
| |
| if self._build_stage_id is not None and db is not None: |
| stages = db.GetBuildStages(build_id) |
| |
| # No stages found. BinhostTest stage didn't start or got skipped, |
| # in both case we don't need to push commits to the temp pfq branch. |
| if not stages: |
| logging.warning('no %s stage found in build %s' % ( |
| stage_name, build_id)) |
| return False |
| |
| stage_status = [s for s in stages if ( |
| s['name'] == stage_name and |
| s['status'] == constants.BUILDER_STATUS_PASSED)] |
| if stage_status: |
| logging.info('build %s passed stage %s with %s' % ( |
| build_id, stage_name, stage_status)) |
| return True |
| else: |
| logging.warning('build %s stage %s result %s' % ( |
| build_id, stage_name, stage_status)) |
| return False |
| |
| logging.warning('Not valid build_stage_id %s or db %s or no %s found' % ( |
| self._build_stage_id, db, stage_name)) |
| return False |
| |
| def CheckSlaveUploadPrebuiltsTest(self, db, build_id): |
| """Check if the slaves have passed UploadPrebuilts stage. |
| |
| Given the master build id, check if all the important slaves have passed |
| the UploadPrebuilts stage. |
| |
| Args: |
| db: cidb.CIDBConnection object. |
| build_id: build_id of the master build to check for. |
| |
| Returns: |
| True if all the important slaves have passed the stage; |
| True if it's in debug environment; |
| else, False. |
| """ |
| stage_name = 'UploadPrebuilts' |
| |
| if not self._run.config.master: |
| logging.warning('The build is not a master') |
| return False |
| elif self._run.options.buildbot and self._run.options.debug: |
| # If it's in debug environment, no slave builds would be triggered, |
| # in order to cover the testing on pushing commits to a remote |
| # temp branch, return True. |
| logging.info('In debug environment, return CheckSlaveUploadPrebuiltsTest' |
| 'as True') |
| return True |
| elif self._build_stage_id is not None and db is not None: |
| slave_configs = self._GetSlaveConfigs() |
| important_set = set([slave['name'] for slave in slave_configs]) |
| |
| slave_buildbucket_ids = self.GetScheduledSlaveBuildbucketIds() |
| stages = db.GetSlaveStages( |
| build_id, buildbucket_ids=slave_buildbucket_ids) |
| |
| passed_set = set([s['build_config'] for s in stages if ( |
| s['name'] == stage_name and |
| s['status'] == constants.BUILDER_STATUS_PASSED)]) |
| |
| if passed_set.issuperset(important_set): |
| logging.info('All the important slaves passed %s' % stage_name) |
| return True |
| else: |
| remaining_set = important_set.difference(passed_set) |
| logging.warning('slave %s didn\'t pass %s' % ( |
| remaining_set, stage_name)) |
| return False |
| else: |
| logging.warning('Not valid build_stage_id %s or db %s ' % ( |
| self._build_stage_id, db)) |
| return False |
| |
| def PerformStage(self): |
| overlays, push_overlays = self._ExtractOverlays() |
| |
| staging_branch = None |
| if self.stage_push: |
| if not config_lib.IsMasterChromePFQ(self._run.config): |
| raise ValueError('This build must be a master chrome PFQ build ' |
| 'when stage_push is True.') |
| build_id, db = self._run.GetCIDBHandle() |
| |
| # If the master passed BinHostTest and all the important slaves passed |
| # UploadPrebuiltsTest, push uprev commits to a staging_branch. |
| if (self.CheckMasterBinhostTest(db, build_id) and |
| self.CheckSlaveUploadPrebuiltsTest(db, build_id)): |
| staging_branch = ('refs/' + constants.PFQ_REF + '/' + |
| constants.STAGING_PFQ_BRANCH_PREFIX + str(build_id)) |
| |
| assert push_overlays, 'push_overlays must be set to run this stage' |
| |
| # If we're a commit queue, we should clean out our local changes, resync, |
| # and reapply our uprevs. This is necessary so that 1) we are sure to point |
| # at the remote SHA1s, not our local SHA1s; 2) we can avoid doing a |
| # rebase; 3) in the case of failure and staging_branch is None, we don't |
| # submit the changes that were committed locally. |
| # |
| # If we're not a commit queue and the build succeeded, we can skip the |
| # cleanup here. This is a cheap trick so that the Chrome PFQ pushes its |
| # earlier uprev from the SyncChrome stage (it would be a bit tricky to |
| # replicate the uprev here, so we'll leave it alone). |
| |
| # If we're not a commit queue and staging_branch is not None, we can skip |
| # the cleanup here. When staging_branch is not None, we're going to push |
| # the local commits generated in AFDOUpdateEbuild stage to the |
| # staging_branch, cleaning up repository here will wipe out the local |
| # commits. |
| if (config_lib.IsCQType(self._run.config.build_type) or |
| not (self.success or staging_branch is not None)): |
| # Clean up our root and sync down the latest changes that were |
| # submitted. |
| commands.BuildRootGitCleanup(self._build_root) |
| |
| # Sync down the latest changes we have submitted. |
| if self._run.options.sync: |
| next_manifest = self._run.config.manifest |
| repo = self.GetRepoRepository() |
| repo.Sync(next_manifest) |
| |
| # Commit an uprev locally. |
| if self._run.options.uprev and self._run.config.uprev: |
| commands.UprevPackages(self._build_root, self._boards, overlays) |
| |
| # When prebuilts is True, if it's a successful run or staging_branch is |
| # not None for a master-chrome-pfq run, update binhost conf |
| if (self._run.config.prebuilts and |
| (self.success or staging_branch is not None)): |
| confwriter = prebuilts.BinhostConfWriter(self._run) |
| confwriter.Perform() |
| |
| # Push the uprev and binhost commits. |
| commands.UprevPush(self._build_root, push_overlays, |
| self._run.options.debug, |
| staging_branch=staging_branch) |
| if config_lib.IsMasterChromePFQ(self._run.config) and self.success: |
| self._run.attrs.metadata.UpdateWithDict({'UprevvedChrome': True}) |
| if config_lib.IsMasterAndroidPFQ(self._run.config) and self.success: |
| self._run.attrs.metadata.UpdateWithDict({'UprevvedAndroid': True}) |