| # Copyright (c) 2012 The Chromium OS Authors. All rights reserved. |
| # Use of this source code is governed by a BSD-style license that can be |
| # found in the LICENSE file. |
| |
| #pylint: disable-msg=C0111 |
| |
| import os |
| import logging |
| |
| from autotest_lib.client.common_lib import global_config |
| from autotest_lib.frontend.afe import models |
| from autotest_lib.scheduler import email_manager |
| from autotest_lib.scheduler import scheduler_config, scheduler_models |
| from autotest_lib.site_utils.graphite import stats |
| |
| # Override default parser with our site parser. |
| def parser_path(install_dir): |
| """Return site implementation of parser. |
| |
| @param install_dir: installation directory. |
| """ |
| return os.path.join(install_dir, 'tko', 'site_parse') |
| |
| |
| class SiteAgentTask(object): |
| """ |
| SiteAgentTask subclasses BaseAgentTask in monitor_db. |
| """ |
| |
| |
| def _archive_results(self, queue_entries): |
| """ |
| Set the status of queue_entries to ARCHIVING. |
| |
| This method sets the status of the queue_entries to ARCHIVING |
| if the enable_archiving flag is true in global_config.ini. |
| Otherwise, it bypasses the archiving step and sets the queue entries |
| to the final status of current step. |
| """ |
| enable_archiving = global_config.global_config.get_config_value( |
| scheduler_config.CONFIG_SECTION, 'enable_archiving', type=bool) |
| # Set the status of the queue entries to archiving or self final status |
| if enable_archiving: |
| status = models.HostQueueEntry.Status.ARCHIVING |
| else: |
| status = self._final_status() |
| |
| for queue_entry in self.queue_entries: |
| queue_entry.set_status(status) |
| |
| |
| def _check_queue_entry_statuses(self, queue_entries, allowed_hqe_statuses, |
| allowed_host_statuses=None): |
| """ |
| Forked from monitor_db.py |
| """ |
| class_name = self.__class__.__name__ |
| for entry in queue_entries: |
| if entry.status not in allowed_hqe_statuses: |
| # In the orignal code, here we raise an exception. In an |
| # effort to prevent downtime we will instead abort the job and |
| # send out an email notifying us this has occured. |
| error_message = ('%s attempting to start entry with invalid ' |
| 'status %s: %s. Aborting Job: %s.' |
| % (class_name, entry.status, entry, |
| entry.job)) |
| logging.error(error_message) |
| email_manager.manager.enqueue_notify_email( |
| 'Job Aborted - Invalid Host Queue Entry Status', |
| error_message) |
| entry.job.request_abort() |
| invalid_host_status = ( |
| allowed_host_statuses is not None |
| and entry.host.status not in allowed_host_statuses) |
| if invalid_host_status: |
| # In the orignal code, here we raise an exception. In an |
| # effort to prevent downtime we will instead abort the job and |
| # send out an email notifying us this has occured. |
| error_message = ('%s attempting to start on queue entry with ' |
| 'invalid host status %s: %s. Aborting Job: %s' |
| % (class_name, entry.host.status, entry, |
| entry.job)) |
| logging.error(error_message) |
| email_manager.manager.enqueue_notify_email( |
| 'Job Aborted - Invalid Host Status', error_message) |
| entry.job.request_abort() |
| |
| |
| class SiteDispatcher(object): |
| """ |
| SiteDispatcher subclasses BaseDispatcher in monitor_db. |
| """ |
| DEFAULT_REQUESTED_BY_USER_ID = 1 |
| |
| |
| _timer = stats.Timer('scheduler') |
| |
| |
| @_timer.decorate |
| def tick(self): |
| super(SiteDispatcher, self).tick() |
| |
| @_timer.decorate |
| def _garbage_collection(self): |
| super(SiteDispatcher, self)._garbage_collection() |
| |
| @_timer.decorate |
| def _run_cleanup(self): |
| super(SiteDispatcher, self)._run_cleanup() |
| |
| @_timer.decorate |
| def _find_aborting(self): |
| super(SiteDispatcher, self)._find_aborting() |
| |
| @_timer.decorate |
| def _process_recurring_runs(self): |
| super(SiteDispatcher, self)._process_recurring_runs() |
| |
| @_timer.decorate |
| def _schedule_delay_tasks(self): |
| super(SiteDispatcher, self)._schedule_delay_tasks() |
| |
| @_timer.decorate |
| def _schedule_running_host_queue_entries(self): |
| super(SiteDispatcher, self)._schedule_running_host_queue_entries() |
| |
| @_timer.decorate |
| def _schedule_special_tasks(self): |
| super(SiteDispatcher, self)._schedule_special_tasks() |
| |
| @_timer.decorate |
| def _schedule_new_jobs(self): |
| super(SiteDispatcher, self)._schedule_new_jobs() |
| |
| @_timer.decorate |
| def _handle_agents(self): |
| super(SiteDispatcher, self)._handle_agents() |
| |
| def _reverify_hosts_where(self, where, |
| print_message='Reverifying host %s'): |
| """ |
| This is an altered version of _reverify_hosts_where the class to |
| models.SpecialTask.objects.create passes in an argument for |
| requested_by, in order to allow the Reset task to be created |
| properly. |
| """ |
| full_where='locked = 0 AND invalid = 0 AND ' + where |
| for host in scheduler_models.Host.fetch(where=full_where): |
| if self.host_has_agent(host): |
| # host has already been recovered in some way |
| continue |
| if self._host_has_scheduled_special_task(host): |
| # host will have a special task scheduled on the next cycle |
| continue |
| if print_message: |
| logging.error(print_message, host.hostname) |
| try: |
| user = models.User.objects.get(login='autotest_system') |
| except models.User.DoesNotExist: |
| user = models.User.objects.get( |
| id=SiteDispatcher.DEFAULT_REQUESTED_BY_USER_ID) |
| models.SpecialTask.objects.create( |
| task=models.SpecialTask.Task.RESET, |
| host=models.Host.objects.get(id=host.id), |
| requested_by=user) |
| |
| |
| def _check_for_unrecovered_verifying_entries(self): |
| # Verify is replaced by Reset. |
| queue_entries = scheduler_models.HostQueueEntry.fetch( |
| where='status = "%s"' % models.HostQueueEntry.Status.RESETTING) |
| for queue_entry in queue_entries: |
| special_tasks = models.SpecialTask.objects.filter( |
| task__in=(models.SpecialTask.Task.CLEANUP, |
| models.SpecialTask.Task.VERIFY, |
| models.SpecialTask.Task.RESET), |
| queue_entry__id=queue_entry.id, |
| is_complete=False) |
| if special_tasks.count() == 0: |
| logging.error('Unrecovered Resetting host queue entry: %s. ' |
| 'Setting status to Queued.', str(queue_entry)) |
| # Essentially this host queue entry was set to be Verifying |
| # however no special task exists for entry. This occurs if the |
| # scheduler dies between changing the status and creating the |
| # special task. By setting it to queued, the job can restart |
| # from the beginning and proceed correctly. This is much more |
| # preferable than having monitor_db not launching. |
| queue_entry.set_status('Queued') |