blob: 109096a35c07b29542eea458f355c324b4a90d8c [file] [log] [blame]
"""
Autotest AFE Cleanup used by the scheduler
"""
import datetime, time, logging, random
from autotest_lib.database import database_connection
from autotest_lib.frontend.afe import models
from autotest_lib.scheduler import email_manager, scheduler_config
from autotest_lib.client.common_lib import host_protections
from autotest_lib.site_utils.graphite import stats
class PeriodicCleanup(object):
def __init__(self, db, clean_interval, run_at_initialize=False):
self._db = db
self.clean_interval = clean_interval
self._last_clean_time = time.time()
self._run_at_initialize = run_at_initialize
def initialize(self):
if self._run_at_initialize:
self._cleanup()
def run_cleanup_maybe(self):
should_cleanup = (self._last_clean_time + self.clean_interval * 60
< time.time())
if should_cleanup:
self._cleanup()
self._last_clean_time = time.time()
def _cleanup(self):
"""Abrstract cleanup method."""
raise NotImplementedError
class UserCleanup(PeriodicCleanup):
"""User cleanup that is controlled by the global config variable
clean_interval in the SCHEDULER section.
"""
timer = stats.Timer('monitor_db_cleanup.user_cleanup')
def __init__(self, db, clean_interval_minutes):
super(UserCleanup, self).__init__(db, clean_interval_minutes)
self._last_reverify_time = time.time()
@timer.decorate
def _cleanup(self):
logging.info('Running periodic cleanup')
self._abort_timed_out_jobs()
self._abort_jobs_past_max_runtime()
self._clear_inactive_blocks()
self._check_for_db_inconsistencies()
self._reverify_dead_hosts()
@timer.decorate
def _abort_timed_out_jobs(self):
msg = 'Aborting all jobs that have timed out and are not complete'
logging.info(msg)
query = models.Job.objects.filter(hostqueueentry__complete=False).extra(
where=['created_on + INTERVAL timeout_mins MINUTE < NOW()'])
for job in query.distinct():
logging.warning('Aborting job %d due to job timeout', job.id)
job.abort()
@timer.decorate
def _abort_jobs_past_max_runtime(self):
"""
Abort executions that have started and are past the job's max runtime.
"""
logging.info('Aborting all jobs that have passed maximum runtime')
rows = self._db.execute("""
SELECT hqe.id
FROM afe_host_queue_entries AS hqe
INNER JOIN afe_jobs ON (hqe.job_id = afe_jobs.id)
WHERE NOT hqe.complete AND NOT hqe.aborted AND
hqe.started_on + INTERVAL afe_jobs.max_runtime_mins MINUTE <
NOW()""")
query = models.HostQueueEntry.objects.filter(
id__in=[row[0] for row in rows])
for queue_entry in query.distinct():
logging.warning('Aborting entry %s due to max runtime', queue_entry)
queue_entry.abort()
@timer.decorate
def _check_for_db_inconsistencies(self):
logging.info('Cleaning db inconsistencies')
self._check_all_invalid_related_objects()
def _check_invalid_related_objects_one_way(self, first_model,
relation_field, second_model):
if 'invalid' not in first_model.get_field_dict():
return []
invalid_objects = list(first_model.objects.filter(invalid=True))
first_model.objects.populate_relationships(invalid_objects,
second_model,
'related_objects')
error_lines = []
for invalid_object in invalid_objects:
if invalid_object.related_objects:
related_list = ', '.join(str(related_object) for related_object
in invalid_object.related_objects)
error_lines.append('Invalid %s %s is related to %ss: %s'
% (first_model.__name__, invalid_object,
second_model.__name__, related_list))
related_manager = getattr(invalid_object, relation_field)
related_manager.clear()
return error_lines
def _check_invalid_related_objects(self, first_model, first_field,
second_model, second_field):
errors = self._check_invalid_related_objects_one_way(
first_model, first_field, second_model)
errors.extend(self._check_invalid_related_objects_one_way(
second_model, second_field, first_model))
return errors
def _check_all_invalid_related_objects(self):
model_pairs = ((models.Host, 'labels', models.Label, 'host_set'),
(models.AclGroup, 'hosts', models.Host, 'aclgroup_set'),
(models.AclGroup, 'users', models.User, 'aclgroup_set'),
(models.Test, 'dependency_labels', models.Label,
'test_set'))
errors = []
for first_model, first_field, second_model, second_field in model_pairs:
errors.extend(self._check_invalid_related_objects(
first_model, first_field, second_model, second_field))
if errors:
subject = ('%s relationships to invalid models, cleaned all' %
len(errors))
message = '\n'.join(errors)
logging.warning(subject)
logging.warning(message)
email_manager.manager.enqueue_notify_email(subject, message)
@timer.decorate
def _clear_inactive_blocks(self):
msg = 'Clear out blocks for all completed jobs.'
logging.info(msg)
# this would be simpler using NOT IN (subquery), but MySQL
# treats all IN subqueries as dependent, so this optimizes much
# better
self._db.execute("""
DELETE ihq FROM afe_ineligible_host_queues ihq
LEFT JOIN (SELECT DISTINCT job_id FROM afe_host_queue_entries
WHERE NOT complete) hqe
USING (job_id) WHERE hqe.job_id IS NULL""")
def _should_reverify_hosts_now(self):
reverify_period_sec = (scheduler_config.config.reverify_period_minutes
* 60)
if reverify_period_sec == 0:
return False
return (self._last_reverify_time + reverify_period_sec) <= time.time()
def _choose_subset_of_hosts_to_reverify(self, hosts):
"""Given hosts needing verification, return a subset to reverify."""
max_at_once = scheduler_config.config.reverify_max_hosts_at_once
if (max_at_once > 0 and len(hosts) > max_at_once):
return random.sample(hosts, max_at_once)
return sorted(hosts)
@timer.decorate
def _reverify_dead_hosts(self):
if not self._should_reverify_hosts_now():
return
self._last_reverify_time = time.time()
logging.info('Checking for dead hosts to reverify')
hosts = models.Host.objects.filter(
status=models.Host.Status.REPAIR_FAILED,
locked=False,
invalid=False)
hosts = hosts.exclude(
protection=host_protections.Protection.DO_NOT_VERIFY)
if not hosts:
return
hosts = list(hosts)
total_hosts = len(hosts)
hosts = self._choose_subset_of_hosts_to_reverify(hosts)
logging.info('Reverifying dead hosts (%d of %d) %s', len(hosts),
total_hosts, ', '.join(host.hostname for host in hosts))
for host in hosts:
models.SpecialTask.schedule_special_task(
host=host, task=models.SpecialTask.Task.VERIFY)
class TwentyFourHourUpkeep(PeriodicCleanup):
"""Cleanup that runs at the startup of monitor_db and every subsequent
twenty four hours.
"""
timer = stats.Timer('monitor_db_cleanup.twentyfourhour_cleanup')
def __init__(self, db, run_at_initialize=True):
clean_interval = 24 * 60 # 24 hours
super(TwentyFourHourUpkeep, self).__init__(
db, clean_interval, run_at_initialize=run_at_initialize)
@timer.decorate
def _cleanup(self):
logging.info('Running 24 hour clean up')
self._django_session_cleanup()
self._check_for_uncleanable_db_inconsistencies()
@timer.decorate
def _django_session_cleanup(self):
"""Clean up django_session since django doesn't for us.
http://www.djangoproject.com/documentation/0.96/sessions/
"""
logging.info('Deleting old sessions from django_session')
sql = 'TRUNCATE TABLE django_session'
self._db.execute(sql)
@timer.decorate
def _check_for_uncleanable_db_inconsistencies(self):
logging.info('Checking for uncleanable DB inconsistencies')
self._check_for_active_and_complete_queue_entries()
self._check_for_multiple_platform_hosts()
self._check_for_no_platform_hosts()
self._check_for_multiple_atomic_group_hosts()
@timer.decorate
def _check_for_active_and_complete_queue_entries(self):
query = models.HostQueueEntry.objects.filter(active=True, complete=True)
if query.count() != 0:
subject = ('%d queue entries found with active=complete=1'
% query.count())
lines = []
for entry in query:
lines.append(str(entry.get_object_dict()))
if entry.status == 'Aborted':
logging.error('Aborted entry: %s is both active and '
'complete. Setting active value to False.',
str(entry))
entry.active = False
entry.save()
self._send_inconsistency_message(subject, lines)
@timer.decorate
def _check_for_multiple_platform_hosts(self):
rows = self._db.execute("""
SELECT afe_hosts.id, hostname, COUNT(1) AS platform_count,
GROUP_CONCAT(afe_labels.name)
FROM afe_hosts
INNER JOIN afe_hosts_labels ON
afe_hosts.id = afe_hosts_labels.host_id
INNER JOIN afe_labels ON afe_hosts_labels.label_id = afe_labels.id
WHERE afe_labels.platform
GROUP BY afe_hosts.id
HAVING platform_count > 1
ORDER BY hostname""")
if rows:
subject = '%s hosts with multiple platforms' % self._db.rowcount
lines = [' '.join(str(item) for item in row)
for row in rows]
self._send_inconsistency_message(subject, lines)
@timer.decorate
def _check_for_no_platform_hosts(self):
rows = self._db.execute("""
SELECT hostname
FROM afe_hosts
LEFT JOIN afe_hosts_labels
ON afe_hosts.id = afe_hosts_labels.host_id
AND afe_hosts_labels.label_id IN (SELECT id FROM afe_labels
WHERE platform)
WHERE NOT afe_hosts.invalid AND afe_hosts_labels.host_id IS NULL""")
if rows:
logging.warn('%s hosts with no platform\n%s', self._db.rowcount,
', '.join(row[0] for row in rows))
@timer.decorate
def _check_for_multiple_atomic_group_hosts(self):
rows = self._db.execute("""
SELECT afe_hosts.id, hostname,
COUNT(DISTINCT afe_atomic_groups.name) AS atomic_group_count,
GROUP_CONCAT(afe_labels.name),
GROUP_CONCAT(afe_atomic_groups.name)
FROM afe_hosts
INNER JOIN afe_hosts_labels ON
afe_hosts.id = afe_hosts_labels.host_id
INNER JOIN afe_labels ON afe_hosts_labels.label_id = afe_labels.id
INNER JOIN afe_atomic_groups ON
afe_labels.atomic_group_id = afe_atomic_groups.id
WHERE NOT afe_hosts.invalid AND NOT afe_labels.invalid
GROUP BY afe_hosts.id
HAVING atomic_group_count > 1
ORDER BY hostname""")
if rows:
subject = '%s hosts with multiple atomic groups' % self._db.rowcount
lines = [' '.join(str(item) for item in row)
for row in rows]
self._send_inconsistency_message(subject, lines)
def _send_inconsistency_message(self, subject, lines):
logging.error(subject)
message = '\n'.join(lines)
if len(message) > 5000:
message = message[:5000] + '\n(truncated)\n'
email_manager.manager.enqueue_notify_email(subject, message)