server/cros/repair_utils.py - mirrors/cros/chromiumos/third_party/autotest - Git at Google

 # Copyright (c) 2013 The Chromium OS Authors. All rights reserved.
 # Use of this source code is governed by a BSD-style license that can be
 # found in the LICENSE file.

 # This file lets us test the repair supporting code.
 # We could not easily unit test it if it was in the repair file as it makes
 # a function call that is not protected by a __name__ == ??? guard.

 import datetime, logging, operator, urllib2, xmlrpclib

 import common

 from autotest_lib.client.common_lib import global_config, logging_config
 from autotest_lib.server import frontend

 # Ignore any jobs that were ran more than this many mins past the max job
 # timeout.
 _CUTOFF_AFTER_TIMEOUT_MINS = 60
 LOGFILE_NAME = 'machine_death.log'


 class MachineDeathLogger(logging_config.LoggingConfig):
     """
     Used to log information about a machine going into the Repair Failed state.

     We use this so that if the default log location ever changes it will also
     change for this logger and to keep this information separate from the
     other logs.

     """
     file_formatter = logging.Formatter(fmt='%(asctime)s | %(message)s',
                                        datefmt='%m/%d %H:%M:%S')

     def __init__(self):
         super(MachineDeathLogger, self).__init__(False)
         self.logger = logging.getLogger('machine_death')

         super(MachineDeathLogger, self).configure_logging(use_console=False)
         log_dir = self.get_server_log_dir()
         self.add_file_handler(LOGFILE_NAME, logging.ERROR, log_dir=log_dir)


 def _find_problem_test(machine, rpc):
     """
     Finds the last job ran on a machine.

     @param machine: The hostname (e.g. IP address) of the machine to find the
         last ran job on it.

     @param rpc: The rpc object to contact the server with.

     @return the job status dictionary for the job that last ran on the machine
         or None if there is no such job.
     """
     # Going through the RPC interface means we cannot use the latest() django
     # QuerySet function. So we will instead look at the past
     # job_max_runtime_mins_default plus _CUTOFF_AFTER_TIMEOUT_MINS
     # and pick the most recent run from there.
     default_timeout_mins = global_config.global_config.get_config_value(
         'AUTOTEST_WEB', 'job_max_runtime_mins_default', type=int)
     cutoff = (datetime.datetime.today() -
               datetime.timedelta(minutes=default_timeout_mins) -
               datetime.timedelta(minutes=_CUTOFF_AFTER_TIMEOUT_MINS))

     results = rpc.run('get_host_queue_entries', host__hostname=machine,
                       started_on__gte=str(cutoff))

     if results:
         return max(results, key=operator.itemgetter('started_on'))
     else:
         return None


 def flag_problem_test(machine):
     """
     Notify people about the last job that ran on a machine.

     This code is ran when a machine goes into the Repair Failed state and so
     there is a chance that the last ran job on it killed it.

     This logs information to a special log file. We are doing this to
     check if keeping track of this information is actually useful.

     @param machine: The hostname (e.g. IP address) of the machine to find the
         last job ran on it.

     """
     logger = MachineDeathLogger()
     rpc = frontend.AFE()

     try:
         problem_test = _find_problem_test(machine, rpc)
         if problem_test:
             # We want the machine death information to be logged to a special
             # file but we do not want every other message to be logged to
             # that file.
             logger.logger.error('%s | %d | %s'
                                 % (machine, problem_test['job']['id'],
                                    problem_test['job']['name']))
         else:
             logger.logger.error('%s | No job detected' % machine)

     except urllib2.URLError:
         logger.logger.error('%s | ERROR: Could not contact RPC server'
                             % machine)
     except xmlrpclib.ProtocolError as e:
         logger.logger.error('%s | ERROR: RPC Protocol Error: %s'
                             % (machine, e.errmsg))
	# Copyright (c) 2013 The Chromium OS Authors. All rights reserved.
	# Use of this source code is governed by a BSD-style license that can be
	# found in the LICENSE file.

	# This file lets us test the repair supporting code.
	# We could not easily unit test it if it was in the repair file as it makes
	# a function call that is not protected by a __name__ == ??? guard.

	import datetime, logging, operator, urllib2, xmlrpclib

	import common

	from autotest_lib.client.common_lib import global_config, logging_config
	from autotest_lib.server import frontend

	# Ignore any jobs that were ran more than this many mins past the max job
	# timeout.
	_CUTOFF_AFTER_TIMEOUT_MINS = 60
	LOGFILE_NAME = 'machine_death.log'


	class MachineDeathLogger(logging_config.LoggingConfig):
	"""
	Used to log information about a machine going into the Repair Failed state.

	We use this so that if the default log location ever changes it will also
	change for this logger and to keep this information separate from the
	other logs.

	"""
	file_formatter = logging.Formatter(fmt='%(asctime)s \| %(message)s',
	datefmt='%m/%d %H:%M:%S')

	def __init__(self):
	super(MachineDeathLogger, self).__init__(False)
	self.logger = logging.getLogger('machine_death')

	super(MachineDeathLogger, self).configure_logging(use_console=False)
	log_dir = self.get_server_log_dir()
	self.add_file_handler(LOGFILE_NAME, logging.ERROR, log_dir=log_dir)


	def _find_problem_test(machine, rpc):
	"""
	Finds the last job ran on a machine.

	@param machine: The hostname (e.g. IP address) of the machine to find the
	last ran job on it.

	@param rpc: The rpc object to contact the server with.

	@return the job status dictionary for the job that last ran on the machine
	or None if there is no such job.
	"""
	# Going through the RPC interface means we cannot use the latest() django
	# QuerySet function. So we will instead look at the past
	# job_max_runtime_mins_default plus _CUTOFF_AFTER_TIMEOUT_MINS
	# and pick the most recent run from there.
	default_timeout_mins = global_config.global_config.get_config_value(
	'AUTOTEST_WEB', 'job_max_runtime_mins_default', type=int)
	cutoff = (datetime.datetime.today() -
	datetime.timedelta(minutes=default_timeout_mins) -
	datetime.timedelta(minutes=_CUTOFF_AFTER_TIMEOUT_MINS))

	results = rpc.run('get_host_queue_entries', host__hostname=machine,
	started_on__gte=str(cutoff))

	if results:
	return max(results, key=operator.itemgetter('started_on'))
	else:
	return None


	def flag_problem_test(machine):
	"""
	Notify people about the last job that ran on a machine.

	This code is ran when a machine goes into the Repair Failed state and so
	there is a chance that the last ran job on it killed it.

	This logs information to a special log file. We are doing this to
	check if keeping track of this information is actually useful.

	@param machine: The hostname (e.g. IP address) of the machine to find the
	last job ran on it.

	"""
	logger = MachineDeathLogger()
	rpc = frontend.AFE()

	try:
	problem_test = _find_problem_test(machine, rpc)
	if problem_test:
	# We want the machine death information to be logged to a special
	# file but we do not want every other message to be logged to
	# that file.
	logger.logger.error('%s \| %d \| %s'
	% (machine, problem_test['job']['id'],
	problem_test['job']['name']))
	else:
	logger.logger.error('%s \| No job detected' % machine)

	except urllib2.URLError:
	logger.logger.error('%s \| ERROR: Could not contact RPC server'
	% machine)
	except xmlrpclib.ProtocolError as e:
	logger.logger.error('%s \| ERROR: RPC Protocol Error: %s'
	% (machine, e.errmsg))