blob: a34d14c97a41aa88a3a90e4e43beba6813460e4a [file] [log] [blame]
#! /usr/bin/python
# Copyright 2017 The Chromium OS Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
"""
Swarming bot manager running on servers that hold swarming bots.
This manages running swarming bots and routinely recovers any that die.
"""
import argparse
import logging
import signal
import socket
import sys
import time
import urllib2
import common
from autotest_lib.server.cros.dynamic_suite import frontend_wrappers
from autotest_lib.site_utils.chromeos_proxy import swarming_bots
from chromite.lib import metrics
from chromite.lib import ts_mon_config
# The seconds between consequent bot check.
CHECK_INTERVAL = 180
_shut_down = False
metrics_template = 'chromeos/autotest/swarming/bot_manager/%s'
def _parse_args(args):
"""Parse system arguments."""
parser = argparse.ArgumentParser(
description='Manage the set of swarming bots running on a server')
parser.add_argument('afe', type=str,
help='AFE to get server role and status.')
# TODO(xixuan): refactor together with swarming_bots.
parser.add_argument(
'id_range', type=str,
help='A range of integer, each bot created will be labeled '
'with an id from this range. E.g. "1-200"')
parser.add_argument(
'working_dir', type=str,
help='A working directory where bots will store files '
'generated at runtime')
parser.add_argument(
'-p', '--swarming_proxy', type=str, dest='swarming_proxy',
default=swarming_bots.DEFAULT_SWARMING_PROXY,
help='The URL of the swarming instance to talk to, '
'Default to the one specified in global config')
parser.add_argument(
'-f', '--log_file', dest='log_file',
help='Path to the log file.')
parser.add_argument(
'-v', '--verbose', dest='verbose', action='store_true',
help='Verbose mode')
parser.add_argument(
'--specify_bot_id', action='store_true',
help='Specify bot id in retrieving bot codes & staring bots')
return parser.parse_args(args)
def handle_signal(signum, frame):
"""Function called when being killed.
@param signum: The signal received.
@param frame: Ignored.
"""
del signum
del frame
_shut_down = True
def is_server_in_prod(server_name, afe):
"""Validate server's role and status.
@param server_name: the server name to be validated.
@param afe: the afe server to get role & status info in server_db.
@return: A boolean value, True when the server_name is in prod, False
otherwise, or if RPC fails.
"""
logging.info('Validating server: %s', server_name)
afe = frontend_wrappers.RetryingAFE(timeout_min=5, delay_sec=10,
server=afe)
is_prod_proxy_server = False
try:
if afe.run('get_servers', hostname=server_name,
status='primary', role='golo_proxy'):
is_prod_proxy_server = True
except urllib2.URLError as e:
logging.warning('RPC get_servers failed on afe %s: %s', afe, str(e))
finally:
metrics.Counter(metrics_template % 'server_in_prod_check').increment(
fields={'success': is_prod_proxy_server})
return is_prod_proxy_server
@metrics.SecondsTimerDecorator(metrics_template % 'tick')
def tick(afe, bot_manager):
"""One tick for swarming bot manager.
@param afe: the afe to check server role. If afe is empty, skip checking.
@param bot_manager: a swarming_bots.BotManager instance.
"""
if ((afe and is_server_in_prod(socket.getfqdn(), afe)) or
(not afe)):
bot_manager.check()
def main(args):
"""Main func.
@args: A list of system arguments.
"""
args = _parse_args(args)
swarming_bots.setup_logging(args.verbose, args.log_file)
if not args.swarming_proxy:
logging.error(
'No swarming proxy instance specified. '
'Specify swarming_proxy in [CROS] in shadow_config, '
'or use --swarming_proxy')
return 1
if not args.swarming_proxy.startswith('https://'):
swarming_proxy = 'https://' + args.swarming_proxy
else:
swarming_proxy = args.swarming_proxy
global _shut_down
logging.info("Setting signal handler.")
signal.signal(signal.SIGINT, handle_signal)
signal.signal(signal.SIGTERM, handle_signal)
bot_manager = swarming_bots.BotManager(
swarming_bots.parse_range(args.id_range),
args.working_dir,
args.swarming_proxy,
specify_bot_id=args.specify_bot_id)
is_prod = False
retryable = True
with ts_mon_config.SetupTsMonGlobalState('swarming_bots', indirect=True):
while not _shut_down:
tick(args.afe, bot_manager)
time.sleep(CHECK_INTERVAL)
if __name__ == '__main__':
sys.exit(main(sys.argv[1:]))