blob: 2c79bf409e2b48da777f0b42ecd0743e36f64796 [file] [log] [blame]
# Copyright 2017 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
import logging
import os
import threading
import time
import common
from autotest_lib.client.common_lib import utils
from autotest_lib.site_utils.lxc import base_image
from autotest_lib.site_utils.lxc import constants
from autotest_lib.site_utils.lxc import container_factory
from autotest_lib.site_utils.lxc import zygote
from autotest_lib.site_utils.lxc.constants import \
CONTAINER_POOL_METRICS_PREFIX as METRICS_PREFIX
from autotest_lib.site_utils.lxc.container_pool import async_listener
from autotest_lib.site_utils.lxc.container_pool import error
from autotest_lib.site_utils.lxc.container_pool import message
from autotest_lib.site_utils.lxc.container_pool import pool
try:
import cPickle as pickle
except:
import pickle
try:
from chromite.lib import metrics
from infra_libs import ts_mon
except ImportError:
import mock
metrics = utils.metrics_mock
ts_mon = mock.Mock()
# The minimum period between polling for new connections, in seconds.
_MIN_POLLING_PERIOD = 0.1
class Service(object):
"""A manager for a pool of LXC containers.
The Service class manages client communication with an underlying container
pool. It listens for incoming client connections, then spawns threads to
deal with communication with each client.
"""
def __init__(self, host_dir, pool=None):
"""Sets up a new container pool service.
@param host_dir: A SharedHostDir. This will be used for Zygote
configuration as well as for general pool operation
(e.g. opening linux domain sockets for communication).
@param pool: (for testing) A container pool that the service will
maintain. This parameter exists for DI, for testing.
Under normal circumstances the service instantiates the
container pool internally.
"""
# Create socket for receiving container pool requests. This also acts
# as a mutex, preventing multiple container pools from being
# instantiated.
self._socket_path = os.path.join(
host_dir.path, constants.DEFAULT_CONTAINER_POOL_SOCKET)
self._connection_listener = async_listener.AsyncListener(
self._socket_path)
self._client_threads = []
self._stop_event = None
self._running = False
self._pool = pool
def start(self, pool_size=constants.DEFAULT_CONTAINER_POOL_SIZE):
"""Starts the service.
@param pool_size: The desired size of the container pool. This
parameter has no effect if a pre-created pool was DI'd
into the Service constructor.
"""
self._running = True
# Start the container pool.
if self._pool is None:
factory = container_factory.ContainerFactory(
base_container=base_image.BaseImage().get(),
container_class=zygote.Zygote)
self._pool = pool.Pool(factory=factory, size=pool_size)
# Start listening asynchronously for incoming connections.
self._connection_listener.start()
# Poll for incoming connections, and spawn threads to handle them.
logging.debug('Start event loop.')
while self._stop_event is None:
self._handle_incoming_connections()
self._cleanup_closed_connections()
# TODO(kenobi): Poll for and log errors from pool.
metrics.Counter(METRICS_PREFIX + '/tick').increment()
time.sleep(_MIN_POLLING_PERIOD)
logging.debug('Exit event loop.')
# Stopped - stop all the client threads, stop listening, then signal
# that shutdown is complete.
for thread in self._client_threads:
thread.stop()
try:
self._connection_listener.close()
except Exception as e:
logging.error('Error stopping pool service: %r', e)
raise
finally:
# Clean up the container pool.
self._pool.cleanup()
# Make sure state is consistent.
self._stop_event.set()
self._stop_event = None
self._running = False
metrics.Counter(METRICS_PREFIX + '/service_stopped').increment()
logging.debug('Container pool stopped.')
def stop(self):
"""Stops the service."""
self._stop_event = threading.Event()
return self._stop_event
def is_running(self):
"""Returns whether or not the service is currently running."""
return self._running
def get_status(self):
"""Returns a dictionary of values describing the current status."""
status = {}
status['running'] = self._running
status['socket_path'] = self._socket_path
if self._running:
status['pool capacity'] = self._pool.capacity
status['pool size'] = self._pool.size
status['pool worker count'] = self._pool.worker_count
status['pool errors'] = self._pool.errors.qsize()
status['client thread count'] = len(self._client_threads)
return status
def _handle_incoming_connections(self):
"""Checks for connections, and spawn sub-threads to handle requests."""
connection = self._connection_listener.get_connection()
if connection is not None:
# Spawn a thread to deal with the new connection.
thread = _ClientThread(self, self._pool, connection)
thread.start()
self._client_threads.append(thread)
thread_count = len(self._client_threads)
metrics.Counter(METRICS_PREFIX + '/client_threads'
).increment_by(thread_count)
logging.debug('Client thread count: %d', thread_count)
def _cleanup_closed_connections(self):
"""Cleans up dead client threads."""
# We don't need to lock because all operations on self._client_threads
# take place on the main thread.
self._client_threads = [t for t in self._client_threads if t.is_alive()]
class _ClientThread(threading.Thread):
"""A class that handles communication with a pool client.
Use a thread-per-connection model instead of select()/poll() for a few
reasons:
- the number of simultaneous clients is not expected to be high enough for
select or poll to really pay off.
- one thread per connection is more robust - if a single client somehow
crashes its communication thread, that will not affect the other
communication threads or the main pool service.
"""
def __init__(self, service, pool, connection):
self._service = service
self._pool = pool
self._connection = connection
self._running = False
super(_ClientThread, self).__init__(name='client_thread')
def run(self):
"""Handles messages coming in from clients.
The thread main loop monitors the connection and handles incoming
messages. Polling is used so that the loop condition can be checked
regularly - this enables the thread to exit cleanly if required.
Any kind of error will exit the event loop and close the connection.
"""
logging.debug('Start event loop.')
try:
self._running = True
while self._running:
# Poll and deal with messages every second. The timeout enables
# the thread to exit cleanly when stop() is called.
if self._connection.poll(1):
try:
msg = self._connection.recv()
except (AttributeError,
ImportError,
IndexError,
pickle.UnpicklingError) as e:
# All of these can occur while unpickling data.
logging.error('Error while receiving message: %r', e)
# Exit if an error occurs
break
except EOFError:
# EOFError means the client closed the connection. This
# is not an error - just exit.
break
try:
response = self._handle_message(msg)
# Always send the response, even if it's None. This
# provides more consistent client-side behaviour.
self._connection.send(response)
except error.UnknownMessageTypeError as e:
# The message received was a valid python object, but
# not a valid Message.
logging.error('Message error: %s', e)
# Exit if an error occurs
break
except EOFError:
# EOFError means the client closed the connection early.
# TODO(chromium:794685): Return container to pool.
logging.error('Client closed connection before return.')
break
finally:
# Always close the connection.
logging.debug('Exit event loop.')
self._connection.close()
def stop(self):
"""Stops the client thread."""
self._running = False
def _handle_message(self, msg):
"""Handles incoming messages.
@param msg: The incoming message to be handled.
@return: A pickleable object (or None) that should be sent back to the
client.
"""
# Only handle Message objects.
if not isinstance(msg, message.Message):
raise error.UnknownMessageTypeError(
'Invalid message class %s' % type(msg))
# Use a dispatch table to simulate switch/case.
handlers = {
message.ECHO: self._echo,
message.GET: self._get,
message.SHUTDOWN: self._shutdown,
message.STATUS: self._status,
}
try:
return handlers[msg.type](**msg.args)
except KeyError:
raise error.UnknownMessageTypeError(
'Invalid message type %s' % msg.type)
def _echo(self, msg):
"""Handles ECHO messages.
@param msg: A string that will be echoed back to the client.
@return: The message, for echoing back to the client.
"""
# Just echo the message back, for testing aliveness.
logging.debug('Echo: %r', msg)
return msg
def _shutdown(self):
"""Handles SHUTDOWN messages.
@return: An ACK message. This function is synchronous (i.e. it blocks,
and only returns the ACK when shutdown is complete).
"""
logging.debug('Received shutdown request.')
# Request shutdown. Wait for the service to actually stop before
# sending the response.
self._service.stop().wait()
logging.debug('Service shutdown complete.')
return message.ack()
def _status(self):
"""Handles STATUS messages.
@return: The result of the service status call.
"""
logging.debug('Received status request.')
return self._service.get_status()
def _get(self, id, timeout):
"""Gets a container from the pool.
@param id: A ContainerId to assign to the new container.
@param timeout: A timeout (in seconds) to wait for the pool. If a
container is not available from the pool within the
given period, None will be returned.
@return: A container from the pool.
"""
logging.debug('Received get request (id=%s)', id)
container = self._pool.get(timeout)
# Assign an ID to the container as soon as it is removed from the pool.
# This associates the container with the process to which it will be
# handed off.
if container is not None:
logging.debug(
'Assigning container (name=%s, id=%s)', container.name, id)
container.id = id
else:
logging.debug('No container (id=%s)', id)
metrics.Counter(METRICS_PREFIX + '/container_requests',
field_spec=[ts_mon.BooleanField('success')]
).increment(fields={'success': (container is not None)})
return container