blob: 2d11d1135f7a944ec8fe80d0706944e31ea27dd7 [file] [log] [blame]
#!/usr/bin/python
#
# Copyright (c) 2011 The Chromium OS Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
"""System Monitor.
This program monitors the health of Chrome OS devices in the AutoTest testbed.
Classes:
Monitor - The Monitor is responsible for managing the overall process of
keeping an updated status of each host available to AutoTest.
RemoteWorker - responsible for SSHing to remote hosts to gather resources.
Resource - maintains all of the resources that are monitored, and methods to
parse their data for consumption by RRDTool.
RRD - maintains all interfaces to RRDTool, including graph definitions, and
methods to create, update, and graph resources.
TestBed - a global class used to hold configuration data and data collected
from each remote host. Additionally, formatted data for RRD will be kept
associated with each host, and some general information about the update
process of each remote host.
Usage:
The following options are supported:
--webdir: Systemhealth web directory.
--url: URL for landing page.
--datadir: Non-NFS directory for RRD files.
--graph: Create 1, 4, and 24 hour graphs for each host.
--all_graphs: Create all graphs for each host.
--html: Build HTML pages for hosts.
--update: Collect data from hosts.
--skip_at_status: Don't collect data about hosts from autotest CLI.
--timout: Seconds to wait for remote commands to complete.
--log_file: Write log messages to specified log file.
--skip_console: Do not write log messages to the console.
--verbose: Set the logging level to debug.
--cli: Autotest CLI executable location.
--acl: Autotest ACL Group to query for host machines.
--label: Only run on hosts with the specified label.
--status: Only run on hosts with the specified status.
--user: Only run on hosts with the specified user.
Arguments should be space separated.
"""
__author__ = ('kdlucas@gmail.com (Kelly Lucas) & '
'pauldean@google.com (Paul Pendlebury)')
__version__ = '3.10'
import cPickle
import datetime
import json
import logging
import optparse
import os
import shutil
import sys
import time
import traceback
import chromeos_test_common
from chromeos_test import autotest_util
from chromeos_test import common_util
from chromeos_test import mp_log_util
from chromeos_test import mp_thread_pool as tp
import IPy
class RemoteWorker(object):
"""Obtain resource data from remote hosts using monitor_remote_worker.py."""
def __init__(self, hostname, platform, testbed):
"""Inits RemoteWorker with hostname and test configuration.
Args:
hostname: string, hostname of AutoTest host.
platform: string, platform of hostname.
testbed: testbed object for this run.
"""
self.h = hostname
self.platform = platform
self.tb = testbed
# Set up some dictionaries for each host.
self.host_data = {}
self.host_data['rrddata'] = {} # Formatted data.
self.host_data['status'] = False
self.host_data['time'] = None
for v in self.tb.version:
self.host_data[v] = {}
self.host_data[v]['PTR'] = None
def Run(self, logger):
"""Method called into by thread pool."""
logger.debug('Starting host %s.', self.h)
updated_html_needed = False
data_file = os.path.join(self.tb.datadir, 'hosts', self.h, 'data.pkl')
local_script = os.path.join(chromeos_test_common.CURRENT_DIR,
'monitor_remote_worker.py')
remote_script = '/tmp/monitor_remote_worker.py'
try:
if self.tb.update:
if not os.path.isfile(local_script):
logger.error('Script file %s missing.', local_script)
return
# Copy script
try:
common_util.RemoteCopy(self.h, 'root', local_script, remote_script)
except common_util.ChromeOSTestError:
logger.error('Skipping unreachable host %s.', self.h)
return
# Run Script
try:
output = common_util.RemoteCommand(self.h, 'root', remote_script,
output=True)
self.host_data = cPickle.loads(output)
except common_util.ChromeOSTestError:
logger.exception('Error running script on host %s.', self.h)
self.host_data['status'] = 'CollectionError'
else:
# If it exists, load saved host_data.
if os.path.isfile(data_file):
with open(data_file, 'rb') as in_file:
self.host_data = cPickle.load(in_file)
advisor = Resource()
if ((self.tb.update or self.tb.graph) and
self.host_data['status'] != 'CollectionError'):
updated_html_needed = self.UpdateRelease(logger)
advisor.ProcessHostRRD(self.h, self.host_data, self.tb, logger)
if self.tb.html:
advisor.BuildHTML(self.h, self.platform, self.host_data, self.tb,
updated_html_needed)
# Save successful host data so it can be loaded later.
if self.tb.update and self.host_data['status'] == 'True':
# rrd data is no longer needed, so don't save it.
del self.host_data['rrddata']
self.host_data['rrddata'] = {}
with open(data_file, 'wb') as out_file:
cPickle.dump(self.host_data, out_file, cPickle.HIGHEST_PROTOCOL)
# Lots of exception handling happening here. This is the entry point
# for this thread/process and if we let an exception go unhandled
# we wouldn't it from the main thread and we would miss any
# notifications of problems.
except (KeyboardInterrupt, SystemExit):
logging.exception('Shutdown requested.')
sys.exit(1)
except Exception:
logging.exception('Unexpected Exception on %s', self.h)
raise
logger.debug('Finished host %s.', self.h)
def UpdateRelease(self, logger):
"""Update Release info with most current release versions.
The PTR key points to the most recent released version. This will also
preserve the last known release version in case the host is down.
Args:
logger: multiprocess logger
Returns:
True/False if new HTML files are needed for this host.
"""
rrd_dir = os.path.join(self.tb.datadir, 'hosts', self.h, 'rrd')
# Check if the host directory exists, if not create it.
common_util.MakedirsExisting(rrd_dir)
update_html = False
for v in self.tb.version:
update_file = False
relfile = os.path.join(rrd_dir, v)
tmpfile = os.path.join(rrd_dir, v + '.tmp')
if os.path.isfile(relfile):
try:
rf = open(relfile, 'r')
lines = rf.readlines()
except IOError, e:
logger.error('Parsing release file %s\n%s', relfile, e)
finally:
rf.close()
for line in lines:
fields = line.split('=')
# The correct format will have two strings separated by =.
if len(fields) == 2:
if fields[0] == 'PTR':
if self.host_data[v]['PTR']:
if self.host_data[v]['PTR'] != fields[1]:
# Most recent version has changed.
update_file = True
lines.pop(lines.index(line))
self.host_data[v][self.tb.time] = (self.host_data[v]['PTR'])
else:
# Host is down so use last known value.
self.host_data[v]['PTR'] = (fields[1].strip())
else:
self.host_data[v][fields[0]] = (fields[1].strip())
elif len(line) > 3:
# This means the release file has the wrong format, so
# we'll just write a new one with current values.
update_file = True
lines.pop(lines.index(line))
else:
# If we get here than it's probably a blank line.
update_file = True
lines.pop(lines.index(line))
if update_file:
update_html = True
logger.debug('Updating %s', relfile)
shutil.move(relfile, tmpfile)
# Put the most recent update in the new file, and make the
# PTR key to point to it.
lines.append('%s=%s\n' % (self.tb.time, self.host_data[v]['PTR']))
lines.append('PTR=%s' % self.host_data[v]['PTR'])
try:
rf = open(relfile, 'w')
for line in lines:
rf.write(line)
except IOError, e:
logger.error('Writing %s\n%s', relfile, e)
finally:
rf.close()
else:
# Create a new release file, as it does not exist.
if self.host_data[v]['PTR']:
update_html = True
logger.info('Creating new %s', relfile)
try:
rf = open(relfile, 'w')
rf.write('%s=%s\n' % (self.tb.time, self.host_data[v]['PTR']))
rf.write('PTR=%s' % self.host_data[v]['PTR'])
except IOError, e:
logger.error('Writing %s\n%s', relfile, e)
finally:
rf.close()
self.host_data[v][self.tb.time] = (self.host_data[v]['PTR'])
return update_html
class TestBed(object):
"""Used to hold all of the global variables."""
def __init__(self, options):
"""Inits TestBed with run options.
Args:
options: Command line args for this run.
"""
# Save run start time.
self.time = int(time.time())
# Setup logging.
self.options = options
self.logfile = options.log_file
logger = logging.getLogger()
mp_log_util.InitializeLogging(logger, **vars(options))
# Warn and exit if SSH is not in the environment.
if not 'SSH_AGENT_PID' in os.environ:
logger.error('SSH_AGENT_PID not in environment, ssh commands will fail '
'to execute.')
sys.exit(1)
# Verify RRD is installed where we expect it.
if not os.path.exists('/usr/bin/rrdtool'):
logger.error('RRD is not installed to /usr/bin/rrdtool. Run \'sudo '
'apt-get install rrdtool\'.')
sys.exit(1)
# Assign TestBed values used for RRD and HTML pages.
self.version = ['ec_firmware', 'firmware', 'release']
self.rrdtimes = ['-1hours', '-4hours', '-24hours', '-1week', '-1month',
'-1year']
# Make sure directories exist to hold status and data files.
run_dir = os.path.normpath('/tmp/systemhealth')
common_util.MakedirsExisting(run_dir)
# Default status files. Used to prevent more than one instance from
# running at the same time.
self.update_runfile = os.path.join(run_dir, 'update.running')
self.graph_runfile = os.path.join(run_dir, 'graph.running')
# Requested run actions.
self.graph = options.graph
self.all_graphs = options.all_graphs
self.html = options.html
self.update = options.update
self.timeout = options.timeout
self.skip_at_status = options.skip_at_status
# Machine setup.
self.webdir = options.webdir
self.url = options.url
self.datadir = options.datadir
# Output some debug info.
self.run_description = str(os.getpid()) + ':'
if self.update:
self.run_description += ' Update'
if self.graph:
self.run_description += ' Graph'
if self.all_graphs:
self.run_description += '_All'
if self.html:
self.run_description += ' HTML'
if not self.skip_at_status:
self.run_description += ' Status'
mp_log_util.LogWithHeader('Start ' + self.run_description, logger)
class Monitor(object):
"""Main class used to manage the monitoring of remote hosts.
This class is used to determine the current status of hosts in the AutoTest
testbed. AutoTest will be queried to populate self.rhosts. It will populate
a list of RemoteWorkes and submit that list to MultiProcWorkPool to query
each host to gather resource data.
"""
def __init__(self, testbed, options):
"""Monitor will use config data from TestBed."""
self.tb = testbed
self.options = options
self.mp_wp = tp.MultiProcWorkPool()
self.afe_hosts = autotest_util.GetHostData(self.tb.options.cli,
self.tb.options.acl,
self.tb.options.label,
self.tb.options.user,
self.tb.options.status)
self.host_status = []
def UpdateStatus(self):
"""Update data from all monitored hosts."""
# Don't attempt work when no hosts are known.
if not self.afe_hosts:
return
# Record known host status from Autotest
if not self.options.skip_at_status:
self.RecordAutotestHostStatus(self.afe_hosts)
# Create instance of RemoteWorker class for every host from atest.
self.host_status = [RemoteWorker(host, self.afe_hosts[host]['platform'],
self.tb) for host in self.afe_hosts.keys()]
# Submit RemoteWorker items to thread pool.
self.host_status = self.mp_wp.ExecuteWorkItems(
self.host_status, 'Run', provide_logger=True,
logger_init_callback=mp_log_util.InitializeLogging,
**vars(self.options))
loglevel = logging.getLogger().getEffectiveLevel()
if loglevel == logging.DEBUG:
for worker in self.host_status:
logging.debug('%s status is %s/%s', worker.h,
worker.host_data['status'],
self.afe_hosts[worker.h]['status'])
def RecordAutotestHostStatus(self, hosts):
"""Record Autotest status of all hosts in rrd files.
Args:
hosts: Dictionary of host information from autotest cli.
"""
# Maps a host status string to an index in an array.
status_key = {'Repairing': 0, 'Verifying': 1, 'Repair_Failed': 2,
'Running': 3, 'Cleaning': 4, 'Ready': 5, 'Pending': 6}
# lab_status holds the lab data in the format rrd needs. The special
# netbook_ALL platform is the sum of all the platforms.
lab_status = {'netbook_ALL': [0] * len(status_key)}
# Loop through all the hosts recording their status in lab_status
for host in hosts:
status = hosts[host]['status'].replace(' ', '_')
platform = hosts[host]['platform']
if platform not in lab_status:
lab_status[platform] = [0] * len(status_key)
if status in status_key:
lab_status[platform][status_key[status]] += 1
lab_status['netbook_ALL'][status_key[status]] += 1
else:
logging.error('Status=%s not a known status of %s', status, status_key)
Resource().ProcessAutotestRRD(lab_status, self.tb, logging.getLogger())
# Save data for later analysis in a pickled data file.
for platform in lab_status:
data_folder = os.path.join(self.tb.datadir, 'hosts', platform)
common_util.MakedirsExisting(data_folder)
data_file = os.path.join(data_folder, 'utilization.pkl')
platform_data = {}
if os.path.isfile(data_file):
with open(data_file, 'rb') as in_file:
platform_data = cPickle.load(in_file)
date_entry = datetime.datetime.strftime(datetime.datetime.now(),
'%Y_%m_%d_%H_%M_%S')
platform_data[date_entry] = lab_status[platform]
with open(data_file, 'wb') as out_file:
cPickle.dump(platform_data, out_file, cPickle.HIGHEST_PROTOCOL)
@staticmethod
def ValidIP(address):
"""Verify address is a valid IP address.
Args:
address: string.
Returns:
boolean: True = valid IP address, False = not valid IP address.
"""
octets = address.split('.')
if len(octets) != 4:
return False
for octet in octets:
if not 0 <= int(octet) <= 255:
return False
return True
def SortAFEHosts(self, afelist):
"""Sort AFE host list by IP address.
Args:
afelist: list of AFE host objects.
Returns:
newlist: list of sorted AFE host objects.
"""
iplist = []
hostlist = []
for h in afelist:
if self.ValidIP(h):
iplist.append(h)
else:
hostlist.append(h)
templist = [(IPy.IP(h).int(), h) for h in iplist]
templist.sort()
newlist = [h[1] for h in templist]
hostlist.sort()
newlist.extend(hostlist)
return newlist
def BuildLandingPage(self):
"""Build the initial HTML landing page with links to all hosts."""
logging.debug('Building Landing Page')
sorted_hosts = []
downhosts = 0
down_repair = 0
down_running = 0
down_ready = 0
down_other = 0
readyhosts = 0
ready_repair = 0
ready_running = 0
ready_ready = 0
ready_other = 0
scripthosts = 0
script_repair = 0
script_running = 0
script_ready = 0
script_other = 0
hostlist = self.afe_hosts.keys()
sorted_ip = self.SortAFEHosts(hostlist)
# Create a dictionary to easily map host name to host result.
host_results = {}
for host in self.host_status:
host_results[host.h] = host
# Put host that are down first
for h in sorted_ip:
insert_offset = 0
# Up hosts.
if host_results[h].host_data['status'] == 'True':
readyhosts += 1
insert_offset += downhosts + scripthosts
if self.afe_hosts[h]['status'] == 'Repair':
insert_offset += ready_repair
ready_repair += 1
self.afe_hosts[h]['color'] = '#96BAC6'
self.afe_hosts[h]['status_string'] = 'Repair'
elif self.afe_hosts[h]['status'] == 'Running':
insert_offset += ready_repair + ready_running
ready_running += 1
self.afe_hosts[h]['color'] = '#BBD9EE'
self.afe_hosts[h]['status_string'] = 'Running'
elif self.afe_hosts[h]['status'] == 'Ready':
insert_offset += ready_repair + ready_running + ready_ready
ready_ready += 1
self.afe_hosts[h]['color'] = '#FFFFFF'
self.afe_hosts[h]['status_string'] = 'Ready'
else:
insert_offset += (ready_repair + ready_running + ready_ready +
ready_other)
ready_other += 1
self.afe_hosts[h]['color'] = '#788D9A'
status_str = self.afe_hosts[h]['status']
self.afe_hosts[h]['status_string'] = status_str
# Up hosts with python problems.
elif host_results[h].host_data['status'] == 'CollectionError':
scripthosts += 1
insert_offset += downhosts
if self.afe_hosts[h]['status'] == 'Repair':
insert_offset += script_repair
script_repair += 1
self.afe_hosts[h]['color'] = '#245403'
self.afe_hosts[h]['status_string'] = 'ScriptError/Repair'
elif self.afe_hosts[h]['status'] == 'Running':
insert_offset += script_repair + script_running
script_running += 1
self.afe_hosts[h]['color'] = '#406331'
self.afe_hosts[h]['status_string'] = 'ScriptError/Running'
elif self.afe_hosts[h]['status'] == 'Ready':
insert_offset += (script_repair + script_running + script_ready)
script_ready += 1
self.afe_hosts[h]['color'] = '#5E924E'
self.afe_hosts[h]['status_string'] = 'ScriptError/Ready'
else:
insert_offset += (script_repair + script_running + script_ready +
script_other)
script_other += 1
self.afe_hosts[h]['color'] = '#183503'
status_str = 'ScriptError/' + self.afe_hosts[h]['status']
self.afe_hosts[h]['status_string'] = status_str
# Down hosts.
else:
downhosts += 1
if self.afe_hosts[h]['status'] == 'Repair':
insert_offset += down_repair
down_repair += 1
self.afe_hosts[h]['color'] = '#867146'
self.afe_hosts[h]['status_string'] = 'Down/Repair'
elif self.afe_hosts[h]['status'] == 'Running':
insert_offset += down_repair + down_running
down_running += 1
self.afe_hosts[h]['color'] = '#E5DCBD'
self.afe_hosts[h]['status_string'] = 'Down/Running'
elif self.afe_hosts[h]['status'] == 'Ready':
insert_offset += down_repair + down_running + down_ready
down_ready += 1
self.afe_hosts[h]['color'] = '#D6C085'
self.afe_hosts[h]['status_string'] = 'Down/Ready'
else:
insert_offset += (down_repair + down_running + down_ready +
down_other)
down_other += 1
self.afe_hosts[h]['color'] = '#4F4126'
status_str = 'Down/' + self.afe_hosts[h]['status']
self.afe_hosts[h]['status_string'] = status_str
sorted_hosts.insert(insert_offset, h)
# If we didn't connect to the host this run, load data from
# the last successful run.
if host_results[h].host_data['status'] != 'True':
data_file = os.path.join(self.tb.datadir, 'hosts', h, 'data.pkl')
if os.path.isfile(data_file):
with open(data_file, 'rb') as in_file:
host_results[h].host_data = cPickle.load(in_file)
# Create symlink to the log file if it does not exist.
log_filename = os.path.join(self.tb.webdir, 'monitor.log')
if not os.path.isfile(log_filename):
try:
os.symlink(self.tb.logfile, log_filename)
except OSError, e:
logging.error('Linking to logfile\n%s', e)
land_page_file = os.path.join(self.tb.webdir, 'index.html')
# The temp file is used so that there will always be viewable html page
# when the new page is being built.
land_page_temp = os.path.join(self.tb.webdir, 'temp.html')
f = open(land_page_temp, 'w')
f.write('<HTML><HEAD>')
f.write('<LINK REL="stylesheet" TYPE="text/css" HREF="table.css">')
f.write('<TITLE>AutoTest System Health Check</TITLE></HEAD>')
f.write('<BODY>')
f.write('<img src="chrome.png" style="float:left;"/>')
f.write('<table style="float: right">')
f.write(('<TR><TD><a href=%s>%s</a><TD>Hosts<TD>Ready<TD>Repair<TD>'
'Running<TD>Other') % ('monitor.log', 'Log File'))
f.write('<TR><TD>Total')
f.write('<TD>%d<TD>%d<TD>%d<TD>%d<TD>%d' % (
downhosts + readyhosts + scripthosts,
down_ready + ready_ready + script_ready,
down_repair + ready_repair + script_repair,
down_running + ready_running + script_running,
down_other + ready_other + script_other))
f.write('<TR><TD>Inaccessible')
f.write('<TD>%d<TD>%d<TD>%d<TD>%d<TD>%d' % (downhosts, down_ready,
down_repair, down_running,
down_other))
f.write('<TR><TD>Script Error')
f.write('<TD>%d<TD>%d<TD>%d<TD>%d<TD>%d' % (scripthosts, script_ready,
script_repair, script_running,
script_other))
f.write('<TR><TD>Accessible')
f.write('<TD>%d<TD>%d<TD>%d<TD>%d<TD>%d' % (readyhosts, ready_ready,
ready_repair, ready_running,
ready_other))
f.write('</table>')
f.write('<center><H1>CAUTOTEST Testbed</H1>')
f.write('<H2>System Health</H2>')
plat_graph = os.path.join(self.tb.url, 'hosts', 'netbook_ALL',
'utilization-24hours.png')
f.write('<BR><BR><img src=%s ><BR><BR>' % plat_graph)
f.write('<table>')
f.write('<CAPTION>Hosts last updated: %s</CAPTION>' % time.strftime(
'%d %b %Y - %I:%M:%S %p %Z', time.localtime()))
f.write('<TR><TH>Hostname<TH>Status<TH>Labels<TH>Last Update')
f.write('<TH>Release<TH>Health</TR>')
for h in sorted_hosts:
link_dir = 'hosts/' + h
web_dir = os.path.join(self.tb.webdir, 'hosts', h)
common_util.MakedirsExisting(web_dir, 0755)
fqn = 'http://cautotest.corp.google.com/'
view_host = 'afe/#tab_id=view_host&object_id=%s' % h
hlink = fqn + view_host
f.write('<tr bgcolor=%s><th>' % self.afe_hosts[h]['color'])
f.write('<a href=%s>%s</a></th>' % (hlink, h))
f.write('<td><em>%s</em>' % self.afe_hosts[h]['status_string'])
f.write('<td>')
f.write('<em><b>%s</b></em><br>' % self.afe_hosts[h]['platform'])
for label in self.afe_hosts[h]['labels']:
f.write('%s<br>' % label)
f.write('<td>%s' % host_results[h].host_data['time'])
if host_results[h].host_data['release']['PTR']:
f.write('<td>%s' % host_results[h].host_data['release']['PTR'])
else:
f.write('<td>Unknown')
index_file = os.path.join(web_dir, 'index.html')
if os.path.isfile(index_file):
f.write('<td><a href=%s' % self.tb.url)
f.write('%s/index.html target="_blank">' % link_dir)
f.write('health</a></td>')
else:
f.write('<td>None</td>')
f.write('</table><p>\n</center>\n</BODY></HTML>')
f.close()
shutil.copyfile(land_page_temp, land_page_file)
os.chmod(land_page_file, 0644)
class Resource(object):
"""Contains structures and methods to collect health data on hosts.
For each resource in self.resources, there must also be a corresponding
method to format the data into what RRDTool expects.
"""
def __init__(self):
self.resources = [
'battery',
'boot',
'cpu',
'load',
'memory',
'network',
'power',
'temp',
'uptime'
]
self.fs = [
'rootfsA_space',
'rootfsA_inodes',
'rootfsA_stats',
'rootfsB_space',
'rootfsB_inodes',
'rootfsB_stats',
'stateful_space',
'stateful_inodes',
'stateful_stats'
]
self.resources += self.fs
@staticmethod
def ProcessAutotestRRD(hosts, testbed, logger):
"""Process formatted data into RRD files for each host in hosts.
Args:
hosts: dictionary of platforms and their data for rrd.
testbed: configuration data for this run.
logger: logger for this process/thread.
"""
for platform in hosts:
rrd_dir = os.path.join(testbed.datadir, 'hosts', platform, 'rrd')
web_dir = os.path.join(testbed.webdir, 'hosts', platform)
common_util.MakedirsExisting(rrd_dir)
common_util.MakedirsExisting(web_dir, 0755)
rrd_list = []
for v in hosts[platform]:
rrd_list += [str(v)]
rrd_dict = {'rrddata': {'utilization': rrd_list}}
rrd = RRD('utilization', platform, rrd_dir, web_dir, testbed)
if not os.path.exists(rrd.rrdfile):
rrd.Create(logger, 600)
rrd.Update(rrd_dict, logger)
rrd.Graph(rrd_dict, logger, False)
def ProcessHostRRD(self, hostname, hostdata, testbed, logger):
"""Process formatted data into RRD files for host hostname.
Args:
hostname: string, hostname of AutoTest host.
hostdata: raw data from the host.
testbed: configuration data for this run.
logger: logger for this process/thread.
"""
rrd_dir = os.path.join(testbed.datadir, 'hosts', hostname, 'rrd')
web_dir = os.path.join(testbed.webdir, 'hosts', hostname)
common_util.MakedirsExisting(rrd_dir)
common_util.MakedirsExisting(web_dir, 0755)
for r in self.resources:
dk = None # datakey only needs to be set if it's a file system.
if r in self.fs:
if '_space' in r:
dk = 'fs_space'
elif '_inode' in r:
dk = 'fs_inode'
elif '_stat' in r:
dk = 'fs_stat'
rrd = RRD(r, hostname, rrd_dir, web_dir, testbed, dk)
if not os.path.exists(rrd.rrdfile):
rrd.Create(logger)
if testbed.update == True:
logger.debug('Updating %s for host %s', r, hostname)
rrd.Update(hostdata, logger)
if testbed.graph:
logger.debug('Building %s graphs for %s', r, hostname)
rrd.Graph(hostdata, logger)
def BuildHTML(self, hostname, platform, hostdata, testbed,
update_needed=False):
"""Create HTML pages for to display the graphs.
Args:
hostname: string, hostname of AutoTest host.
platform: string, platform of hostname.
hostdata: raw data from the host.
testbed: configuration data for this run.
update_needed: new html needed, existing has wrong info.
"""
web_dir = os.path.join(testbed.webdir, 'hosts', hostname)
plat_dir = os.path.join(testbed.url, 'hosts', platform)
index_file = os.path.join(web_dir, 'index.html')
# If the index file exists, and the release info hasn't changed, skip.
if os.path.isfile(index_file) and not update_needed:
return
mainindex = testbed.url + 'index.html'
resource_list = []
for r in self.resources:
resource_list.append(r)
resource_list.sort()
html_file = {}
for t in testbed.rrdtimes:
html_file[t] = hostname + t + '.html'
pathname = {}
for name in html_file:
pathname[name] = os.path.join(web_dir, html_file[name])
# Create directory for html/graphs.
common_util.MakedirsExisting(web_dir, 0755)
# Create HTML files for each time period we are graphing.
for path in pathname:
f = open(pathname[path], 'w')
f.write('<HTML><HEAD>')
f.write('<center><TITLE>%s System Health</TITLE></HEAD>' % hostname)
f.write('<BODY><H1>%s System Health</H1>' % hostname)
for v in testbed.version:
f.write('<H4>%s: %s</H4>' % (v, hostdata[v]['PTR']))
for t in testbed.rrdtimes:
f.write('<a href="%s">%s</a>&nbsp;<b>|</b>' % (html_file[t], t))
f.write('<a href="%s">SystemHealth Home</a>' % mainindex)
f.write('<p><HR>')
plat_graph = os.path.join(plat_dir, 'utilization' + path + '.png')
f.write('<img src=%s ><BR><BR>' % plat_graph)
f.write('<table border=1 bgcolor=#EEEEEE>')
newrow = True
for r in resource_list:
if newrow:
f.write('<tr>')
f.write('<td>%s<br><a href=%s.html>' % (r, r))
f.write('<img src=%s%s.png width=475 height=250></a></td>' % (r, path))
if newrow:
newrow = False
else:
f.write('</tr>\n')
newrow = True
f.write('</table><p>\n')
f.write('</center>\n')
f.write('<H5>Last Update: %s</H5>' % hostdata['time'])
f.write('</BODY></HTML>')
f.close()
os.chmod(pathname[path], 0644)
# Set default landing page to 24-hour graphs
if not os.path.isfile(index_file):
os.symlink(pathname[testbed.rrdtimes[2]], index_file)
# Create HTML files for each resource for all time periods.
for r in resource_list:
rrdfile = os.path.join(web_dir, r + '.html')
f = open(rrdfile, 'w')
f.write('<HTML><HEAD>')
f.write('<center><TITLE>%s %s Resources</TITLE></HEAD>' % (hostname, r))
f.write('<BODY><H1>%s %s Resources</H1>' % (hostname, r))
for v in testbed.version:
f.write('<H4>%s: %s</H4>' % (v, hostdata[v]['PTR']))
f.write('<table border=5 bgcolor=#B5B5B5>')
f.write('<tr>')
for t in testbed.rrdtimes:
f.write('<td><a href="#%s"><b>%s</b></a>' % (t, t))
f.write('</table>')
f.write('<HR>')
f.write('<table border=1 bgcolor=#EEEEEE>')
for t in testbed.rrdtimes:
f.write('<tr><td><a name="%s"><img src=%s%s.png>' % (t, r, t))
f.write('</a></td></tr>\n')
f.write('</table><p>\n')
f.write('</center>\n')
f.write('<H5>Last Update: %s</H5>' % hostdata['time'])
f.write('</BODY></HTML>')
f.close()
os.chmod(rrdfile, 0644)
class RRD(object):
"""The class to create and update RRD data stores and graph them.
This class should be used to access all of the functions of RRDTool. It will
create the data files, update them, and create graphs/charts based on that
data. Datakey is needed when we are using the same data definitions for many
items of the same type, like file systems.
"""
def __init__(self, rrdname, hostname, rrd_dir, web_dir, tb, datakey=None):
"""Inits RRD class.
Args:
rrdname: string, item name(should match key from Resources)
hostname: string, hostname of the machine.
rrd_dir: string, directory for rrd files.
web_dir: string, directory for generated graphs.
tb: testbase object for this run.
datakey: string, overrides which data definition to use.
"""
self.tb = tb
self.rrdtool = '/usr/bin/rrdtool'
self.rrd_dir = rrd_dir
self.web_dir = web_dir
self.rrdname = rrdname
self.hostname = hostname
rrd_filename = rrdname + '.rrd'
self.rrdfile = os.path.join(self.rrd_dir, rrd_filename)
file_system = 'Unknown'
if not datakey:
datakey = rrdname
else:
fields = rrdname.split('_')
if fields[0]:
file_system = fields[0]
self.dd = json.load(open(os.path.join(sys.path[0], 'rrd.json')))[datakey]
self.dd['title'] %= {'host': self.hostname, 'file_system': file_system}
def Create(self, logger, step=600):
"""Create an empty RRD file.
Args:
logger: Multiprocess logger.
step: Default rrdtool step.
Returns:
boolean: True = Success, False = failure.
"""
stime = int(time.time()) - 5 * 86400
rrd_suffix = ['RRA:AVERAGE:0.5:1:576', 'RRA:AVERAGE:0.5:6:672',
'RRA:AVERAGE:0.5:24:732', 'RRA:AVERAGE:0.5:144:1460']
rrd_cmd = [self.rrdtool, 'create', self.rrdfile, '--start', str(stime),
'--step', str(step)]
for ds in self.dd['items']:
ds_str = 'DS:%s:%s:%s:%s:%s' % (ds, self.dd['type'], self.dd['heartbeat'],
self.dd['min'], self.dd['max'])
rrd_cmd.append(ds_str)
rrd_cmd += rrd_suffix
# Convert the rrd_cmd to a string with space separated commands.
exec_str = ' '.join(rrd_cmd)
try:
common_util.RunCommand(exec_str)
except common_util.ChromeOSTestError:
logger.error('Executing: "%s".', exec_str)
return False
return True
def Update(self, hostdata, logger):
"""Update an existing RRD file.
Args:
hostdata: dictionary of raw data from this host.
logger: logger for this process/thread
Returns:
boolean: True = Success, False = errors.
"""
if self.rrdname in hostdata['rrddata']:
data_count = len(hostdata['rrddata'][self.rrdname])
if data_count == 0:
logger.debug('Key "%s" empty in hostdata for host %s.', self.rrdname,
self.hostname)
return False
if data_count < 2:
data = 'N:' + hostdata['rrddata'][self.rrdname][0]
else:
data = 'N:' + ':'.join(hostdata['rrddata'][self.rrdname])
rrd_cmd = [self.rrdtool, 'update', self.rrdfile, data]
exec_str = ' '.join(rrd_cmd)
try:
common_util.RunCommand(exec_str)
except common_util.ChromeOSTestError:
logger.error('Executing: "%s".', exec_str)
return False
return True
else:
logger.debug('Key "%s" not found in hostdata for host %s.', self.rrdname,
self.hostname)
return False
def Graph(self, hostdata, logger, include_updates=True, file_prefix=''):
"""Create a graph of a tracked resource.
Args:
hostdata: Dictionary of raw data from this host.
logger: Logger for this process/thread.
include_updates: Include firmware update history in graphs.
file_prefix: String to append to front of graph file names.
"""
width = '850'
height = '300'
end = 'now'
rcolor = {'release': '#9966FF', 'firmware': '#990033',
'ec_firmware': '#009933'}
if self.tb.all_graphs:
rrdtimes = self.tb.rrdtimes
else:
rrdtimes = self.tb.rrdtimes[:3]
for rrdtime in rrdtimes:
png_filename = file_prefix + self.rrdname + rrdtime + '.png'
png_file = os.path.join(self.web_dir, png_filename)
title = self.dd['title'] + ' ' + rrdtime + '"'
rrd_cmd = [self.rrdtool, 'graph', png_file, '--imgformat PNG', '-s',
rrdtime, '--end', end, '--width', width, '--height', height,
'--vertical-label', self.dd['units'], '--title', title]
for ds in self.dd['items']:
rrd_cmd.append('DEF:%s=%s:%s:AVERAGE' % (ds, self.rrdfile, ds))
rrd_cmd += self.dd['graph']
if include_updates:
rrd_cmd.append('COMMENT:"Release History \\s"')
rrd_cmd.append('COMMENT:"=============== \\n"')
for v in self.tb.version:
sorted_items = []
for k in hostdata[v]:
if k != 'PTR':
sorted_items.append(k)
sorted_items.sort()
for i in sorted_items:
# Get a date/time string to display, localtime requires
# a float, so convert i to float.
fw_datetime = time.strftime('%D %H\\:%M', time.localtime(float(i)))
# Need to escape any ':' for RRDTool.
filter_val = (hostdata[v][i].replace(':', '\\:'))
if not self.tb.all_graphs:
# Insert Veritical Lines for release and firmware updates.
vrule = 'VRULE:%s%s:"%s %s=%s \\n"' % (i, rcolor[v], fw_datetime,
v, filter_val)
else:
# On Week + graphs, only insert release comment. There are too
# many vertical lines on the longer graphs to make see anything
# else.
vrule = 'COMMENT:"%s %s=%s \\n"' % (fw_datetime, v, filter_val)
rrd_cmd.append(vrule)
exec_str = ' '.join(rrd_cmd)
try:
common_util.RunCommand(exec_str)
except common_util.ChromeOSTestError:
logger.error('Executing: "%s".', exec_str)
if os.path.isfile(png_file):
os.chmod(png_file, 0644)
def ParseArgs():
"""Parse all command line options."""
homedir = os.environ['HOME']
datadir = os.path.normpath('/usr/local/google/%s/systemhealth' % homedir)
systemhealth_webdir = os.path.join(homedir, 'www', 'systemhealth')
logfile = os.path.join(systemhealth_webdir, 'monitor.log')
defaul_url = 'http://www/~%s/systemhealth/' % os.environ['USER']
parser = optparse.OptionParser(version=__version__)
# Args for describing the environment of the server machine.
group = optparse.OptionGroup(
parser, title='Server Configuration',
description=('Options specifying the layout of this machine.'))
group.add_option(
'-w', '--webdir',
help='Systemhealth web directory [default: %default]',
default=systemhealth_webdir,
dest='webdir')
group.add_option(
'-u', '--url',
help='URL for landing page [default: %default]',
default=defaul_url,
dest='url')
group.add_option(
'-d', '--datadir',
help='Non-NFS directory for RRD. [default: %default]',
default=datadir,
dest='datadir')
parser.add_option_group(group)
# Args for describing logging.
mp_log_util.AddOptions(parser)
# Args for selecting hosts from Autotest.
autotest_util.AddOptions(parser)
# Args for describing what work to perform.
group = optparse.OptionGroup(
parser, title='Run Configuration',
description=('Options specifying what actions the script will perform.'))
group.add_option(
'--graph',
help=('Create 1, 4, & 24 hour graphs for each host [default: %default]'),
default=False, action='store_true', dest='graph')
group.add_option(
'--all_graphs',
help='Create all graphs for each host [default: %default]',
default=False, action='store_true', dest='all_graphs')
group.add_option(
'--html',
help='Build HTML pages for hosts [default: %default]',
default=False, action='store_true', dest='html')
group.add_option(
'--update',
help='Collect data from hosts [default: %default]',
default=False, action='store_true', dest='update')
group.add_option(
'--timout',
help=('Timeout for remote commands to complete [default: %default]'),
default=30, dest='timeout')
group.add_option(
'--skip_at_status',
help=('Record the host status in autotest [default: %default]'),
default=False, action='store_true', dest='skip_at_status')
parser.add_option_group(group)
options = parser.parse_args()[0]
if not options.log_file:
options.log_file = logfile
if options.all_graphs:
options.graph = True
if not (options.graph or options.html or options.update):
parser.error('Must specify at least one of the --graph, --html, or '
'--update options.')
# Create required directories if they don't exist.
common_util.MakedirsExisting(options.datadir)
common_util.MakedirsExisting(options.webdir, 0755)
common_util.MakedirsExisting(os.path.join(options.webdir, 'hosts'), 0755)
return options
def CheckRun(action, tb):
"""Check the run status of monitor.py, and add/remove run files.
This function will ensure we only running one program with either the graph
or update option.
Args:
action: string, indicates if monitor.py is starting or stopping.
tb: options for this run.
"""
if action == 'start':
if tb.update == True:
if os.path.isfile(tb.update_runfile):
logging.info('Exiting, already running with update option')
sys.exit(1)
else:
try:
open(tb.update_runfile, 'w').close()
except IOError, e:
logging.error('Opening %s\n%s', tb.update_runfile, e)
if tb.graph:
if os.path.isfile(tb.graph_runfile):
logging.info('Exiting, already running with graph option')
sys.exit(1)
else:
try:
open(tb.graph_runfile, 'w').close()
except IOError, e:
logging.error('Opening %s\n%s', tb.graph_runfile, e)
elif action == 'stop':
if tb.update == True:
if os.path.isfile(tb.update_runfile):
try:
os.remove(tb.update_runfile)
except IOError, e:
logging.error('Removing %s\n%s', tb.update_runfile, e)
if tb.graph:
if os.path.isfile(tb.graph_runfile):
try:
os.remove(tb.graph_runfile)
except IOError, e:
logging.error('Removing %s\n%s', tb.graph_runfile, e)
else:
logging.error('Unknown option passed to CheckRun(): %s', action)
sys.exit(1)
def main():
start_time = time.time()
options = ParseArgs()
test_bed = TestBed(options)
CheckRun('start', test_bed)
try:
sysmon = Monitor(test_bed, options)
if not sysmon.afe_hosts:
logging.error('No hosts found, nothing to do, exiting.')
sys.exit(1)
sysmon.UpdateStatus()
if test_bed.update:
sysmon.BuildLandingPage()
runtime = time.time() - start_time
msg = 'End [ %s ] Runtime %d seconds' % (test_bed.run_description, runtime)
mp_log_util.LogWithHeader(msg, symbol='-')
except (KeyboardInterrupt, SystemExit):
logging.error('Shutdown requested.')
sys.exit(1)
except Exception, e:
logging.error('Exception: %s\n%s', e, traceback.format_exc())
raise
finally:
CheckRun('stop', test_bed)
os.chmod(options.log_file, 0755)
if __name__ == '__main__':
main()