| #!/usr/bin/python |
| # |
| # Copyright (c) 2011 The Chromium OS Authors. All rights reserved. |
| # Use of this source code is governed by a BSD-style license that can be |
| # found in the LICENSE file. |
| |
| """System Monitor. |
| |
| This program monitors the health of Chrome OS devices in the AutoTest testbed. |
| |
| Classes: |
| |
| Monitor - The Monitor is responsible for managing the overall process of |
| keeping an updated status of each host available to AutoTest. |
| |
| RemoteWorker - responsible for SSHing to remote hosts to gather resources. |
| |
| Resource - maintains all of the resources that are monitored, and methods to |
| parse their data for consumption by RRDTool. |
| |
| RRD - maintains all interfaces to RRDTool, including graph definitions, and |
| methods to create, update, and graph resources. |
| |
| TestBed - a global class used to hold configuration data and data collected |
| from each remote host. Additionally, formatted data for RRD will be kept |
| associated with each host, and some general information about the update |
| process of each remote host. |
| |
| |
| Usage: |
| The following options are supported: |
| --webdir: Systemhealth web directory. |
| --url: URL for landing page. |
| --datadir: Non-NFS directory for RRD files. |
| |
| --graph: Create 1, 4, and 24 hour graphs for each host. |
| --all_graphs: Create all graphs for each host. |
| --html: Build HTML pages for hosts. |
| --update: Collect data from hosts. |
| --skip_at_status: Don't collect data about hosts from autotest CLI. |
| --timout: Seconds to wait for remote commands to complete. |
| |
| --log_file: Write log messages to specified log file. |
| --skip_console: Do not write log messages to the console. |
| --verbose: Set the logging level to debug. |
| |
| --cli: Autotest CLI executable location. |
| --acl: Autotest ACL Group to query for host machines. |
| --label: Only run on hosts with the specified label. |
| --status: Only run on hosts with the specified status. |
| --user: Only run on hosts with the specified user. |
| |
| Arguments should be space separated. |
| """ |
| |
| __author__ = ('kdlucas@gmail.com (Kelly Lucas) & ' |
| 'pauldean@google.com (Paul Pendlebury)') |
| __version__ = '3.10' |
| |
| |
| import cPickle |
| import datetime |
| import json |
| import logging |
| import optparse |
| import os |
| import shutil |
| import sys |
| import time |
| import traceback |
| |
| import chromeos_test_common |
| from chromeos_test import autotest_util |
| from chromeos_test import common_util |
| from chromeos_test import mp_log_util |
| from chromeos_test import mp_thread_pool as tp |
| import IPy |
| |
| |
| class RemoteWorker(object): |
| """Obtain resource data from remote hosts using monitor_remote_worker.py.""" |
| |
| def __init__(self, hostname, platform, testbed): |
| """Inits RemoteWorker with hostname and test configuration. |
| |
| Args: |
| hostname: string, hostname of AutoTest host. |
| platform: string, platform of hostname. |
| testbed: testbed object for this run. |
| """ |
| self.h = hostname |
| self.platform = platform |
| self.tb = testbed |
| |
| # Set up some dictionaries for each host. |
| self.host_data = {} |
| self.host_data['rrddata'] = {} # Formatted data. |
| self.host_data['status'] = False |
| self.host_data['time'] = None |
| for v in self.tb.version: |
| self.host_data[v] = {} |
| self.host_data[v]['PTR'] = None |
| |
| def Run(self, logger): |
| """Method called into by thread pool.""" |
| |
| logger.debug('Starting host %s.', self.h) |
| updated_html_needed = False |
| data_file = os.path.join(self.tb.datadir, 'hosts', self.h, 'data.pkl') |
| local_script = os.path.join(chromeos_test_common.CURRENT_DIR, |
| 'monitor_remote_worker.py') |
| remote_script = '/tmp/monitor_remote_worker.py' |
| |
| try: |
| if self.tb.update: |
| if not os.path.isfile(local_script): |
| logger.error('Script file %s missing.', local_script) |
| return |
| |
| # Copy script |
| try: |
| common_util.RemoteCopy(self.h, 'root', local_script, remote_script) |
| except common_util.ChromeOSTestError: |
| logger.error('Skipping unreachable host %s.', self.h) |
| return |
| # Run Script |
| try: |
| output = common_util.RemoteCommand(self.h, 'root', remote_script, |
| output=True) |
| self.host_data = cPickle.loads(output) |
| except common_util.ChromeOSTestError: |
| logger.exception('Error running script on host %s.', self.h) |
| self.host_data['status'] = 'CollectionError' |
| else: |
| # If it exists, load saved host_data. |
| if os.path.isfile(data_file): |
| with open(data_file, 'rb') as in_file: |
| self.host_data = cPickle.load(in_file) |
| |
| advisor = Resource() |
| if ((self.tb.update or self.tb.graph) and |
| self.host_data['status'] != 'CollectionError'): |
| updated_html_needed = self.UpdateRelease(logger) |
| advisor.ProcessHostRRD(self.h, self.host_data, self.tb, logger) |
| if self.tb.html: |
| advisor.BuildHTML(self.h, self.platform, self.host_data, self.tb, |
| updated_html_needed) |
| |
| # Save successful host data so it can be loaded later. |
| if self.tb.update and self.host_data['status'] == 'True': |
| # rrd data is no longer needed, so don't save it. |
| del self.host_data['rrddata'] |
| self.host_data['rrddata'] = {} |
| with open(data_file, 'wb') as out_file: |
| cPickle.dump(self.host_data, out_file, cPickle.HIGHEST_PROTOCOL) |
| |
| # Lots of exception handling happening here. This is the entry point |
| # for this thread/process and if we let an exception go unhandled |
| # we wouldn't it from the main thread and we would miss any |
| # notifications of problems. |
| except (KeyboardInterrupt, SystemExit): |
| logging.exception('Shutdown requested.') |
| sys.exit(1) |
| except Exception: |
| logging.exception('Unexpected Exception on %s', self.h) |
| raise |
| logger.debug('Finished host %s.', self.h) |
| |
| def UpdateRelease(self, logger): |
| """Update Release info with most current release versions. |
| |
| The PTR key points to the most recent released version. This will also |
| preserve the last known release version in case the host is down. |
| |
| Args: |
| logger: multiprocess logger |
| |
| Returns: |
| True/False if new HTML files are needed for this host. |
| """ |
| rrd_dir = os.path.join(self.tb.datadir, 'hosts', self.h, 'rrd') |
| # Check if the host directory exists, if not create it. |
| common_util.MakedirsExisting(rrd_dir) |
| |
| update_html = False |
| for v in self.tb.version: |
| update_file = False |
| relfile = os.path.join(rrd_dir, v) |
| tmpfile = os.path.join(rrd_dir, v + '.tmp') |
| if os.path.isfile(relfile): |
| try: |
| rf = open(relfile, 'r') |
| lines = rf.readlines() |
| except IOError, e: |
| logger.error('Parsing release file %s\n%s', relfile, e) |
| finally: |
| rf.close() |
| |
| for line in lines: |
| fields = line.split('=') |
| # The correct format will have two strings separated by =. |
| if len(fields) == 2: |
| if fields[0] == 'PTR': |
| if self.host_data[v]['PTR']: |
| if self.host_data[v]['PTR'] != fields[1]: |
| # Most recent version has changed. |
| update_file = True |
| lines.pop(lines.index(line)) |
| self.host_data[v][self.tb.time] = (self.host_data[v]['PTR']) |
| else: |
| # Host is down so use last known value. |
| self.host_data[v]['PTR'] = (fields[1].strip()) |
| else: |
| self.host_data[v][fields[0]] = (fields[1].strip()) |
| elif len(line) > 3: |
| # This means the release file has the wrong format, so |
| # we'll just write a new one with current values. |
| update_file = True |
| lines.pop(lines.index(line)) |
| else: |
| # If we get here than it's probably a blank line. |
| update_file = True |
| lines.pop(lines.index(line)) |
| |
| if update_file: |
| update_html = True |
| logger.debug('Updating %s', relfile) |
| shutil.move(relfile, tmpfile) |
| # Put the most recent update in the new file, and make the |
| # PTR key to point to it. |
| lines.append('%s=%s\n' % (self.tb.time, self.host_data[v]['PTR'])) |
| lines.append('PTR=%s' % self.host_data[v]['PTR']) |
| try: |
| rf = open(relfile, 'w') |
| for line in lines: |
| rf.write(line) |
| except IOError, e: |
| logger.error('Writing %s\n%s', relfile, e) |
| finally: |
| rf.close() |
| else: |
| # Create a new release file, as it does not exist. |
| if self.host_data[v]['PTR']: |
| update_html = True |
| logger.info('Creating new %s', relfile) |
| try: |
| rf = open(relfile, 'w') |
| rf.write('%s=%s\n' % (self.tb.time, self.host_data[v]['PTR'])) |
| rf.write('PTR=%s' % self.host_data[v]['PTR']) |
| except IOError, e: |
| logger.error('Writing %s\n%s', relfile, e) |
| finally: |
| rf.close() |
| |
| self.host_data[v][self.tb.time] = (self.host_data[v]['PTR']) |
| return update_html |
| |
| |
| class TestBed(object): |
| """Used to hold all of the global variables.""" |
| |
| def __init__(self, options): |
| """Inits TestBed with run options. |
| |
| Args: |
| options: Command line args for this run. |
| """ |
| # Save run start time. |
| self.time = int(time.time()) |
| |
| # Setup logging. |
| self.options = options |
| self.logfile = options.log_file |
| |
| logger = logging.getLogger() |
| mp_log_util.InitializeLogging(logger, **vars(options)) |
| |
| # Warn and exit if SSH is not in the environment. |
| if not 'SSH_AGENT_PID' in os.environ: |
| logger.error('SSH_AGENT_PID not in environment, ssh commands will fail ' |
| 'to execute.') |
| sys.exit(1) |
| |
| # Verify RRD is installed where we expect it. |
| if not os.path.exists('/usr/bin/rrdtool'): |
| logger.error('RRD is not installed to /usr/bin/rrdtool. Run \'sudo ' |
| 'apt-get install rrdtool\'.') |
| sys.exit(1) |
| |
| # Assign TestBed values used for RRD and HTML pages. |
| self.version = ['ec_firmware', 'firmware', 'release'] |
| self.rrdtimes = ['-1hours', '-4hours', '-24hours', '-1week', '-1month', |
| '-1year'] |
| |
| # Make sure directories exist to hold status and data files. |
| run_dir = os.path.normpath('/tmp/systemhealth') |
| common_util.MakedirsExisting(run_dir) |
| |
| # Default status files. Used to prevent more than one instance from |
| # running at the same time. |
| self.update_runfile = os.path.join(run_dir, 'update.running') |
| self.graph_runfile = os.path.join(run_dir, 'graph.running') |
| |
| # Requested run actions. |
| self.graph = options.graph |
| self.all_graphs = options.all_graphs |
| self.html = options.html |
| self.update = options.update |
| self.timeout = options.timeout |
| self.skip_at_status = options.skip_at_status |
| |
| # Machine setup. |
| self.webdir = options.webdir |
| self.url = options.url |
| self.datadir = options.datadir |
| |
| # Output some debug info. |
| self.run_description = str(os.getpid()) + ':' |
| if self.update: |
| self.run_description += ' Update' |
| if self.graph: |
| self.run_description += ' Graph' |
| if self.all_graphs: |
| self.run_description += '_All' |
| if self.html: |
| self.run_description += ' HTML' |
| if not self.skip_at_status: |
| self.run_description += ' Status' |
| mp_log_util.LogWithHeader('Start ' + self.run_description, logger) |
| |
| |
| class Monitor(object): |
| """Main class used to manage the monitoring of remote hosts. |
| |
| This class is used to determine the current status of hosts in the AutoTest |
| testbed. AutoTest will be queried to populate self.rhosts. It will populate |
| a list of RemoteWorkes and submit that list to MultiProcWorkPool to query |
| each host to gather resource data. |
| """ |
| |
| def __init__(self, testbed, options): |
| """Monitor will use config data from TestBed.""" |
| self.tb = testbed |
| self.options = options |
| self.mp_wp = tp.MultiProcWorkPool() |
| self.afe_hosts = autotest_util.GetHostData(self.tb.options.cli, |
| self.tb.options.acl, |
| self.tb.options.label, |
| self.tb.options.user, |
| self.tb.options.status) |
| self.host_status = [] |
| |
| def UpdateStatus(self): |
| """Update data from all monitored hosts.""" |
| |
| # Don't attempt work when no hosts are known. |
| if not self.afe_hosts: |
| return |
| |
| # Record known host status from Autotest |
| if not self.options.skip_at_status: |
| self.RecordAutotestHostStatus(self.afe_hosts) |
| |
| # Create instance of RemoteWorker class for every host from atest. |
| self.host_status = [RemoteWorker(host, self.afe_hosts[host]['platform'], |
| self.tb) for host in self.afe_hosts.keys()] |
| |
| # Submit RemoteWorker items to thread pool. |
| self.host_status = self.mp_wp.ExecuteWorkItems( |
| self.host_status, 'Run', provide_logger=True, |
| logger_init_callback=mp_log_util.InitializeLogging, |
| **vars(self.options)) |
| |
| loglevel = logging.getLogger().getEffectiveLevel() |
| if loglevel == logging.DEBUG: |
| for worker in self.host_status: |
| logging.debug('%s status is %s/%s', worker.h, |
| worker.host_data['status'], |
| self.afe_hosts[worker.h]['status']) |
| |
| def RecordAutotestHostStatus(self, hosts): |
| """Record Autotest status of all hosts in rrd files. |
| |
| Args: |
| hosts: Dictionary of host information from autotest cli. |
| """ |
| |
| # Maps a host status string to an index in an array. |
| status_key = {'Repairing': 0, 'Verifying': 1, 'Repair_Failed': 2, |
| 'Running': 3, 'Cleaning': 4, 'Ready': 5, 'Pending': 6} |
| |
| # lab_status holds the lab data in the format rrd needs. The special |
| # netbook_ALL platform is the sum of all the platforms. |
| lab_status = {'netbook_ALL': [0] * len(status_key)} |
| |
| # Loop through all the hosts recording their status in lab_status |
| for host in hosts: |
| status = hosts[host]['status'].replace(' ', '_') |
| platform = hosts[host]['platform'] |
| |
| if platform not in lab_status: |
| lab_status[platform] = [0] * len(status_key) |
| if status in status_key: |
| lab_status[platform][status_key[status]] += 1 |
| lab_status['netbook_ALL'][status_key[status]] += 1 |
| else: |
| logging.error('Status=%s not a known status of %s', status, status_key) |
| |
| Resource().ProcessAutotestRRD(lab_status, self.tb, logging.getLogger()) |
| |
| # Save data for later analysis in a pickled data file. |
| for platform in lab_status: |
| data_folder = os.path.join(self.tb.datadir, 'hosts', platform) |
| common_util.MakedirsExisting(data_folder) |
| |
| data_file = os.path.join(data_folder, 'utilization.pkl') |
| platform_data = {} |
| if os.path.isfile(data_file): |
| with open(data_file, 'rb') as in_file: |
| platform_data = cPickle.load(in_file) |
| |
| date_entry = datetime.datetime.strftime(datetime.datetime.now(), |
| '%Y_%m_%d_%H_%M_%S') |
| platform_data[date_entry] = lab_status[platform] |
| with open(data_file, 'wb') as out_file: |
| cPickle.dump(platform_data, out_file, cPickle.HIGHEST_PROTOCOL) |
| |
| @staticmethod |
| def ValidIP(address): |
| """Verify address is a valid IP address. |
| |
| Args: |
| address: string. |
| Returns: |
| boolean: True = valid IP address, False = not valid IP address. |
| """ |
| octets = address.split('.') |
| if len(octets) != 4: |
| return False |
| for octet in octets: |
| if not 0 <= int(octet) <= 255: |
| return False |
| return True |
| |
| def SortAFEHosts(self, afelist): |
| """Sort AFE host list by IP address. |
| |
| Args: |
| afelist: list of AFE host objects. |
| Returns: |
| newlist: list of sorted AFE host objects. |
| """ |
| iplist = [] |
| hostlist = [] |
| |
| for h in afelist: |
| if self.ValidIP(h): |
| iplist.append(h) |
| else: |
| hostlist.append(h) |
| |
| templist = [(IPy.IP(h).int(), h) for h in iplist] |
| templist.sort() |
| newlist = [h[1] for h in templist] |
| hostlist.sort() |
| newlist.extend(hostlist) |
| |
| return newlist |
| |
| def BuildLandingPage(self): |
| """Build the initial HTML landing page with links to all hosts.""" |
| logging.debug('Building Landing Page') |
| sorted_hosts = [] |
| downhosts = 0 |
| down_repair = 0 |
| down_running = 0 |
| down_ready = 0 |
| down_other = 0 |
| |
| readyhosts = 0 |
| ready_repair = 0 |
| ready_running = 0 |
| ready_ready = 0 |
| ready_other = 0 |
| |
| scripthosts = 0 |
| script_repair = 0 |
| script_running = 0 |
| script_ready = 0 |
| script_other = 0 |
| |
| hostlist = self.afe_hosts.keys() |
| sorted_ip = self.SortAFEHosts(hostlist) |
| |
| # Create a dictionary to easily map host name to host result. |
| host_results = {} |
| for host in self.host_status: |
| host_results[host.h] = host |
| |
| # Put host that are down first |
| for h in sorted_ip: |
| insert_offset = 0 |
| # Up hosts. |
| if host_results[h].host_data['status'] == 'True': |
| readyhosts += 1 |
| insert_offset += downhosts + scripthosts |
| if self.afe_hosts[h]['status'] == 'Repair': |
| insert_offset += ready_repair |
| ready_repair += 1 |
| self.afe_hosts[h]['color'] = '#96BAC6' |
| self.afe_hosts[h]['status_string'] = 'Repair' |
| elif self.afe_hosts[h]['status'] == 'Running': |
| insert_offset += ready_repair + ready_running |
| ready_running += 1 |
| self.afe_hosts[h]['color'] = '#BBD9EE' |
| self.afe_hosts[h]['status_string'] = 'Running' |
| elif self.afe_hosts[h]['status'] == 'Ready': |
| insert_offset += ready_repair + ready_running + ready_ready |
| ready_ready += 1 |
| self.afe_hosts[h]['color'] = '#FFFFFF' |
| self.afe_hosts[h]['status_string'] = 'Ready' |
| else: |
| insert_offset += (ready_repair + ready_running + ready_ready + |
| ready_other) |
| ready_other += 1 |
| self.afe_hosts[h]['color'] = '#788D9A' |
| status_str = self.afe_hosts[h]['status'] |
| self.afe_hosts[h]['status_string'] = status_str |
| # Up hosts with python problems. |
| elif host_results[h].host_data['status'] == 'CollectionError': |
| scripthosts += 1 |
| insert_offset += downhosts |
| if self.afe_hosts[h]['status'] == 'Repair': |
| insert_offset += script_repair |
| script_repair += 1 |
| self.afe_hosts[h]['color'] = '#245403' |
| self.afe_hosts[h]['status_string'] = 'ScriptError/Repair' |
| elif self.afe_hosts[h]['status'] == 'Running': |
| insert_offset += script_repair + script_running |
| script_running += 1 |
| self.afe_hosts[h]['color'] = '#406331' |
| self.afe_hosts[h]['status_string'] = 'ScriptError/Running' |
| elif self.afe_hosts[h]['status'] == 'Ready': |
| insert_offset += (script_repair + script_running + script_ready) |
| script_ready += 1 |
| self.afe_hosts[h]['color'] = '#5E924E' |
| self.afe_hosts[h]['status_string'] = 'ScriptError/Ready' |
| else: |
| insert_offset += (script_repair + script_running + script_ready + |
| script_other) |
| script_other += 1 |
| self.afe_hosts[h]['color'] = '#183503' |
| status_str = 'ScriptError/' + self.afe_hosts[h]['status'] |
| self.afe_hosts[h]['status_string'] = status_str |
| # Down hosts. |
| else: |
| downhosts += 1 |
| if self.afe_hosts[h]['status'] == 'Repair': |
| insert_offset += down_repair |
| down_repair += 1 |
| self.afe_hosts[h]['color'] = '#867146' |
| self.afe_hosts[h]['status_string'] = 'Down/Repair' |
| elif self.afe_hosts[h]['status'] == 'Running': |
| insert_offset += down_repair + down_running |
| down_running += 1 |
| self.afe_hosts[h]['color'] = '#E5DCBD' |
| self.afe_hosts[h]['status_string'] = 'Down/Running' |
| elif self.afe_hosts[h]['status'] == 'Ready': |
| insert_offset += down_repair + down_running + down_ready |
| down_ready += 1 |
| self.afe_hosts[h]['color'] = '#D6C085' |
| self.afe_hosts[h]['status_string'] = 'Down/Ready' |
| else: |
| insert_offset += (down_repair + down_running + down_ready + |
| down_other) |
| down_other += 1 |
| self.afe_hosts[h]['color'] = '#4F4126' |
| status_str = 'Down/' + self.afe_hosts[h]['status'] |
| self.afe_hosts[h]['status_string'] = status_str |
| sorted_hosts.insert(insert_offset, h) |
| |
| # If we didn't connect to the host this run, load data from |
| # the last successful run. |
| if host_results[h].host_data['status'] != 'True': |
| data_file = os.path.join(self.tb.datadir, 'hosts', h, 'data.pkl') |
| if os.path.isfile(data_file): |
| with open(data_file, 'rb') as in_file: |
| host_results[h].host_data = cPickle.load(in_file) |
| |
| # Create symlink to the log file if it does not exist. |
| log_filename = os.path.join(self.tb.webdir, 'monitor.log') |
| if not os.path.isfile(log_filename): |
| try: |
| os.symlink(self.tb.logfile, log_filename) |
| except OSError, e: |
| logging.error('Linking to logfile\n%s', e) |
| land_page_file = os.path.join(self.tb.webdir, 'index.html') |
| # The temp file is used so that there will always be viewable html page |
| # when the new page is being built. |
| land_page_temp = os.path.join(self.tb.webdir, 'temp.html') |
| f = open(land_page_temp, 'w') |
| f.write('<HTML><HEAD>') |
| f.write('<LINK REL="stylesheet" TYPE="text/css" HREF="table.css">') |
| f.write('<TITLE>AutoTest System Health Check</TITLE></HEAD>') |
| f.write('<BODY>') |
| f.write('<img src="chrome.png" style="float:left;"/>') |
| f.write('<table style="float: right">') |
| f.write(('<TR><TD><a href=%s>%s</a><TD>Hosts<TD>Ready<TD>Repair<TD>' |
| 'Running<TD>Other') % ('monitor.log', 'Log File')) |
| f.write('<TR><TD>Total') |
| f.write('<TD>%d<TD>%d<TD>%d<TD>%d<TD>%d' % ( |
| downhosts + readyhosts + scripthosts, |
| down_ready + ready_ready + script_ready, |
| down_repair + ready_repair + script_repair, |
| down_running + ready_running + script_running, |
| down_other + ready_other + script_other)) |
| f.write('<TR><TD>Inaccessible') |
| f.write('<TD>%d<TD>%d<TD>%d<TD>%d<TD>%d' % (downhosts, down_ready, |
| down_repair, down_running, |
| down_other)) |
| f.write('<TR><TD>Script Error') |
| f.write('<TD>%d<TD>%d<TD>%d<TD>%d<TD>%d' % (scripthosts, script_ready, |
| script_repair, script_running, |
| script_other)) |
| f.write('<TR><TD>Accessible') |
| f.write('<TD>%d<TD>%d<TD>%d<TD>%d<TD>%d' % (readyhosts, ready_ready, |
| ready_repair, ready_running, |
| ready_other)) |
| f.write('</table>') |
| f.write('<center><H1>CAUTOTEST Testbed</H1>') |
| f.write('<H2>System Health</H2>') |
| plat_graph = os.path.join(self.tb.url, 'hosts', 'netbook_ALL', |
| 'utilization-24hours.png') |
| f.write('<BR><BR><img src=%s ><BR><BR>' % plat_graph) |
| f.write('<table>') |
| f.write('<CAPTION>Hosts last updated: %s</CAPTION>' % time.strftime( |
| '%d %b %Y - %I:%M:%S %p %Z', time.localtime())) |
| f.write('<TR><TH>Hostname<TH>Status<TH>Labels<TH>Last Update') |
| f.write('<TH>Release<TH>Health</TR>') |
| for h in sorted_hosts: |
| link_dir = 'hosts/' + h |
| web_dir = os.path.join(self.tb.webdir, 'hosts', h) |
| common_util.MakedirsExisting(web_dir, 0755) |
| fqn = 'http://cautotest.corp.google.com/' |
| view_host = 'afe/#tab_id=view_host&object_id=%s' % h |
| hlink = fqn + view_host |
| f.write('<tr bgcolor=%s><th>' % self.afe_hosts[h]['color']) |
| f.write('<a href=%s>%s</a></th>' % (hlink, h)) |
| f.write('<td><em>%s</em>' % self.afe_hosts[h]['status_string']) |
| f.write('<td>') |
| f.write('<em><b>%s</b></em><br>' % self.afe_hosts[h]['platform']) |
| for label in self.afe_hosts[h]['labels']: |
| f.write('%s<br>' % label) |
| f.write('<td>%s' % host_results[h].host_data['time']) |
| if host_results[h].host_data['release']['PTR']: |
| f.write('<td>%s' % host_results[h].host_data['release']['PTR']) |
| else: |
| f.write('<td>Unknown') |
| index_file = os.path.join(web_dir, 'index.html') |
| if os.path.isfile(index_file): |
| f.write('<td><a href=%s' % self.tb.url) |
| f.write('%s/index.html target="_blank">' % link_dir) |
| f.write('health</a></td>') |
| else: |
| f.write('<td>None</td>') |
| f.write('</table><p>\n</center>\n</BODY></HTML>') |
| f.close() |
| shutil.copyfile(land_page_temp, land_page_file) |
| os.chmod(land_page_file, 0644) |
| |
| |
| class Resource(object): |
| """Contains structures and methods to collect health data on hosts. |
| |
| For each resource in self.resources, there must also be a corresponding |
| method to format the data into what RRDTool expects. |
| """ |
| |
| def __init__(self): |
| self.resources = [ |
| 'battery', |
| 'boot', |
| 'cpu', |
| 'load', |
| 'memory', |
| 'network', |
| 'power', |
| 'temp', |
| 'uptime' |
| ] |
| self.fs = [ |
| 'rootfsA_space', |
| 'rootfsA_inodes', |
| 'rootfsA_stats', |
| 'rootfsB_space', |
| 'rootfsB_inodes', |
| 'rootfsB_stats', |
| 'stateful_space', |
| 'stateful_inodes', |
| 'stateful_stats' |
| ] |
| self.resources += self.fs |
| |
| @staticmethod |
| def ProcessAutotestRRD(hosts, testbed, logger): |
| """Process formatted data into RRD files for each host in hosts. |
| |
| Args: |
| hosts: dictionary of platforms and their data for rrd. |
| testbed: configuration data for this run. |
| logger: logger for this process/thread. |
| """ |
| for platform in hosts: |
| rrd_dir = os.path.join(testbed.datadir, 'hosts', platform, 'rrd') |
| web_dir = os.path.join(testbed.webdir, 'hosts', platform) |
| |
| common_util.MakedirsExisting(rrd_dir) |
| common_util.MakedirsExisting(web_dir, 0755) |
| |
| rrd_list = [] |
| for v in hosts[platform]: |
| rrd_list += [str(v)] |
| |
| rrd_dict = {'rrddata': {'utilization': rrd_list}} |
| rrd = RRD('utilization', platform, rrd_dir, web_dir, testbed) |
| if not os.path.exists(rrd.rrdfile): |
| rrd.Create(logger, 600) |
| rrd.Update(rrd_dict, logger) |
| rrd.Graph(rrd_dict, logger, False) |
| |
| def ProcessHostRRD(self, hostname, hostdata, testbed, logger): |
| """Process formatted data into RRD files for host hostname. |
| |
| Args: |
| hostname: string, hostname of AutoTest host. |
| hostdata: raw data from the host. |
| testbed: configuration data for this run. |
| logger: logger for this process/thread. |
| """ |
| rrd_dir = os.path.join(testbed.datadir, 'hosts', hostname, 'rrd') |
| web_dir = os.path.join(testbed.webdir, 'hosts', hostname) |
| |
| common_util.MakedirsExisting(rrd_dir) |
| common_util.MakedirsExisting(web_dir, 0755) |
| |
| for r in self.resources: |
| dk = None # datakey only needs to be set if it's a file system. |
| if r in self.fs: |
| if '_space' in r: |
| dk = 'fs_space' |
| elif '_inode' in r: |
| dk = 'fs_inode' |
| elif '_stat' in r: |
| dk = 'fs_stat' |
| |
| rrd = RRD(r, hostname, rrd_dir, web_dir, testbed, dk) |
| if not os.path.exists(rrd.rrdfile): |
| rrd.Create(logger) |
| if testbed.update == True: |
| logger.debug('Updating %s for host %s', r, hostname) |
| rrd.Update(hostdata, logger) |
| if testbed.graph: |
| logger.debug('Building %s graphs for %s', r, hostname) |
| rrd.Graph(hostdata, logger) |
| |
| def BuildHTML(self, hostname, platform, hostdata, testbed, |
| update_needed=False): |
| """Create HTML pages for to display the graphs. |
| |
| Args: |
| hostname: string, hostname of AutoTest host. |
| platform: string, platform of hostname. |
| hostdata: raw data from the host. |
| testbed: configuration data for this run. |
| update_needed: new html needed, existing has wrong info. |
| """ |
| web_dir = os.path.join(testbed.webdir, 'hosts', hostname) |
| plat_dir = os.path.join(testbed.url, 'hosts', platform) |
| index_file = os.path.join(web_dir, 'index.html') |
| |
| # If the index file exists, and the release info hasn't changed, skip. |
| if os.path.isfile(index_file) and not update_needed: |
| return |
| |
| mainindex = testbed.url + 'index.html' |
| resource_list = [] |
| for r in self.resources: |
| resource_list.append(r) |
| resource_list.sort() |
| |
| html_file = {} |
| for t in testbed.rrdtimes: |
| html_file[t] = hostname + t + '.html' |
| pathname = {} |
| for name in html_file: |
| pathname[name] = os.path.join(web_dir, html_file[name]) |
| |
| # Create directory for html/graphs. |
| common_util.MakedirsExisting(web_dir, 0755) |
| |
| # Create HTML files for each time period we are graphing. |
| for path in pathname: |
| f = open(pathname[path], 'w') |
| f.write('<HTML><HEAD>') |
| f.write('<center><TITLE>%s System Health</TITLE></HEAD>' % hostname) |
| f.write('<BODY><H1>%s System Health</H1>' % hostname) |
| for v in testbed.version: |
| f.write('<H4>%s: %s</H4>' % (v, hostdata[v]['PTR'])) |
| for t in testbed.rrdtimes: |
| f.write('<a href="%s">%s</a> <b>|</b>' % (html_file[t], t)) |
| f.write('<a href="%s">SystemHealth Home</a>' % mainindex) |
| f.write('<p><HR>') |
| plat_graph = os.path.join(plat_dir, 'utilization' + path + '.png') |
| f.write('<img src=%s ><BR><BR>' % plat_graph) |
| f.write('<table border=1 bgcolor=#EEEEEE>') |
| newrow = True |
| for r in resource_list: |
| if newrow: |
| f.write('<tr>') |
| f.write('<td>%s<br><a href=%s.html>' % (r, r)) |
| f.write('<img src=%s%s.png width=475 height=250></a></td>' % (r, path)) |
| if newrow: |
| newrow = False |
| else: |
| f.write('</tr>\n') |
| newrow = True |
| f.write('</table><p>\n') |
| f.write('</center>\n') |
| f.write('<H5>Last Update: %s</H5>' % hostdata['time']) |
| f.write('</BODY></HTML>') |
| f.close() |
| os.chmod(pathname[path], 0644) |
| # Set default landing page to 24-hour graphs |
| if not os.path.isfile(index_file): |
| os.symlink(pathname[testbed.rrdtimes[2]], index_file) |
| |
| # Create HTML files for each resource for all time periods. |
| for r in resource_list: |
| rrdfile = os.path.join(web_dir, r + '.html') |
| f = open(rrdfile, 'w') |
| f.write('<HTML><HEAD>') |
| f.write('<center><TITLE>%s %s Resources</TITLE></HEAD>' % (hostname, r)) |
| f.write('<BODY><H1>%s %s Resources</H1>' % (hostname, r)) |
| for v in testbed.version: |
| f.write('<H4>%s: %s</H4>' % (v, hostdata[v]['PTR'])) |
| f.write('<table border=5 bgcolor=#B5B5B5>') |
| f.write('<tr>') |
| for t in testbed.rrdtimes: |
| f.write('<td><a href="#%s"><b>%s</b></a>' % (t, t)) |
| f.write('</table>') |
| f.write('<HR>') |
| f.write('<table border=1 bgcolor=#EEEEEE>') |
| for t in testbed.rrdtimes: |
| f.write('<tr><td><a name="%s"><img src=%s%s.png>' % (t, r, t)) |
| f.write('</a></td></tr>\n') |
| f.write('</table><p>\n') |
| f.write('</center>\n') |
| f.write('<H5>Last Update: %s</H5>' % hostdata['time']) |
| f.write('</BODY></HTML>') |
| f.close() |
| os.chmod(rrdfile, 0644) |
| |
| |
| class RRD(object): |
| """The class to create and update RRD data stores and graph them. |
| |
| This class should be used to access all of the functions of RRDTool. It will |
| create the data files, update them, and create graphs/charts based on that |
| data. Datakey is needed when we are using the same data definitions for many |
| items of the same type, like file systems. |
| """ |
| |
| def __init__(self, rrdname, hostname, rrd_dir, web_dir, tb, datakey=None): |
| """Inits RRD class. |
| |
| Args: |
| rrdname: string, item name(should match key from Resources) |
| hostname: string, hostname of the machine. |
| rrd_dir: string, directory for rrd files. |
| web_dir: string, directory for generated graphs. |
| tb: testbase object for this run. |
| datakey: string, overrides which data definition to use. |
| """ |
| self.tb = tb |
| self.rrdtool = '/usr/bin/rrdtool' |
| self.rrd_dir = rrd_dir |
| self.web_dir = web_dir |
| self.rrdname = rrdname |
| self.hostname = hostname |
| rrd_filename = rrdname + '.rrd' |
| self.rrdfile = os.path.join(self.rrd_dir, rrd_filename) |
| file_system = 'Unknown' |
| |
| if not datakey: |
| datakey = rrdname |
| else: |
| fields = rrdname.split('_') |
| if fields[0]: |
| file_system = fields[0] |
| |
| self.dd = json.load(open(os.path.join(sys.path[0], 'rrd.json')))[datakey] |
| self.dd['title'] %= {'host': self.hostname, 'file_system': file_system} |
| |
| def Create(self, logger, step=600): |
| """Create an empty RRD file. |
| |
| Args: |
| logger: Multiprocess logger. |
| step: Default rrdtool step. |
| |
| Returns: |
| boolean: True = Success, False = failure. |
| """ |
| |
| stime = int(time.time()) - 5 * 86400 |
| rrd_suffix = ['RRA:AVERAGE:0.5:1:576', 'RRA:AVERAGE:0.5:6:672', |
| 'RRA:AVERAGE:0.5:24:732', 'RRA:AVERAGE:0.5:144:1460'] |
| |
| rrd_cmd = [self.rrdtool, 'create', self.rrdfile, '--start', str(stime), |
| '--step', str(step)] |
| for ds in self.dd['items']: |
| ds_str = 'DS:%s:%s:%s:%s:%s' % (ds, self.dd['type'], self.dd['heartbeat'], |
| self.dd['min'], self.dd['max']) |
| rrd_cmd.append(ds_str) |
| rrd_cmd += rrd_suffix |
| # Convert the rrd_cmd to a string with space separated commands. |
| exec_str = ' '.join(rrd_cmd) |
| try: |
| common_util.RunCommand(exec_str) |
| except common_util.ChromeOSTestError: |
| logger.error('Executing: "%s".', exec_str) |
| return False |
| return True |
| |
| def Update(self, hostdata, logger): |
| """Update an existing RRD file. |
| |
| Args: |
| hostdata: dictionary of raw data from this host. |
| logger: logger for this process/thread |
| |
| Returns: |
| boolean: True = Success, False = errors. |
| """ |
| if self.rrdname in hostdata['rrddata']: |
| data_count = len(hostdata['rrddata'][self.rrdname]) |
| if data_count == 0: |
| logger.debug('Key "%s" empty in hostdata for host %s.', self.rrdname, |
| self.hostname) |
| return False |
| |
| if data_count < 2: |
| data = 'N:' + hostdata['rrddata'][self.rrdname][0] |
| else: |
| data = 'N:' + ':'.join(hostdata['rrddata'][self.rrdname]) |
| rrd_cmd = [self.rrdtool, 'update', self.rrdfile, data] |
| exec_str = ' '.join(rrd_cmd) |
| try: |
| common_util.RunCommand(exec_str) |
| except common_util.ChromeOSTestError: |
| logger.error('Executing: "%s".', exec_str) |
| return False |
| |
| return True |
| else: |
| logger.debug('Key "%s" not found in hostdata for host %s.', self.rrdname, |
| self.hostname) |
| return False |
| |
| def Graph(self, hostdata, logger, include_updates=True, file_prefix=''): |
| """Create a graph of a tracked resource. |
| |
| Args: |
| hostdata: Dictionary of raw data from this host. |
| logger: Logger for this process/thread. |
| include_updates: Include firmware update history in graphs. |
| file_prefix: String to append to front of graph file names. |
| """ |
| width = '850' |
| height = '300' |
| end = 'now' |
| rcolor = {'release': '#9966FF', 'firmware': '#990033', |
| 'ec_firmware': '#009933'} |
| |
| if self.tb.all_graphs: |
| rrdtimes = self.tb.rrdtimes |
| else: |
| rrdtimes = self.tb.rrdtimes[:3] |
| |
| for rrdtime in rrdtimes: |
| png_filename = file_prefix + self.rrdname + rrdtime + '.png' |
| png_file = os.path.join(self.web_dir, png_filename) |
| |
| title = self.dd['title'] + ' ' + rrdtime + '"' |
| |
| rrd_cmd = [self.rrdtool, 'graph', png_file, '--imgformat PNG', '-s', |
| rrdtime, '--end', end, '--width', width, '--height', height, |
| '--vertical-label', self.dd['units'], '--title', title] |
| |
| for ds in self.dd['items']: |
| rrd_cmd.append('DEF:%s=%s:%s:AVERAGE' % (ds, self.rrdfile, ds)) |
| rrd_cmd += self.dd['graph'] |
| if include_updates: |
| rrd_cmd.append('COMMENT:"Release History \\s"') |
| rrd_cmd.append('COMMENT:"=============== \\n"') |
| for v in self.tb.version: |
| sorted_items = [] |
| for k in hostdata[v]: |
| if k != 'PTR': |
| sorted_items.append(k) |
| sorted_items.sort() |
| for i in sorted_items: |
| # Get a date/time string to display, localtime requires |
| # a float, so convert i to float. |
| fw_datetime = time.strftime('%D %H\\:%M', time.localtime(float(i))) |
| # Need to escape any ':' for RRDTool. |
| filter_val = (hostdata[v][i].replace(':', '\\:')) |
| if not self.tb.all_graphs: |
| # Insert Veritical Lines for release and firmware updates. |
| vrule = 'VRULE:%s%s:"%s %s=%s \\n"' % (i, rcolor[v], fw_datetime, |
| v, filter_val) |
| else: |
| # On Week + graphs, only insert release comment. There are too |
| # many vertical lines on the longer graphs to make see anything |
| # else. |
| vrule = 'COMMENT:"%s %s=%s \\n"' % (fw_datetime, v, filter_val) |
| rrd_cmd.append(vrule) |
| |
| exec_str = ' '.join(rrd_cmd) |
| try: |
| common_util.RunCommand(exec_str) |
| except common_util.ChromeOSTestError: |
| logger.error('Executing: "%s".', exec_str) |
| if os.path.isfile(png_file): |
| os.chmod(png_file, 0644) |
| |
| |
| def ParseArgs(): |
| """Parse all command line options.""" |
| homedir = os.environ['HOME'] |
| datadir = os.path.normpath('/usr/local/google/%s/systemhealth' % homedir) |
| systemhealth_webdir = os.path.join(homedir, 'www', 'systemhealth') |
| logfile = os.path.join(systemhealth_webdir, 'monitor.log') |
| defaul_url = 'http://www/~%s/systemhealth/' % os.environ['USER'] |
| |
| parser = optparse.OptionParser(version=__version__) |
| |
| # Args for describing the environment of the server machine. |
| group = optparse.OptionGroup( |
| parser, title='Server Configuration', |
| description=('Options specifying the layout of this machine.')) |
| group.add_option( |
| '-w', '--webdir', |
| help='Systemhealth web directory [default: %default]', |
| default=systemhealth_webdir, |
| dest='webdir') |
| group.add_option( |
| '-u', '--url', |
| help='URL for landing page [default: %default]', |
| default=defaul_url, |
| dest='url') |
| group.add_option( |
| '-d', '--datadir', |
| help='Non-NFS directory for RRD. [default: %default]', |
| default=datadir, |
| dest='datadir') |
| parser.add_option_group(group) |
| |
| # Args for describing logging. |
| mp_log_util.AddOptions(parser) |
| |
| # Args for selecting hosts from Autotest. |
| autotest_util.AddOptions(parser) |
| |
| # Args for describing what work to perform. |
| group = optparse.OptionGroup( |
| parser, title='Run Configuration', |
| description=('Options specifying what actions the script will perform.')) |
| group.add_option( |
| '--graph', |
| help=('Create 1, 4, & 24 hour graphs for each host [default: %default]'), |
| default=False, action='store_true', dest='graph') |
| group.add_option( |
| '--all_graphs', |
| help='Create all graphs for each host [default: %default]', |
| default=False, action='store_true', dest='all_graphs') |
| group.add_option( |
| '--html', |
| help='Build HTML pages for hosts [default: %default]', |
| default=False, action='store_true', dest='html') |
| group.add_option( |
| '--update', |
| help='Collect data from hosts [default: %default]', |
| default=False, action='store_true', dest='update') |
| group.add_option( |
| '--timout', |
| help=('Timeout for remote commands to complete [default: %default]'), |
| default=30, dest='timeout') |
| group.add_option( |
| '--skip_at_status', |
| help=('Record the host status in autotest [default: %default]'), |
| default=False, action='store_true', dest='skip_at_status') |
| |
| parser.add_option_group(group) |
| |
| options = parser.parse_args()[0] |
| |
| if not options.log_file: |
| options.log_file = logfile |
| |
| if options.all_graphs: |
| options.graph = True |
| |
| if not (options.graph or options.html or options.update): |
| parser.error('Must specify at least one of the --graph, --html, or ' |
| '--update options.') |
| |
| # Create required directories if they don't exist. |
| common_util.MakedirsExisting(options.datadir) |
| common_util.MakedirsExisting(options.webdir, 0755) |
| common_util.MakedirsExisting(os.path.join(options.webdir, 'hosts'), 0755) |
| |
| return options |
| |
| |
| def CheckRun(action, tb): |
| """Check the run status of monitor.py, and add/remove run files. |
| |
| This function will ensure we only running one program with either the graph |
| or update option. |
| Args: |
| action: string, indicates if monitor.py is starting or stopping. |
| tb: options for this run. |
| """ |
| if action == 'start': |
| if tb.update == True: |
| if os.path.isfile(tb.update_runfile): |
| logging.info('Exiting, already running with update option') |
| sys.exit(1) |
| else: |
| try: |
| open(tb.update_runfile, 'w').close() |
| except IOError, e: |
| logging.error('Opening %s\n%s', tb.update_runfile, e) |
| if tb.graph: |
| if os.path.isfile(tb.graph_runfile): |
| logging.info('Exiting, already running with graph option') |
| sys.exit(1) |
| else: |
| try: |
| open(tb.graph_runfile, 'w').close() |
| except IOError, e: |
| logging.error('Opening %s\n%s', tb.graph_runfile, e) |
| elif action == 'stop': |
| if tb.update == True: |
| if os.path.isfile(tb.update_runfile): |
| try: |
| os.remove(tb.update_runfile) |
| except IOError, e: |
| logging.error('Removing %s\n%s', tb.update_runfile, e) |
| if tb.graph: |
| if os.path.isfile(tb.graph_runfile): |
| try: |
| os.remove(tb.graph_runfile) |
| except IOError, e: |
| logging.error('Removing %s\n%s', tb.graph_runfile, e) |
| else: |
| logging.error('Unknown option passed to CheckRun(): %s', action) |
| sys.exit(1) |
| |
| |
| def main(): |
| start_time = time.time() |
| options = ParseArgs() |
| |
| test_bed = TestBed(options) |
| CheckRun('start', test_bed) |
| try: |
| sysmon = Monitor(test_bed, options) |
| if not sysmon.afe_hosts: |
| logging.error('No hosts found, nothing to do, exiting.') |
| sys.exit(1) |
| sysmon.UpdateStatus() |
| if test_bed.update: |
| sysmon.BuildLandingPage() |
| |
| runtime = time.time() - start_time |
| msg = 'End [ %s ] Runtime %d seconds' % (test_bed.run_description, runtime) |
| mp_log_util.LogWithHeader(msg, symbol='-') |
| |
| except (KeyboardInterrupt, SystemExit): |
| logging.error('Shutdown requested.') |
| sys.exit(1) |
| except Exception, e: |
| logging.error('Exception: %s\n%s', e, traceback.format_exc()) |
| raise |
| finally: |
| CheckRun('stop', test_bed) |
| os.chmod(options.log_file, 0755) |
| |
| if __name__ == '__main__': |
| main() |