blob: ff8f1a5373ed4a2379709d819256bded9d81f96a [file] [log] [blame]
# Copyright 2016 The Chromium OS Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
# Copyright (c) 2015 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
"""System metrics."""
from __future__ import print_function
import collections
import errno
import os
import platform
import sys
import time
import psutil
from chromite.lib import cros_logging as logging
from infra_libs import ts_mon
_cpu_count_metric = ts_mon.GaugeMetric(
description='Number of CPU cores.')
_cpu_time_metric = ts_mon.FloatMetric(
description='percentage of time spent by the CPU '
'in different states.')
_disk_free_metric = ts_mon.GaugeMetric(
description='Available bytes on disk partition.',
_disk_total_metric = ts_mon.GaugeMetric(
description='Total bytes on disk partition.',
_inodes_free_metric = ts_mon.GaugeMetric(
description='Number of available inodes on '
'disk partition (unix only).')
_inodes_total_metric = ts_mon.GaugeMetric(
description='Number of possible inodes on '
'disk partition (unix only)')
_mem_free_metric = ts_mon.GaugeMetric(
description='Amount of memory available to a '
'process (in Bytes). Buffers are considered '
'free memory.',
_mem_total_metric = ts_mon.GaugeMetric(
description='Total physical memory in Bytes.',
BOOT_TIME = psutil.boot_time()
_net_up_metric = ts_mon.CounterMetric(
'dev/net/bytes/up', start_time=BOOT_TIME,
description='Number of bytes sent on interface.',
_net_down_metric = ts_mon.CounterMetric(
'dev/net/bytes/down', start_time=BOOT_TIME,
description='Number of Bytes received on '
_net_err_up_metric = ts_mon.CounterMetric(
'dev/net/err/up', start_time=BOOT_TIME,
description='Total number of errors when '
'sending (per interface).')
_net_err_down_metric = ts_mon.CounterMetric(
'dev/net/err/down', start_time=BOOT_TIME,
description='Total number of errors when '
'receiving (per interface).')
_net_drop_up_metric = ts_mon.CounterMetric(
'dev/net/drop/up', start_time=BOOT_TIME,
description='Total number of outgoing '
'packets that have been dropped.')
_net_drop_down_metric = ts_mon.CounterMetric(
'dev/net/drop/down', start_time=BOOT_TIME,
description='Total number of incoming '
'packets that have been dropped.')
_disk_read_metric = ts_mon.CounterMetric(
'dev/disk/read', start_time=BOOT_TIME,
description='Number of Bytes read on disk.',
_disk_write_metric = ts_mon.CounterMetric(
'dev/disk/write', start_time=BOOT_TIME,
description='Number of Bytes written on disk.',
_uptime_metric = ts_mon.GaugeMetric(
description='Machine uptime, in seconds.',
_proc_count_metric = ts_mon.GaugeMetric(
description='Number of processes currently running.')
_autoserv_proc_count_metric = ts_mon.GaugeMetric(
description='Number of autoserv processes currently running.')
_sysmon_proc_count_metric = ts_mon.GaugeMetric(
description='Number of sysmon processes currently running.')
_load_average_metric = ts_mon.FloatMetric(
description='Number of processes currently '
'in the system run queue.')
# ts_mon pipeline uses backend clocks when assigning timestamps to metric
# points. By comparing point timestamp to the point value (i.e. time by
# machine's local clock), we can potentially detect some anomalies (clock
# drift, unusually high metrics pipeline delay, completely wrong clocks, etc).
# It is important to gather this metric right before the flush.
_unix_time_metric = ts_mon.GaugeMetric(
description='Number of milliseconds since epoch'
' based on local machine clock.')
_os_name_metric = ts_mon.StringMetric(
description='OS name on the machine')
_os_version_metric = ts_mon.StringMetric(
description='OS version on the machine')
_os_arch_metric = ts_mon.StringMetric(
description='OS architecture on this machine')
_python_arch_metric = ts_mon.StringMetric(
description='python userland '
'architecture on this machine')
def get_uptime():
_uptime_metric.set(int(time.time() - BOOT_TIME))
def get_cpu_info():
times = psutil.cpu_times_percent()
for mode in ('user', 'system', 'idle'):
_cpu_time_metric.set(getattr(times, mode), {'mode': mode})
def get_disk_info(mountpoints=None):
if mountpoints is None:
mountpoints = [disk.mountpoint for disk in psutil.disk_partitions()]
for mountpoint in mountpoints:
def _get_disk_info_single(mountpoint):
fields = {'path': mountpoint}
usage = psutil.disk_usage(mountpoint)
except OSError as ex:
if ex.errno == errno.ENOENT:
# This happens on Windows when querying a removable drive that
# doesn't have any media inserted right now.
_disk_free_metric.set(, fields=fields)
_disk_total_metric.set(, fields=fields)
# inode counts are only available on Unix.
if == 'posix':
def _get_fs_inode_info(mountpoint):
fields = {'path': mountpoint}
stats = os.statvfs(mountpoint)
_inodes_free_metric.set(stats.f_favail, fields=fields)
_inodes_total_metric.set(stats.f_files, fields=fields)
def _get_disk_io_info():
disk_counters = psutil.disk_io_counters(perdisk=True).iteritems()
except RuntimeError as ex:
if "couldn't find any physical disk" in str(ex):
# Disk performance counters aren't enabled on Windows.
for disk, counters in disk_counters:
fields = {'disk': disk}
_disk_read_metric.set(counters.read_bytes, fields=fields)
_disk_write_metric.set(counters.write_bytes, fields=fields)
def get_mem_info():
# We don't report mem.used because (due to virtual memory) it is not
# useful.
mem = psutil.virtual_memory()
def get_net_info():
metric_counter_names = [
(_net_up_metric, 'bytes_sent'),
(_net_down_metric, 'bytes_recv'),
(_net_err_up_metric, 'errout'),
(_net_err_down_metric, 'errin'),
(_net_drop_up_metric, 'dropout'),
(_net_drop_down_metric, 'dropin'),
nics = psutil.net_io_counters(pernic=True)
for nic, counters in nics.iteritems():
# TODO(ayatane): Use a different way of identifying virtual interfaces
if nic.startswith('veth'):
# Skip virtual interfaces
fields = {'interface': nic}
for metric, counter_name in metric_counter_names:
metric.set(getattr(counters, counter_name), fields=fields)
except ts_mon.MonitoringDecreasingValueError as ex:
# This normally shouldn't happen, but might if the network
# driver module is reloaded, so log an error and continue
# instead of raising an exception.
def get_os_info():
os_info = _get_os_info()
OSInfo = collections.namedtuple('OSInfo', 'name,version')
def _get_os_info():
"""Get OS name and version.
OSInfo instance
os_name = platform.system().lower()
os_version = ''
if 'windows' in os_name:
os_name = 'windows'
# release will be something like '7', 'vista', or 'xp'
os_version = platform.release()
elif 'linux' in os_name:
# will return something like ('Ubuntu', '14.04', 'trusty')
os_name, os_version, _ = platform.dist()
# On mac platform.system() reports 'darwin'.
os_version = _get_mac_version()
if os_version:
# We found a valid mac.
os_name = 'mac'
# not a mac, unable to find platform information, reset
os_name = ''
os_version = ''
os_name = os_name.lower()
os_version = os_version.lower()
return OSInfo(name=os_name, version=os_version)
def _get_mac_version():
"""Get Mac system version.
Version string, which is empty if not a valid Mac system.
# This tuple is only populated on mac systems.
mac_ver = platform.mac_ver()
# Will be '10.11.5' or similar on a valid mac or will be '' on a non-mac.
os_version = mac_ver[0]
return os_version
def _get_python_arch():
if sys.maxsize > 2**32:
return '64'
return '32'
def get_proc_info():
autoserv_count = 0
sysmon_count = 0
total = 0
for proc in psutil.process_iter():
if _is_parent_autoserv(proc):
autoserv_count += 1
elif _is_sysmon(proc):
sysmon_count += 1
total += 1
logging.debug('autoserv_count: %s', autoserv_count)
def _is_parent_autoserv(proc):
"""Return whether proc is a parent (not forked) autoserv process."""
return _is_autoserv(proc) and not _is_autoserv(proc.parent())
def _is_autoserv(proc):
"""Return whether proc is an autoserv process."""
# This relies on the autoserv script being run directly. The script should
# be named autoserv exactly and start with a shebang that is /usr/bin/python,
# NOT /bin/env
return == 'autoserv'
def _is_sysmon(proc):
"""Return whether proc is a sysmon process."""
# This is fragile due to the virtualenv bootstrap of sysmon.
# The process tree for an Upstart invocation of sysmon is:
# init -> sudo -> python2 -> python
# If sysmon is started without using Upstart:
# init -> (shell) -> python2 -> python
# The extra python2 is due to the virtualenv wrapper script, which should do
# an exec to avoid wasting a process. The fact that the first has a 2 and
# the second doesn't is basically just luck.
# TODO(ayatane): Once the chromite virtualenv wrapper uses exec, clean this
# up.
return == 'python' and 'sysmon' in ' '.join(proc.cmdline())
def get_load_avg():
avg1, avg5, avg15 = os.getloadavg()
except OSError:
_load_average_metric.set(avg1, fields={'minutes': 1})
_load_average_metric.set(avg5, fields={'minutes': 5})
_load_average_metric.set(avg15, fields={'minutes': 15})
def get_unix_time():
_unix_time_metric.set(int(time.time() * 1000))