| # Copyright 2015 The Chromium Authors. All rights reserved. |
| # Use of this source code is governed by a BSD-style license that can be |
| # found in the LICENSE file. |
| |
| import json |
| import logging |
| import os |
| import socket |
| import sys |
| import re |
| |
| import requests |
| |
| from infra_libs.ts_mon.common import interface |
| from infra_libs.ts_mon.common import monitors |
| from infra_libs.ts_mon.common import standard_metrics |
| from infra_libs.ts_mon.common import targets |
| |
| |
| def load_machine_config(filename): |
| if not os.path.exists(filename): |
| logging.info('Configuration file does not exist, ignoring: %s', filename) |
| return {} |
| |
| try: |
| with open(filename) as fh: |
| return json.load(fh) |
| except Exception: |
| logging.error('Configuration file couldn\'t be read: %s', filename) |
| raise |
| |
| |
| def _default_region(fqdn): |
| # Check if we're running in a GCE instance. |
| try: |
| r = requests.get( |
| 'http://metadata.google.internal/computeMetadata/v1/instance/zone', |
| headers={'Metadata-Flavor': 'Google'}, |
| timeout=1.0) |
| except requests.exceptions.RequestException: |
| pass |
| else: |
| if r.status_code == requests.codes.ok: |
| # The zone is the last slash-separated component. |
| return r.text.split('/')[-1] |
| |
| try: |
| return fqdn.split('.')[1] # [chrome|golo] |
| except IndexError: |
| return '' |
| |
| |
| def _default_network(host): |
| try: |
| # Regular expression that matches the vast majority of our host names. |
| # Matches everything of the form 'masterN', 'masterNa', and 'foo-xN'. |
| return re.match(r'^([\w-]*?-[acm]|master)(\d+)a?$', host).group(2) # N |
| except AttributeError: |
| return '' |
| |
| |
| def add_argparse_options(parser): |
| """Add monitoring related flags to a process' argument parser. |
| |
| Args: |
| parser (argparse.ArgumentParser): the parser for the main process. |
| """ |
| if sys.platform == 'win32': # pragma: no cover |
| default_config_file = 'C:\\chrome-infra\\ts-mon.json' |
| else: # pragma: no cover |
| default_config_file = '/etc/chrome-infra/ts-mon.json' |
| |
| parser = parser.add_argument_group('Timeseries Monitoring Options') |
| parser.add_argument( |
| '--ts-mon-config-file', |
| default=default_config_file, |
| help='path to a JSON config file that contains suitable values for ' |
| '"endpoint" and "credentials" for this machine. This config file is ' |
| 'intended to be shared by all processes on the machine, as the ' |
| 'values depend on the machine\'s position in the network, IP ' |
| 'whitelisting and deployment of credentials. (default: %(default)s)') |
| parser.add_argument( |
| '--ts-mon-endpoint', |
| help='url (file:// or https://) to post monitoring metrics to. If set, ' |
| 'overrides the value in --ts-mon-config-file') |
| parser.add_argument( |
| '--ts-mon-credentials', |
| help='path to a pkcs8 json credential file. If set, overrides the value ' |
| 'in --ts-mon-config-file') |
| parser.add_argument( |
| '--ts-mon-ca-certs', |
| help='path to file containing root CA certificates for SSL server ' |
| 'certificate validation. If not set, a CA cert file bundled with ' |
| 'httplib2 is used.') |
| parser.add_argument( |
| '--ts-mon-flush', |
| choices=('manual', 'auto'), default='auto', |
| help=('metric push behavior: manual (only send when flush() is called), ' |
| 'or auto (send automatically every --ts-mon-flush-interval-secs ' |
| 'seconds). (default: %(default)s)')) |
| parser.add_argument( |
| '--ts-mon-flush-interval-secs', |
| type=int, |
| default=60, |
| help=('automatically push metrics on this interval if ' |
| '--ts-mon-flush=auto.')) |
| parser.add_argument( |
| '--ts-mon-autogen-hostname', |
| action="store_true", |
| help=('Indicate that the hostname is autogenerated. ' |
| 'This option must be set on autoscaled GCE VMs, Kubernetes pods, ' |
| 'or any other hosts with dynamically generated names.')) |
| |
| parser.add_argument( |
| '--ts-mon-target-type', |
| choices=('device', 'task'), |
| default='device', |
| help='the type of target that is being monitored ("device" or "task").' |
| ' (default: %(default)s)') |
| |
| fqdn = socket.getfqdn().lower() # foo-[a|m]N.[chrome|golo].chromium.org |
| host = fqdn.split('.')[0] # foo-[a|m]N |
| region = _default_region(fqdn) |
| network = _default_network(host) |
| |
| parser.add_argument( |
| '--ts-mon-device-hostname', |
| default=host, |
| help='name of this device, (default: %(default)s)') |
| parser.add_argument( |
| '--ts-mon-device-region', |
| default=region, |
| help='name of the region this devices lives in. (default: %(default)s)') |
| parser.add_argument( |
| '--ts-mon-device-role', |
| default='default', |
| help='Role of the device. (default: %(default)s)') |
| parser.add_argument( |
| '--ts-mon-device-network', |
| default=network, |
| help='name of the network this device is connected to. ' |
| '(default: %(default)s)') |
| |
| parser.add_argument( |
| '--ts-mon-task-service-name', |
| help='name of the service being monitored') |
| parser.add_argument( |
| '--ts-mon-task-job-name', |
| help='name of this job instance of the task') |
| parser.add_argument( |
| '--ts-mon-task-region', |
| default=region, |
| help='name of the region in which this task is running ' |
| '(default: %(default)s)') |
| parser.add_argument( |
| '--ts-mon-task-hostname', |
| default=host, |
| help='name of the host on which this task is running ' |
| '(default: %(default)s)') |
| parser.add_argument( |
| '--ts-mon-task-number', type=int, default=0, |
| help='number (e.g. for replication) of this instance of this task ' |
| '(default: %(default)s)') |
| |
| parser.add_argument( |
| '--ts-mon-metric-name-prefix', |
| default='/chrome/infra/', |
| help='metric name prefix for all metrics (default: %(default)s)') |
| |
| parser.add_argument( |
| '--ts-mon-use-new-proto', |
| default=True, action='store_true', |
| help='deprecated and ignored') |
| |
| |
| def process_argparse_options(args): |
| """Process command line arguments to initialize the global monitor. |
| |
| Also initializes the default target. |
| |
| Starts a background thread to automatically flush monitoring metrics if not |
| disabled by command line arguments. |
| |
| Args: |
| args (argparse.Namespace): the result of parsing the command line arguments |
| """ |
| # Parse the config file if it exists. |
| config = load_machine_config(args.ts_mon_config_file) |
| endpoint = config.get('endpoint', '') |
| credentials = config.get('credentials', '') |
| autogen_hostname = config.get('autogen_hostname', False) |
| |
| # Command-line args override the values in the config file. |
| if args.ts_mon_endpoint is not None: |
| endpoint = args.ts_mon_endpoint |
| if args.ts_mon_credentials is not None: |
| credentials = args.ts_mon_credentials |
| |
| if args.ts_mon_target_type == 'device': |
| hostname = args.ts_mon_device_hostname |
| if args.ts_mon_autogen_hostname or autogen_hostname: |
| hostname = 'autogen:' + hostname |
| interface.state.target = targets.DeviceTarget( |
| args.ts_mon_device_region, |
| args.ts_mon_device_role, |
| args.ts_mon_device_network, |
| hostname) |
| if args.ts_mon_target_type == 'task': |
| # Reimplement ArgumentParser.error, since we don't have access to the parser |
| if not args.ts_mon_task_service_name: |
| print >> sys.stderr, ('Argument --ts-mon-task-service-name must be ' |
| 'provided when the target type is "task".') |
| sys.exit(2) |
| if not args.ts_mon_task_job_name: |
| print >> sys.stderr, ('Argument --ts-mon-task-job-name must be provided ' |
| 'when the target type is "task".') |
| sys.exit(2) |
| hostname = args.ts_mon_task_hostname |
| if args.ts_mon_autogen_hostname or autogen_hostname: |
| hostname = 'autogen:' + hostname |
| interface.state.target = targets.TaskTarget( |
| args.ts_mon_task_service_name, |
| args.ts_mon_task_job_name, |
| args.ts_mon_task_region, |
| hostname, |
| args.ts_mon_task_number) |
| |
| interface.state.metric_name_prefix = args.ts_mon_metric_name_prefix |
| interface.state.global_monitor = monitors.NullMonitor() |
| |
| if endpoint.startswith('file://'): |
| interface.state.global_monitor = monitors.DebugMonitor( |
| endpoint[len('file://'):]) |
| elif endpoint.startswith('https://'): |
| interface.state.global_monitor = monitors.HttpsMonitor( |
| endpoint, monitors.CredentialFactory.from_string(credentials), |
| ca_certs=args.ts_mon_ca_certs) |
| elif endpoint.lower() == 'none' or not endpoint: |
| logging.info('ts_mon monitoring has been explicitly disabled') |
| else: |
| logging.error('ts_mon monitoring is disabled because the endpoint provided' |
| ' is invalid or not supported: %s', endpoint) |
| |
| interface.state.flush_mode = args.ts_mon_flush |
| |
| if args.ts_mon_flush == 'auto': |
| interface.state.flush_thread = interface._FlushThread( |
| args.ts_mon_flush_interval_secs) |
| interface.state.flush_thread.start() |
| |
| standard_metrics.init() |