| #!/usr/bin/env python2 |
| |
| # Copyright 2017 The Chromium OS Authors. All rights reserved. |
| # Use of this source code is governed by a BSD-style license that can be |
| # found in the LICENSE file. |
| |
| """Script to upload metrics from apache access logs to Monarch.""" |
| |
| from __future__ import print_function |
| |
| import argparse |
| import re |
| import sys |
| import urlparse |
| |
| import common |
| |
| from chromite.lib import ts_mon_config |
| from chromite.lib import metrics |
| |
| from autotest_lib.site_utils.stats import log_daemon_common |
| # Not used, but needed for importing rpc_interface. |
| from autotest_lib.frontend import setup_django_environment |
| from autotest_lib.frontend.afe import rpc_interface |
| from autotest_lib.frontend.afe import moblab_rpc_interface |
| |
| |
| """ |
| The log format is set to: |
| %v:%p %h %l %u %t \"%r\" %>s %O \"%{Referer}i\" \"%{User-Agent}i\" %T |
| |
| These are documented as follows: |
| (from https://httpd.apache.org/docs/current/mod/mod_log_config.html) |
| |
| %h: Remote host |
| %l: Remote logname (from identd, if supplied) |
| %O: Bytes sent, including headers. May be zero in rare cases such as when a |
| request is aborted before a response is sent. You need to enable mod_logio |
| to use this. |
| %p: The canonical Port of the server serving the request |
| %r: First line of request |
| %s: Status. For requests that got internally redirected, this is |
| the status of the *original* request --- %...>s for the last. |
| %t: Time, in common log format time format (standard english format) |
| %T: The time taken to serve the request, in seconds. |
| %u: Remote user (from auth; may be bogus if return status (%s) is 401) |
| %v: The canonical ServerName of the server serving the request. |
| """ |
| |
| # Lemma: a regex to match sections delimited be double-quotes ("), which |
| # possible contained escaped quotes (\"). |
| # This works by matching non-quotes or the string r'\"' repeatedly; then it ends |
| # when finding a quote (") preceeded by a character which is not a backslash. |
| MATCH_UNTIL_QUOTE = r'([^"]|\\")*[^\\]' |
| |
| ACCESS_MATCHER = re.compile( |
| r'^' |
| r'\S+ \S+ \S+ \S+ ' # Ignore %v:%p %h %l %u |
| r'\[[^]]+\] ' # Ignore %t |
| r'"(' # Begin %r |
| r'(?P<request_method>\S+) ' # e.g. POST |
| r'(?P<endpoint>\S+)' # e.g. /afe/server/noauth/rpc/ |
| + MATCH_UNTIL_QUOTE + # Ignore protocol (e.g. HTTP/1.1) |
| r'|-' # The request data might just be "-" |
| r')" ' # End %r |
| r'(?P<response_code>\d+) ' # %>s (e.g. 200) |
| r'(?P<bytes_sent>\d+)' # %O |
| r' "' + MATCH_UNTIL_QUOTE + '"' # Ignore Referer |
| r' "' + MATCH_UNTIL_QUOTE + '"' # Ignore User-Agent |
| r' ?(?P<response_seconds>\d+?)' # The server time in seconds |
| r'.*' # Allow adding extra stuff afterward. |
| ) |
| |
| ACCESS_TIME_METRIC = '/chromeos/autotest/http/server/response_seconds' |
| ACCESS_BYTES_METRIC = '/chromeos/autotest/http/server/response_bytes' |
| |
| RPC_ACCESS_TIME_METRIC = '/chromeos/autotest/http/server/rpc/response_seconds' |
| RPC_ACCESS_BYTES_METRIC = '/chromeos/autotest/http/server/rpc/response_bytes' |
| RPC_METHOD_ARGUMENT = 'method' |
| |
| |
| # TODO(phobbs) use a memory-efficient structure to detect non-unique paths. |
| # We can't just include the endpoint because it will cause a cardinality |
| # explosion. |
| ALLOWLISTED_ENDPOINTS = frozenset(( |
| '/', |
| '/afe/', |
| '/new_tko/server/rpc/', |
| '/afe/server/rpc/', |
| '/___rPc_sWiTcH___', |
| '*', |
| '/afe/server/noauth/rpc/', |
| )) |
| |
| |
| # A bad actor could DOS Monarch by requesting millions of different RPC methods, |
| # each of which would create a different stream. Only allow an allowlist of |
| # methods to be recorded in Monarch. |
| ALLOWLISTED_METHODS = (frozenset(dir(rpc_interface)) |
| | frozenset(dir(moblab_rpc_interface))) |
| |
| |
| def EmitRequestMetrics(m): |
| """Emits metrics for each line in the access log. |
| |
| @param m: A regex match object |
| """ |
| fields = { |
| 'request_method': m.groupdict().get('request_method', ''), |
| 'endpoint': SanitizeEndpoint(m.group('endpoint')), |
| 'response_code': int(m.group('response_code')), |
| } |
| |
| send_rpc_metrics = ( |
| '?' in m.group('endpoint') and '/rpc' in m.group('endpoint')) |
| if send_rpc_metrics: |
| EmitRPCMetrics(m) |
| |
| # Request seconds and bytes sent are both extremely high cardinality, so |
| # they must be the VAL of a metric, not a metric field. |
| if m.group('response_seconds'): |
| response_seconds = int(m.group('response_seconds')) |
| metrics.SecondsDistribution(ACCESS_TIME_METRIC).add(response_seconds, |
| fields=fields) |
| |
| bytes_sent = int(m.group('bytes_sent')) |
| metrics.CumulativeDistribution(ACCESS_BYTES_METRIC).add( |
| bytes_sent, fields=fields) |
| |
| |
| def EmitRPCMetrics(m): |
| """Emit a special metric including the method when the request was an RPC.""" |
| fields = { |
| 'request_method': m.groupdict().get('request_method', ''), |
| 'rpc_method': ParseRPCMethod(m.group('endpoint')), |
| 'response_code': int(m.group('response_code')), |
| } |
| |
| if m.group('response_seconds'): |
| response_seconds = int(m.group('response_seconds')) |
| metrics.SecondsDistribution(RPC_ACCESS_TIME_METRIC).add( |
| response_seconds, fields=fields) |
| |
| bytes_sent = int(m.group('bytes_sent')) |
| metrics.CumulativeDistribution(RPC_ACCESS_BYTES_METRIC).add(bytes_sent, |
| fields=fields) |
| |
| |
| def ParseRPCMethod(url): |
| """Parses the RPC method from an RPC query string. |
| |
| Args: |
| url: The URL requested. |
| """ |
| query = urlparse.urlparse(url).query |
| return urlparse.parse_qs(query)[RPC_METHOD_ARGUMENT][-1] |
| |
| |
| def SanitizeEndpoint(endpoint): |
| """Returns empty string if endpoint is not allowlisted. |
| |
| @param endpoint: The endpoint to sanitize. |
| """ |
| if endpoint in ALLOWLISTED_ENDPOINTS: |
| return endpoint |
| else: |
| return '' |
| |
| |
| MATCHERS = [ |
| (ACCESS_MATCHER, EmitRequestMetrics), |
| ] |
| |
| |
| def ParseArgs(): |
| """Parses the command line arguments.""" |
| p = argparse.ArgumentParser( |
| description='Parses apache logs and emits metrics to Monarch') |
| p.add_argument('--output-logfile') |
| p.add_argument('--debug-metrics-file', |
| help='Output metrics to the given file instead of sending ' |
| 'them to production.') |
| return p.parse_args() |
| |
| |
| def Main(): |
| """Sets up logging and runs matchers against stdin.""" |
| args = ParseArgs() |
| log_daemon_common.SetupLogging(args) |
| |
| # Set up metrics sending and go. |
| ts_mon_args = {} |
| if args.debug_metrics_file: |
| ts_mon_args['debug_file'] = args.debug_metrics_file |
| |
| with ts_mon_config.SetupTsMonGlobalState('apache_access_log_metrics', |
| **ts_mon_args): |
| log_daemon_common.RunMatchers(sys.stdin, MATCHERS) |
| |
| |
| if __name__ == '__main__': |
| Main() |