gs_cache/nginx_access_log_metrics.py - mirrors/cros/chromiumos/platform/dev-util - Git at Google

 # -*- coding: utf-8 -*-
 # Copyright 2018 The Chromium OS Authors. All rights reserved.
 # Use of this source code is governed by a BSD-style license that can be
 # found in the LICENSE file.

 """This script generate monarch metrics from Gs Cache server logs.

 It parses the Nginx access log file of Gs Cache, e.g.
 /var/log/nginx/gs-cache-server.access.log, and generate monarch metrics for Gs
 Cache performance.
 """

 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

 import argparse
 import re
 import sys
 from logging import handlers

 from chromite.lib import cros_logging as logging
 from chromite.lib import metrics
 from chromite.lib import ts_mon_config

 _LOG_ROTATION_TIME = 'H'
 _LOG_ROTATION_INTERVAL = 24  # hours
 _LOG_ROTATION_BACKUP = 14  # Keep for 14 days.

 _METRIC_NAME = 'chromeos/gs_cache/nginx/response_metrics'

 # An example Nginx access log line is:
 # <ip> 2018-07-26T18:13:49-07:00 "GET URL HTTP/1.1" 200 <size> "<agent>" HIT
 _SUCCESS_RESPONSE_MATCHER = re.compile(
     r'^(?P<ip_addr>\d+\.\d+\.\d+\.\d+) '
     r'(?P<timestamp>\d+\-\d+\-\d+T\d+:\d+:\d+[+\-]\d+:\d+) '
     r'"(?P<http_method>\S+) (?P<url_path>\S*)[^"]*" '
     r'(?P<status_code>\d+) (?P<size>\S+) "[^"]+" (?P<cache_status>\S+)')

 # Define common regexes.
 _COMMON_PARTIAL_REGEX = (r'(?P<build>[^/]+)/(?P<milestone>\S\d+)'
                          r'\-(?P<version>[^/]+)')
 _COMMON_FULL_REGEX = (r'^/(?P<action>[^/]+)/(?P<bucket>[^/]+)/%s' %
                       _COMMON_PARTIAL_REGEX)

 # Regex for all accepted actions.
 _URL_PATH_MATCHER = {
     'decompress': re.compile(r'%s/(?P<filename>[^\?]+)' % _COMMON_FULL_REGEX),
     'download': re.compile(r'%s/(?P<filename>[^\?]+)' % _COMMON_FULL_REGEX),
     'extract': re.compile(r'%s/(?P<package>[^\?]+)\?file=(?P<filename>[^\?]+)'
                           % _COMMON_FULL_REGEX),
     'list_dir': re.compile(r'%s' % _COMMON_FULL_REGEX),

     # TODO(crbug.com/1122319): Remove all fake RPCs once all devserver clients
     # have been migrated to TLW/TLS caching API.

     'fake_check_health': re.compile(r'/(?P<action>[^/]+)'),
     'fake_list_image_dir': re.compile(r'/(?P<action>[^/]+)'),
     'fake_is_staged': re.compile(r'/(?P<action>[^/]+)'),
     'fake_stage': re.compile(r'/(?P<action>[^/]+)'),
     'setup_telemetry': re.compile(r'/(?P<action>[^/]+)\?archive_url=gs://'
                                   r'(?P<bucket>[^/]+)/%s' %
                                   _COMMON_PARTIAL_REGEX),
     'static': re.compile(r'/(?P<action>[^/]+)/(?P<filename>[^\?]+)'),
     'update': re.compile(r'^/(?P<action>[^/]+)/%s.*' % _COMMON_PARTIAL_REGEX),
 }


 def emit_successful_response_metric(m):
   """Emit a Counter metric for a successful response.

   We parse the response line and uploads the size of response to a cumulative
   metric. Especially, the metric has a designate field of 'cache' which
   indicates the upstream cache status, e.g. HIT/MISS etc. For all list of cache
   status, see
   http://nginx.org/en/docs/http/ngx_http_upstream_module.html#var_upstream_cache_status

   Args:
     m: A regex match object or None.
   """
   if not m:
     return

   logging.debug('Emitting successful response metric.')
   metric_fields = {
       'cache': m.group('cache_status'),
       'action': '',
       'bucket': '',
       'build': '',
       'milestone': '',
       'endpoint': '',
       'status_code': m.group('status_code'),
   }
   metric_fields.update(match_url_path(m.group('url_path')))
   metrics.Counter(_METRIC_NAME).increment_by(int(m.group('size')),
                                              fields=metric_fields)


 def match_url_path(url):
   """Extract information from the url by matching it with the appropriate regex.

   Args:
     url: The url to be matched.

   Returns:
     A dictionary containing all matched fields and their respective values.
   """
   # The url is either in the format /<action_name>?<param1>=<value1>.. or
   # /<action_name>/<arg1>/<arg2>/..?<param1>=<value1>..
   # To single out the action name from these urls, first strip the leading '/'
   # and then handle the cases where the action name can be followed by a '/'
   # or a '?'.
   action_name = url.strip('/').split('/')[0].split('?')[0]
   try:
     info = _URL_PATH_MATCHER[action_name].match(url)
     matched_group_dict = info.groupdict()
     return {
         'action': matched_group_dict.get('action', ''),
         'bucket': matched_group_dict.get('bucket', ''),
         'build': matched_group_dict.get('build', ''),
         'milestone': matched_group_dict.get('milestone', ''),
         'endpoint': matched_group_dict.get('filename', ''),
     }
   except KeyError:
     logging.warning('The URL: %s did not match with any of the regexes. May '
                     'be it is new?', url)
   except Exception as e:
     logging.error('Could not extract fields names for %s due to exception: %s',
                   url, e)
   return {}

 def input_log_file_type(filename):
   """A argparse type function converting input filename to file object.

   It converts '-' to sys.stdin.
   """
   if filename == '-':
     return sys.stdin
   return argparse.FileType('r')(filename)


 def parse_args(argv):
   """Parses command line arguments."""
   parser = argparse.ArgumentParser(description=__doc__)
   parser.add_argument(
       '-i', '--input-log-file', metavar='NGINX_ACCESS_LOG_FILE',
       dest='input_fd', required=True,
       type=input_log_file_type,
       help=('Nginx log file of Gs Cache '
             '(use "-" to indicate reading from sys.stdin).')
   )
   parser.add_argument(
       '-l', '--log-file', default=sys.stdout,
       help='Log file of this script (default is sys.stdout).'
   )
   return parser.parse_args(argv)


 def main(argv):
   """Main function."""
   args = parse_args(argv)

   logger = logging.getLogger()
   if args.log_file is sys.stdout:
     logger.addHandler(logging.StreamHandler(stream=sys.stdout))
   else:
     logger.addHandler(handlers.TimedRotatingFileHandler(
         args.log_file, when=_LOG_ROTATION_TIME,
         interval=_LOG_ROTATION_INTERVAL,
         backupCount=_LOG_ROTATION_BACKUP))
   logger.setLevel(logging.DEBUG)

   with ts_mon_config.SetupTsMonGlobalState('gs_cache_nginx_log_metrics',
                                            indirect=True):
     for line in iter(args.input_fd.readline, b''):
       logging.debug('Parsing line: %s', line.strip())
       emit_successful_response_metric(_SUCCESS_RESPONSE_MATCHER.match(line))


 if __name__ == "__main__":  # pylint: disable=invalid-string-quote
   sys.exit(main(sys.argv[1:]))
	# -- coding: utf-8 --
	# Copyright 2018 The Chromium OS Authors. All rights reserved.
	# Use of this source code is governed by a BSD-style license that can be
	# found in the LICENSE file.

	"""This script generate monarch metrics from Gs Cache server logs.

	It parses the Nginx access log file of Gs Cache, e.g.
	/var/log/nginx/gs-cache-server.access.log, and generate monarch metrics for Gs
	Cache performance.
	"""

	from __future__ import absolute_import
	from __future__ import division
	from __future__ import print_function

	import argparse
	import re
	import sys
	from logging import handlers

	from chromite.lib import cros_logging as logging
	from chromite.lib import metrics
	from chromite.lib import ts_mon_config

	_LOG_ROTATION_TIME = 'H'
	_LOG_ROTATION_INTERVAL = 24 # hours
	_LOG_ROTATION_BACKUP = 14 # Keep for 14 days.

	_METRIC_NAME = 'chromeos/gs_cache/nginx/response_metrics'

	# An example Nginx access log line is:
	# <ip> 2018-07-26T18:13:49-07:00 "GET URL HTTP/1.1" 200 <size> "<agent>" HIT
	_SUCCESS_RESPONSE_MATCHER = re.compile(
	r'^(?P<ip_addr>\d+\.\d+\.\d+\.\d+) '
	r'(?P<timestamp>\d+\-\d+\-\d+T\d+:\d+:\d+[+\-]\d+:\d+) '
	r'"(?P<http_method>\S+) (?P<url_path>\S)[^"]" '
	r'(?P<status_code>\d+) (?P<size>\S+) "[^"]+" (?P<cache_status>\S+)')

	# Define common regexes.
	_COMMON_PARTIAL_REGEX = (r'(?P<build>[^/]+)/(?P<milestone>\S\d+)'
	r'\-(?P<version>[^/]+)')
	_COMMON_FULL_REGEX = (r'^/(?P<action>[^/]+)/(?P<bucket>[^/]+)/%s' %
	_COMMON_PARTIAL_REGEX)

	# Regex for all accepted actions.
	_URL_PATH_MATCHER = {
	'decompress': re.compile(r'%s/(?P<filename>[^\?]+)' % _COMMON_FULL_REGEX),
	'download': re.compile(r'%s/(?P<filename>[^\?]+)' % _COMMON_FULL_REGEX),
	'extract': re.compile(r'%s/(?P<package>[^\?]+)\?file=(?P<filename>[^\?]+)'
	% _COMMON_FULL_REGEX),
	'list_dir': re.compile(r'%s' % _COMMON_FULL_REGEX),

	# TODO(crbug.com/1122319): Remove all fake RPCs once all devserver clients
	# have been migrated to TLW/TLS caching API.

	'fake_check_health': re.compile(r'/(?P<action>[^/]+)'),
	'fake_list_image_dir': re.compile(r'/(?P<action>[^/]+)'),
	'fake_is_staged': re.compile(r'/(?P<action>[^/]+)'),
	'fake_stage': re.compile(r'/(?P<action>[^/]+)'),
	'setup_telemetry': re.compile(r'/(?P<action>[^/]+)\?archive_url=gs://'
	r'(?P<bucket>[^/]+)/%s' %
	_COMMON_PARTIAL_REGEX),
	'static': re.compile(r'/(?P<action>[^/]+)/(?P<filename>[^\?]+)'),
	'update': re.compile(r'^/(?P<action>[^/]+)/%s.*' % _COMMON_PARTIAL_REGEX),
	}


	def emit_successful_response_metric(m):
	"""Emit a Counter metric for a successful response.

	We parse the response line and uploads the size of response to a cumulative
	metric. Especially, the metric has a designate field of 'cache' which
	indicates the upstream cache status, e.g. HIT/MISS etc. For all list of cache
	status, see
	http://nginx.org/en/docs/http/ngx_http_upstream_module.html#var_upstream_cache_status

	Args:
	m: A regex match object or None.
	"""
	if not m:
	return

	logging.debug('Emitting successful response metric.')
	metric_fields = {
	'cache': m.group('cache_status'),
	'action': '',
	'bucket': '',
	'build': '',
	'milestone': '',
	'endpoint': '',
	'status_code': m.group('status_code'),
	}
	metric_fields.update(match_url_path(m.group('url_path')))
	metrics.Counter(_METRIC_NAME).increment_by(int(m.group('size')),
	fields=metric_fields)


	def match_url_path(url):
	"""Extract information from the url by matching it with the appropriate regex.

	Args:
	url: The url to be matched.

	Returns:
	A dictionary containing all matched fields and their respective values.
	"""
	# The url is either in the format /<action_name>?<param1>=<value1>.. or
	# /<action_name>/<arg1>/<arg2>/..?<param1>=<value1>..
	# To single out the action name from these urls, first strip the leading '/'
	# and then handle the cases where the action name can be followed by a '/'
	# or a '?'.
	action_name = url.strip('/').split('/')[0].split('?')[0]
	try:
	info = _URL_PATH_MATCHER[action_name].match(url)
	matched_group_dict = info.groupdict()
	return {
	'action': matched_group_dict.get('action', ''),
	'bucket': matched_group_dict.get('bucket', ''),
	'build': matched_group_dict.get('build', ''),
	'milestone': matched_group_dict.get('milestone', ''),
	'endpoint': matched_group_dict.get('filename', ''),
	}
	except KeyError:
	logging.warning('The URL: %s did not match with any of the regexes. May '
	'be it is new?', url)
	except Exception as e:
	logging.error('Could not extract fields names for %s due to exception: %s',
	url, e)
	return {}

	def input_log_file_type(filename):
	"""A argparse type function converting input filename to file object.

	It converts '-' to sys.stdin.
	"""
	if filename == '-':
	return sys.stdin
	return argparse.FileType('r')(filename)


	def parse_args(argv):
	"""Parses command line arguments."""
	parser = argparse.ArgumentParser(description=__doc__)
	parser.add_argument(
	'-i', '--input-log-file', metavar='NGINX_ACCESS_LOG_FILE',
	dest='input_fd', required=True,
	type=input_log_file_type,
	help=('Nginx log file of Gs Cache '
	'(use "-" to indicate reading from sys.stdin).')
	)
	parser.add_argument(
	'-l', '--log-file', default=sys.stdout,
	help='Log file of this script (default is sys.stdout).'
	)
	return parser.parse_args(argv)


	def main(argv):
	"""Main function."""
	args = parse_args(argv)

	logger = logging.getLogger()
	if args.log_file is sys.stdout:
	logger.addHandler(logging.StreamHandler(stream=sys.stdout))
	else:
	logger.addHandler(handlers.TimedRotatingFileHandler(
	args.log_file, when=_LOG_ROTATION_TIME,
	interval=_LOG_ROTATION_INTERVAL,
	backupCount=_LOG_ROTATION_BACKUP))
	logger.setLevel(logging.DEBUG)

	with ts_mon_config.SetupTsMonGlobalState('gs_cache_nginx_log_metrics',
	indirect=True):
	for line in iter(args.input_fd.readline, b''):
	logging.debug('Parsing line: %s', line.strip())
	emit_successful_response_metric(_SUCCESS_RESPONSE_MATCHER.match(line))


	if __name__ == "__main__": # pylint: disable=invalid-string-quote
	sys.exit(main(sys.argv[1:]))