nginx_access_log_metrics: collect metrics for all requests

Remove condition that filters what metrics need to be emitted based on
the originating ip address. This will ensure that requests with
originating IP address as 127.0.0.1 will be emitted.

Add regular expressions that will match with all RPCs exposed by the
gs_archive_server.

Also fix minor lint issues.

BUG=chromium:1061364
TEST=Manually tested on chromeos2-devservertest. Details:
http://gpaste/5139800782798848.

Change-Id: I23205caca70807c7ae1989b3c4dd51c25892c943
Reviewed-on: https://chromium-review.googlesource.com/c/chromiumos/platform/dev-util/+/2351759
Tested-by: Sanika Kulkarni <sanikak@chromium.org>
Commit-Queue: Sanika Kulkarni <sanikak@chromium.org>
Reviewed-by: Congbin Guo <guocb@chromium.org>
Auto-Submit: Sanika Kulkarni <sanikak@chromium.org>
diff --git a/gs_cache/nginx_access_log_metrics.py b/gs_cache/nginx_access_log_metrics.py
index 249d44a..769fa66 100644
--- a/gs_cache/nginx_access_log_metrics.py
+++ b/gs_cache/nginx_access_log_metrics.py
@@ -27,7 +27,7 @@
 _LOG_ROTATION_INTERVAL = 24  # hours
 _LOG_ROTATION_BACKUP = 14  # Keep for 14 days.
 
-_METRIC_NAME = 'chromeos/gs_cache/nginx/response'
+_METRIC_NAME = 'chromeos/gs_cache/nginx/response_metrics'
 
 # An example Nginx access log line is:
 # <ip> 2018-07-26T18:13:49-07:00 "GET URL HTTP/1.1" 200 <size> "<agent>" HIT
@@ -37,13 +37,33 @@
     r'"(?P<http_method>\S+) (?P<url_path>\S*)[^"]*" '
     r'(?P<status_code>\d+) (?P<size>\S+) "[^"]+" (?P<cache_status>\S+)')
 
-# The format of URL is like:
-#   /$ACTION/$BUCKET/$BUILD/$MILESTONE-$VERSION/$FILENAME?params
-# We want ACTION, BUCKET, BUILD, MILESTONE, and FILENAME.
-_URL_PATH_MATCHER = re.compile(
-    r'^/(?P<action>[^/]+)/(?P<bucket>[^/]+)/(?P<build>[^/]+)/'
-    r'(?P<milestone>\S\d+)\-(?P<version>[^/]+)/(?P<filename>[^\?]+)'
-)
+# Define common regexes.
+_COMMON_PARTIAL_REGEX = (r'(?P<build>[^/]+)/(?P<milestone>\S\d+)'
+                         r'\-(?P<version>[^/]+)')
+_COMMON_FULL_REGEX = (r'^/(?P<action>[^/]+)/(?P<bucket>[^/]+)/%s' %
+                      _COMMON_PARTIAL_REGEX)
+
+# Regex for all accepted actions.
+_URL_PATH_MATCHER = {
+    'decompress': re.compile(r'%s/(?P<filename>[^\?]+)' % _COMMON_FULL_REGEX),
+    'download': re.compile(r'%s/(?P<filename>[^\?]+)' % _COMMON_FULL_REGEX),
+    'extract': re.compile(r'%s/(?P<package>[^\?]+)\?file=(?P<filename>[^\?]+)'
+                          % _COMMON_FULL_REGEX),
+    'list_dir': re.compile(r'%s' % _COMMON_FULL_REGEX),
+
+    # TODO(crbug.com/1122319): Remove all fake RPCs once all devserver clients
+    # have been migrated to TLW/TLS caching API.
+
+    'fake_check_health': re.compile(r'/(?P<action>[^/]+)'),
+    'fake_list_image_dir': re.compile(r'/(?P<action>[^/]+)'),
+    'fake_is_staged': re.compile(r'/(?P<action>[^/]+)'),
+    'fake_stage': re.compile(r'/(?P<action>[^/]+)'),
+    'setup_telemetry': re.compile(r'/(?P<action>[^/]+)\?archive_url=gs://'
+                                  r'(?P<bucket>[^/]+)/%s' %
+                                  _COMMON_PARTIAL_REGEX),
+    'static': re.compile(r'/(?P<action>[^/]+)/(?P<filename>[^\?]+)'),
+    'update': re.compile(r'^/(?P<action>[^/]+)/%s.*' % _COMMON_PARTIAL_REGEX),
+}
 
 
 def emit_successful_response_metric(m):
@@ -61,10 +81,6 @@
   if not m:
     return
 
-  # Ignore all loopback calls between gs_archive_server and Nginx.
-  if m.group('ip_addr') == '127.0.0.1':
-    return
-
   logging.debug('Emitting successful response metric.')
   metric_fields = {
       'cache': m.group('cache_status'),
@@ -75,19 +91,44 @@
       'endpoint': '',
       'status_code': m.group('status_code'),
   }
-  requested_file_info = _URL_PATH_MATCHER.match(m.group('url_path'))
-  if requested_file_info:
-    metric_fields.update({
-        'action': requested_file_info.group('action'),
-        'bucket': requested_file_info.group('bucket'),
-        'build': requested_file_info.group('build'),
-        'milestone': requested_file_info.group('milestone'),
-        'endpoint': requested_file_info.group('filename'),
-    })
+  metric_fields.update(match_url_path(m.group('url_path')))
   metrics.Counter(_METRIC_NAME).increment_by(int(m.group('size')),
                                              fields=metric_fields)
 
 
+def match_url_path(url):
+  """Extract information from the url by matching it with the appropriate regex.
+
+  Args:
+    url: The url to be matched.
+
+  Returns:
+    A dictionary containing all matched fields and their respective values.
+  """
+  # The url is either in the format /<action_name>?<param1>=<value1>.. or
+  # /<action_name>/<arg1>/<arg2>/..?<param1>=<value1>..
+  # To single out the action name from these urls, first strip the leading '/'
+  # and then handle the cases where the action name can be followed by a '/'
+  # or a '?'.
+  action_name = url.strip('/').split('/')[0].split('?')[0]
+  try:
+    info = _URL_PATH_MATCHER[action_name].match(url)
+    matched_group_dict = info.groupdict()
+    return {
+        'action': matched_group_dict.get('action', ''),
+        'bucket': matched_group_dict.get('bucket', ''),
+        'build': matched_group_dict.get('build', ''),
+        'milestone': matched_group_dict.get('milestone', ''),
+        'endpoint': matched_group_dict.get('filename', ''),
+    }
+  except KeyError:
+    logging.warning('The URL: %s did not match with any of the regexes. May '
+                    'be it is new?', url)
+  except Exception as e:
+    logging.error('Could not extract fields names for %s due to exception: %s',
+                  url, e)
+  return {}
+
 def input_log_file_type(filename):
   """A argparse type function converting input filename to file object.
 
@@ -102,15 +143,15 @@
   """Parses command line arguments."""
   parser = argparse.ArgumentParser(description=__doc__)
   parser.add_argument(
-      "-i", "--input-log-file", metavar='NGINX_ACCESS_LOG_FILE',
+      '-i', '--input-log-file', metavar='NGINX_ACCESS_LOG_FILE',
       dest='input_fd', required=True,
       type=input_log_file_type,
-      help=("Nginx log file of Gs Cache "
-            "(use '-' to indicate reading from sys.stdin).")
+      help=('Nginx log file of Gs Cache '
+            '(use "-" to indicate reading from sys.stdin).')
   )
   parser.add_argument(
-      "-l", "--log-file", default=sys.stdout,
-      help="Log file of this script (default is sys.stdout)."
+      '-l', '--log-file', default=sys.stdout,
+      help='Log file of this script (default is sys.stdout).'
   )
   return parser.parse_args(argv)
 
@@ -136,5 +177,5 @@
       emit_successful_response_metric(_SUCCESS_RESPONSE_MATCHER.match(line))
 
 
-if __name__ == "__main__":
+if __name__ == "__main__":  # pylint: disable=invalid-string-quote
   sys.exit(main(sys.argv[1:]))