scripts/gather_builder_stats.py - mirrors/cros/chromiumos/chromite - Git at Google

 # Copyright (c) 2014 The Chromium OS Authors. All rights reserved.
 # Use of this source code is governed by a BSD-style license that can be
 # found in the LICENSE file.

 """Script for gathering stats from builder runs."""

 from __future__ import division
 from __future__ import print_function

 import collections
 import datetime
 import logging
 import numpy
 import os
 import re
 import sys

 from chromite.cbuildbot import cbuildbot_config
 from chromite.cbuildbot import metadata_lib
 from chromite.cbuildbot import constants
 from chromite.lib import cidb
 from chromite.lib import clactions
 from chromite.lib import commandline
 from chromite.lib import cros_build_lib
 from chromite.lib import gdata_lib
 from chromite.lib import graphite
 from chromite.lib import gs
 from chromite.lib import table


 # Useful config targets.
 CQ_MASTER = constants.CQ_MASTER
 PFQ_MASTER = constants.PFQ_MASTER
 CANARY_MASTER = constants.CANARY_MASTER

 # Bot types
 CQ = constants.CQ
 PRE_CQ = constants.PRE_CQ
 PFQ = constants.PFQ_TYPE
 CANARY = constants.CANARY_TYPE

 # Number of parallel processes used when uploading/downloading GS files.
 MAX_PARALLEL = 40

 # The graphite graphs use seconds since epoch start as time value.
 EPOCH_START = metadata_lib.EPOCH_START

 # Formats we like for output.
 NICE_DATE_FORMAT = metadata_lib.NICE_DATE_FORMAT
 NICE_TIME_FORMAT = metadata_lib.NICE_TIME_FORMAT
 NICE_DATETIME_FORMAT = metadata_lib.NICE_DATETIME_FORMAT

 # Spreadsheet keys
 # CQ master and slaves both use the same spreadsheet
 CQ_SS_KEY = '0AsXDKtaHikmcdElQWVFuT21aMlFXVTN5bVhfQ2ptVFE'
 PFQ_SS_KEY = '0AhFPeDq6pmwxdDdrYXk3cnJJV05jN3Zja0s5VjFfNlE'
 CANARY_SS_KEY = '0AhFPeDq6pmwxdDBWM0t3YnYyeVFoM3VaRXNianc1VVE'

 # These are the preferred base URLs we use to canonicalize bugs/CLs.
 BUGANIZER_BASE_URL = 'b/'
 GUTS_BASE_URL = 't/'
 CROS_BUG_BASE_URL = 'crbug.com/'
 INTERNAL_CL_BASE_URL = 'crosreview.com/i/'
 EXTERNAL_CL_BASE_URL = 'crosreview.com/'


 class GatherStatsError(Exception):
   """Base exception class for exceptions in this module."""


 class DataError(GatherStatsError):
   """Any exception raised when an error occured while collectring data."""


 class SpreadsheetError(GatherStatsError):
   """Raised when there is a problem with the stats spreadsheet."""


 class BadData(DataError):
   """Raised when a json file is still running."""


 class NoDataError(DataError):
   """Returned if a manifest file does not exist."""


 def _SendToCarbon(builds, token_funcs):
   """Send data for |builds| to Carbon/Graphite according to |token_funcs|.

   Args:
     builds: List of BuildData objects.
     token_funcs: List of functors that each take a BuildData object as the only
       argument and return a string.  Each line of data to send to Carbon is
       constructed by taking the strings returned from these functors and
       concatenating them using single spaces.
   """
   lines = [' '.join([str(func(b)) for func in token_funcs]) for b in builds]
   cros_build_lib.Info('Sending %d lines to Graphite now.', len(lines))
   graphite.SendToCarbon(lines)


 # TODO(dgarrett): Discover list from Json. Will better track slave changes.
 def _GetSlavesOfMaster(master_target):
   """Returns list of slave config names for given master config name.

   Args:
     master_target: Name of master config target.

   Returns:
     List of names of slave config targets.
   """
   master_config = cbuildbot_config.config[master_target]
   slave_configs = cbuildbot_config.GetSlavesForMaster(master_config)
   return sorted(slave_config.name for slave_config in slave_configs)


 def _RemoveBuildsWithNoBuildId(builds, description):
   """Remove builds without a build_id.

   They happen during some failure modes, but we can't process them, and
   they don't affect the stats. We do log a warning message if any were
   filtered.

   Args:
     builds: List of metadata_lib.BuildData objects.
     description: A string describing the source of missing build_ids.
   """
   result = [b for b in builds if 'build_id' in b.metadata_dict]
   filtered = len(builds) - len(result)

   if filtered:
     logging.warn('Found %d %s builds without a build_id.',
                  filtered, description)

   return result


 class StatsTable(table.Table):
   """Stats table for any specific target on a waterfall."""

   LINK_ROOT = ('https://uberchromegw.corp.google.com/i/%(waterfall)s/builders/'
                '%(target)s/builds')

   @staticmethod
   def _SSHyperlink(link, text):
     return '=HYPERLINK("%s", "%s")' % (link, text)

   def __init__(self, target, waterfall, columns):
     super(StatsTable, self).__init__(columns, target)
     self.target = target
     self.waterfall = waterfall

   def GetBuildLink(self, build_number):
     target = self.target.replace(' ', '%20')
     link = self.LINK_ROOT % {'waterfall': self.waterfall, 'target': target}
     link += '/%s' % build_number
     return link

   def GetBuildSSLink(self, build_number):
     link = self.GetBuildLink(build_number)
     return self._SSHyperlink(link, 'build %s' % build_number)


 class SpreadsheetMasterTable(StatsTable):
   """Stats table for master builder that puts results in a spreadsheet."""
   # Bump this number whenever this class adds new data columns, or changes
   # the values of existing data columns.
   SHEETS_VERSION = 2

   # These must match up with the column names on the spreadsheet.
   COL_BUILD_NUMBER = 'build number'
   COL_BUILD_LINK = 'build link'
   COL_STATUS = 'status'
   COL_START_DATETIME = 'start datetime'
   COL_RUNTIME_MINUTES = 'runtime minutes'
   COL_WEEKDAY = 'weekday'
   COL_CHROMEOS_VERSION = 'chromeos version'
   COL_CHROME_VERSION = 'chrome version'
   COL_FAILED_STAGES = 'failed stages'
   COL_FAILURE_MESSAGE = 'failure message'

   # It is required that the ID_COL be an integer value.
   ID_COL = COL_BUILD_NUMBER

   COLUMNS = (
       COL_BUILD_NUMBER,
       COL_BUILD_LINK,
       COL_STATUS,
       COL_START_DATETIME,
       COL_RUNTIME_MINUTES,
       COL_WEEKDAY,
       COL_CHROMEOS_VERSION,
       COL_CHROME_VERSION,
       COL_FAILED_STAGES,
       COL_FAILURE_MESSAGE,
   )

   def __init__(self, target, waterfall, columns=None):
     columns = columns or []
     columns = list(SpreadsheetMasterTable.COLUMNS) + columns
     super(SpreadsheetMasterTable, self).__init__(target,
                                                  waterfall,
                                                  columns)

     self._slaves = []

   def _CreateAbortedRowDict(self, build_number):
     """Create a row dict to represent an aborted run of |build_number|."""
     return {
         self.COL_BUILD_NUMBER: str(build_number),
         self.COL_BUILD_LINK: self.GetBuildSSLink(build_number),
         self.COL_STATUS: 'aborted',
     }

   def AppendGapRow(self, build_number):
     """Append a row to represent a missing run of |build_number|."""
     self.AppendRow(self._CreateAbortedRowDict(build_number))

   def AppendBuildRow(self, build_data):
     """Append a row from the given |build_data|.

     Args:
       build_data: A BuildData object.
     """
     # First see if any build number gaps are in the table, and if so fill
     # them in.  This happens when a CQ run is aborted and never writes metadata.
     num_rows = self.GetNumRows()
     if num_rows:
       last_row = self.GetRowByIndex(num_rows - 1)
       last_build_number = int(last_row[self.COL_BUILD_NUMBER])

       for bn in range(build_data.build_number + 1, last_build_number):
         self.AppendGapRow(bn)

     row = self._GetBuildRow(build_data)

     #Use a separate column for each slave.
     slaves = build_data.slaves
     for slave_name in slaves:
       # This adds the slave to our local data, but doesn't add a missing
       # column to the spreadsheet itself.
       self._EnsureSlaveKnown(slave_name)

     # Now add the finished row to this table.
     self.AppendRow(row)

   def _GetBuildRow(self, build_data):
     """Fetch a row dictionary from |build_data|

     Returns:
       A dictionary of the form {column_name: value}
     """
     build_number = build_data.build_number
     build_link = self.GetBuildSSLink(build_number)

     # For datetime.weekday(), 0==Monday and 6==Sunday.
     is_weekday = build_data.start_datetime.weekday() in range(5)

     row = {
         self.COL_BUILD_NUMBER: str(build_number),
         self.COL_BUILD_LINK: build_link,
         self.COL_STATUS: build_data.status,
         self.COL_START_DATETIME: build_data.start_datetime_str,
         self.COL_RUNTIME_MINUTES: str(build_data.runtime_minutes),
         self.COL_WEEKDAY: str(is_weekday),
         self.COL_CHROMEOS_VERSION: build_data.chromeos_version,
         self.COL_CHROME_VERSION: build_data.chrome_version,
         self.COL_FAILED_STAGES: ' '.join(build_data.GetFailedStages()),
         self.COL_FAILURE_MESSAGE: build_data.failure_message,
     }

     slaves = build_data.slaves
     for slave_name in slaves:
       slave = slaves[slave_name]
       slave_url = slave.get('dashboard_url')

       # For some reason status in slaves uses pass/fail instead of
       # passed/failed used elsewhere in metadata.
       translate_dict = {'fail': 'failed', 'pass': 'passed'}
       slave_status = translate_dict.get(slave['status'], slave['status'])

       # Bizarrely, dashboard_url is not always set for slaves that pass.
       # Only sometimes.  crbug.com/350939.
       if slave_url:
         row[slave_name] = self._SSHyperlink(slave_url, slave_status)
       else:
         row[slave_name] = slave_status

     return row

   def _EnsureSlaveKnown(self, slave_name):
     """Ensure that a slave builder name is known.

     Args:
       slave_name: The name of the slave builder (aka spreadsheet column name).
     """
     if not self.HasColumn(slave_name):
       self._slaves.append(slave_name)
       self.AppendColumn(slave_name)

   def GetSlaves(self):
     """Get the list of slave builders which has been discovered so far.

     This list is only fully populated when all row data has been fully
     populated.

     Returns:
       List of column names for slave builders.
     """
     return self._slaves[:]


 class PFQMasterTable(SpreadsheetMasterTable):
   """Stats table for the PFQ Master."""
   SS_KEY = PFQ_SS_KEY

   WATERFALL = 'chromeos'
   # Must match up with name in waterfall.
   TARGET = 'x86-generic nightly chromium PFQ'

   WORKSHEET_NAME = 'PFQMasterData'

   # Bump this number whenever this class adds new data columns, or changes
   # the values of existing data columns.
   SHEETS_VERSION = SpreadsheetMasterTable.SHEETS_VERSION + 1

   # These columns are in addition to those inherited from
   # SpreadsheetMasterTable
   COLUMNS = ()

   def __init__(self):
     super(PFQMasterTable, self).__init__(PFQMasterTable.TARGET,
                                          PFQMasterTable.WATERFALL,
                                          list(PFQMasterTable.COLUMNS))


 class CanaryMasterTable(SpreadsheetMasterTable):
   """Stats table for the Canary Master."""
   SS_KEY = PFQ_SS_KEY

   WATERFALL = 'chromeos'
   # Must match up with name in waterfall.
   TARGET = 'Canary master'

   WORKSHEET_NAME = 'CanaryMasterData'

   # Bump this number whenever this class adds new data columns, or changes
   # the values of existing data columns.
   SHEETS_VERSION = SpreadsheetMasterTable.SHEETS_VERSION

   # These columns are in addition to those inherited from
   # SpreadsheetMasterTable
   COLUMNS = ()

   def __init__(self):
     super(CanaryMasterTable, self).__init__(CanaryMasterTable.TARGET,
                                             CanaryMasterTable.WATERFALL,
                                             list(CanaryMasterTable.COLUMNS))


 class CQMasterTable(SpreadsheetMasterTable):
   """Stats table for the CQ Master."""
   WATERFALL = 'chromeos'
   TARGET = 'CQ master' # Must match up with name in waterfall.

   WORKSHEET_NAME = 'CQMasterData'

   # Bump this number whenever this class adds new data columns, or changes
   # the values of existing data columns.
   SHEETS_VERSION = SpreadsheetMasterTable.SHEETS_VERSION + 2

   COL_CL_COUNT = 'cl count'
   COL_CL_SUBMITTED_COUNT = 'cls submitted'
   COL_CL_REJECTED_COUNT = 'cls rejected'

   # These columns are in addition to those inherited from
   # SpreadsheetMasterTable
   COLUMNS = (
       COL_CL_COUNT,
       COL_CL_SUBMITTED_COUNT,
       COL_CL_REJECTED_COUNT,
   )

   def __init__(self):
     super(CQMasterTable, self).__init__(CQMasterTable.TARGET,
                                         CQMasterTable.WATERFALL,
                                         list(CQMasterTable.COLUMNS))

   def _GetBuildRow(self, build_data):
     """Fetch a row dictionary from |build_data|

     Returns:
       A dictionary of the form {column_name: value}
     """
     row = super(CQMasterTable, self)._GetBuildRow(build_data)
     row[self.COL_CL_COUNT] = str(build_data.count_changes)

     cl_actions = [metadata_lib.CLActionTuple(*x)
                   for x in build_data['cl_actions']]
     submitted_cl_count = len([x for x in cl_actions if
                               x.action == constants.CL_ACTION_SUBMITTED])
     rejected_cl_count = len([x for x in cl_actions
                              if x.action == constants.CL_ACTION_KICKED_OUT])
     row[self.COL_CL_SUBMITTED_COUNT] = str(submitted_cl_count)
     row[self.COL_CL_REJECTED_COUNT] = str(rejected_cl_count)
     return row


 class SSUploader(object):
   """Uploads data from table object to Google spreadsheet."""

   __slots__ = (
       '_creds',          # gdata_lib.Creds object
       '_scomm',          # gdata_lib.SpreadsheetComm object
       'ss_key',          # Spreadsheet key string
   )

   SOURCE = 'Gathered from builder metadata'
   HYPERLINK_RE = re.compile(r'=HYPERLINK\("[^"]+", "([^"]+)"\)')
   DATETIME_FORMATS = ('%m/%d/%Y %H:%M:%S', NICE_DATETIME_FORMAT)

   def __init__(self, creds, ss_key):
     self._creds = creds
     self.ss_key = ss_key
     self._scomm = None

   @classmethod
   def _ValsEqual(cls, val1, val2):
     """Compare two spreadsheet values and return True if they are the same.

     This is non-trivial because of the automatic changes that Google Sheets
     does to values.

     Args:
       val1: New or old spreadsheet value to compare.
       val2: New or old spreadsheet value to compare.

     Returns:
       True if val1 and val2 are effectively the same, False otherwise.
     """
     # An empty string sent to spreadsheet comes back as None.  In any case,
     # treat two false equivalents as equal.
     if not (val1 or val2):
       return True

     # If only one of the values is set to anything then they are not the same.
     if bool(val1) != bool(val2):
       return False

     # If values are equal then we are done.
     if val1 == val2:
       return True

     # Ignore case differences.  This is because, for example, the
     # spreadsheet automatically changes "True" to "TRUE".
     if val1 and val2 and val1.lower() == val2.lower():
       return True

     # If either value is a HYPERLINK, then extract just the text for comparison
     # because that is all the spreadsheet says the value is.
     match = cls.HYPERLINK_RE.search(val1)
     if match:
       return match.group(1) == val2
     match = cls.HYPERLINK_RE.search(val2)
     if match:
       return match.group(2) == val1

     # See if the strings are two different representations of the same datetime.
     dt1, dt2 = None, None
     for dt_format in cls.DATETIME_FORMATS:
       try:
         dt1 = datetime.datetime.strptime(val1, dt_format)
       except ValueError:
         pass
       try:
         dt2 = datetime.datetime.strptime(val2, dt_format)
       except ValueError:
         pass
     if dt1 and dt2 and dt1 == dt2:
       return True

     # If we get this far then the values are just different.
     return False

   def _Connect(self, ws_name):
     """Establish connection to specific worksheet.

     Args:
       ws_name: Worksheet name.
     """
     if self._scomm:
       self._scomm.SetCurrentWorksheet(ws_name)
     else:
       self._scomm = gdata_lib.SpreadsheetComm()
       self._scomm.Connect(self._creds, self.ss_key, ws_name, source=self.SOURCE)

   def GetRowCacheByCol(self, ws_name, key):
     """Fetch the row cache with id=|key|."""
     self._Connect(ws_name)
     ss_key = gdata_lib.PrepColNameForSS(key)
     return self._scomm.GetRowCacheByCol(ss_key)

   def _EnsureColumnsExist(self, data_columns):
     """Ensures that |data_columns| exist in current spreadsheet worksheet.

     Assumes spreadsheet worksheet is already connected.

     Raises:
       SpreadsheetError if any column in |data_columns| is missing from
       the spreadsheet's current worksheet.
     """
     ss_cols = self._scomm.GetColumns()

     # Make sure all columns in data_table are supported in spreadsheet.
     missing_cols = [c for c in data_columns
                     if gdata_lib.PrepColNameForSS(c) not in ss_cols]
     if missing_cols:
       raise SpreadsheetError('Spreadsheet missing column(s): %s' %
                              ', '.join(missing_cols))

   def UploadColumnToWorksheet(self, ws_name, colIx, data):
     """Upload list |data| to column number |colIx| in worksheet |ws_name|.

     This will overwrite any existing data in that column.
     """
     self._Connect(ws_name)
     self._scomm.WriteColumnToWorksheet(colIx, data)

   def UploadSequentialRows(self, ws_name, data_table):
     """Upload |data_table| to the |ws_name| worksheet of sheet at self.ss_key.

     Data will be uploaded row-by-row in ascending ID_COL order. Missing
     values of ID_COL will be filled in by filler rows.

     Args:
       ws_name: Worksheet name for identifying worksheet within spreadsheet.
       data_table: table.Table object with rows to upload to worksheet.
     """
     self._Connect(ws_name)
     cros_build_lib.Info('Uploading stats rows to worksheet "%s" of spreadsheet'
                         ' "%s" now.', self._scomm.ws_name, self._scomm.ss_key)

     cros_build_lib.Debug('Getting cache of current spreadsheet contents.')
     id_col = data_table.ID_COL
     ss_id_col = gdata_lib.PrepColNameForSS(id_col)
     ss_row_cache = self._scomm.GetRowCacheByCol(ss_id_col)

     self._EnsureColumnsExist(data_table.GetColumns())

     # First see if a build_number is being skipped.  Allow the data_table to
     # add default (filler) rows if it wants to.  These rows may represent
     # aborted runs, for example.  ID_COL is assumed to hold integers.
     first_id_val = int(data_table[-1][id_col])
     prev_id_val = first_id_val - 1
     while str(prev_id_val) not in ss_row_cache:
       data_table.AppendGapRow(prev_id_val)
       prev_id_val -= 1
       # Sanity check that we have not created an infinite loop.
       assert prev_id_val >= 0

     # Spreadsheet is organized with oldest runs at the top and newest runs
     # at the bottom (because there is no interface for inserting new rows at
     # the top).  This is the reverse of data_table, so start at the end.
     for row in data_table[::-1]:
       row_dict = dict((gdata_lib.PrepColNameForSS(key), row[key])
                       for key in row)

       # See if row with the same id_col value already exists.
       id_val = row[id_col]
       ss_row = ss_row_cache.get(id_val)

       try:
         if ss_row:
           # Row already exists in spreadsheet.  See if contents any different.
           # Create dict representing values in row_dict different from ss_row.
           row_delta = dict((k, v) for k, v in row_dict.iteritems()
                            if not self._ValsEqual(v, ss_row[k]))
           if row_delta:
             cros_build_lib.Debug('Updating existing spreadsheet row for %s %s.',
                                  id_col, id_val)
             self._scomm.UpdateRowCellByCell(ss_row.ss_row_num, row_delta)
           else:
             cros_build_lib.Debug('Unchanged existing spreadsheet row for'
                                  ' %s %s.', id_col, id_val)
         else:
           cros_build_lib.Debug('Adding spreadsheet row for %s %s.',
                                id_col, id_val)
           self._scomm.InsertRow(row_dict)
       except gdata_lib.SpreadsheetError as e:
         cros_build_lib.Error('Failure while uploading spreadsheet row for'
                              ' %s %s with data %s. Error: %s.', id_col, id_val,
                              row_dict, e)


 class StatsManager(object):
   """Abstract class for managing stats for one config target.

   Subclasses should specify the config target by passing them in to __init__.

   This class handles the following duties:
     1) Read a bunch of metadata.json URLs for the config target that are
        are no older than the given start date.
     2) Upload data to a Google Sheet, if specified by the subclass.
     3) Upload data to Graphite, if specified by the subclass.
   """
   # Subclasses can overwrite any of these.
   TABLE_CLASS = None
   UPLOAD_ROW_PER_BUILD = False
   # To be overridden by subclass. A dictionary mapping a |key| from
   # self.summary to (ws_name, colIx) tuples from the spreadsheet which
   # should be overwritten with the data from the self.summary[key]
   SUMMARY_SPREADSHEET_COLUMNS = {}
   CARBON_FUNCS_BY_VERSION = None
   BOT_TYPE = None

   # Whether to grab a count of what data has been written to sheets before.
   # This is needed if you are writing data to the Google Sheets spreadsheet.
   GET_SHEETS_VERSION = True

   def __init__(self, config_target, db=None, ss_key=None,
                no_sheets_version_filter=False):
     self.builds = []
     self.gs_ctx = gs.GSContext()
     self.config_target = config_target
     self.db = db
     self.ss_key = ss_key
     self.no_sheets_version_filter = no_sheets_version_filter
     self.summary = {}
     self.actions = None


   # pylint: disable=W0613
   def Gather(self, start_date, end_date, sort_by_build_number=True,
              starting_build_number=0, creds=None):
     """Fetches build data into self.builds.

     Args:
       start_date: A datetime.date instance for the earliest build to
                   examine.
       end_date: A datetime.date instance for the latest build to
                 examine.
       sort_by_build_number: Optional boolean. If True, builds will be
                             sorted by build number.
       starting_build_number: The lowest build number to include in
                              self.builds.
       creds: Login credentials as returned by _PrepareCreds. (optional)
     """
     self.builds = self._FetchBuildData(start_date, end_date, self.config_target,
                                        self.gs_ctx)

     if sort_by_build_number:
       # Sort runs by build_number, from newest to oldest.
       cros_build_lib.Info('Sorting by build number now.')
       self.builds = sorted(self.builds, key=lambda b: b.build_number,
                            reverse=True)
     if starting_build_number:
       cros_build_lib.Info('Filtering to include builds after %s (inclusive).',
                           starting_build_number)
       self.builds = [b for b in self.builds
                      if b.build_number >= starting_build_number]

   def CollateActions(self, actions):
     """Collates a list of actions into per-patch and per-cl actions.

     Returns a tuple (per_patch_actions, per_cl_actions) where each are
     a dictionary mapping patches or cls to a list of CLActionWithBuildTuple
     sorted in order of ascending timestamp.
     """
     per_patch_actions = {}
     per_cl_actions = {}
     for a in actions:
       change_with_patch = a.patch
       change_no_patch = metadata_lib.GerritChangeTuple(
           a.change_number, change_with_patch.internal
       )

       per_patch_actions.setdefault(change_with_patch, []).append(a)
       per_cl_actions.setdefault(change_no_patch, []).append(a)

     for p in [per_cl_actions, per_patch_actions]:
       for k, v in p.iteritems():
         p[k] = sorted(v, key=lambda x: x.timestamp)

     return (per_patch_actions, per_cl_actions)

   @classmethod
   def _FetchBuildData(cls, start_date, end_date, config_target, gs_ctx):
     """Fetches BuildData for builds of |config_target| since |start_date|.

     Args:
       start_date: A datetime.date instance.
       end_date: A datetime.date instance for the latest build to
                 examine.
       config_target: String config name to fetch metadata for.
       gs_ctx: A gs.GSContext instance.

     Returns:
       A list of of metadata_lib.BuildData objects that were fetched.
     """
     cros_build_lib.Info('Gathering data for %s from %s until %s',
                         config_target, start_date, end_date)
     urls = metadata_lib.GetMetadataURLsSince(config_target,
                                              start_date,
                                              end_date)
     cros_build_lib.Info('Found %d metadata.json URLs to process.\n'
                         '  From: %s\n  To  : %s', len(urls), urls[0], urls[-1])

     builds = metadata_lib.BuildData.ReadMetadataURLs(
         urls, gs_ctx, get_sheets_version=cls.GET_SHEETS_VERSION)
     cros_build_lib.Info('Read %d total metadata files.', len(builds))
     return builds

   # TODO(akeshet): Return statistics in dictionary rather than just printing
   # them.
   def Summarize(self):
     """Process and print a summary of statistics.

     Returns:
       An empty dictionary. Note: subclasses can extend this method and return
       non-empty dictionaries, with summarized statistics.
     """
     if self.builds:
       cros_build_lib.Info('%d total runs included, from build %d to %d.',
                           len(self.builds), self.builds[-1].build_number,
                           self.builds[0].build_number)
       total_passed = len([b for b in self.builds if b.Passed()])
       cros_build_lib.Info('%d of %d runs passed.', total_passed,
                           len(self.builds))

       # TODO(yjhong): Remove this once there is sufficent data to
       # compare Board-Aware Submission with the baseline. This count
       # may not be entirely accurate because the to-be-submitted
       # changes may fail to submit due to other reasons (e.g., patch
       # modified during the CQ run).
       count = sum(x.get('bs_submission_diff_count', 0) for x in self.builds)
       cros_build_lib.Info(
           'BAS submitted %d more changes than the baseline in the %s failed '
           'builds.', count, len(self.builds) - total_passed)
     else:
       cros_build_lib.Info('No runs included.')
     return {}

   @property
   def sheets_version(self):
     if self.TABLE_CLASS:
       return self.TABLE_CLASS.SHEETS_VERSION

     return -1

   @property
   def carbon_version(self):
     if self.CARBON_FUNCS_BY_VERSION:
       return len(self.CARBON_FUNCS_BY_VERSION) - 1

     return -1

   def UploadToSheet(self, creds):
     assert creds

     if self.UPLOAD_ROW_PER_BUILD:
       self._UploadBuildsToSheet(creds)

     if self.SUMMARY_SPREADSHEET_COLUMNS:
       self._UploadSummaryColumns(creds)

   def _UploadBuildsToSheet(self, creds):
     """Upload row-per-build data to adsheet."""
     if not self.TABLE_CLASS:
       cros_build_lib.Debug('No Spreadsheet uploading configured for %s.',
                            self.config_target)
       return

     # Filter for builds that need to send data to Sheets (unless overridden
     # by command line flag.
     if self.no_sheets_version_filter:
       builds = self.builds
     else:
       version = self.sheets_version
       builds = [b for b in self.builds if b.sheets_version < version]
       cros_build_lib.Info('Found %d builds that need to send Sheets v%d data.',
                           len(builds), version)

     if builds:
       # Fill a data table of type table_class from self.builds.
       # pylint: disable=E1102
       data_table = self.TABLE_CLASS()
       for build in builds:
         try:
           data_table.AppendBuildRow(build)
         except Exception:
           cros_build_lib.Error('Failed to add row for builder_number %s to'
                                ' table.  It came from %s.', build.build_number,
                                build.metadata_url)
           raise

       # Upload data table to sheet.
       uploader = SSUploader(creds, self.ss_key)
       uploader.UploadSequentialRows(data_table.WORKSHEET_NAME, data_table)

   def _UploadSummaryColumns(self, creds):
     """Overwrite summary columns in spreadsheet with appropriate data."""
     # Upload data table to sheet.
     uploader = SSUploader(creds, self.ss_key)
     for key, (ws_name, colIx) in self.SUMMARY_SPREADSHEET_COLUMNS.iteritems():
       uploader.UploadColumnToWorksheet(ws_name, colIx, self.summary[key])


   def SendToCarbon(self):
     if self.CARBON_FUNCS_BY_VERSION:
       for version, func in enumerate(self.CARBON_FUNCS_BY_VERSION):
         # Filter for builds that need to send data to Graphite.
         builds = [b for b in self.builds if b.carbon_version < version]
         cros_build_lib.Info('Found %d builds that need to send Graphite v%d'
                             ' data.', len(builds), version)
         if builds:
           func(self, builds)

   def MarkGathered(self):
     """Mark each metadata.json in self.builds as processed.

     Applies only to StatsManager subclasses that have UPLOAD_ROW_PER_BUILD
     True, as these do not want data from a given build to be re-uploaded.
     """
     if self.UPLOAD_ROW_PER_BUILD:
       metadata_lib.BuildData.MarkBuildsGathered(self.builds,
                                                 self.sheets_version,
                                                 self.carbon_version,
                                                 gs_ctx=self.gs_ctx)


 # TODO(mtennant): This class is an untested placeholder.
 class CQSlaveStats(StatsManager):
   """Stats manager for all CQ slaves."""
   # TODO(mtennant): Add Sheets support for each CQ slave.
   TABLE_CLASS = None
   GET_SHEETS_VERSION = True

   def __init__(self, slave_target, **kwargs):
     super(CQSlaveStats, self).__init__(slave_target, **kwargs)

   # TODO(mtennant): This is totally untested, but is a refactoring of the
   # graphite code that was in place before for CQ slaves.
   def _SendToCarbonV0(self, builds):
     # Send runtime data.
     def _GetGraphName(build):
       bot_id = build['bot-config'].replace('-', '.')
       return 'buildbot.builders.%s.duration_seconds' % bot_id

     _SendToCarbon(builds, (
         _GetGraphName,
         lambda b: b.runtime_seconds,
         lambda b: b.epoch_time_seconds,
     ))


 class CQMasterStats(StatsManager):
   """Manager stats gathering for the Commit Queue Master."""
   TABLE_CLASS = CQMasterTable
   UPLOAD_ROW_PER_BUILD = True
   BOT_TYPE = CQ
   GET_SHEETS_VERSION = True

   def __init__(self, **kwargs):
     super(CQMasterStats, self).__init__(CQ_MASTER, **kwargs)

   def _SendToCarbonV0(self, builds):
     # Send runtime data.
     _SendToCarbon(builds, (
         lambda b: 'buildbot.cq.run_time_seconds',
         lambda b: b.runtime_seconds,
         lambda b: b.epoch_time_seconds,
     ))

     # Send CLs per run data.
     _SendToCarbon(builds, (
         lambda b: 'buildbot.cq.cls_per_run',
         lambda b: b.count_changes,
         lambda b: b.epoch_time_seconds,
     ))

   # Organized by by increasing graphite version numbers, starting at 0.
   CARBON_FUNCS_BY_VERSION = (
       _SendToCarbonV0,
   )


 class PFQMasterStats(StatsManager):
   """Manager stats gathering for the PFQ Master."""
   TABLE_CLASS = PFQMasterTable
   UPLOAD_ROW_PER_BUILD = True
   BOT_TYPE = PFQ
   GET_SHEETS_VERSION = True

   def __init__(self, **kwargs):
     super(PFQMasterStats, self).__init__(PFQ_MASTER, **kwargs)


 class CanaryMasterStats(StatsManager):
   """Manager stats gathering for the Canary Master."""
   TABLE_CLASS = CanaryMasterTable
   UPLOAD_ROW_PER_BUILD = True
   BOT_TYPE = CANARY
   GET_SHEETS_VERSION = True

   def __init__(self, **kwargs):
     super(CanaryMasterStats, self).__init__(CANARY_MASTER, **kwargs)


 class CLStats(StatsManager):
   """Manager for stats about CL actions taken by the Commit Queue."""
   PATCH_HANDLING_TIME_SUMMARY_KEY = 'patch_handling_time'
   SUMMARY_SPREADSHEET_COLUMNS = {
       PATCH_HANDLING_TIME_SUMMARY_KEY: ('PatchHistogram', 1)}
   COL_FAILURE_CATEGORY = 'failure category'
   COL_FAILURE_BLAME = 'bug or bad CL'
   REASON_BAD_CL = 'Bad CL'
   BOT_TYPE = CQ
   GET_SHEETS_VERSION = False

   def __init__(self, email, **kwargs):
     super(CLStats, self).__init__(CQ_MASTER, **kwargs)
     self.actions = []
     self.per_patch_actions = {}
     self.per_cl_actions = {}
     self.email = email
     self.reasons = {}
     self.blames = {}
     self.summary = {}
     self.build_numbers_by_build_id = {}

   def GatherFailureReasons(self, creds):
     """Gather the reasons why our builds failed and the blamed bugs or CLs.

     Args:
       creds: A gdata_lib.Creds object.
     """
     data_table = CQMasterStats.TABLE_CLASS()
     uploader = SSUploader(creds, self.ss_key)
     ss_failure_category = gdata_lib.PrepColNameForSS(self.COL_FAILURE_CATEGORY)
     ss_failure_blame = gdata_lib.PrepColNameForSS(self.COL_FAILURE_BLAME)
     rows = uploader.GetRowCacheByCol(data_table.WORKSHEET_NAME,
                                      data_table.COL_BUILD_NUMBER)
     for b in self.builds:
       try:
         row = rows[str(b.build_number)]
       except KeyError:
         self.reasons[b.build_number] = 'None'
         self.blames[b.build_number] = []
       else:
         self.reasons[b.build_number] = str(row[ss_failure_category])
         self.blames[b.build_number] = self.ProcessBlameString(
             str(row[ss_failure_blame]))

   @staticmethod
   def ProcessBlameString(blame_string):
     """Parse a human-created |blame_string| from the spreadsheet.

     Returns:
       A list of canonicalized URLs for bugs or CLs that appear in the blame
       string. Canonicalized form will be 'crbug.com/1234',
       'crosreview.com/1234', 'b/1234', 't/1234', or 'crosreview.com/i/1234' as
       applicable.
     """
     urls = []
     tokens = blame_string.split()

     # Format to generate the regex patterns. Matches one of provided domain
     # names, followed by lazy wildcard, followed by greedy digit wildcard,
     # followed by optional slash and optional comma.
     general_regex = r'^.*(%s).*?([0-9]+)/?,?$'

     crbug = general_regex % r'crbug.com|code.google.com'
     internal_review = general_regex % (
         r'chrome-internal-review.googlesource.com|crosreview.com/i')
     external_review = general_regex % (
         r'crosreview.com|chromium-review.googlesource.com')
     guts = general_regex % r't/|gutsv\d.corp.google.com/#ticket/'

     # Buganizer regex is different, as buganizer urls do not end with the bug
     # number.
     buganizer = r'^.*(b/|b.corp.google.com/issue\?id=)([0-9]+).*$'

     # Patterns need to be tried in a specific order -- internal review needs
     # to be tried before external review, otherwise urls like crosreview.com/i
     # will be incorrectly parsed as external.
     patterns = [crbug,
                 internal_review,
                 external_review,
                 buganizer,
                 guts]
     url_patterns = [CROS_BUG_BASE_URL,
                     INTERNAL_CL_BASE_URL,
                     EXTERNAL_CL_BASE_URL,
                     BUGANIZER_BASE_URL,
                     GUTS_BASE_URL]

     for t in tokens:
       for p, u in zip(patterns, url_patterns):
         m = re.match(p, t)
         if m:
           urls.append(u + m.group(2))
           break

     return urls

   def Gather(self, start_date, end_date, sort_by_build_number=True,
              starting_build_number=0, creds=None):
     """Fetches build data and failure reasons.

     Args:
       start_date: A datetime.date instance for the earliest build to
                   examine.
       end_date: A datetime.date instance for the latest build to
                 examine.
       sort_by_build_number: Optional boolean. If True, builds will be
                             sorted by build number.
       starting_build_number: The lowest build number from the CQ to include in
                              the results.
       creds: Login credentials as returned by _PrepareCreds. (optional)
     """
     if not creds:
       creds = _PrepareCreds(self.email)
     super(CLStats, self).Gather(start_date,
                                 end_date,
                                 sort_by_build_number=sort_by_build_number,
                                 starting_build_number=starting_build_number)
     self.actions = self.db.GetActionHistory(start_date, end_date)
     self.GatherFailureReasons(creds)

     # Remove builds without a build_id.
     self.builds = _RemoveBuildsWithNoBuildId(self.builds, 'CQ')

     self.build_numbers_by_build_id.update(
         {b['build_id'] : b.build_number for b in self.builds})

   def GetSubmittedPatchNumber(self, actions):
     """Get the patch number of the final patchset submitted.

     This function only makes sense for patches that were submitted.

     Args:
       actions: A list of actions for a single change.
     """
     submit = [a for a in actions if a.action == constants.CL_ACTION_SUBMITTED]
     assert len(submit) == 1, \
         'Expected change to be submitted exactly once, got %r' % submit
     return submit[-1].patch_number

   def ClassifyRejections(self, submitted_changes):
     """Categorize rejected CLs, deciding whether the rejection was incorrect.

     We figure out what patches were falsely rejected by looking for patches
     which were later submitted unmodified after being rejected. These patches
     are considered to be likely good CLs.

     Args:
       submitted_changes: A dict mapping submitted GerritChangeTuple objects to
         a list of associated actions.

     Yields:
       change: The GerritChangeTuple that was rejected.
       actions: A list of actions applicable to the CL.
       a: The reject action that kicked out the CL.
       falsely_rejected: Whether the CL was incorrectly rejected. A CL rejection
         is considered incorrect if the same patch is later submitted, with no
         changes. It's a heuristic.
     """
     for change, actions in submitted_changes.iteritems():
       submitted_patch_number = self.GetSubmittedPatchNumber(actions)
       for a in actions:
         if a.action == constants.CL_ACTION_KICKED_OUT:
           # If the patch wasn't picked up in the run, this means that it "failed
           # to apply" rather than "failed to validate". Ignore it.
           picked_up = [x for x in actions if x.build_id == a.build_id and
                        x.patch == a.patch and
                        x.action == constants.CL_ACTION_PICKED_UP]
           falsely_rejected = a.patch_number == submitted_patch_number
           if picked_up:
             # Check whether the patch was updated after submission.
             yield change, actions, a, falsely_rejected

   def _PrintCounts(self, reasons, fmt):
     """Print a sorted list of reasons in descending order of frequency.

     Args:
       reasons: A key/value mapping mapping the reason to the count.
       fmt: A format string for our log message, containing %(cnt)d
         and %(reason)s.
     """
     d = reasons
     for cnt, reason in sorted(((v, k) for (k, v) in d.items()), reverse=True):
       logging.info(fmt, dict(cnt=cnt, reason=reason))
     if not d:
       logging.info('  None')

   def BotType(self, action):
     """Return whether |action| applies to the CQ or PRE_CQ."""
     build_config = action.build_config
     if build_config.endswith('-%s' % cbuildbot_config.CONFIG_TYPE_PALADIN):
       return CQ
     else:
       return PRE_CQ

   def GoodPatchRejections(self, submitted_changes):
     """Find good patches that were incorrectly rejected.

     Args:
       submitted_changes: A dict mapping submitted GerritChangeTuple objects to
         a list of associated actions.

     Returns:
       A dict, where d[patch] = reject_actions for each good patch that was
       incorrectly rejected.
     """
     # falsely_rejected_changes maps GerritChangeTuple objects to their actions.
     # bad_cl_builds is a set of builds that contain a bad patch.
     falsely_rejected_changes = {}
     bad_cl_builds = set()
     for x in self.ClassifyRejections(submitted_changes):
       _, actions, a, falsely_rejected = x
       if falsely_rejected:
         falsely_rejected_changes[a.patch] = actions
       elif self.BotType(a) == PRE_CQ:
         # If a developer writes a bad patch and it fails the Pre-CQ, it
         # may cause many other patches from the same developer to be
         # rejected. This is expected and correct behavior. Treat all of
         # the patches in the Pre-CQ run as bad so that they don't skew our
         # our statistics.
         #
         # Since we don't have a spreadsheet for the Pre-CQ, we guess what
         # CLs were bad by looking at what patches needed to be changed
         # before submission.
         #
         # NOTE: We intentionally only apply this logic to the Pre-CQ here.
         # The CQ is different because it may have many innocent patches in
         # a single run which should not be treated as bad.
         bad_cl_builds.add(a.build_id)

     # Make a list of candidate patches that got incorrectly rejected. We track
     # them in a dict, setting good_patch_rejections[patch] = rejections for
     # each patch.
     good_patch_rejections = collections.defaultdict(list)
     for v in falsely_rejected_changes.itervalues():
       for a in v:
         if (a.action == constants.CL_ACTION_KICKED_OUT and
             a.build_id not in bad_cl_builds):
           good_patch_rejections[a.patch].append(a)

     return good_patch_rejections

   def FalseRejectionRate(self, good_patch_count, good_patch_rejection_count):
     """Calculate the false rejection ratio.

     This is the chance that a good patch will be rejected by the Pre-CQ or CQ
     in a given run.

     Args:
       good_patch_count: The number of good patches in the run.
       good_patch_rejection_count: A dict containing the number of false
         rejections for the CQ and PRE_CQ.

     Returns:
       A dict containing the false rejection ratios for CQ, PRE_CQ, and combined.
     """
     false_rejection_rate = dict()
     for bot, rejection_count in good_patch_rejection_count.iteritems():
       false_rejection_rate[bot] = (
           rejection_count * 100 / (rejection_count + good_patch_count)
       )
     false_rejection_rate['combined'] = 0
     if good_patch_count:
       rejection_count = sum(good_patch_rejection_count.values())
       false_rejection_rate['combined'] = (
           rejection_count * 100 / (good_patch_count + rejection_count)
       )
     return false_rejection_rate

   def Summarize(self):
     """Process, print, and return a summary of cl action statistics.

     As a side effect, save summary to self.summary.

     Returns:
       A dictionary summarizing the statistics.
     """
     super_summary = super(CLStats, self).Summarize()

     (self.per_patch_actions,
      self.per_cl_actions) = self.CollateActions(self.actions)

     submit_actions = [a for a in self.actions
                       if a.action == constants.CL_ACTION_SUBMITTED]
     reject_actions = [a for a in self.actions
                       if a.action == constants.CL_ACTION_KICKED_OUT]
     sbfail_actions = [a for a in self.actions
                       if a.action == constants.CL_ACTION_SUBMIT_FAILED]

     build_reason_counts = {}
     for reason in self.reasons.values():
       if reason != 'None':
         build_reason_counts[reason] = build_reason_counts.get(reason, 0) + 1

     unique_blames = set()
     for blames in self.blames.itervalues():
       unique_blames.update(blames)

     unique_cl_blames = {blame for blame in unique_blames if
                         EXTERNAL_CL_BASE_URL in blame}

     submitted_changes = {k: v for k, v, in self.per_cl_actions.iteritems()
                          if any(a.action == constants.CL_ACTION_SUBMITTED
                                 for a in v)}
     submitted_patches = {
         k: v for k, v, in self.per_patch_actions.iteritems()
         if any(a.action == constants.CL_ACTION_SUBMITTED and
                a.build_config == constants.CQ_MASTER for a in v)}

     patch_handle_times = [
         clactions.GetCLHandlingTime(patch, actions) for
         (patch, actions) in submitted_patches.iteritems()]

     pre_cq_handle_times = [
         clactions.GetPreCQTime(patch, actions) for
         (patch, actions) in submitted_patches.iteritems()]

     cq_wait_times = [
         clactions.GetCQWaitTime(patch, actions) for
         (patch, actions) in submitted_patches.iteritems()]

     cq_handle_times = [
         clactions.GetCQRunTime(patch, actions) for
         (patch, actions) in submitted_patches.iteritems()]

     # Count CLs that were rejected, then a subsequent patch was submitted.
     # These are good candidates for bad CLs. We track them in a dict, setting
     # submitted_after_new_patch[bot_type][patch] = actions for each bad patch.
     submitted_after_new_patch = {}
     for x in self.ClassifyRejections(submitted_changes):
       change, actions, a, falsely_rejected = x
       if not falsely_rejected:
         d = submitted_after_new_patch.setdefault(self.BotType(a), {})
         d[change] = actions

     # Sort the candidate bad CLs in order of submit time.
     bad_cl_candidates = {}
     for bot_type, patch_actions in submitted_after_new_patch.items():
       bad_cl_candidates[bot_type] = [
           k for k, _ in sorted(patch_actions.items(),
                                key=lambda x: x[1][-1].timestamp)]

     # Calculate how many good patches were falsely rejected and why.
     # good_patch_rejections maps patches to the rejection actions.
     # patch_reason_counts maps failure reasons to counts.
     # patch_blame_counts maps blame targets to counts.
     good_patch_rejections = self.GoodPatchRejections(submitted_changes)
     patch_reason_counts = {}
     patch_blame_counts = {}
     for k, v in good_patch_rejections.iteritems():
       for a in v:
         if a.action == constants.CL_ACTION_KICKED_OUT:
           build_number = self.build_numbers_by_build_id.get(a.build_id)
           if self.BotType(a) == CQ and build_number:
             reason = self.reasons.get(build_number, 'None')
             blames = self.blames.get(build_number, ['None'])
             patch_reason_counts[reason] = patch_reason_counts.get(reason, 0) + 1
             for blame in blames:
               patch_blame_counts[blame] = patch_blame_counts.get(blame, 0) + 1

     # good_patch_count: The number of good patches.
     # good_patch_rejection_count maps the bot type (CQ or PRE_CQ) to the number
     #   of times that bot has falsely rejected good patches.
     good_patch_count = len(submit_actions)
     good_patch_rejection_count = collections.defaultdict(int)
     for k, v in good_patch_rejections.iteritems():
       for a in v:
         good_patch_rejection_count[self.BotType(a)] += 1
     false_rejection_rate = self.FalseRejectionRate(good_patch_count,
                                                    good_patch_rejection_count)

     # This list counts how many times each good patch was rejected.
     rejection_counts = [0] * (good_patch_count - len(good_patch_rejections))
     rejection_counts += [len(x) for x in good_patch_rejections.values()]

     # Break down the frequency of how many times each patch is rejected.
     good_patch_rejection_breakdown = []
     if rejection_counts:
       for x in range(max(rejection_counts) + 1):
         good_patch_rejection_breakdown.append((x, rejection_counts.count(x)))

     summary = {
         'total_cl_actions': len(self.actions),
         'unique_cls': len(self.per_cl_actions),
         'unique_patches': len(self.per_patch_actions),
         'submitted_patches': len(submit_actions),
         'rejections': len(reject_actions),
         'submit_fails': len(sbfail_actions),
         'good_patch_rejections': sum(rejection_counts),
         'mean_good_patch_rejections': numpy.mean(rejection_counts),
         'good_patch_rejection_breakdown': good_patch_rejection_breakdown,
         'good_patch_rejection_count': dict(good_patch_rejection_count),
         'false_rejection_rate': false_rejection_rate,
         'median_handling_time': numpy.median(patch_handle_times),
         self.PATCH_HANDLING_TIME_SUMMARY_KEY: patch_handle_times,
         'bad_cl_candidates': bad_cl_candidates,
         'unique_blames_change_count': len(unique_cl_blames),
     }

     logging.info('CQ committed %s changes', summary['submitted_patches'])
     logging.info('CQ correctly rejected %s unique changes',
                  summary['unique_blames_change_count'])
     logging.info('pre-CQ and CQ incorrectly rejected %s changes a total of '
                  '%s times (pre-CQ: %s; CQ: %s)',
                  len(good_patch_rejections),
                  sum(good_patch_rejection_count.values()),
                  good_patch_rejection_count[PRE_CQ],
                  good_patch_rejection_count[CQ])

     logging.info('      Total CL actions: %d.', summary['total_cl_actions'])
     logging.info('    Unique CLs touched: %d.', summary['unique_cls'])
     logging.info('Unique patches touched: %d.', summary['unique_patches'])
     logging.info('   Total CLs submitted: %d.', summary['submitted_patches'])
     logging.info('      Total rejections: %d.', summary['rejections'])
     logging.info(' Total submit failures: %d.', summary['submit_fails'])
     logging.info(' Good patches rejected: %d.',
                  len(good_patch_rejections))
     logging.info('   Mean rejections per')
     logging.info('            good patch: %.2f',
                  summary['mean_good_patch_rejections'])
     logging.info(' False rejection rate for CQ: %.1f%%',
                  summary['false_rejection_rate'].get(CQ, 0))
     logging.info(' False rejection rate for Pre-CQ: %.1f%%',
                  summary['false_rejection_rate'].get(PRE_CQ, 0))
     logging.info(' Combined false rejection rate: %.1f%%',
                  summary['false_rejection_rate']['combined'])

     for x, p in summary['good_patch_rejection_breakdown']:
       logging.info('%d good patches were rejected %d times.', p, x)
     logging.info('')
     logging.info('Good patch handling time:')
     logging.info('  10th percentile: %.2f hours',
                  numpy.percentile(patch_handle_times, 10) / 3600.0)
     logging.info('  25th percentile: %.2f hours',
                  numpy.percentile(patch_handle_times, 25) / 3600.0)
     logging.info('  50th percentile: %.2f hours',
                  summary['median_handling_time'] / 3600.0)
     logging.info('  75th percentile: %.2f hours',
                  numpy.percentile(patch_handle_times, 75) / 3600.0)
     logging.info('  90th percentile: %.2f hours',
                  numpy.percentile(patch_handle_times, 90) / 3600.0)
     logging.info('')
     logging.info('Time spent in Pre-CQ:')
     logging.info('  10th percentile: %.2f hours',
                  numpy.percentile(pre_cq_handle_times, 10) / 3600.0)
     logging.info('  25th percentile: %.2f hours',
                  numpy.percentile(pre_cq_handle_times, 25) / 3600.0)
     logging.info('  50th percentile: %.2f hours',
                  numpy.percentile(pre_cq_handle_times, 50) / 3600.0)
     logging.info('  75th percentile: %.2f hours',
                  numpy.percentile(pre_cq_handle_times, 75) / 3600.0)
     logging.info('  90th percentile: %.2f hours',
                  numpy.percentile(pre_cq_handle_times, 90) / 3600.0)
     logging.info('')
     logging.info('Time spent waiting for CQ:')
     logging.info('  10th percentile: %.2f hours',
                  numpy.percentile(cq_wait_times, 10) / 3600.0)
     logging.info('  25th percentile: %.2f hours',
                  numpy.percentile(cq_wait_times, 25) / 3600.0)
     logging.info('  50th percentile: %.2f hours',
                  numpy.percentile(cq_wait_times, 50) / 3600.0)
     logging.info('  75th percentile: %.2f hours',
                  numpy.percentile(cq_wait_times, 75) / 3600.0)
     logging.info('  90th percentile: %.2f hours',
                  numpy.percentile(cq_wait_times, 90) / 3600.0)
     logging.info('')
     logging.info('Time spent in CQ:')
     logging.info('  10th percentile: %.2f hours',
                  numpy.percentile(cq_handle_times, 10) / 3600.0)
     logging.info('  25th percentile: %.2f hours',
                  numpy.percentile(cq_handle_times, 25) / 3600.0)
     logging.info('  50th percentile: %.2f hours',
                  numpy.percentile(cq_handle_times, 50) / 3600.0)
     logging.info('  75th percentile: %.2f hours',
                  numpy.percentile(cq_handle_times, 75) / 3600.0)
     logging.info('  90th percentile: %.2f hours',
                  numpy.percentile(cq_handle_times, 90) / 3600.0)
     logging.info('')

     for bot_type, patches in summary['bad_cl_candidates'].items():
       logging.info('%d bad patch candidates were rejected by the %s',
                    len(patches), bot_type)
       for k in patches:
         logging.info('Bad patch candidate in: CL:%s%s',
                      constants.INTERNAL_CHANGE_PREFIX
                      if k.internal else constants.EXTERNAL_CHANGE_PREFIX,
                      k.gerrit_number)

     fmt_fai = '  %(cnt)d failures in %(reason)s'
     fmt_rej = '  %(cnt)d rejections due to %(reason)s'

     logging.info('Reasons why good patches were rejected:')
     self._PrintCounts(patch_reason_counts, fmt_rej)

     logging.info('Bugs or CLs responsible for good patches rejections:')
     self._PrintCounts(patch_blame_counts, fmt_rej)

     logging.info('Reasons why builds failed:')
     self._PrintCounts(build_reason_counts, fmt_fai)

     super_summary.update(summary)
     self.summary = super_summary
     return super_summary

 # TODO(mtennant): Add token file support.  See upload_package_status.py.
 def _PrepareCreds(email, password=None):
   """Return a gdata_lib.Creds object from given credentials.

   Args:
     email: Email address.
     password: Password string.  If not specified then a password
       prompt will be used.

   Returns:
     A gdata_lib.Creds object.
   """
   creds = gdata_lib.Creds()
   creds.SetCreds(email, password)
   return creds


 def _CheckOptions(options):
   # Ensure that specified start date is in the past.
   now = datetime.datetime.now()
   if options.start_date and now.date() < options.start_date:
     cros_build_lib.Error('Specified start date is in the future: %s',
                          options.start_date)
     return False

   # The --save option requires --email.
   if options.save and not options.email:
     cros_build_lib.Error('You must specify --email with --save.')
     return False

   # The --cl-actions option requires --email.
   if options.cl_actions and not options.email:
     cros_build_lib.Error('You must specify --email with --cl-actions.')
     return False

   return True


 def GetParser():
   """Creates the argparse parser."""
   parser = commandline.ArgumentParser(description=__doc__)

   # Put options that control the mode of script into mutually exclusive group.
   mode = parser.add_mutually_exclusive_group(required=True)
   mode.add_argument('--cq-master', action='store_true', default=False,
                     help='Gather stats for the CQ master.')
   mode.add_argument('--pfq-master', action='store_true', default=False,
                     help='Gather stats for the PFQ master.')
   mode.add_argument('--canary-master', action='store_true', default=False,
                     help='Gather stats for the Canary master.')
   mode.add_argument('--cq-slaves', action='store_true', default=False,
                     help='Gather stats for all CQ slaves.')
   mode.add_argument('--cl-actions', action='store_true', default=False,
                     help='Gather stats about CL actions taken by the CQ '
                          'master')
   # TODO(mtennant): Other modes as they make sense, like maybe --release.

   mode = parser.add_mutually_exclusive_group(required=True)
   mode.add_argument('--start-date', action='store', type='date', default=None,
                     help='Limit scope to a start date in the past.')
   mode.add_argument('--past-month', action='store_true', default=False,
                     help='Limit scope to the past 30 days up to now.')
   mode.add_argument('--past-week', action='store_true', default=False,
                     help='Limit scope to the past week up to now.')
   mode.add_argument('--past-day', action='store_true', default=False,
                     help='Limit scope to the past day up to now.')

   parser.add_argument('--starting-build', action='store', type=int, default=0,
                       help='Filter to builds after given number (inclusive).')
   parser.add_argument('--end-date', action='store', type='date', default=None,
                       help='Limit scope to an end date in the past.')
   parser.add_argument('--save', action='store_true', default=False,
                       help='Save results to DB, if applicable.')
   parser.add_argument('--email', action='store', type=str, default=None,
                       help='Specify email for Google Sheets account to use.')

   mode = parser.add_argument_group('Advanced (use at own risk)')
   mode.add_argument('--no-upload', action='store_false', default=True,
                     dest='upload',
                     help='Skip uploading results to spreadsheet')
   mode.add_argument('--no-carbon', action='store_false', default=True,
                     dest='carbon',
                     help='Skip sending results to carbon/graphite')
   mode.add_argument('--no-mark-gathered', action='store_false', default=True,
                     dest='mark_gathered',
                     help='Skip marking results as gathered.')
   mode.add_argument('--no-sheets-version-filter', action='store_true',
                     default=False,
                     help='Upload all parsed metadata to spreasheet regardless '
                          'of sheets version.')
   mode.add_argument('--override-ss-key', action='store', default=None,
                     dest='ss_key',
                     help='Override spreadsheet key.')
   parser.add_argument('--cred-dir', action='store', required=True,
                       metavar='CIDB_CREDENTIALS_DIR',
                       help='Database credentials directory with certificates '
                            'and other connection information. Obtain your '
                            'credentials at go/cros-cidb-admin .')

   return parser


 def main(argv):
   parser = GetParser()
   options = parser.parse_args(argv)

   if not _CheckOptions(options):
     sys.exit(1)

   db = cidb.CIDBConnection(options.cred_dir)

   if options.end_date:
     end_date = options.end_date
   else:
     end_date = datetime.datetime.now().date()

   # Determine the start date to use, which is required.
   if options.start_date:
     start_date = options.start_date
   else:
     assert options.past_month or options.past_week or options.past_day
     if options.past_month:
       start_date = end_date - datetime.timedelta(days=30)
     elif options.past_week:
       start_date = end_date - datetime.timedelta(days=7)
     else:
       start_date = end_date - datetime.timedelta(days=1)

   # Prepare the rounds of stats gathering to do.
   stats_managers = []

   if options.cq_master:
     stats_managers.append(
         CQMasterStats(
             db=db,
             ss_key=options.ss_key or CQ_SS_KEY,
             no_sheets_version_filter=options.no_sheets_version_filter))

   if options.cl_actions:
     # CL stats manager uses the CQ spreadsheet to fetch failure reasons
     stats_managers.append(
         CLStats(
             options.email,
             db=db,
             ss_key=options.ss_key or CQ_SS_KEY,
             no_sheets_version_filter=options.no_sheets_version_filter))

   if options.pfq_master:
     stats_managers.append(
         PFQMasterStats(
             db=db,
             ss_key=options.ss_key or PFQ_SS_KEY,
             no_sheets_version_filter=options.no_sheets_version_filter))

   if options.canary_master:
     stats_managers.append(
         CanaryMasterStats(
             db=db,
             ss_key=options.ss_key or CANARY_SS_KEY,
             no_sheets_version_filter=options.no_sheets_version_filter))

   if options.cq_slaves:
     targets = _GetSlavesOfMaster(CQ_MASTER)
     for target in targets:
       # TODO(mtennant): Add spreadsheet and/or graphite support for cq-slaves.
       stats_managers.append(CQSlaveStats(target, db=db))

   # If options.save is set and any of the instructions include a table class,
   # or specify summary columns for upload, prepare spreadsheet creds object
   # early.
   creds = None
   if options.save and any((stats.UPLOAD_ROW_PER_BUILD or
                            stats.SUMMARY_SPREADSHEET_COLUMNS)
                           for stats in stats_managers):
     # TODO(mtennant): See if this can work with two-factor authentication.
     # TODO(mtennant): Eventually, we probably want to use 90-day certs to
     # run this as a cronjob on a ganeti instance.
     creds = _PrepareCreds(options.email)

   # Now run through all the stats gathering that is requested.
   for stats_mgr in stats_managers:
     stats_mgr.Gather(start_date, end_date,
                      starting_build_number=options.starting_build,
                      creds=creds)
     stats_mgr.Summarize()

     if options.save:
       # Send data to spreadsheet, if applicable.
       if options.upload:
         stats_mgr.UploadToSheet(creds)

       # Send data to Carbon/Graphite, if applicable.
       if options.carbon:
         stats_mgr.SendToCarbon()

       # Mark these metadata.json files as processed.
       if options.mark_gathered:
         stats_mgr.MarkGathered()

     cros_build_lib.Info('Finished with %s.\n\n', stats_mgr.config_target)


 # Background: This function logs the number of tryjob runs, both internal
 # and external, to Graphite.  It gets the data from git logs.  It was in
 # place, in a very different form, before the migration to
 # gather_builder_stats.  It is simplified here, but entirely untested and
 # not plumbed into gather_builder_stats anywhere.
 def GraphiteTryJobInfoUpToNow(internal, start_date, end_date):
   """Find the amount of tryjobs that finished on a particular day.

   Args:
     internal: If true report for internal, if false report external.
     start_date: datetime.date object for date to start on.
     end_date: datetime.date object for date to end on.
   """
   carbon_lines = []

   # Apparently scottz had 'trybot' and 'trybot-internal' checkouts in
   # his home directory which this code relied on.  Any new solution that
   # also relies on git logs will need a place to look for them.
   if internal:
     repo_path = '/some/path/to/trybot-internal'
     marker = 'internal'
   else:
     repo_path = '/some/path/to/trybot'
     marker = 'external'

   # Make sure the trybot checkout is up to date.
   os.chdir(repo_path)
   cros_build_lib.RunCommand(['git', 'pull'], cwd=repo_path)

   # Now get a list of datetime objects, in hourly deltas.
   start = datetime.datetime(start_date.year, start_date.month, start_date.day)
   hour_delta = datetime.timedelta(hours=1)
   end = start + hour_delta
   while end < end_date:
     git_cmd = ['git', 'log', '--since="%s"' % start,
                '--until="%s"' % end, '--name-only', '--pretty=format:']
     result = cros_build_lib.RunCommand(git_cmd, cwd=repo_path)

     # Expect one line per tryjob run in the specified hour.
     count = len(l for l in result.output.splitlines() if l.strip())

     carbon_lines.append('buildbot.tryjobs.%s.hourly %s %s' %
                         (marker, count, (end - EPOCH_START).total_seconds()))

   graphite.SendToCarbon(carbon_lines)