crosperf: Add verification of per-process profiles

Verification is very helpful when we record per-process perf profile
and collect top statistics.
Using verification we can make sure that perf profiles are collected
from the hottest functions in top statistics.

BUG=chromium:1098098
TEST=unittests and crosperf with per-process profiling are passing.

Change-Id: Ib79b070babcdea38932ba709ac3e14fa3311f842
Reviewed-on: https://chromium-review.googlesource.com/c/chromiumos/third_party/toolchain-utils/+/2407223
Reviewed-by: Tiancong Wang <tcwang@google.com>
Reviewed-by: Caroline Tice <cmtice@chromium.org>
Commit-Queue: Denis Nikitin <denik@chromium.org>
Tested-by: Denis Nikitin <denik@chromium.org>
diff --git a/crosperf/results_cache.py b/crosperf/results_cache.py
index b435eee..c5c8594 100644
--- a/crosperf/results_cache.py
+++ b/crosperf/results_cache.py
@@ -35,6 +35,14 @@
 CACHE_KEYS_FILE = 'cache_keys.txt'
 
 
+class PidVerificationError(Exception):
+  """Error of perf PID verification in per-process mode."""
+
+
+class PerfDataReadError(Exception):
+  """Error of reading a perf.data header."""
+
+
 class Result(object):
   """Class for holding the results of a single test run.
 
@@ -142,12 +150,12 @@
     # while tast runs hold output under TEST_NAME/.
     # Both ensure to be unique.
     result_dir_name = self.test_name if self.suite == 'tast' else 'results'
-    results_dir = self.FindFilesInResultsDir(
-        '-name %s' % result_dir_name).split('\n')[0]
+    results_dir = self.FindFilesInResultsDir('-name %s' %
+                                             result_dir_name).split('\n')[0]
 
     if not results_dir:
-      self._logger.LogOutput(
-          'WARNING: No results dir matching %r found' % result_dir_name)
+      self._logger.LogOutput('WARNING: No results dir matching %r found' %
+                             result_dir_name)
       return
 
     self.CreateTarball(results_dir, tarball)
@@ -239,8 +247,8 @@
       command = 'cp -r {0}/* {1}'.format(self.results_dir, self.temp_dir)
       self.ce.RunCommand(command, print_to_console=False)
 
-    command = ('./generate_test_report --no-color --csv %s' % (os.path.join(
-        '/tmp', os.path.basename(self.temp_dir))))
+    command = ('./generate_test_report --no-color --csv %s' %
+               (os.path.join('/tmp', os.path.basename(self.temp_dir))))
     _, out, _ = self.ce.ChrootRunCommandWOutput(
         self.chromeos_root, command, print_to_console=False)
     keyvals_dict = {}
@@ -390,8 +398,8 @@
                                                        perf_data_file)
       perf_report_file = '%s.report' % perf_data_file
       if os.path.exists(perf_report_file):
-        raise RuntimeError(
-            'Perf report file already exists: %s' % perf_report_file)
+        raise RuntimeError('Perf report file already exists: %s' %
+                           perf_report_file)
       chroot_perf_report_file = misc.GetInsideChrootPath(
           self.chromeos_root, perf_report_file)
       perf_path = os.path.join(self.chromeos_root, 'chroot', 'usr/bin/perf')
@@ -429,8 +437,8 @@
         if self.log_level != 'verbose':
           self._logger.LogOutput('Perf report generated successfully.')
       else:
-        raise RuntimeError(
-            'Perf report not generated correctly. CMD: %s' % command)
+        raise RuntimeError('Perf report not generated correctly. CMD: %s' %
+                           command)
 
       # Add a keyval to the dictionary for the events captured.
       perf_report_files.append(
@@ -844,6 +852,88 @@
       keyvals[key] = [result, unit]
     return keyvals
 
+  def ReadPidFromPerfData(self):
+    """Read PIDs from perf.data files.
+
+    Extract PID from perf.data if "perf record" was running per process,
+    i.e. with "-p <PID>" and no "-a".
+
+    Returns:
+      pids: list of PIDs.
+
+    Raises:
+      PerfDataReadError when perf.data header reading fails.
+    """
+    cmd = ['/usr/bin/perf', 'report', '--header-only', '-i']
+    pids = []
+
+    for perf_data_path in self.perf_data_files:
+      perf_data_path_in_chroot = misc.GetInsideChrootPath(
+          self.chromeos_root, perf_data_path)
+      path_str = ' '.join(cmd + [perf_data_path_in_chroot])
+      status, output, _ = self.ce.ChrootRunCommandWOutput(
+          self.chromeos_root, path_str)
+      if status:
+        # Error of reading a perf.data profile is fatal.
+        raise PerfDataReadError(f'Failed to read perf.data profile: {path_str}')
+
+      # Pattern to search a line with "perf record" command line:
+      # # cmdline : /usr/bin/perf record -e instructions -p 123"
+      cmdline_regex = re.compile(
+          r'^\#\scmdline\s:\s+(?P<cmd>.*perf\s+record\s+.*)$')
+      # Pattern to search PID in a command line.
+      pid_regex = re.compile(r'^.*\s-p\s(?P<pid>\d+)\s*.*$')
+      for line in output.splitlines():
+        cmd_match = cmdline_regex.match(line)
+        if cmd_match:
+          # Found a perf command line.
+          cmdline = cmd_match.group('cmd')
+          # '-a' is a system-wide mode argument.
+          if '-a' not in cmdline.split():
+            # It can be that perf was attached to PID and was still running in
+            # system-wide mode.
+            # We filter out this case here since it's not per-process.
+            pid_match = pid_regex.match(cmdline)
+            if pid_match:
+              pids.append(pid_match.group('pid'))
+          # Stop the search and move to the next perf.data file.
+          break
+      else:
+        # cmdline wasn't found in the header. It's a fatal error.
+        raise PerfDataReadError(f'Perf command line is not found in {path_str}')
+    return pids
+
+  def VerifyPerfDataPID(self):
+    """Verify PIDs in per-process perf.data profiles.
+
+    Check that at list one top process is profiled if perf was running in
+    per-process mode.
+
+    Raises:
+      PidVerificationError if PID verification of per-process perf.data profiles
+      fail.
+    """
+    perf_data_pids = self.ReadPidFromPerfData()
+    if not perf_data_pids:
+      # In system-wide mode there are no PIDs.
+      self._logger.LogOutput('System-wide perf mode. Skip verification.')
+      return
+
+    # PIDs will be present only in per-process profiles.
+    # In this case we need to verify that profiles are collected on the
+    # hottest processes.
+    top_processes = [top_cmd['cmd'] for top_cmd in self.top_cmds]
+    # top_process structure: <cmd>-<pid>
+    top_pids = [top_process.split('-')[-1] for top_process in top_processes]
+    for top_pid in top_pids:
+      if top_pid in perf_data_pids:
+        self._logger.LogOutput('PID verification passed! '
+                               f'Top process {top_pid} is profiled.')
+        return
+    raise PidVerificationError(
+        f'top processes {top_processes} are missing in perf.data traces with'
+        f' PID: {perf_data_pids}.')
+
   def ProcessResults(self, use_cache=False):
     # Note that this function doesn't know anything about whether there is a
     # cache hit or miss. It should process results agnostic of the cache hit
@@ -882,6 +972,9 @@
       cpustats = self.ProcessCpustatsResults()
     if self.top_log_file:
       self.top_cmds = self.ProcessTopResults()
+    # Verify that PID in non system-wide perf.data and top_cmds are matching.
+    if self.perf_data_files and self.top_cmds:
+      self.VerifyPerfDataPID()
     if self.wait_time_log_file:
       with open(self.wait_time_log_file) as f:
         wait_time = f.readline().strip()
@@ -962,8 +1055,8 @@
 
   def CreateTarball(self, results_dir, tarball):
     if not results_dir.strip():
-      raise ValueError(
-          'Refusing to `tar` an empty results_dir: %r' % results_dir)
+      raise ValueError('Refusing to `tar` an empty results_dir: %r' %
+                       results_dir)
 
     ret = self.ce.RunCommand('cd %s && '
                              'tar '
@@ -1013,8 +1106,8 @@
     if ret:
       command = 'rm -rf {0}'.format(temp_dir)
       self.ce.RunCommand(command)
-      raise RuntimeError(
-          'Could not move dir %s to dir %s' % (temp_dir, cache_dir))
+      raise RuntimeError('Could not move dir %s to dir %s' %
+                         (temp_dir, cache_dir))
 
   @classmethod
   def CreateFromRun(cls,
diff --git a/crosperf/results_cache_unittest.py b/crosperf/results_cache_unittest.py
index 812d5ab..91ceed2 100755
--- a/crosperf/results_cache_unittest.py
+++ b/crosperf/results_cache_unittest.py
@@ -21,6 +21,8 @@
 
 from label import MockLabel
 from results_cache import CacheConditions
+from results_cache import PerfDataReadError
+from results_cache import PidVerificationError
 from results_cache import Result
 from results_cache import ResultsCache
 from results_cache import TelemetryResult
@@ -158,6 +160,34 @@
     'b_string_strstr___abcdefghijklmnopqrstuvwxyz__': '0.0134553343333'
 }
 
+PERF_DATA_HEADER = """
+# ========
+# captured on    : Thu Jan 01 00:00:00 1980
+# header version : 1
+# data offset    : 536
+# data size      : 737678672
+# feat offset    : 737679208
+# hostname : localhost
+# os release : 5.4.61
+# perf version :
+# arch : aarch64
+# nrcpus online : 8
+# nrcpus avail : 8
+# total memory : 5911496 kB
+# cmdline : /usr/bin/perf record -e instructions -p {pid}
+# event : name = instructions, , id = ( 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193 ), type = 8, size = 112
+# event : name = dummy:u, , id = ( 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204 ), type = 1, size = 112, config = 0x9
+# CPU_TOPOLOGY info available, use -I to display
+# pmu mappings: software = 1, uprobe = 6, cs_etm = 8, breakpoint = 5, tracepoint = 2, armv8_pmuv3 = 7
+# contains AUX area data (e.g. instruction trace)
+# time of first sample : 0.000000
+# time of last sample : 0.000000
+# sample duration :      0.000 ms
+# missing features: TRACING_DATA CPUDESC CPUID NUMA_TOPOLOGY BRANCH_STACK GROUP_DESC STAT CACHE MEM_TOPOLOGY CLOCKID DIR_FORMAT
+# ========
+#
+"""
+
 TURBOSTAT_LOG_OUTPUT = \
 """CPU     Avg_MHz Busy%   Bzy_MHz TSC_MHz IRQ     CoreTmp
 -       329     12.13   2723    2393    10975   77
@@ -470,6 +500,7 @@
   def setUp(self):
     self.result = Result(self.mock_logger, self.mock_label, 'average',
                          self.mock_cmd_exec)
+    self.result.chromeos_root = '/tmp/chromeos'
 
   @mock.patch.object(os.path, 'isdir')
   @mock.patch.object(command_executer.CommandExecuter, 'RunCommand')
@@ -707,7 +738,8 @@
     self.assertEqual(mock_chrootruncmd.call_count, 1)
     self.assertEqual(
         mock_chrootruncmd.call_args_list[0][0],
-        ('/tmp', ('./generate_test_report --no-color --csv %s') % TMP_DIR1))
+        (self.result.chromeos_root,
+         ('./generate_test_report --no-color --csv %s') % TMP_DIR1))
     self.assertEqual(mock_getpath.call_count, 1)
     self.assertEqual(mock_mkdtemp.call_count, 1)
     self.assertEqual(res, {'Total': [10, 'score'], 'first_time': [680, 'ms']})
@@ -916,6 +948,101 @@
     found_no_logs = self.result.GetCpustatsFile()
     self.assertEqual(found_no_logs, '')
 
+  def test_verify_perf_data_pid_ok(self):
+    """Verify perf PID which is present in TOP_DATA."""
+    self.result.top_cmds = TOP_DATA
+    # pid is present in TOP_DATA.
+    with mock.patch.object(
+        Result, 'ReadPidFromPerfData', return_value=['5713']):
+      self.result.VerifyPerfDataPID()
+
+  def test_verify_perf_data_pid_fail(self):
+    """Test perf PID missing in top raises the error."""
+    self.result.top_cmds = TOP_DATA
+    # pid is not in the list of top processes.
+    with mock.patch.object(
+        Result, 'ReadPidFromPerfData', return_value=['9999']):
+      with self.assertRaises(PidVerificationError):
+        self.result.VerifyPerfDataPID()
+
+  @mock.patch.object(command_executer.CommandExecuter,
+                     'ChrootRunCommandWOutput')
+  def test_read_pid_from_perf_data_ok(self, mock_runcmd):
+    """Test perf header parser, normal flow."""
+    self.result.ce.ChrootRunCommandWOutput = mock_runcmd
+    self.result.perf_data_files = ['/tmp/chromeos/chroot/tmp/results/perf.data']
+    exp_pid = '12345'
+    mock_runcmd.return_value = (0, PERF_DATA_HEADER.format(pid=exp_pid), '')
+    pids = self.result.ReadPidFromPerfData()
+    self.assertEqual(pids, [exp_pid])
+
+  @mock.patch.object(command_executer.CommandExecuter,
+                     'ChrootRunCommandWOutput')
+  def test_read_pid_from_perf_data_mult_profiles(self, mock_runcmd):
+    """Test multiple perf.data files with PID."""
+    self.result.ce.ChrootRunCommandWOutput = mock_runcmd
+    # self.result.chromeos_root = '/tmp/chromeos'
+    self.result.perf_data_files = [
+        '/tmp/chromeos/chroot/tmp/results/perf.data.0',
+        '/tmp/chromeos/chroot/tmp/results/perf.data.1',
+    ]
+    # There is '-p <pid>' in command line but it's still system-wide: '-a'.
+    cmd_line = '# cmdline : /usr/bin/perf record -e instructions -p {pid}'
+    exp_perf_pids = ['1111', '2222']
+    mock_runcmd.side_effect = [
+        (0, cmd_line.format(pid=exp_perf_pids[0]), ''),
+        (0, cmd_line.format(pid=exp_perf_pids[1]), ''),
+    ]
+    pids = self.result.ReadPidFromPerfData()
+    self.assertEqual(pids, exp_perf_pids)
+
+  @mock.patch.object(command_executer.CommandExecuter,
+                     'ChrootRunCommandWOutput')
+  def test_read_pid_from_perf_data_no_pid(self, mock_runcmd):
+    """Test perf.data without PID."""
+    self.result.ce.ChrootRunCommandWOutput = mock_runcmd
+    self.result.perf_data_files = ['/tmp/chromeos/chroot/tmp/results/perf.data']
+    cmd_line = '# cmdline : /usr/bin/perf record -e instructions'
+    mock_runcmd.return_value = (0, cmd_line, '')
+    pids = self.result.ReadPidFromPerfData()
+    # pids is empty.
+    self.assertEqual(pids, [])
+
+  @mock.patch.object(command_executer.CommandExecuter,
+                     'ChrootRunCommandWOutput')
+  def test_read_pid_from_perf_data_system_wide(self, mock_runcmd):
+    """Test reading from system-wide profile with PID."""
+    self.result.ce.ChrootRunCommandWOutput = mock_runcmd
+    self.result.perf_data_files = ['/tmp/chromeos/chroot/tmp/results/perf.data']
+    # There is '-p <pid>' in command line but it's still system-wide: '-a'.
+    cmd_line = '# cmdline : /usr/bin/perf record -e instructions -a -p 1234'
+    mock_runcmd.return_value = (0, cmd_line, '')
+    pids = self.result.ReadPidFromPerfData()
+    # pids should be empty since it's not a per-process profiling.
+    self.assertEqual(pids, [])
+
+  @mock.patch.object(command_executer.CommandExecuter,
+                     'ChrootRunCommandWOutput')
+  def test_read_pid_from_perf_data_read_fail(self, mock_runcmd):
+    """Failure to read perf.data raises the error."""
+    self.result.ce.ChrootRunCommandWOutput = mock_runcmd
+    self.result.perf_data_files = ['/tmp/chromeos/chroot/tmp/results/perf.data']
+    # Error status of the profile read.
+    mock_runcmd.return_value = (1, '', '')
+    with self.assertRaises(PerfDataReadError):
+      self.result.ReadPidFromPerfData()
+
+  @mock.patch.object(command_executer.CommandExecuter,
+                     'ChrootRunCommandWOutput')
+  def test_read_pid_from_perf_data_fail(self, mock_runcmd):
+    """Failure to find cmdline in perf.data header raises the error."""
+    self.result.ce.ChrootRunCommandWOutput = mock_runcmd
+    self.result.perf_data_files = ['/tmp/chromeos/chroot/tmp/results/perf.data']
+    # Empty output.
+    mock_runcmd.return_value = (0, '', '')
+    with self.assertRaises(PerfDataReadError):
+      self.result.ReadPidFromPerfData()
+
   def test_process_turbostat_results_with_valid_data(self):
     """Normal case when log exists and contains valid data."""
     self.result.turbostat_log_file = '/tmp/somelogfile.log'
@@ -1112,10 +1239,11 @@
     # Debug path not found
     self.result.label.debug_path = ''
     tmp = self.result.GeneratePerfReportFiles()
-    self.assertEqual(tmp, ['/tmp/chroot%s' % fake_file])
+    self.assertEqual(tmp, ['/tmp/chromeos/chroot%s' % fake_file])
     self.assertEqual(mock_chrootruncmd.call_args_list[0][0],
-                     ('/tmp', ('/usr/sbin/perf report -n    '
-                               '-i %s --stdio > %s') % (fake_file, fake_file)))
+                     (self.result.chromeos_root,
+                      ('/usr/sbin/perf report -n    '
+                       '-i %s --stdio > %s') % (fake_file, fake_file)))
 
   @mock.patch.object(misc, 'GetInsideChrootPath')
   @mock.patch.object(command_executer.CommandExecuter, 'ChrootRunCommand')
@@ -1130,11 +1258,12 @@
     # Debug path found
     self.result.label.debug_path = '/tmp/debug'
     tmp = self.result.GeneratePerfReportFiles()
-    self.assertEqual(tmp, ['/tmp/chroot%s' % fake_file])
+    self.assertEqual(tmp, ['/tmp/chromeos/chroot%s' % fake_file])
     self.assertEqual(mock_chrootruncmd.call_args_list[0][0],
-                     ('/tmp', ('/usr/sbin/perf report -n --symfs /tmp/debug '
-                               '--vmlinux /tmp/debug/boot/vmlinux  '
-                               '-i %s --stdio > %s') % (fake_file, fake_file)))
+                     (self.result.chromeos_root,
+                      ('/usr/sbin/perf report -n --symfs /tmp/debug '
+                       '--vmlinux /tmp/debug/boot/vmlinux  '
+                       '-i %s --stdio > %s') % (fake_file, fake_file)))
 
   @mock.patch.object(misc, 'GetOutsideChrootPath')
   def test_populate_from_run(self, mock_getpath):
@@ -1183,7 +1312,6 @@
     if mock_getpath:
       pass
     mock.get_path = '/tmp/chromeos/tmp/results_dir'
-    self.result.chromeos_root = '/tmp/chromeos'
 
     self.callGetResultsDir = False
     self.callGetResultsFile = False
@@ -1453,8 +1581,7 @@
             u'crypto-md5__crypto-md5': [10.5, u'ms'],
             u'string-tagcloud__string-tagcloud': [52.8, u'ms'],
             u'access-nbody__access-nbody': [8.5, u'ms'],
-            'retval':
-                0,
+            'retval': 0,
             u'math-spectral-norm__math-spectral-norm': [6.6, u'ms'],
             u'math-cordic__math-cordic': [8.7, u'ms'],
             u'access-binary-trees__access-binary-trees': [4.5, u'ms'],
@@ -1492,8 +1619,7 @@
             u'crypto-md5__crypto-md5': [10.5, u'ms'],
             u'string-tagcloud__string-tagcloud': [52.8, u'ms'],
             u'access-nbody__access-nbody': [8.5, u'ms'],
-            'retval':
-                0,
+            'retval': 0,
             u'math-spectral-norm__math-spectral-norm': [6.6, u'ms'],
             u'math-cordic__math-cordic': [8.7, u'ms'],
             u'access-binary-trees__access-binary-trees': [4.5, u'ms'],
@@ -1722,8 +1848,9 @@
                                 'autotest_dir', 'debug_dir', '/tmp', 'lumpy',
                                 'remote', 'image_args', 'cache_dir', 'average',
                                 'gcc', False, None)
-    self.mock_machine = machine_manager.MockCrosMachine(
-        'falco.cros', '/tmp/chromeos', 'average')
+    self.mock_machine = machine_manager.MockCrosMachine('falco.cros',
+                                                        '/tmp/chromeos',
+                                                        'average')
 
   def test_populate_from_run(self):
 
@@ -1803,10 +1930,12 @@
     def FakeGetMachines(label):
       if label:
         pass
-      m1 = machine_manager.MockCrosMachine(
-          'lumpy1.cros', self.results_cache.chromeos_root, 'average')
-      m2 = machine_manager.MockCrosMachine(
-          'lumpy2.cros', self.results_cache.chromeos_root, 'average')
+      m1 = machine_manager.MockCrosMachine('lumpy1.cros',
+                                           self.results_cache.chromeos_root,
+                                           'average')
+      m2 = machine_manager.MockCrosMachine('lumpy2.cros',
+                                           self.results_cache.chromeos_root,
+                                           'average')
       return [m1, m2]
 
     mock_checksum.return_value = 'FakeImageChecksumabc123'
@@ -1848,10 +1977,12 @@
     def FakeGetMachines(label):
       if label:
         pass
-      m1 = machine_manager.MockCrosMachine(
-          'lumpy1.cros', self.results_cache.chromeos_root, 'average')
-      m2 = machine_manager.MockCrosMachine(
-          'lumpy2.cros', self.results_cache.chromeos_root, 'average')
+      m1 = machine_manager.MockCrosMachine('lumpy1.cros',
+                                           self.results_cache.chromeos_root,
+                                           'average')
+      m2 = machine_manager.MockCrosMachine('lumpy2.cros',
+                                           self.results_cache.chromeos_root,
+                                           'average')
       return [m1, m2]
 
     mock_checksum.return_value = 'FakeImageChecksumabc123'