TLB counter verification tests - corrections. BUG=None TEST=Run on wolf board Change-Id: I932b3dc8c72c84a95e90c7d018bc388d6a4560f3 Reviewed-on: https://chromium-review.googlesource.com/293003 Commit-Ready: Nemanja Vasić <nvasic@google.com> Tested-by: Nemanja Vasić <nvasic@google.com> Reviewed-by: David Sharp <dhsharp@google.com>

commit: 35c925af65ffd0a0420eb7e4a13ebf61b9b318cb [log] [tgz]
author: Nemanja Vasic <nvasic@google.com> Tue Aug 11 17:22:45 2015 -0700
committer: chrome-bot <chrome-bot@chromium.org> Tue Sep 08 23:32:41 2015 -0700
tree: 6869ac533863448f1caa31f892ab85bce4fe0354
parent: 6b2479d782150314e83a5f640c8efb0e909150e1 [diff]
diff --git a/client/site_tests/hardware_PerfCounterVerification/control b/client/site_tests/hardware_PerfCounterVerification/control
index 0576ffd..8c79ac4 100644
--- a/client/site_tests/hardware_PerfCounterVerification/control
+++ b/client/site_tests/hardware_PerfCounterVerification/control

@@ -17,10 +17,21 @@
 Arguments:
   events: Events to pass to perf stat -e. Events are passed in together, so
     multiplexing may occur if more than one is specified.
+  program: Benchmark binary
 """
 
 job.run_test('hardware_PerfCounterVerification', tag='cycles_instructions',
-             perf_cmd='stat', events=('cycles', 'instructions'))
+             perf_cmd='stat', events=('cycles', 'instructions'),
+             program='noploop', multiplier=10000000)
 
 job.run_test('hardware_PerfCounterVerification', tag='LBR',
-             perf_cmd='record -b', events=('br_inst_retired.all_branches',))
+             perf_cmd='record -b', events=('br_inst_retired.all_branches',),
+             program='noploop', multiplier=10000000)
+
+job.run_test('hardware_PerfCounterVerification', tag='iTLB_misses',
+             perf_cmd='stat', events=('iTLB-misses','cycles'),
+             program='iTLB_benchmark', multiplier=100)
+
+job.run_test('hardware_PerfCounterVerification', tag='dTLB_misses',
+             perf_cmd='stat', events=('dTLB-misses','cycles'),
+             program='dTLB_benchmark', multiplier=10000)

diff --git a/client/site_tests/hardware_PerfCounterVerification/hardware_PerfCounterVerification.py b/client/site_tests/hardware_PerfCounterVerification/hardware_PerfCounterVerification.py
index 4043dea..a98cbee 100644
--- a/client/site_tests/hardware_PerfCounterVerification/hardware_PerfCounterVerification.py
+++ b/client/site_tests/hardware_PerfCounterVerification/hardware_PerfCounterVerification.py

@@ -27,9 +27,11 @@
 
     For cycles and instructions, we expect a strong correlation between
     the number of iterations of a "noploop" program and the number of
-    cycles and instructions. That is, each loop iteration should retire
-    a constant number of additional instructions, and should take a
-    nearly constant number of additional cycles.
+    cycles and instructions. For TLB misses, we expect a strong correlation
+    between number of misses and number of iterations of a matching benchmark
+    Each loop iteration should retire a constant number of additional
+    instructions, and should take a nearly constant number of additional
+    cycles or misses.
     """
 
     version = 1
@@ -55,16 +57,16 @@
         if board in unsupported_boards:
             raise error.TestNAError('Unsupported board')
 
-    def run_once(self, **kwargs):
-        noploop = os.path.join(self.srcdir, 'noploop')
+    def run_once(self, program, multiplier, **kwargs):
+        program = os.path.join(self.srcdir, program)
         if self.perf_cmd == 'stat':
             self.facts = perf_verification.GatherPerfStats(
-                    noploop, ','.join(self.events))
+                    program, ','.join(self.events), multiplier)
         elif self.perf_cmd == 'record -b':
             branch = perf_lbr_verification.ReadBranchAddressesFile(
                     os.path.join(self.srcdir, 'noploop_branch.txt'))
             self.facts = perf_lbr_verification.GatherPerfBranchSamples(
-                    noploop, branch, ','.join(self.events),
+                    program, branch, ','.join(self.events),
                     10000)
         else:
             raise error.TestError('Unrecognized perf_cmd')
@@ -79,14 +81,22 @@
                               ('branch_count', numpy.int)])
         arr = stats_utils.FactsToNumpyArray(self.facts, dt)
         results = {}
+        is_tlb_benchmark = ('iTLB-misses' in dt.names or
+                            'dTLB-misses' in dt.names)
         for y_var in dt.names:
             if y_var == 'loops': continue
+            if y_var == 'cycles' and is_tlb_benchmark: continue
             (slope, intercept), r2 = stats_utils.LinearRegression(
                     arr['loops'], arr[y_var])
             prefix = y_var + '_'
             results[prefix+'slope'] = slope
             results[prefix+'intercept'] = intercept
             results[prefix+'r_squared'] = r2
+            if y_var in ('dTLB-misses', 'iTLB-misses'):
+                misses_per_milion_cycles = [x[y_var] * 1.0e6 / x['cycles']
+                                            for x in self.facts]
+                rvar = prefix+'misses_per_milion_cycles'
+                results[rvar] = numpy.max(misses_per_milion_cycles)
 
         self.write_perf_keyval(results)
 
@@ -97,13 +107,18 @@
         else:
             cycles_r_squared_expectation = 0.999
 
-        if ('cycles' in self.events and
+        if ('cycles' in self.events and not is_tlb_benchmark and
             results['cycles_r_squared'] < cycles_r_squared_expectation):
             raise error.TestFail('Poor correlation for cycles ~ loops')
         if ('instructions' in self.events and
             results['instructions_r_squared'] < 0.999999):
             raise error.TestFail('Poor correlation for instructions ~ loops')
-
+        if ('iTLB-misses' in self.events and
+            results['iTLB-misses_r_squared'] < 0.999):
+            raise error.TestFail('Poor correlation for iTLB-misses ~ loops')
+        if ('dTLB-misses' in self.events and
+            results['dTLB-misses_r_squared'] < 0.999):
+            raise error.TestFail('Poor correlation for dTLB-misses ~ loops')
         if (self.perf_cmd == 'record -b' and
             results['branch_count_r_squared'] < 0.9999999):
             raise error.TestFail('Poor correlation for branch_count ~ loops')

diff --git a/client/site_tests/hardware_PerfCounterVerification/perf_verification.py b/client/site_tests/hardware_PerfCounterVerification/perf_verification.py
index 05ebbc4..dbc033c 100755
--- a/client/site_tests/hardware_PerfCounterVerification/perf_verification.py
+++ b/client/site_tests/hardware_PerfCounterVerification/perf_verification.py

@@ -16,12 +16,14 @@
     """Module error class."""
 
 
-def GatherPerfStats(noploop, events, progress_func=lambda i, j: None):
-    """Run perf stat with the given events and noploop program.
+def GatherPerfStats(program, events, multiplier=1000,
+                    progress_func=lambda i, j: None):
+    """Run perf stat with the given events and given program.
 
-    @param noploop: path to noploop binary. It should take one argument (number
-        of loop iterations) and produce no output.
+    @param program: path to benchmark binary. It should take one argument
+        (number of loop iterations) and produce no output.
     @param events: value to pass to '-e' arg of perf stat.
+    @param multiplier: loop multiplier
     @param progress_func: function that tracks progress of running the
         benchmark. takes two arguments for the outer and inner iteration
         numbers.
@@ -30,11 +32,11 @@
     facts = []
     for i, j in itertools.product(xrange(10), xrange(5)):
         progress_func(i, j)
-        loops = (i+1) * 10000000  # (i+1) * 10 million
+        loops = (i+1) * multiplier
         out = subprocess.check_output(
                 ('perf', 'stat', '-x', ',',
                  '-e', events,
-                 noploop, '%d' % loops),
+                 program, '%d' % loops),
                 stderr=subprocess.STDOUT)
         unsupported_events = []
         f = {'loops': loops}
@@ -74,7 +76,7 @@
 
     events = ('cycles', 'instructions')
     facts = GatherPerfStats('src/noploop', ','.join(events),
-                            progress_func=_Progress)
+                            multiplier=10*1000*1000, progress_func=_Progress)
 
     dt = numpy.dtype([('loops', numpy.int)] +
                      [(e, numpy.int) for e in events])

diff --git a/client/site_tests/hardware_PerfCounterVerification/src/Makefile b/client/site_tests/hardware_PerfCounterVerification/src/Makefile
index 820badb..b96642f 100644
--- a/client/site_tests/hardware_PerfCounterVerification/src/Makefile
+++ b/client/site_tests/hardware_PerfCounterVerification/src/Makefile

@@ -1,6 +1,8 @@
 CFLAGS=-O0 -g
 
-OUTPUTS=noploop noploop_branch.txt
+BINS=iTLB_benchmark dTLB_benchmark noploop
+OBJS=iTLB_benchmark.o dTLB_benchmark.o iTLB_benchmark_function.o
+OUTPUTS=$(BINS) $(OBJS) iTLB_benchmark_function.c noploop_branch.txt
 
 all: $(OUTPUTS)
 
@@ -9,5 +11,12 @@
 noploop_branch.txt: noploop
 	./find_loop_instructions.py $< > $@
 
+iTLB_benchmark: iTLB_benchmark.o iTLB_benchmark_function.o
+
+dTLB_benchmark: dTLB_benchmark.o
+
+iTLB_benchmark_function.c: generateBenchmarkFunction.sh
+	./generateBenchmarkFunction.sh > iTLB_benchmark_function.c
+
 clean:
 	rm -rf $(OUTPUTS)

diff --git a/client/site_tests/hardware_PerfCounterVerification/src/dTLB_benchmark.c b/client/site_tests/hardware_PerfCounterVerification/src/dTLB_benchmark.c
new file mode 100644
index 0000000..fc55836
--- /dev/null
+++ b/client/site_tests/hardware_PerfCounterVerification/src/dTLB_benchmark.c

@@ -0,0 +1,39 @@
+#include <stdlib.h>
+#include <unistd.h>
+
+int main(int argc, char *argv[]) {
+  unsigned long i, block_cnt = 100;
+  char** blocks;
+  long page_size;
+
+  page_size = sysconf(_SC_PAGESIZE);
+  if (page_size == -1) {
+    page_size = (1 << 12); // 4Kb
+  }
+
+  if (argc > 1) {
+    block_cnt = strtoul(argv[1], NULL, 10);
+    if (block_cnt < 1) {
+      block_cnt = 1;
+    }
+  }
+
+  blocks = (char**) malloc(block_cnt * sizeof(char*));
+  for (i = 0; i < block_cnt; i++) {
+    char* dummy_ptr = (char*) malloc(page_size * sizeof(char)); // forcing fragmentation
+    blocks[i] = (char*) malloc(page_size * sizeof(char));
+    free(dummy_ptr);
+  }
+
+  for (i = 0; i < block_cnt; i++) {
+    char dummy_char = blocks[i][0];
+  }
+
+  for(i = 0; i < block_cnt; i++) {
+    free(blocks[i]);
+  }
+
+  free(blocks);
+
+  return 0;
+}

diff --git a/client/site_tests/hardware_PerfCounterVerification/src/generateBenchmarkFunction.sh b/client/site_tests/hardware_PerfCounterVerification/src/generateBenchmarkFunction.sh
new file mode 100755
index 0000000..0b21f6f
--- /dev/null
+++ b/client/site_tests/hardware_PerfCounterVerification/src/generateBenchmarkFunction.sh

@@ -0,0 +1,21 @@
+#!/bin/bash
+# This script generates large function intended to
+# cause as many iTLB misses as possible.
+
+# Number of instructions:
+# 4k - page size
+# x 64 - supposed number of TLB entires
+# x 2 - executing a function sized page_size * tlb_entry_count multiple
+# times would cause tlb misses only on the first call and tlb entries
+# would be valid for each next call. Doubling the size of the function
+# guarantees invalidating tlb entires and thus causing tlb misses.
+
+echo "void iTLB_bechmark_function() {"
+echo "  int a = 0, b = 0;"
+
+for (( c=0; c < (1 << 18) ; c++ )) ; do
+  echo "  a = b + 1;"
+  echo "  b = a + 1;"
+done
+
+echo "}"

diff --git a/client/site_tests/hardware_PerfCounterVerification/src/iTLB_benchmark.c b/client/site_tests/hardware_PerfCounterVerification/src/iTLB_benchmark.c
new file mode 100644
index 0000000..7809f44
--- /dev/null
+++ b/client/site_tests/hardware_PerfCounterVerification/src/iTLB_benchmark.c

@@ -0,0 +1,18 @@
+#include <stdlib.h>
+#include "iTLB_benchmark_function.h"
+
+int main(int argc, char *argv[]) {
+  unsigned long loops = 1000;
+  if (argc > 1) {
+    loops = strtoul(argv[1], NULL, 10);
+    if (loops < 1) {
+      loops = 1;
+    }
+  }
+
+  while (--loops) {
+    iTLB_bechmark_function();
+  }
+
+  return 0;
+}

diff --git a/client/site_tests/hardware_PerfCounterVerification/src/iTLB_benchmark_function.h b/client/site_tests/hardware_PerfCounterVerification/src/iTLB_benchmark_function.h
new file mode 100644
index 0000000..dd31701
--- /dev/null
+++ b/client/site_tests/hardware_PerfCounterVerification/src/iTLB_benchmark_function.h

@@ -0,0 +1 @@
+void iTLB_bechmark_function();
commit	35c925af65ffd0a0420eb7e4a13ebf61b9b318cb	[log] [tgz]
author	Nemanja Vasic <nvasic@google.com>	Tue Aug 11 17:22:45 2015 -0700
committer	chrome-bot <chrome-bot@chromium.org>	Tue Sep 08 23:32:41 2015 -0700
tree	6869ac533863448f1caa31f892ab85bce4fe0354
parent	6b2479d782150314e83a5f640c8efb0e909150e1 [diff]