ml_benchmark: add memory sampler to estimate peak rss+swap usage

BUG=b:174827149
TEST=FEATURES=test emerge-betty ml-benchmark
     cros deploy <dut> ml-benchmark
     (DUT) $ ml_benchmark --config_file_name=soda-scenario-1.config

Change-Id: I80e854e8217fed4ad9bb610d50621db57a04edb3
Reviewed-on: https://chromium-review.googlesource.com/c/chromiumos/platform2/+/2582021
Tested-by: Jim Pollock <jmpollock@chromium.org>
Reviewed-by: Michael Pishchagin <mblsha@google.com>
Commit-Queue: Jim Pollock <jmpollock@chromium.org>
Auto-Submit: Jim Pollock <jmpollock@chromium.org>
diff --git a/ml_benchmark/BUILD.gn b/ml_benchmark/BUILD.gn
index cb4367f..10a0a9e 100644
--- a/ml_benchmark/BUILD.gn
+++ b/ml_benchmark/BUILD.gn
@@ -38,6 +38,8 @@
     "benchmark_functions.h",
     "json_serializer.cc",
     "json_serializer.h",
+    "memory_sampler.cc",
+    "memory_sampler.h",
     "shared_library_benchmark.cc",
     "shared_library_benchmark.h",
     "shared_library_benchmark_functions.cc",
@@ -64,6 +66,7 @@
   executable("ml_benchmark_test") {
     sources = [
       "json_serializer_test.cc",
+      "memory_sampler_test.cc",
       "shared_library_benchmark_test.cc",
       "sysmetrics_test.cc",
     ]
diff --git a/ml_benchmark/main.cc b/ml_benchmark/main.cc
index a743e84..c1d64a5 100644
--- a/ml_benchmark/main.cc
+++ b/ml_benchmark/main.cc
@@ -6,12 +6,14 @@
 #include <base/files/file_util.h>
 #include <base/json/json_writer.h>
 #include <base/logging.h>
+#include <base/task/thread_pool/thread_pool_instance.h>
 #include <base/values.h>
 #include <brillo/flag_helper.h>
 
 #include <string>
 
 #include "ml_benchmark/json_serializer.h"
+#include "ml_benchmark/memory_sampler.h"
 #include "ml_benchmark/shared_library_benchmark.h"
 #include "ml_benchmark/shared_library_benchmark_functions.h"
 #include "ml_benchmark/sysmetrics.h"
@@ -20,27 +22,21 @@
 using chrome::ml_benchmark::BenchmarkResults;
 using chrome::ml_benchmark::CrOSBenchmarkConfig;
 using chrome::ml_benchmark::Metric;
+using ml_benchmark::PeakMemorySampler;
 using ml_benchmark::SharedLibraryBenchmark;
 using ml_benchmark::SharedLibraryBenchmarkFunctions;
 
 namespace {
 
-void AddMemoryMetrics(const int64_t initial_memsize,
-                      const int64_t final_peaksize,
-                      BenchmarkResults* results) {
-  auto& initial_mem = *results->add_metrics();
-  initial_mem.set_name("initial_vmsize");
-  initial_mem.set_units(Metric::BYTES);
-  initial_mem.set_direction(Metric::SMALLER_IS_BETTER);
-  initial_mem.set_cardinality(Metric::SINGLE);
-  initial_mem.add_values(initial_memsize);
-
-  auto& final_mem = *results->add_metrics();
-  final_mem.set_name("final_vmpeak");
-  final_mem.set_units(Metric::BYTES);
-  final_mem.set_direction(Metric::SMALLER_IS_BETTER);
-  final_mem.set_cardinality(Metric::SINGLE);
-  final_mem.add_values(final_peaksize);
+void AddMemoryMetric(const std::string& metric_name,
+                     const int64_t value,
+                     BenchmarkResults* results) {
+  auto& metric = *results->add_metrics();
+  metric.set_name(metric_name);
+  metric.set_units(Metric::BYTES);
+  metric.set_direction(Metric::SMALLER_IS_BETTER);
+  metric.set_cardinality(Metric::SINGLE);
+  metric.add_values(value);
 }
 
 void PrintMetrics(const BenchmarkResults& results) {
@@ -73,6 +69,10 @@
   }
 
   const int64_t initial_memsize = ml_benchmark::GetVMSizeBytes();
+  const int64_t initial_rss_swap = ml_benchmark::GetSwapAndRSSBytes();
+
+  scoped_refptr<PeakMemorySampler> mem_sampler = new PeakMemorySampler();
+  PeakMemorySampler::StartSampling(mem_sampler);
 
   LOG(INFO) << "Starting the " << driver_name << " benchmark";
   SharedLibraryBenchmark benchmark(std::move(functions));
@@ -83,11 +83,18 @@
     return;
   }
 
+  PeakMemorySampler::StopSampling(mem_sampler);
+
   if (results.status() == chrome::ml_benchmark::OK) {
     LOG(INFO) << driver_name << " finished";
 
-    const int64_t final_peaksize = ml_benchmark::GetVMPeakBytes();
-    AddMemoryMetrics(initial_memsize, final_peaksize, &results);
+    const int64_t final_vmpeaksize = ml_benchmark::GetVMPeakBytes();
+    const int64_t peak_rss_swap = mem_sampler->GetMaxSample();
+
+    AddMemoryMetric("initial_vmsize", initial_memsize, &results);
+    AddMemoryMetric("final_vmpeak", final_vmpeaksize, &results);
+    AddMemoryMetric("initial_rss_swap", initial_rss_swap, &results);
+    AddMemoryMetric("peak_rss_swap", peak_rss_swap, &results);
 
     PrintMetrics(results);
 
@@ -135,8 +142,9 @@
     }
   }
 
-  base::FilePath driver_library(FLAGS_driver_library_path);
+  base::ThreadPoolInstance::CreateAndStartWithDefaultParams("ml_benchmark");
 
+  base::FilePath driver_library(FLAGS_driver_library_path);
   BenchmarkAndReportResults(FLAGS_driver_library_path, driver_library,
                             benchmark_config, output_file_path);
 
diff --git a/ml_benchmark/memory_sampler.cc b/ml_benchmark/memory_sampler.cc
new file mode 100644
index 0000000..71c4936
--- /dev/null
+++ b/ml_benchmark/memory_sampler.cc
@@ -0,0 +1,60 @@
+// Copyright 2020 The Chromium OS Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "ml_benchmark/memory_sampler.h"
+
+#include <base/task/task_traits.h>
+#include <base/task/thread_pool.h>
+
+#include <algorithm>
+
+#include "ml_benchmark/sysmetrics.h"
+
+namespace ml_benchmark {
+
+PeakMemorySampler::PeakMemorySampler() : from_here_(FROM_HERE) {
+  task_runner_ = base::ThreadPool::CreateSequencedTaskRunner(
+      {base::MayBlock(), base::TaskPriority::BEST_EFFORT,
+       base::TaskShutdownBehavior::SKIP_ON_SHUTDOWN});
+}
+
+PeakMemorySampler::~PeakMemorySampler() {
+  SetRunning(false);
+}
+
+void PeakMemorySampler::SetRunning(bool is_running) {
+  base::AutoLock auto_lock(lock_);
+  running_ = is_running;
+}
+
+void PeakMemorySampler::StartSampling(
+    scoped_refptr<PeakMemorySampler> sampler) {
+  sampler->SetRunning(true);
+  SampleMemory(sampler);
+}
+
+void PeakMemorySampler::StopSampling(scoped_refptr<PeakMemorySampler> sampler) {
+  sampler->SetRunning(false);
+}
+
+int64_t PeakMemorySampler::GetMaxSample() {
+  // Writing to max_sample_ is protected by this lock as well.
+  base::AutoLock auto_lock(lock_);
+  return max_sample_;
+}
+
+void PeakMemorySampler::SampleMemory(scoped_refptr<PeakMemorySampler> sampler) {
+  base::AutoLock auto_lock(sampler->lock_);
+  if (!sampler->running_)
+    return;
+
+  sampler->sample_counter_++;
+  sampler->max_sample_ = std::max(sampler->max_sample_, GetSwapAndRSSBytes());
+  sampler->task_runner_->PostDelayedTask(
+      sampler->from_here_,
+      base::Bind(&PeakMemorySampler::SampleMemory, sampler),
+      sampler->sampling_interval_);
+}
+
+}  // namespace ml_benchmark
diff --git a/ml_benchmark/memory_sampler.h b/ml_benchmark/memory_sampler.h
new file mode 100644
index 0000000..2ad296b
--- /dev/null
+++ b/ml_benchmark/memory_sampler.h
@@ -0,0 +1,45 @@
+// Copyright 2020 The Chromium OS Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef ML_BENCHMARK_MEMORY_SAMPLER_H_
+#define ML_BENCHMARK_MEMORY_SAMPLER_H_
+
+#include <base/memory/ref_counted.h>
+#include <base/synchronization/lock.h>
+#include <base/task/thread_pool.h>
+#include <base/time/time.h>
+#include <gtest/gtest_prod.h>  // for FRIEND_TEST
+
+namespace ml_benchmark {
+
+class PeakMemorySampler : public base::RefCountedThreadSafe<PeakMemorySampler> {
+ public:
+  PeakMemorySampler();
+
+  static void StartSampling(scoped_refptr<PeakMemorySampler> sampler);
+  static void StopSampling(scoped_refptr<PeakMemorySampler> sampler);
+  int64_t GetMaxSample();
+
+ protected:
+  ~PeakMemorySampler();
+  friend class base::RefCountedThreadSafe<PeakMemorySampler>;
+
+ private:
+  static void SampleMemory(scoped_refptr<PeakMemorySampler> sampler);
+  void SetRunning(bool is_running);
+
+  base::Location from_here_;
+  bool running_ = false;
+  base::Lock lock_;
+  base::TimeDelta sampling_interval_ = base::TimeDelta::FromSeconds(1);
+  int64_t max_sample_ = 0;
+  scoped_refptr<base::SequencedTaskRunner> task_runner_;
+
+  // For testing purposes
+  int sample_counter_ = 0;
+  FRIEND_TEST(PeakMemorySamplerTest, BasicFunctions);
+};
+}  // namespace ml_benchmark
+
+#endif  // ML_BENCHMARK_MEMORY_SAMPLER_H_
diff --git a/ml_benchmark/memory_sampler_test.cc b/ml_benchmark/memory_sampler_test.cc
new file mode 100644
index 0000000..2a0a83b
--- /dev/null
+++ b/ml_benchmark/memory_sampler_test.cc
@@ -0,0 +1,96 @@
+// Copyright 2020 The Chromium OS Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "ml_benchmark/memory_sampler.h"
+
+#include <base/test/task_environment.h>
+#include <gtest/gtest.h>
+
+namespace ml_benchmark {
+
+class PeakMemorySamplerTest : public ::testing::Test {
+ public:
+  PeakMemorySamplerTest() = default;
+
+ protected:
+  base::test::TaskEnvironment task_environment_{
+      base::test::TaskEnvironment::TimeSource::MOCK_TIME};
+  base::TimeDelta sampling_interval_ = base::TimeDelta::FromSeconds(1);
+};
+
+TEST_F(PeakMemorySamplerTest, BasicFunctions) {
+  scoped_refptr<PeakMemorySampler> sampler = new PeakMemorySampler();
+
+  // No samples should mean zero
+  EXPECT_EQ(sampler->GetMaxSample(), 0);
+  EXPECT_EQ(sampler->sample_counter_, 0);
+
+  PeakMemorySampler::StartSampling(sampler);
+  task_environment_.FastForwardBy(sampling_interval_);
+  const int64_t initial_peak = sampler->GetMaxSample();
+  // StartSampling causes a sample, plus the interval is two samples.
+  EXPECT_EQ(sampler->sample_counter_, 2);
+
+  // Sample a few more times, make sure it hasn't changed
+  task_environment_.FastForwardBy(sampling_interval_ * 2);
+  EXPECT_EQ(initial_peak, sampler->GetMaxSample());
+
+  // Allocate 10MB
+  int ten_mb_bytes = 1024 * 1024 * 10;
+  char* allocate = new char[ten_mb_bytes];
+  // Zero it out and read so the compiler doesn't optimize the variable away.
+  memset(allocate, 0, ten_mb_bytes);
+  EXPECT_EQ(allocate[ten_mb_bytes - 1], 0);
+
+  task_environment_.FastForwardBy(sampling_interval_);
+  const int64_t higher_peak = sampler->GetMaxSample();
+  EXPECT_GT(higher_peak, initial_peak);
+
+  // Free the memory and make sure the peak doesn't drop
+  delete[] allocate;
+  task_environment_.FastForwardBy(sampling_interval_);
+  EXPECT_EQ(higher_peak, sampler->GetMaxSample());
+
+  // Stop sampling, allocate a bunch more memory
+  PeakMemorySampler::StopSampling(sampler);
+  EXPECT_EQ(sampler->sample_counter_, 6);
+
+  // Allocate 20MB
+  int twenty_mb_bytes = 1024 * 1024 * 20;
+  allocate = new char[twenty_mb_bytes];
+  // Zero it out and read so the compiler doesn't optimize the variable away.
+  memset(allocate, 0, twenty_mb_bytes);
+  EXPECT_EQ(allocate[twenty_mb_bytes - 1], 0);
+
+  // We're not sampling so the peak should stay the same.
+  task_environment_.FastForwardBy(sampling_interval_ * 2);
+  EXPECT_EQ(higher_peak, sampler->GetMaxSample());
+  EXPECT_EQ(sampler->sample_counter_, 6);
+
+  // Start sampling again and check it grows
+  PeakMemorySampler::StartSampling(sampler);
+  task_environment_.FastForwardBy(sampling_interval_);
+  EXPECT_GT(sampler->GetMaxSample(), higher_peak);
+
+  // StartSampling causes a sample, plus the interval is two samples.
+  EXPECT_EQ(sampler->sample_counter_, 8);
+
+  delete[] allocate;
+}
+
+TEST_F(PeakMemorySamplerTest, LifeCycle) {
+  scoped_refptr<PeakMemorySampler> sampler = new PeakMemorySampler();
+
+  PeakMemorySampler::StartSampling(sampler);
+  task_environment_.FastForwardBy(sampling_interval_);
+  EXPECT_GT(sampler->GetMaxSample(), 0);
+
+  // At this point another task has been scheduled in t+1, so
+  // delete the object and move forward in time. We expect this
+  // to 'just work' and not crash due to some dangling pointer.
+  sampler.reset();
+  task_environment_.FastForwardBy(sampling_interval_ * 2);
+}
+
+}  // namespace ml_benchmark