blob: 2d258e49d4b849668a82725cd11df436047f84b4 [file] [log] [blame]
// Copyright 2020 The Chromium OS Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef CRASH_REPORTER_CRASH_REPORTER_PARSER_H_
#define CRASH_REPORTER_CRASH_REPORTER_PARSER_H_
#include <memory>
#include <string>
#include <vector>
#include <base/time/clock.h>
#include <base/time/time.h>
#include <metrics/metrics_library.h>
#include "crash-reporter/anomaly_detector.h"
namespace anomaly {
// Anomaly_detector's collector for syslog entries from our own crash_reporter.
// Unlike other anomaly_detector collectors, this doesn't usually create
// crash reports -- ParseLogEntry always returns nullopt. Instead, it primarily
// produces UMA metrics that track how well Chrome's crash handlers (breakpad or
// crashpad) are working. If Chrome gets a segfault or such, its internal crash
// handler should invoke crash_reporter directly. Once the internal crash
// handler is done, the kernel should also invoke crash_reporter via the normal
// core pattern file. Both of these produce distinct log entries. By matching
// these up, we can detect how often the internal crash handler is failing to
// invoke crash_reporter. In particular, if we see an invoked-by-kernel message
// without a corresponding invoking-directly message, Chrome's crash handler
// failed. We record the number of unmatched invoked-by-kernel messages, and,
// for a denominator, we record the total number of invoked-by-kernel messages.
//
// (There are some cases -- "dump without crashing" -- in which Chrome will
// invoke crash_reporter but will not actually crash, and so will not produce
// an invoked-by-kernel message. This is why we go to the trouble of actually
// matching up messages from the log, instead of just counting the number of
// invoked-directly and invoked-from-kernel events. The "dump without crashing"
// events will overcount the number of successes and hide the true number of
// failures. Therefore, we ignore "dump without crashing" crashes by not
// counting the number of invoked-by-Chrome messages we see, and not reporting
// the number of unmatched invoked-by-Chrome messages.)
class CrashReporterParser : public Parser {
public:
// We hold on to unmatched messages for at least this long before reporting
// them as unmatched.
static constexpr base::TimeDelta kTimeout = base::TimeDelta::FromSeconds(30);
// We hold on to records of the # of matched and unmatched messages for this
// long before discarding this. This is longer than kTimeout because we want
// to know the number of crashes the crash system was trying to handle all
// around the missed crash, not just after it,
static constexpr base::TimeDelta kTimeoutForRecentUsage =
base::TimeDelta::FromSeconds(60);
// Constants around log capture. Exposed here just for unit testing.
// Number of lines of the various logs captured.
static constexpr int kNumLogLinesCaptured = 50;
// We only captures this much from the end of the file. This is usually
// enough to get 50 lines of text. It's possible that if some lines are
// humongous, we'll get less than 50 lines, but that's very rare and if it
// happens, we don't lose much -- 50 lines is a bit arbitrary anyways.
static constexpr int kMaxLogBytesRead = kNumLogLinesCaptured * 400;
explicit CrashReporterParser(
std::unique_ptr<base::Clock> clock,
std::unique_ptr<MetricsLibraryInterface> metrics_lib,
bool testonly_send_all);
MaybeCrashReport ParseLogEntry(const std::string& line) override;
MaybeCrashReport PeriodicUpdate() override;
private:
enum class Collector {
// Log entry was from ChromeCollector.
CHROME,
// Log entry was from UserCollector.
USER
};
struct UnmatchedCrash {
int pid;
base::Time timestamp;
Collector collector;
// Log captures. We are seeing some boards with a high crash miss rate
// (that is, crash_reporter isn't getting called for many Chrome crashes.)
// To investigate further, we want to grab some logs when we get a Chrome
// crash miss. We can't do this in the normal way (in
// CrashCollector::GetLogContents) because we don't know this is a miss for
// 30 seconds, and we want to grab the logs at the time of the miss. So we
// grab the logs when we first see the UserCollector log entry, and then
// only use if PeriodicUpdate marks this a missed collection. To avoid
// generating too many crash uploads, we also do this one of 1000 times.
// So these fields are only filled in 1-in-1000 times and only if
// collector == USER.
// If false, the entries below this were not filled in.
bool logs_captured = false;
// Contents of /proc/sys/fs/file-nr, which lists the # of allocated file
// handles, the number of allocated-but-unused handles, and the maximum
// number of file handles.
std::string file_nr;
// Contents of /proc/meminfo.
std::string meminfo;
// Last 50 lines of /var/log/messages.
std::string last_50_messages;
// Last 50 lines of most recent /var/log/chrome_* log.
std::string last_50_chrome_current;
// Last 50 lines of second most recent /var/log/chrome_* log.
std::string last_50_chrome_previous;
};
// Returns the last 50 lines of the file. (Or the entire file, if less than
// 50 lines.) Not in util.cc because this function is a bit opinionated on
// handling error messages and how important it is to get 50 lines in all
// possible scenarios. (Specifically -- it's willing to get less than 50 lines
// in some cases to avoid complexity, and it returns a string indicating the
// error in place of the file contents if there is an error reading the file.)
static std::string GetLast50Lines(const base::FilePath& file_path);
// Take an UnmatchedCrash that has |logs_captured| of true, and turn it into
// a CrashReport that anomaly_detector_main.cc can send to crash_reporter.
CrashReport MakeCrashReport(const UnmatchedCrash& crash);
// Capture the ChromeLogs in |crash|.
static void GetChromeLogs(UnmatchedCrash* crash);
// Capture logs (such as |last_50_messages|) in |crash|.
static void CaptureLogs(UnmatchedCrash* crash);
// Returns true if we should capture logs for this crash report. Outside of
// tests, we only capture logs for .1% of Collector::USER unmatched crashes.
bool ShouldCaptureLogs(const UnmatchedCrash& crash);
std::unique_ptr<base::Clock> clock_;
std::unique_ptr<MetricsLibraryInterface> metrics_lib_;
std::vector<UnmatchedCrash> unmatched_crashes_;
std::vector<base::Time> recent_unmatched_crash_times_;
std::vector<base::Time> recent_matched_crash_times_;
const bool always_capture_logs_for_test_;
};
} // namespace anomaly
#endif // CRASH_REPORTER_CRASH_REPORTER_PARSER_H_