blob: 066b6bd560bd4026e24dbe6ead10dd77bed507a4 [file] [log] [blame]
// Copyright 2020 The Chromium OS Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "crash-reporter/crash_reporter_parser.h"
#include <utility>
#include <base/files/file_enumerator.h>
#include <base/files/file_util.h>
#include <base/rand_util.h>
#include <base/strings/strcat.h>
#include <base/strings/stringprintf.h>
#include <re2/re2.h>
#include "crash-reporter/paths.h"
#include "crash-reporter/util.h"
namespace {
// Erase all entries in times that are older than too_old.
void EraseTimesBefore(base::Time too_old, std::vector<base::Time>* times) {
auto it = times->begin();
// Assume the times are in order. This is not *strictly* true, because
// clock->Now() will in some cases return values out of order (e.g. if the
// system clock is adjusted), but we won't break anything (no crash, no
// undefined behavior) if that happens, we'll just get a slightly-incorrect
// count. So take the optimization savings of assuming the times are in order.
while (it != times->end() && *it < too_old) {
++it;
}
times->erase(times->begin(), it);
}
} // namespace
namespace anomaly {
constexpr LazyRE2 chrome_crash_called_directly = {
"Received crash notification for chrome\\[(\\d+)\\][[:alnum:] ]+"
"\\(called directly\\)"};
constexpr LazyRE2 chrome_crash_called_by_kernel = {
"Received crash notification for chrome\\[(\\d+)\\][[:alnum:], ]+"
"\\(ignoring call by kernel - chrome crash"};
constexpr char kUMACrashesFromKernel[] = "Crash.Chrome.CrashesFromKernel";
constexpr char kUMAMissedCrashes[] = "Crash.Chrome.MissedCrashes";
constexpr base::TimeDelta CrashReporterParser::kTimeout;
constexpr base::TimeDelta CrashReporterParser::kTimeoutForRecentUsage;
constexpr int CrashReporterParser::kNumLogLinesCaptured;
constexpr int CrashReporterParser::kMaxLogBytesRead;
CrashReporterParser::CrashReporterParser(
std::unique_ptr<base::Clock> clock,
std::unique_ptr<MetricsLibraryInterface> metrics_lib,
bool testonly_send_all)
: clock_(std::move(clock)),
metrics_lib_(std::move(metrics_lib)),
always_capture_logs_for_test_(testonly_send_all) {
metrics_lib_->Init();
}
CrashReport CrashReporterParser::MakeCrashReport(const UnmatchedCrash& crash) {
// Note that we don't have a good signature -- we don't have any way to
// really distinguish one missed crash from another -- so the text is just the
// log we want to send.
int recent_miss_count =
static_cast<int>(recent_unmatched_crash_times_.size());
int recent_match_count = static_cast<int>(recent_matched_crash_times_.size());
// Count pending missing. collector=CHROME will never turn into a miss, so
// don't count those.
int pending_miss_count = 0;
for (const UnmatchedCrash& unmatched : unmatched_crashes_) {
if (unmatched.collector == Collector::USER) {
++pending_miss_count;
}
}
// -1 so we don't count the current missed crash as both pending and recent.
--pending_miss_count;
return CrashReport(
base::StrCat({"===/proc/sys/fs/file-nr===\n", crash.file_nr,
"\n===/proc/meminfo===\n", crash.meminfo,
"\n===tail /var/log/messages===\n", crash.last_50_messages,
"\n===tail most recent /var/log/chrome===\n",
crash.last_50_chrome_current,
"\n===tail previous /var/log/chrome===\n",
crash.last_50_chrome_previous}),
{"--missed_chrome_crash", base::StringPrintf("--pid=%d", crash.pid),
base::StringPrintf("--recent_miss_count=%d", recent_miss_count),
base::StringPrintf("--recent_match_count=%d", recent_match_count),
base::StringPrintf("--pending_miss_count=%d", pending_miss_count)});
}
// static
std::string CrashReporterParser::GetLast50Lines(
const base::FilePath& file_path) {
base::File file(file_path, base::File::FLAG_OPEN | base::File::FLAG_READ);
if (!file.IsValid()) {
LOG(WARNING) << "Could not open " << file_path.value();
return "Could not open";
}
char buffer[kMaxLogBytesRead];
int64_t length = file.GetLength();
if (length < 0) {
LOG(WARNING) << "Error getting length of " << file_path.value();
return "Error getting length\n";
}
int read;
if (length > kMaxLogBytesRead) {
read = file.Read(length - kMaxLogBytesRead, buffer, kMaxLogBytesRead);
} else {
read = file.Read(0, buffer, kMaxLogBytesRead);
}
if (read < 0) {
LOG(WARNING) << "Error reading " << file_path.value();
return "Error during read\n";
}
if (read == 0) {
return "<empty>\n";
}
int newlines = 0;
int pos = read - 1;
for (; pos >= 0; --pos) {
if (buffer[pos] == '\n') {
++newlines;
// Find the \n that is just before the 50th line.
if (newlines > kNumLogLinesCaptured) {
break;
}
}
}
// pos is either -1 or on the \n before the 50th line. Either way, we're
// 1 before where we want to be.
++pos;
return std::string(buffer + pos, read - pos);
}
// static
void CrashReporterParser::GetChromeLogs(UnmatchedCrash* crash) {
const base::FilePath system_log_directory =
paths::Get(paths::kSystemChromeLogDirectory);
// Find the two most recent Chrome logs rather than relying on the chrome &
// chrome.PREVIOUS symlinks. Since Chrome may be crashing and restarting, we
// risk races around the symlinks being created (for instance, if chrome gets
// moved to chrome.PREVIOUS while are reading it, we might up reading the
// same file twice).
base::FileEnumerator file_enumerator(system_log_directory,
false /*recursive */,
base::FileEnumerator::FILES, "chrome_*");
base::FileEnumerator::FileInfo most_recent;
base::FileEnumerator::FileInfo next_most_recent;
while (!file_enumerator.Next().empty()) {
auto info = file_enumerator.GetInfo();
if (info.GetLastModifiedTime() > most_recent.GetLastModifiedTime()) {
next_most_recent = std::move(most_recent);
most_recent = std::move(info);
} else if (info.GetLastModifiedTime() >
next_most_recent.GetLastModifiedTime()) {
next_most_recent = std::move(info);
}
}
if (!most_recent.GetName().empty()) {
crash->last_50_chrome_current =
GetLast50Lines(system_log_directory.Append(most_recent.GetName()));
} else {
crash->last_50_chrome_current = "<no chrome log found>";
}
if (!next_most_recent.GetName().empty()) {
crash->last_50_chrome_previous =
GetLast50Lines(system_log_directory.Append(next_most_recent.GetName()));
} else {
crash->last_50_chrome_previous = "<no chrome-previous log found>";
}
// We'd like to capture the most recent user Chrome logs here as well. (The
// /home/chronos/u-*/log/chrome_* logs). However, this runs into a lot of
// privacy issues. There can be 30-40 second delay between the time we gather
// the logs and the time we write the crash report. During this time, the
// user may have logged out, which means we're writing the crash report
// outside of the user's cryptohome. We can't write user logs to locations
// outside of their cryptohome. Solving this involves recording the location
// we got the user logs from, passing that to crash_reporter, and then doing
// various text manipulations to strip the user logs out if we're writing to
// a location not inside that cryptohome. It's possible but it's complex and
// messy and embeds some assumptions about the exact directory format being
// used by the cryptohome system. For now we're skipping the complexity and
// hoping the /var/log/chrome logs will have enough info to diagnose the
// problem. (Crashpad always writes to /var/log/chrome even when someone is
// logged in, so there's a good chance the info we want will be there.)
//
// Note -- if you want to change this, you also need to update the call to
// GetCreatedCrashDirectoryByEuid in MissedCrashCollector::Collect to use a
// different UID.
}
// static
void CrashReporterParser::CaptureLogs(UnmatchedCrash* crash) {
crash->logs_captured = true;
if (!base::ReadFileToString(paths::Get(paths::kProcFileNr),
&crash->file_nr)) {
crash->file_nr = "<read failed>";
// Keep going in the face of errors. One theory as to why we see missed
// crashes is that we are resource exhausted (in particular, file-descriptor
// exhausted). If we give up because of errors, and the file-descriptor
// exhaustion makes it hard to read the proc files, then we won't see
// information telling us about the file-descriptor exhaustion.
}
if (!base::ReadFileToString(paths::Get(paths::kProcMeminfo),
&crash->meminfo)) {
crash->meminfo = "<read failed>";
}
crash->last_50_messages = GetLast50Lines(paths::Get(paths::kMessageLogPath));
GetChromeLogs(crash);
// TODO(b/160903152): We should also run /bin/ps and attach its output to
// the logs. This would let us check that crashpad_handler is still running.
// Similarly, we should capture the last 50 lines from dmesg, since failed fd
// allocations are logged there.
}
bool CrashReporterParser::ShouldCaptureLogs(const UnmatchedCrash& crash) {
if (crash.collector != Collector::USER) {
return false;
}
if (always_capture_logs_for_test_) {
return true;
}
// Capture logs 1-in-1000. 413 is an arbitrary number chosen to celebrate an
// important date.
return base::RandGenerator(1000) == 413;
}
MaybeCrashReport CrashReporterParser::ParseLogEntry(const std::string& line) {
int pid = 0;
UnmatchedCrash crash;
if (RE2::PartialMatch(line, *chrome_crash_called_directly, &pid)) {
crash.pid = pid;
crash.collector = Collector::CHROME;
crash.timestamp = clock_->Now();
} else if (RE2::PartialMatch(line, *chrome_crash_called_by_kernel, &pid)) {
crash.pid = pid;
crash.collector = Collector::USER;
crash.timestamp = clock_->Now();
} else {
return base::nullopt;
}
// Find the matching entry in our unmatched_crashes_ vector. We expect each
// real chrome crash to reported twice, with the same PID -- once with "called
// directly" and once with "ignoring call by kernel".
for (auto it = unmatched_crashes_.begin(); it != unmatched_crashes_.end();
++it) {
if (it->pid == crash.pid && it->collector != crash.collector) {
// Found the corresponding message from the other collector. Throw away
// both.
unmatched_crashes_.erase(it);
// One of the two was a crash from kernel, so record that we got a crash
// from kernel. (We only send the events when we match or don't match;
// this avoids having our data polluted by events just before a shutdown.)
if (!metrics_lib_->SendCrosEventToUMA(kUMACrashesFromKernel)) {
LOG(WARNING) << "Could not mark Chrome crash as correctly processed";
}
recent_matched_crash_times_.push_back(clock_->Now());
return base::nullopt;
}
}
if (ShouldCaptureLogs(crash)) {
CaptureLogs(&crash);
}
unmatched_crashes_.push_back(crash);
return base::nullopt;
}
MaybeCrashReport CrashReporterParser::PeriodicUpdate() {
base::Time now = clock_->Now();
base::Time too_old_for_recent_usage = now - kTimeoutForRecentUsage;
// Remove the record of recent reports first, so that any missed crash reports
// have the proper nearby misses & hits count.
EraseTimesBefore(too_old_for_recent_usage, &recent_unmatched_crash_times_);
EraseTimesBefore(too_old_for_recent_usage, &recent_matched_crash_times_);
base::Time too_old = now - kTimeout;
auto it = unmatched_crashes_.begin();
MaybeCrashReport return_value;
while (it != unmatched_crashes_.end()) {
if (it->timestamp < too_old) {
if (it->collector == Collector::USER) {
if (!metrics_lib_->SendCrosEventToUMA(kUMACrashesFromKernel) ||
!metrics_lib_->SendCrosEventToUMA(kUMAMissedCrashes)) {
LOG(WARNING) << "Could not mark Chrome crash as missed";
}
recent_unmatched_crash_times_.push_back(it->timestamp);
}
if (it->logs_captured) {
// In principle, we could have two log captures at about the same
// moment and this will lose one of these. (If we have two missed
// crashes within 10 seconds of each other.) In practice, this should
// happen rarely enough that it won't distort our overall numbers and
// the two log captures will have pretty much the same info anyways.
return_value = MakeCrashReport(*it);
}
it = unmatched_crashes_.erase(it);
} else {
++it;
}
}
return return_value;
}
} // namespace anomaly