compiler_wrapper: add autocrash logic

As outlined in the attached bug, we want to be able to crash the
compiler based on a handful of heuristics. Crashing Clang helps get us
self-contained reproducers fairly easily.

This CL provides (off-by-default) functionality to do the above. The
expectation is that a SWE will hack at it to make it work as they need
to.

BUG=b:236736327
TEST=Installed the new wrapper; observed autocrashes.

Change-Id: I76ec753ec37baa5e9b6dab92668081fa7c605725
Reviewed-on: https://chromium-review.googlesource.com/c/chromiumos/third_party/toolchain-utils/+/3714885
Reviewed-by: Manoj Gupta <manojgupta@chromium.org>
Reviewed-by: Jordan Abrahams-Whitehead <ajordanr@google.com>
Commit-Queue: George Burgess <gbiv@chromium.org>
Tested-by: George Burgess <gbiv@chromium.org>
diff --git a/compiler_wrapper/README.md b/compiler_wrapper/README.md
index 0228e27..bb63798 100644
--- a/compiler_wrapper/README.md
+++ b/compiler_wrapper/README.md
@@ -73,3 +73,31 @@
   `/usr/bin/clang_host_wrapper`
 - Gcc host wrapper:
   `/usr/x86_64-pc-linux-gnu/gcc-bin/10.2.0/host_wrapper`
+
+## Using the compiler wrapper to crash arbitrary compilations
+
+When Clang crashes, its output can be extremely useful. Often, it will provide
+the user with a stack trace, and messages like:
+
+```
+clang-15: unable to execute command: Illegal instruction
+clang-15: note: diagnostic msg: /tmp/clang_crash_diagnostics/foo-5420d2.c
+clang-15: note: diagnostic msg: /tmp/clang_crash_diagnostics/foo-5420d2.sh
+```
+
+Where the artifacts at `/tmp/clang_crash_diagnostics/foo-*` are a full,
+self-contained reproducer of the inputs that caused the crash in question.
+Often, such a reproducer is very valuable to have even for cases where a crash
+_doesn't_ happen (e.g., maybe Clang is now emitting an error where it used to
+not do so, and we want to bisect upstream LLVM with that info). Normally,
+collecting and crafting such a reproducer is a multi-step process, and can be
+error-prone; compile commands may rely on env vars, they may be done within
+`chroot`s, they may rely on being executed in a particular directory, they may
+rely on intermediate state, etc.
+
+Because of the usefulness of these crash reports, our wrapper supports crashing
+Clang even on files that ordinarily don't cause Clang to crash. For various
+reasons (b/236736327), this support currently requires rebuilding and
+redeploying the wrapper in order to work. That said, this could be a valuable
+tool for devs interested in creating a self-contained reproducer without having
+to manually reproduce the environment in which a particular build was performed.
diff --git a/compiler_wrapper/ccache_flag.go b/compiler_wrapper/ccache_flag.go
index 7d19da8..2c966fd 100644
--- a/compiler_wrapper/ccache_flag.go
+++ b/compiler_wrapper/ccache_flag.go
@@ -4,6 +4,11 @@
 
 package main
 
+func isInConfigureStage(env env) bool {
+	val, present := env.getenv("EBUILD_PHASE")
+	return present && val == "configure"
+}
+
 func processCCacheFlag(builder *commandBuilder) {
 	// We should be able to share the objects across compilers as
 	// the pre-processed output will differ.  This allows boards
@@ -22,7 +27,7 @@
 	// Disable ccache during portage's src_configure phase. Using ccache here is generally a
 	// waste of time, since these files are very small. Experimentally, this speeds up
 	// configuring by ~13%.
-	if val, present := builder.env.getenv("EBUILD_PHASE"); present && val == "configure" {
+	if isInConfigureStage(builder.env) {
 		useCCache = false
 	}
 
diff --git a/compiler_wrapper/compiler_wrapper.go b/compiler_wrapper/compiler_wrapper.go
index 1fe3eb7..28d2247 100644
--- a/compiler_wrapper/compiler_wrapper.go
+++ b/compiler_wrapper/compiler_wrapper.go
@@ -201,6 +201,12 @@
 		}
 	}
 
+	// If builds matching some heuristic should crash, crash them. Since this is purely a
+	// debugging tool, don't offer any nice features with it (e.g., rusage, ...).
+	if shouldUseCrashBuildsHeuristic && mainBuilder.target.compilerType == clangType {
+		return buildWithAutocrash(env, cfg, compilerCmd)
+	}
+
 	bisectStage := getBisectStage(env)
 
 	if rusageEnabled {
diff --git a/compiler_wrapper/crash_builds.go b/compiler_wrapper/crash_builds.go
new file mode 100644
index 0000000..147fb36
--- /dev/null
+++ b/compiler_wrapper/crash_builds.go
@@ -0,0 +1,154 @@
+// Copyright 2022 The ChromiumOS Authors.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+package main
+
+import (
+	"bytes"
+	"fmt"
+	"io"
+	"regexp"
+)
+
+// ** HEY YOU, PERSON READING THIS! **
+//
+// Are you a dev who wants to make this work locally? Awesome! Please note that this **only** works
+// for Clang. If that's OK, here's a checklist for you:
+// [ ] Set `shouldUseCrashBuildsHeuristic = true` below.
+// [ ] If you want this heuristic to operate during `src_configure` (rare), also set
+// `allowAutoCrashInConfigure` to true.
+// [ ] Modify `shouldAutocrashPostExec` to return `true` when the compiler's output/flags match what
+// you want to crash on, and `false` otherwise.
+// [ ] Run `./install_compiler_wrapper.sh` to install the updated wrapper.
+// [ ] Run whatever command reproduces the error.
+//
+// If you need to make changes to your heuristic, repeat the above steps starting at
+// `./install_compiler_wrapper.sh` until things seem to do what you want.
+const (
+	// Set this to true to use autocrashing logic.
+	shouldUseCrashBuildsHeuristic = false
+	// Set this to true to allow `shouldAutocrashPostExec` to check+crash configure steps.
+	allowAutoCrashInConfigure = false
+)
+
+// shouldAutocrashPostExec returns true if we should automatically crash the compiler. This is
+// called after the compiler is run. If it returns true, we'll re-execute the compiler with the bit
+// of extra code necessary to crash it.
+func shouldAutocrashPostExec(env env, cfg *config, originalCmd *command, runInfo compilerExecInfo) bool {
+	// ** TODO, DEAR READER: ** Fill this in. Below are a few `if false {` blocks that should
+	// work for common use-cases. You're encouraged to change them to `if true {` if they suit
+	// your needs.
+
+	// Return true if `error: some error message` is contained in the run's stderr.
+	if false {
+		return bytes.Contains(runInfo.stderr, []byte("error: some error message"))
+	}
+
+	// Return true if `foo.c:${line_number}: error: some error message` appears in the run's
+	// stderr. Otherwise, return false.
+	if false {
+		r := regexp.MustCompile(`foo\.c:\d+: error: some error message`)
+		return r.Match(runInfo.stderr)
+	}
+
+	// Return true if there's a `-fjust-give-up` flag in the compiler's invocation.
+	if false {
+		for _, flag := range originalCmd.Args {
+			if flag == "-fjust-give-up" {
+				return true
+			}
+		}
+
+		return false
+	}
+
+	panic("Please fill in `shouldAutocrashPostExec` with meaningful logic.")
+}
+
+type compilerExecInfo struct {
+	exitCode       int
+	stdout, stderr []byte
+}
+
+// ** Below here are implementation details. If all you want is autocrashing behavior, you don't
+// need to keep reading. **
+const (
+	autocrashProgramLine = "\n#pragma clang __debug parser_crash"
+)
+
+type buildWithAutocrashPredicates struct {
+	allowInConfigure bool
+	shouldAutocrash  func(env, *config, *command, compilerExecInfo) bool
+}
+
+func buildWithAutocrash(env env, cfg *config, originalCmd *command) (exitCode int, err error) {
+	return buildWithAutocrashImpl(env, cfg, originalCmd, buildWithAutocrashPredicates{
+		allowInConfigure: allowAutoCrashInConfigure,
+		shouldAutocrash:  shouldAutocrashPostExec,
+	})
+}
+
+func buildWithAutocrashImpl(env env, cfg *config, originalCmd *command, preds buildWithAutocrashPredicates) (exitCode int, err error) {
+	stdinBuffer := (*bytes.Buffer)(nil)
+	subprocStdin := io.Reader(nil)
+	invocationUsesStdinAsAFile := needStdinTee(originalCmd)
+	if invocationUsesStdinAsAFile {
+		stdinBuffer = &bytes.Buffer{}
+		if _, err := stdinBuffer.ReadFrom(env.stdin()); err != nil {
+			return 0, wrapErrorwithSourceLocf(err, "prebuffering stdin")
+		}
+		subprocStdin = stdinBuffer
+	} else {
+		subprocStdin = env.stdin()
+	}
+
+	stdoutBuffer := &bytes.Buffer{}
+	stderrBuffer := &bytes.Buffer{}
+	exitCode, err = wrapSubprocessErrorWithSourceLoc(originalCmd,
+		env.run(originalCmd, subprocStdin, stdoutBuffer, stderrBuffer))
+	if err != nil {
+		return 0, err
+	}
+
+	autocrashAllowed := preds.allowInConfigure || !isInConfigureStage(env)
+	crash := autocrashAllowed && preds.shouldAutocrash(env, cfg, originalCmd, compilerExecInfo{
+		exitCode: exitCode,
+		stdout:   stdoutBuffer.Bytes(),
+		stderr:   stderrBuffer.Bytes(),
+	})
+	if !crash {
+		stdoutBuffer.WriteTo(env.stdout())
+		stderrBuffer.WriteTo(env.stderr())
+		return exitCode, nil
+	}
+
+	fmt.Fprintln(env.stderr(), "** Autocrash requested; crashing the compiler...**")
+
+	// `stdinBuffer == nil` implies that `-` wasn't used as a flag.  If `-` isn't used as a
+	// flag, clang will ignore stdin. We want to write our #pragma to stdin, since we can't
+	// reasonably modify the files we're currently compiling.
+	if stdinBuffer == nil {
+		newArgs := []string{}
+		// Clang can't handle `-o ${target}` when handed multiple input files. Since
+		// we expect to crash before emitting anything, remove `-o ${file}` entirely.
+		for i, e := 0, len(originalCmd.Args); i < e; i++ {
+			a := originalCmd.Args[i]
+			if a == "-o" {
+				// Skip the -o here, then skip the following arg in the loop header.
+				i++
+			} else {
+				newArgs = append(newArgs, a)
+			}
+		}
+		// And now add args that instruct clang to read from stdin. In this case, we also
+		// need to tell Clang what language the file is written in; C is as good as anything
+		// for this.
+		originalCmd.Args = append(newArgs, "-x", "c", "-")
+		stdinBuffer = &bytes.Buffer{}
+	}
+
+	stdinBuffer.WriteString(autocrashProgramLine)
+	return wrapSubprocessErrorWithSourceLoc(originalCmd,
+		env.run(originalCmd, stdinBuffer, env.stdout(), env.stderr()))
+}
diff --git a/compiler_wrapper/crash_builds_test.go b/compiler_wrapper/crash_builds_test.go
new file mode 100644
index 0000000..a4b2b99
--- /dev/null
+++ b/compiler_wrapper/crash_builds_test.go
@@ -0,0 +1,260 @@
+// Copyright 2022 The ChromiumOS Authors.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+package main
+
+import (
+	"bytes"
+	"io"
+	"strings"
+	"testing"
+)
+
+func TestBuildWithAutoCrashDoesNothingIfCrashIsNotRequested(t *testing.T) {
+	withTestContext(t, func(ctx *testContext) {
+		neverAutoCrash := buildWithAutocrashPredicates{
+			allowInConfigure: true,
+			shouldAutocrash: func(env, *config, *command, compilerExecInfo) bool {
+				return false
+			},
+		}
+
+		exitCode, err := buildWithAutocrashImpl(ctx, ctx.cfg, ctx.newCommand(clangX86_64, mainCc), neverAutoCrash)
+		if err != nil {
+			t.Fatalf("unexpectedly failed with %v", err)
+		}
+		ctx.must(exitCode)
+		if ctx.cmdCount != 1 {
+			t.Errorf("expected 1 call. Got: %d", ctx.cmdCount)
+		}
+	})
+}
+
+func TestBuildWithAutoCrashSkipsAutocrashLogicIfInConfigureAndConfigureChecksDisabled(t *testing.T) {
+	withTestContext(t, func(ctx *testContext) {
+		alwaysAutocrash := buildWithAutocrashPredicates{
+			allowInConfigure: false,
+			shouldAutocrash: func(env, *config, *command, compilerExecInfo) bool {
+				return true
+			},
+		}
+
+		ctx.env = append(ctx.env, "EBUILD_PHASE=configure")
+		exitCode, err := buildWithAutocrashImpl(ctx, ctx.cfg, ctx.newCommand(clangX86_64, mainCc), alwaysAutocrash)
+		if err != nil {
+			t.Fatalf("unexpectedly failed with %v", err)
+		}
+		ctx.must(exitCode)
+		if ctx.cmdCount != 1 {
+			t.Errorf("expected 1 call. Got: %d", ctx.cmdCount)
+		}
+	})
+}
+
+func TestBuildWithAutoCrashRerunsIfPredicateRequestsCrash(t *testing.T) {
+	withTestContext(t, func(ctx *testContext) {
+		autocrashPostCmd := buildWithAutocrashPredicates{
+			allowInConfigure: true,
+			shouldAutocrash: func(env, *config, *command, compilerExecInfo) bool {
+				return true
+			},
+		}
+
+		ctx.cmdMock = func(cmd *command, stdin io.Reader, stdout io.Writer, stderr io.Writer) error {
+			hasDash := false
+			for _, arg := range cmd.Args {
+				if arg == "-" {
+					hasDash = true
+					break
+				}
+			}
+
+			switch ctx.cmdCount {
+			case 1:
+				if hasDash {
+					t.Error("Got `-` on command 1; didn't want that.")
+				}
+				return nil
+			case 2:
+				if !hasDash {
+					t.Error("Didn't get `-` on command 2; wanted that.")
+				} else {
+					input := stdin.(*bytes.Buffer)
+					if s := input.String(); !strings.Contains(s, autocrashProgramLine) {
+						t.Errorf("Input was %q; expected %q to be in it", s, autocrashProgramLine)
+					}
+				}
+				return nil
+			default:
+				t.Fatalf("Unexpected command count: %d", ctx.cmdCount)
+				panic("Unreachable")
+			}
+		}
+
+		exitCode, err := buildWithAutocrashImpl(ctx, ctx.cfg, ctx.newCommand(clangX86_64, mainCc), autocrashPostCmd)
+		if err != nil {
+			t.Fatalf("unexpectedly failed with %v", err)
+		}
+		ctx.must(exitCode)
+
+		if ctx.cmdCount != 2 {
+			t.Errorf("expected 2 calls. Got: %d", ctx.cmdCount)
+		}
+	})
+}
+
+func TestBuildWithAutoCrashAddsDashAndWritesToStdinIfInputFileIsNotStdin(t *testing.T) {
+	withTestContext(t, func(ctx *testContext) {
+		autocrashPostCmd := buildWithAutocrashPredicates{
+			allowInConfigure: true,
+			shouldAutocrash: func(env, *config, *command, compilerExecInfo) bool {
+				return true
+			},
+		}
+
+		ctx.cmdMock = func(cmd *command, stdin io.Reader, stdout io.Writer, stderr io.Writer) error {
+			numDashes := 0
+			for _, arg := range cmd.Args {
+				if arg == "-" {
+					numDashes++
+				}
+			}
+
+			switch ctx.cmdCount {
+			case 1:
+				if numDashes != 0 {
+					t.Errorf("Got %d dashes on command 1; want 0", numDashes)
+				}
+				return nil
+			case 2:
+				if numDashes != 1 {
+					t.Errorf("Got %d dashes on command 2; want 1", numDashes)
+				}
+
+				input := stdin.(*bytes.Buffer).String()
+				stdinHasAutocrashLine := strings.Contains(input, autocrashProgramLine)
+				if !stdinHasAutocrashLine {
+					t.Error("Got no autocrash line on the second command; wanted that")
+				}
+				return nil
+			default:
+				t.Fatalf("Unexpected command count: %d", ctx.cmdCount)
+				panic("Unreachable")
+			}
+		}
+
+		exitCode, err := buildWithAutocrashImpl(ctx, ctx.cfg, ctx.newCommand(clangX86_64, mainCc), autocrashPostCmd)
+		if err != nil {
+			t.Fatalf("unexpectedly failed with %v", err)
+		}
+		ctx.must(exitCode)
+
+		if ctx.cmdCount != 2 {
+			t.Errorf("expected 2 calls. Got: %d", ctx.cmdCount)
+		}
+	})
+}
+
+func TestBuildWithAutoCrashAppendsToStdinIfStdinIsTheOnlyInputFile(t *testing.T) {
+	withTestContext(t, func(ctx *testContext) {
+		autocrashPostCmd := buildWithAutocrashPredicates{
+			allowInConfigure: true,
+			shouldAutocrash: func(env, *config, *command, compilerExecInfo) bool {
+				return true
+			},
+		}
+
+		ctx.cmdMock = func(cmd *command, stdin io.Reader, stdout io.Writer, stderr io.Writer) error {
+			numDashes := 0
+			for _, arg := range cmd.Args {
+				if arg == "-" {
+					numDashes++
+				}
+			}
+
+			if numDashes != 1 {
+				t.Errorf("Got %d dashes on command %d (args: %#v); want 1", numDashes, ctx.cmdCount, cmd.Args)
+			}
+
+			input := stdin.(*bytes.Buffer).String()
+			stdinHasAutocrashLine := strings.Contains(input, autocrashProgramLine)
+
+			switch ctx.cmdCount {
+			case 1:
+				if stdinHasAutocrashLine {
+					t.Error("Got autocrash line on the first command; did not want that")
+				}
+				return nil
+			case 2:
+				if !stdinHasAutocrashLine {
+					t.Error("Got no autocrash line on the second command; wanted that")
+				}
+				return nil
+			default:
+				t.Fatalf("Unexpected command count: %d", ctx.cmdCount)
+				panic("Unreachable")
+			}
+		}
+
+		exitCode, err := buildWithAutocrashImpl(ctx, ctx.cfg, ctx.newCommand(clangX86_64, "-x", "c", "-"), autocrashPostCmd)
+		if err != nil {
+			t.Fatalf("unexpectedly failed with %v", err)
+		}
+		ctx.must(exitCode)
+
+		if ctx.cmdCount != 2 {
+			t.Errorf("expected 2 calls. Got: %d", ctx.cmdCount)
+		}
+	})
+}
+
+func TestCrashBuildFiltersObjectFileOptionOnCrashes(t *testing.T) {
+	withTestContext(t, func(ctx *testContext) {
+		autocrashPostCmd := buildWithAutocrashPredicates{
+			allowInConfigure: true,
+			shouldAutocrash: func(env, *config, *command, compilerExecInfo) bool {
+				return true
+			},
+		}
+
+		const outputFileName = "/path/to/foo.o"
+
+		ctx.cmdMock = func(cmd *command, stdin io.Reader, stdout io.Writer, stderr io.Writer) error {
+			cmdOutputArg := (*string)(nil)
+			for i, e := range cmd.Args {
+				if e == "-o" {
+					// Assume something follows. If not, we'll crash and the
+					// test will fail.
+					cmdOutputArg = &cmd.Args[i+1]
+				}
+			}
+
+			switch ctx.cmdCount {
+			case 1:
+				if cmdOutputArg == nil || *cmdOutputArg != outputFileName {
+					t.Errorf("Got command args %q; want `-o %q` in them", cmd.Args, outputFileName)
+				}
+				return nil
+			case 2:
+				if cmdOutputArg != nil {
+					t.Errorf("Got command args %q; want no mention of `-o %q` in them", cmd.Args, outputFileName)
+				}
+				return nil
+			default:
+				t.Fatalf("Unexpected command count: %d", ctx.cmdCount)
+				panic("Unreachable")
+			}
+		}
+
+		exitCode, err := buildWithAutocrashImpl(ctx, ctx.cfg, ctx.newCommand(clangX86_64, "-o", outputFileName, mainCc), autocrashPostCmd)
+		if err != nil {
+			t.Fatalf("unexpectedly failed with %v", err)
+		}
+		ctx.must(exitCode)
+
+		if ctx.cmdCount != 2 {
+			t.Errorf("expected 2 calls. Got: %d", ctx.cmdCount)
+		}
+	})
+}