sys-devel/llvm/files/cherry/09859113ed23ea11ce5726f0d19bc57e93ecbaab.patch - third_party/overlays/chromiumos-overlay - Git at Google

 From 84f0606557aa8e88332c23eb10da15a9c047a287 Mon Sep 17 00:00:00 2001
 From: Dmitry Vyukov <dvyukov@google.com>
 Date: Wed, 1 Dec 2021 17:55:14 +0100
 Subject: [PATCH] Revert "tsan: new runtime (v3)"

 This reverts commit 66d4ce7e26a5ab00f7e4946b6e1bac8f805010fa.

 Chromium tests started failing:
 https://bugs.chromium.org/p/chromium/issues/detail?id=1275581
 ---
  .../sanitizer_thread_registry.h               |   2 -
  compiler-rt/lib/tsan/CMakeLists.txt           |   1 +
  compiler-rt/lib/tsan/check_analyze.sh         |  12 +-
  compiler-rt/lib/tsan/go/build.bat             |   1 -
  compiler-rt/lib/tsan/go/buildgo.sh            |   1 -
  compiler-rt/lib/tsan/go/tsan_go.cpp           |   2 +-
  compiler-rt/lib/tsan/rtl/tsan_defs.h          |  23 +-
  compiler-rt/lib/tsan/rtl/tsan_dense_alloc.h   |   9 -
  compiler-rt/lib/tsan/rtl/tsan_flags.cpp       |   6 +
  compiler-rt/lib/tsan/rtl/tsan_flags.inc       |  10 +-
  .../lib/tsan/rtl/tsan_interceptors_posix.cpp  |   3 +-
  .../lib/tsan/rtl/tsan_interface_atomic.cpp    |  87 +-
  .../lib/tsan/rtl/tsan_interface_java.cpp      |   4 +-
  compiler-rt/lib/tsan/rtl/tsan_mman.cpp        |  32 +-
  compiler-rt/lib/tsan/rtl/tsan_mman.h          |   2 -
  compiler-rt/lib/tsan/rtl/tsan_mutexset.cpp    |  54 +-
  compiler-rt/lib/tsan/rtl/tsan_mutexset.h      |  11 +-
  compiler-rt/lib/tsan/rtl/tsan_platform.h      | 173 +++-
  .../lib/tsan/rtl/tsan_platform_linux.cpp      |  48 +-
  .../lib/tsan/rtl/tsan_platform_mac.cpp        |   9 +-
  .../lib/tsan/rtl/tsan_platform_posix.cpp      |  16 +-
  .../lib/tsan/rtl/tsan_platform_windows.cpp    |   3 +
  compiler-rt/lib/tsan/rtl/tsan_rtl.cpp         | 626 ++++---------
  compiler-rt/lib/tsan/rtl/tsan_rtl.h           | 325 ++++---
  compiler-rt/lib/tsan/rtl/tsan_rtl_access.cpp  | 860 ++++++++----------
  compiler-rt/lib/tsan/rtl/tsan_rtl_mutex.cpp   | 642 +++++++------
  compiler-rt/lib/tsan/rtl/tsan_rtl_proc.cpp    |   1 +
  compiler-rt/lib/tsan/rtl/tsan_rtl_report.cpp  | 367 +++++---
  compiler-rt/lib/tsan/rtl/tsan_rtl_thread.cpp  | 191 ++--
  compiler-rt/lib/tsan/rtl/tsan_shadow.h        | 315 ++++---
  compiler-rt/lib/tsan/rtl/tsan_sync.cpp        |  82 +-
  compiler-rt/lib/tsan/rtl/tsan_sync.h          |  48 +-
  compiler-rt/lib/tsan/rtl/tsan_trace.h         |  73 +-
  .../lib/tsan/rtl/tsan_update_shadow_word.inc  |  59 ++
  .../lib/tsan/tests/unit/tsan_flags_test.cpp   |   4 +-
  .../lib/tsan/tests/unit/tsan_shadow_test.cpp  |  92 +-
  .../lib/tsan/tests/unit/tsan_stack_test.cpp   |   4 +-
  .../lib/tsan/tests/unit/tsan_sync_test.cpp    |  17 +-
  .../lib/tsan/tests/unit/tsan_trace_test.cpp   | 175 +---
  compiler-rt/test/tsan/bench_threads.cpp       |   5 +
  compiler-rt/test/tsan/free_race2.c            |   2 +-
  compiler-rt/test/tsan/memcmp_race.cpp         |   2 +-
  compiler-rt/test/tsan/memcpy_race.cpp         |  10 +-
  compiler-rt/test/tsan/mutexset7.cpp           |   6 +-
  44 files changed, 2132 insertions(+), 2283 deletions(-)
  create mode 100644 compiler-rt/lib/tsan/rtl/tsan_update_shadow_word.inc

 diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_thread_registry.h b/compiler-rt/lib/sanitizer_common/sanitizer_thread_registry.h
 index 89e5fefa3408..a259b324220f 100644
 --- a/compiler-rt/lib/sanitizer_common/sanitizer_thread_registry.h
 +++ b/compiler-rt/lib/sanitizer_common/sanitizer_thread_registry.h
 @@ -104,8 +104,6 @@ class MUTEX ThreadRegistry {
      return threads_.empty() ? nullptr : threads_[tid];
    }

 -  u32 NumThreadsLocked() const { return threads_.size(); }
 -
    u32 CreateThread(uptr user_id, bool detached, u32 parent_tid, void *arg);

    typedef void (*ThreadCallback)(ThreadContextBase *tctx, void *arg);
 diff --git a/compiler-rt/lib/tsan/CMakeLists.txt b/compiler-rt/lib/tsan/CMakeLists.txt
 index aede54f689aa..c3284a5dc422 100644
 --- a/compiler-rt/lib/tsan/CMakeLists.txt
 +++ b/compiler-rt/lib/tsan/CMakeLists.txt
 @@ -119,6 +119,7 @@ set(TSAN_HEADERS
    rtl/tsan_symbolize.h
    rtl/tsan_sync.h
    rtl/tsan_trace.h
 +  rtl/tsan_update_shadow_word.inc
    rtl/tsan_vector_clock.h
    )

 diff --git a/compiler-rt/lib/tsan/check_analyze.sh b/compiler-rt/lib/tsan/check_analyze.sh
 index f507ba0172f3..3bd817c13697 100755
 --- a/compiler-rt/lib/tsan/check_analyze.sh
 +++ b/compiler-rt/lib/tsan/check_analyze.sh
 @@ -34,27 +34,21 @@ check() {
    fi
  }

 -# All hot functions must contain no PUSH/POP
 -# and no CALLs (everything is tail-called).
  for f in write1 write2 write4 write8; do
    check $f rsp 1
 -  check $f push 0
 -  check $f pop 0
 -  check $f call 0
 +  check $f push 2
  done

  for f in read1 read2 read4 read8; do
    check $f rsp 1
 -  check $f push 0
 -  check $f pop 0
 -  check $f call 0
 +  check $f push 3
  done

  for f in func_entry func_exit; do
    check $f rsp 0
    check $f push 0
    check $f pop 0
 -  check $f call 0
 +  check $f call 1  # TraceSwitch()
  done

  echo LGTM
 diff --git a/compiler-rt/lib/tsan/go/build.bat b/compiler-rt/lib/tsan/go/build.bat
 index e83410044314..496e127d9581 100644
 --- a/compiler-rt/lib/tsan/go/build.bat
 +++ b/compiler-rt/lib/tsan/go/build.bat
 @@ -14,7 +14,6 @@ type ^
    ..\rtl\tsan_suppressions.cpp ^
    ..\rtl\tsan_sync.cpp ^
    ..\rtl\tsan_stack_trace.cpp ^
 -  ..\rtl\tsan_vector_clock.cpp ^
    ..\..\sanitizer_common\sanitizer_allocator.cpp ^
    ..\..\sanitizer_common\sanitizer_common.cpp ^
    ..\..\sanitizer_common\sanitizer_flags.cpp ^
 diff --git a/compiler-rt/lib/tsan/go/buildgo.sh b/compiler-rt/lib/tsan/go/buildgo.sh
 index ab0db57b2783..8f6ffd4d34c5 100755
 --- a/compiler-rt/lib/tsan/go/buildgo.sh
 +++ b/compiler-rt/lib/tsan/go/buildgo.sh
 @@ -19,7 +19,6 @@ SRCS="
  	../rtl/tsan_stack_trace.cpp
  	../rtl/tsan_suppressions.cpp
  	../rtl/tsan_sync.cpp
 -	../rtl/tsan_vector_clock.cpp
  	../../sanitizer_common/sanitizer_allocator.cpp
  	../../sanitizer_common/sanitizer_common.cpp
  	../../sanitizer_common/sanitizer_common_libcdep.cpp
 diff --git a/compiler-rt/lib/tsan/go/tsan_go.cpp b/compiler-rt/lib/tsan/go/tsan_go.cpp
 index c689a51fb5e1..104c5b325aee 100644
 --- a/compiler-rt/lib/tsan/go/tsan_go.cpp
 +++ b/compiler-rt/lib/tsan/go/tsan_go.cpp
 @@ -214,7 +214,7 @@ void __tsan_malloc(ThreadState *thr, uptr pc, uptr p, uptr sz) {
  }

  void __tsan_free(uptr p, uptr sz) {
 -  ctx->metamap.FreeRange(get_cur_proc(), p, sz, false);
 +  ctx->metamap.FreeRange(get_cur_proc(), p, sz);
  }

  void __tsan_go_start(ThreadState *parent, ThreadState **pthr, void *pc) {
 diff --git a/compiler-rt/lib/tsan/rtl/tsan_defs.h b/compiler-rt/lib/tsan/rtl/tsan_defs.h
 index d9f20d14a92a..4712c2be1813 100644
 --- a/compiler-rt/lib/tsan/rtl/tsan_defs.h
 +++ b/compiler-rt/lib/tsan/rtl/tsan_defs.h
 @@ -63,13 +63,6 @@ enum class Epoch : u16 {};
  constexpr uptr kEpochBits = 14;
  constexpr Epoch kEpochZero = static_cast<Epoch>(0);
  constexpr Epoch kEpochOver = static_cast<Epoch>(1 << kEpochBits);
 -constexpr Epoch kEpochLast = static_cast<Epoch>((1 << kEpochBits) - 1);
 -
 -inline Epoch EpochInc(Epoch epoch) {
 -  return static_cast<Epoch>(static_cast<u16>(epoch) + 1);
 -}
 -
 -inline bool EpochOverflow(Epoch epoch) { return epoch == kEpochOver; }

  const int kClkBits = 42;
  const unsigned kMaxTidReuse = (1 << (64 - kClkBits)) - 1;
 @@ -114,7 +107,7 @@ const uptr kShadowCnt = 4;
  const uptr kShadowCell = 8;

  // Single shadow value.
 -enum class RawShadow : u32 {};
 +typedef u64 RawShadow;
  const uptr kShadowSize = sizeof(RawShadow);

  // Shadow memory is kShadowMultiplier times larger than user memory.
 @@ -191,13 +184,10 @@ MD5Hash md5_hash(const void *data, uptr size);
  struct Processor;
  struct ThreadState;
  class ThreadContext;
 -struct TidSlot;
  struct Context;
  struct ReportStack;
  class ReportDesc;
  class RegionAlloc;
 -struct Trace;
 -struct TracePart;

  typedef uptr AccessType;

 @@ -208,8 +198,6 @@ enum : AccessType {
    kAccessVptr = 1 << 2,  // read or write of an object virtual table pointer
    kAccessFree = 1 << 3,  // synthetic memory access during memory freeing
    kAccessExternalPC = 1 << 4,  // access PC can have kExternalPCBit set
 -  kAccessCheckOnly = 1 << 5,   // check for races, but don't store
 -  kAccessNoRodata = 1 << 6,    // don't check for .rodata marker
  };

  // Descriptor of user's memory block.
 @@ -231,8 +219,9 @@ enum ExternalTag : uptr {
    // as 16-bit values, see tsan_defs.h.
  };

 -enum {
 -  MutexTypeReport = MutexLastCommon,
 +enum MutexType {
 +  MutexTypeTrace = MutexLastCommon,
 +  MutexTypeReport,
    MutexTypeSyncVar,
    MutexTypeAnnotations,
    MutexTypeAtExit,
 @@ -240,10 +229,6 @@ enum {
    MutexTypeRacy,
    MutexTypeGlobalProc,
    MutexTypeInternalAlloc,
 -  MutexTypeTrace,
 -  MutexTypeSlot,
 -  MutexTypeSlots,
 -  MutexTypeMultiSlot,
  };

  }  // namespace __tsan
 diff --git a/compiler-rt/lib/tsan/rtl/tsan_dense_alloc.h b/compiler-rt/lib/tsan/rtl/tsan_dense_alloc.h
 index 7a39a39d51de..9e15f74a0615 100644
 --- a/compiler-rt/lib/tsan/rtl/tsan_dense_alloc.h
 +++ b/compiler-rt/lib/tsan/rtl/tsan_dense_alloc.h
 @@ -104,15 +104,6 @@ class DenseSlabAlloc {
      return atomic_load_relaxed(&fillpos_) * kL2Size * sizeof(T);
    }

 -  template <typename Func>
 -  void ForEach(Func func) {
 -    SpinMutexLock lock(&mtx_);
 -    uptr fillpos = atomic_load_relaxed(&fillpos_);
 -    for (uptr l1 = 0; l1 < fillpos; l1++) {
 -      for (IndexT l2 = l1 == 0 ? 1 : 0; l2 < kL2Size; l2++) func(&map_[l1][l2]);
 -    }
 -  }
 -
   private:
    T *map_[kL1Size];
    SpinMutex mtx_;
 diff --git a/compiler-rt/lib/tsan/rtl/tsan_flags.cpp b/compiler-rt/lib/tsan/rtl/tsan_flags.cpp
 index 54bed9f9a6be..ee89862d17bd 100644
 --- a/compiler-rt/lib/tsan/rtl/tsan_flags.cpp
 +++ b/compiler-rt/lib/tsan/rtl/tsan_flags.cpp
 @@ -110,6 +110,12 @@ void InitializeFlags(Flags *f, const char *env, const char *env_option_name) {

    if (common_flags()->help) parser.PrintFlagDescriptions();

 +  if (f->history_size < 0 || f->history_size > 7) {
 +    Printf("ThreadSanitizer: incorrect value for history_size"
 +           " (must be [0..7])\n");
 +    Die();
 +  }
 +
    if (f->io_sync < 0 || f->io_sync > 2) {
      Printf("ThreadSanitizer: incorrect value for io_sync"
             " (must be [0..2])\n");
 diff --git a/compiler-rt/lib/tsan/rtl/tsan_flags.inc b/compiler-rt/lib/tsan/rtl/tsan_flags.inc
 index 3df180ec68cc..7954a4307fa1 100644
 --- a/compiler-rt/lib/tsan/rtl/tsan_flags.inc
 +++ b/compiler-rt/lib/tsan/rtl/tsan_flags.inc
 @@ -59,10 +59,14 @@ TSAN_FLAG(bool, stop_on_start, false,
            "Stops on start until __tsan_resume() is called (for debugging).")
  TSAN_FLAG(bool, running_on_valgrind, false,
            "Controls whether RunningOnValgrind() returns true or false.")
 +// There are a lot of goroutines in Go, so we use smaller history.
  TSAN_FLAG(
 -    uptr, history_size, 0,
 -    "Per-thread history size,"
 -    " controls how many extra previous memory accesses are remembered per thread.")
 +    int, history_size, SANITIZER_GO ? 1 : 3,
 +    "Per-thread history size, controls how many previous memory accesses "
 +    "are remembered per thread.  Possible values are [0..7]. "
 +    "history_size=0 amounts to 32K memory accesses.  Each next value doubles "
 +    "the amount of memory accesses, up to history_size=7 that amounts to "
 +    "4M memory accesses.  The default value is 2 (128K memory accesses).")
  TSAN_FLAG(int, io_sync, 1,
            "Controls level of synchronization implied by IO operations. "
            "0 - no synchronization "
 diff --git a/compiler-rt/lib/tsan/rtl/tsan_interceptors_posix.cpp b/compiler-rt/lib/tsan/rtl/tsan_interceptors_posix.cpp
 index 280db4ae28e5..73df011b4212 100644
 --- a/compiler-rt/lib/tsan/rtl/tsan_interceptors_posix.cpp
 +++ b/compiler-rt/lib/tsan/rtl/tsan_interceptors_posix.cpp
 @@ -1981,7 +1981,6 @@ static void ReportErrnoSpoiling(ThreadState *thr, uptr pc) {
  static void CallUserSignalHandler(ThreadState *thr, bool sync, bool acquire,
                                    int sig, __sanitizer_siginfo *info,
                                    void *uctx) {
 -  CHECK(thr->slot);
    __sanitizer_sigaction *sigactions = interceptor_ctx()->sigactions;
    if (acquire)
      Acquire(thr, 0, (uptr)&sigactions[sig]);
 @@ -2269,7 +2268,7 @@ struct dl_iterate_phdr_data {
  };

  static bool IsAppNotRodata(uptr addr) {
 -  return IsAppMem(addr) && *MemToShadow(addr) != Shadow::kRodata;
 +  return IsAppMem(addr) && *MemToShadow(addr) != kShadowRodata;
  }

  static int dl_iterate_phdr_cb(__sanitizer_dl_phdr_info *info, SIZE_T size,
 diff --git a/compiler-rt/lib/tsan/rtl/tsan_interface_atomic.cpp b/compiler-rt/lib/tsan/rtl/tsan_interface_atomic.cpp
 index f794a2fcdd0d..24ba3bb1f65d 100644
 --- a/compiler-rt/lib/tsan/rtl/tsan_interface_atomic.cpp
 +++ b/compiler-rt/lib/tsan/rtl/tsan_interface_atomic.cpp
 @@ -235,9 +235,8 @@ static T AtomicLoad(ThreadState *thr, uptr pc, const volatile T *a, morder mo) {
    T v = NoTsanAtomicLoad(a, mo);
    SyncVar *s = ctx->metamap.GetSyncIfExists((uptr)a);
    if (s) {
 -    SlotLocker locker(thr);
 -    ReadLock lock(&s->mtx);
 -    thr->clock.Acquire(s->clock);
 +    ReadLock l(&s->mtx);
 +    AcquireImpl(thr, pc, &s->clock);
      // Re-read under sync mutex because we need a consistent snapshot
      // of the value and the clock we acquire.
      v = NoTsanAtomicLoad(a, mo);
 @@ -271,14 +270,14 @@ static void AtomicStore(ThreadState *thr, uptr pc, volatile T *a, T v,
      NoTsanAtomicStore(a, v, mo);
      return;
    }
 -  SlotLocker locker(thr);
 -  {
 -    auto s = ctx->metamap.GetSyncOrCreate(thr, pc, (uptr)a, false);
 -    Lock lock(&s->mtx);
 -    thr->clock.ReleaseStore(&s->clock);
 -    NoTsanAtomicStore(a, v, mo);
 -  }
 -  IncrementEpoch(thr);
 +  __sync_synchronize();
 +  SyncVar *s = ctx->metamap.GetSyncOrCreate(thr, pc, (uptr)a, false);
 +  Lock l(&s->mtx);
 +  thr->fast_state.IncrementEpoch();
 +  // Can't increment epoch w/o writing to the trace as well.
 +  TraceAddEvent(thr, thr->fast_state, EventTypeMop, 0);
 +  ReleaseStoreImpl(thr, pc, &s->clock);
 +  NoTsanAtomicStore(a, v, mo);
  }

  template <typename T, T (*F)(volatile T *v, T op)>
 @@ -286,21 +285,18 @@ static T AtomicRMW(ThreadState *thr, uptr pc, volatile T *a, T v, morder mo) {
    MemoryAccess(thr, pc, (uptr)a, AccessSize<T>(), kAccessWrite | kAccessAtomic);
    if (LIKELY(mo == mo_relaxed))
      return F(a, v);
 -  SlotLocker locker(thr);
 -  {
 -    auto s = ctx->metamap.GetSyncOrCreate(thr, pc, (uptr)a, false);
 -    RWLock lock(&s->mtx, IsReleaseOrder(mo));
 -    if (IsAcqRelOrder(mo))
 -      thr->clock.ReleaseAcquire(&s->clock);
 -    else if (IsReleaseOrder(mo))
 -      thr->clock.Release(&s->clock);
 -    else if (IsAcquireOrder(mo))
 -      thr->clock.Acquire(s->clock);
 -    v = F(a, v);
 -  }
 -  if (IsReleaseOrder(mo))
 -    IncrementEpoch(thr);
 -  return v;
 +  SyncVar *s = ctx->metamap.GetSyncOrCreate(thr, pc, (uptr)a, false);
 +  Lock l(&s->mtx);
 +  thr->fast_state.IncrementEpoch();
 +  // Can't increment epoch w/o writing to the trace as well.
 +  TraceAddEvent(thr, thr->fast_state, EventTypeMop, 0);
 +  if (IsAcqRelOrder(mo))
 +    AcquireReleaseImpl(thr, pc, &s->clock);
 +  else if (IsReleaseOrder(mo))
 +    ReleaseImpl(thr, pc, &s->clock);
 +  else if (IsAcquireOrder(mo))
 +    AcquireImpl(thr, pc, &s->clock);
 +  return F(a, v);
  }

  template<typename T>
 @@ -420,28 +416,27 @@ static bool AtomicCAS(ThreadState *thr, uptr pc, volatile T *a, T *c, T v,
      *c = pr;
      return false;
    }
 -  SlotLocker locker(thr);
 +
    bool release = IsReleaseOrder(mo);
 -  bool success;
 -  {
 -    auto s = ctx->metamap.GetSyncOrCreate(thr, pc, (uptr)a, false);
 -    RWLock lock(&s->mtx, release);
 -    T cc = *c;
 -    T pr = func_cas(a, cc, v);
 -    success = pr == cc;
 -    if (!success) {
 -      *c = pr;
 -      mo = fmo;
 -    }
 -    if (success && IsAcqRelOrder(mo))
 -      thr->clock.ReleaseAcquire(&s->clock);
 -    else if (success && IsReleaseOrder(mo))
 -      thr->clock.Release(&s->clock);
 -    else if (IsAcquireOrder(mo))
 -      thr->clock.Acquire(s->clock);
 +  SyncVar *s = ctx->metamap.GetSyncOrCreate(thr, pc, (uptr)a, false);
 +  RWLock l(&s->mtx, release);
 +  T cc = *c;
 +  T pr = func_cas(a, cc, v);
 +  bool success = pr == cc;
 +  if (!success) {
 +    *c = pr;
 +    mo = fmo;
    }
 -  if (success && release)
 -    IncrementEpoch(thr);
 +  thr->fast_state.IncrementEpoch();
 +  // Can't increment epoch w/o writing to the trace as well.
 +  TraceAddEvent(thr, thr->fast_state, EventTypeMop, 0);
 +
 +  if (success && IsAcqRelOrder(mo))
 +    AcquireReleaseImpl(thr, pc, &s->clock);
 +  else if (success && IsReleaseOrder(mo))
 +    ReleaseImpl(thr, pc, &s->clock);
 +  else if (IsAcquireOrder(mo))
 +    AcquireImpl(thr, pc, &s->clock);
    return success;
  }

 diff --git a/compiler-rt/lib/tsan/rtl/tsan_interface_java.cpp b/compiler-rt/lib/tsan/rtl/tsan_interface_java.cpp
 index 7c15a1638826..c090c1f08cbe 100644
 --- a/compiler-rt/lib/tsan/rtl/tsan_interface_java.cpp
 +++ b/compiler-rt/lib/tsan/rtl/tsan_interface_java.cpp
 @@ -106,7 +106,7 @@ void __tsan_java_free(jptr ptr, jptr size) {
    DCHECK_GE(ptr, jctx->heap_begin);
    DCHECK_LE(ptr + size, jctx->heap_begin + jctx->heap_size);

 -  ctx->metamap.FreeRange(thr->proc(), ptr, size, false);
 +  ctx->metamap.FreeRange(thr->proc(), ptr, size);
  }

  void __tsan_java_move(jptr src, jptr dst, jptr size) {
 @@ -133,7 +133,7 @@ void __tsan_java_move(jptr src, jptr dst, jptr size) {
    // support that anymore as it contains addresses of accesses.
    RawShadow *d = MemToShadow(dst);
    RawShadow *dend = MemToShadow(dst + size);
 -  ShadowSet(d, dend, Shadow::kEmpty);
 +  internal_memset(d, 0, (dend - d) * sizeof(*d));
  }

  jptr __tsan_java_find(jptr *from_ptr, jptr to) {
 diff --git a/compiler-rt/lib/tsan/rtl/tsan_mman.cpp b/compiler-rt/lib/tsan/rtl/tsan_mman.cpp
 index 18022d012bbc..a31bebcb6ba9 100644
 --- a/compiler-rt/lib/tsan/rtl/tsan_mman.cpp
 +++ b/compiler-rt/lib/tsan/rtl/tsan_mman.cpp
 @@ -125,6 +125,7 @@ ScopedGlobalProcessor::~ScopedGlobalProcessor() {
  }

  void AllocatorLock() NO_THREAD_SAFETY_ANALYSIS {
 +  global_proc()->mtx.Lock();
    global_proc()->internal_alloc_mtx.Lock();
    InternalAllocatorLock();
  }
 @@ -132,13 +133,6 @@ void AllocatorLock() NO_THREAD_SAFETY_ANALYSIS {
  void AllocatorUnlock() NO_THREAD_SAFETY_ANALYSIS {
    InternalAllocatorUnlock();
    global_proc()->internal_alloc_mtx.Unlock();
 -}
 -
 -void GlobalProcessorLock() NO_THREAD_SAFETY_ANALYSIS {
 -  global_proc()->mtx.Lock();
 -}
 -
 -void GlobalProcessorUnlock() NO_THREAD_SAFETY_ANALYSIS {
    global_proc()->mtx.Unlock();
  }

 @@ -251,17 +245,8 @@ void *user_reallocarray(ThreadState *thr, uptr pc, void *p, uptr size, uptr n) {

  void OnUserAlloc(ThreadState *thr, uptr pc, uptr p, uptr sz, bool write) {
    DPrintf("#%d: alloc(%zu) = 0x%zx\n", thr->tid, sz, p);
 -  // Note: this can run before thread initialization/after finalization.
 -  // As a result this is not necessarily synchronized with DoReset,
 -  // which iterates over and resets all sync objects,
 -  // but it is fine to create new MBlocks in this context.
    ctx->metamap.AllocBlock(thr, pc, p, sz);
 -  // If this runs before thread initialization/after finalization
 -  // and we don't have trace initialized, we can't imitate writes.
 -  // In such case just reset the shadow range, it is fine since
 -  // it affects only a small fraction of special objects.
 -  if (write && thr->ignore_reads_and_writes == 0 &&
 -      atomic_load_relaxed(&thr->trace_pos))
 +  if (write && thr->ignore_reads_and_writes == 0 && thr->is_inited)
      MemoryRangeImitateWrite(thr, pc, (uptr)p, sz);
    else
      MemoryResetRange(thr, pc, (uptr)p, sz);
 @@ -269,16 +254,9 @@ void OnUserAlloc(ThreadState *thr, uptr pc, uptr p, uptr sz, bool write) {

  void OnUserFree(ThreadState *thr, uptr pc, uptr p, bool write) {
    CHECK_NE(p, (void*)0);
 -  if (!thr->slot) {
 -    // Very early/late in thread lifetime, or during fork.
 -    UNUSED uptr sz = ctx->metamap.FreeBlock(thr->proc(), p, false);
 -    DPrintf("#%d: free(0x%zx, %zu) (no slot)\n", thr->tid, p, sz);
 -    return;
 -  }
 -  SlotLocker locker(thr);
 -  uptr sz = ctx->metamap.FreeBlock(thr->proc(), p, true);
 +  uptr sz = ctx->metamap.FreeBlock(thr->proc(), p);
    DPrintf("#%d: free(0x%zx, %zu)\n", thr->tid, p, sz);
 -  if (write && thr->ignore_reads_and_writes == 0)
 +  if (write && thr->ignore_reads_and_writes == 0 && thr->is_inited)
      MemoryRangeFreed(thr, pc, (uptr)p, sz);
  }

 @@ -443,6 +421,8 @@ uptr __sanitizer_get_allocated_size(const void *p) {

  void __tsan_on_thread_idle() {
    ThreadState *thr = cur_thread();
 +  thr->clock.ResetCached(&thr->proc()->clock_cache);
 +  thr->last_sleep_clock.ResetCached(&thr->proc()->clock_cache);
    allocator()->SwallowCache(&thr->proc()->alloc_cache);
    internal_allocator()->SwallowCache(&thr->proc()->internal_alloc_cache);
    ctx->metamap.OnProcIdle(thr->proc());
 diff --git a/compiler-rt/lib/tsan/rtl/tsan_mman.h b/compiler-rt/lib/tsan/rtl/tsan_mman.h
 index 2095f28c0253..db8488eabbe2 100644
 --- a/compiler-rt/lib/tsan/rtl/tsan_mman.h
 +++ b/compiler-rt/lib/tsan/rtl/tsan_mman.h
 @@ -26,8 +26,6 @@ void AllocatorProcFinish(Processor *proc);
  void AllocatorPrintStats();
  void AllocatorLock();
  void AllocatorUnlock();
 -void GlobalProcessorLock();
 -void GlobalProcessorUnlock();

  // For user allocations.
  void *user_alloc_internal(ThreadState *thr, uptr pc, uptr sz,
 diff --git a/compiler-rt/lib/tsan/rtl/tsan_mutexset.cpp b/compiler-rt/lib/tsan/rtl/tsan_mutexset.cpp
 index 3a75b80ac30f..735179686ba9 100644
 --- a/compiler-rt/lib/tsan/rtl/tsan_mutexset.cpp
 +++ b/compiler-rt/lib/tsan/rtl/tsan_mutexset.cpp
 @@ -19,7 +19,57 @@ namespace __tsan {
  MutexSet::MutexSet() {
  }

 -void MutexSet::Reset() { internal_memset(this, 0, sizeof(*this)); }
 +void MutexSet::Add(u64 id, bool write, u64 epoch) {
 +  // Look up existing mutex with the same id.
 +  for (uptr i = 0; i < size_; i++) {
 +    if (descs_[i].id == id) {
 +      descs_[i].count++;
 +      descs_[i].epoch = epoch;
 +      return;
 +    }
 +  }
 +  // On overflow, find the oldest mutex and drop it.
 +  if (size_ == kMaxSize) {
 +    u64 minepoch = (u64)-1;
 +    u64 mini = (u64)-1;
 +    for (uptr i = 0; i < size_; i++) {
 +      if (descs_[i].epoch < minepoch) {
 +        minepoch = descs_[i].epoch;
 +        mini = i;
 +      }
 +    }
 +    RemovePos(mini);
 +    CHECK_EQ(size_, kMaxSize - 1);
 +  }
 +  // Add new mutex descriptor.
 +  descs_[size_].addr = 0;
 +  descs_[size_].stack_id = kInvalidStackID;
 +  descs_[size_].id = id;
 +  descs_[size_].write = write;
 +  descs_[size_].epoch = epoch;
 +  descs_[size_].seq = seq_++;
 +  descs_[size_].count = 1;
 +  size_++;
 +}
 +
 +void MutexSet::Del(u64 id, bool write) {
 +  for (uptr i = 0; i < size_; i++) {
 +    if (descs_[i].id == id) {
 +      if (--descs_[i].count == 0)
 +        RemovePos(i);
 +      return;
 +    }
 +  }
 +}
 +
 +void MutexSet::Remove(u64 id) {
 +  for (uptr i = 0; i < size_; i++) {
 +    if (descs_[i].id == id) {
 +      RemovePos(i);
 +      return;
 +    }
 +  }
 +}

  void MutexSet::AddAddr(uptr addr, StackID stack_id, bool write) {
    // Look up existing mutex with the same id.
 @@ -43,7 +93,9 @@ void MutexSet::AddAddr(uptr addr, StackID stack_id, bool write) {
    // Add new mutex descriptor.
    descs_[size_].addr = addr;
    descs_[size_].stack_id = stack_id;
 +  descs_[size_].id = 0;
    descs_[size_].write = write;
 +  descs_[size_].epoch = 0;
    descs_[size_].seq = seq_++;
    descs_[size_].count = 1;
    size_++;
 diff --git a/compiler-rt/lib/tsan/rtl/tsan_mutexset.h b/compiler-rt/lib/tsan/rtl/tsan_mutexset.h
 index aabd361e6afd..93776a664135 100644
 --- a/compiler-rt/lib/tsan/rtl/tsan_mutexset.h
 +++ b/compiler-rt/lib/tsan/rtl/tsan_mutexset.h
 @@ -25,6 +25,8 @@ class MutexSet {
    struct Desc {
      uptr addr;
      StackID stack_id;
 +    u64 id;
 +    u64 epoch;
      u32 seq;
      u32 count;
      bool write;
 @@ -38,7 +40,10 @@ class MutexSet {
    };

    MutexSet();
 -  void Reset();
 +  // The 'id' is obtained from SyncVar::GetId().
 +  void Add(u64 id, bool write, u64 epoch);
 +  void Del(u64 id, bool write);
 +  void Remove(u64 id);  // Removes the mutex completely (if it's destroyed).
    void AddAddr(uptr addr, StackID stack_id, bool write);
    void DelAddr(uptr addr, bool destroy = false);
    uptr Size() const;
 @@ -77,7 +82,9 @@ class DynamicMutexSet {
  // in different goroutine).
  #if SANITIZER_GO
  MutexSet::MutexSet() {}
 -void MutexSet::Reset() {}
 +void MutexSet::Add(u64 id, bool write, u64 epoch) {}
 +void MutexSet::Del(u64 id, bool write) {}
 +void MutexSet::Remove(u64 id) {}
  void MutexSet::AddAddr(uptr addr, StackID stack_id, bool write) {}
  void MutexSet::DelAddr(uptr addr, bool destroy) {}
  uptr MutexSet::Size() const { return 0; }
 diff --git a/compiler-rt/lib/tsan/rtl/tsan_platform.h b/compiler-rt/lib/tsan/rtl/tsan_platform.h
 index e28bac2457aa..7ff0acace8f6 100644
 --- a/compiler-rt/lib/tsan/rtl/tsan_platform.h
 +++ b/compiler-rt/lib/tsan/rtl/tsan_platform.h
 @@ -18,8 +18,8 @@
  # error "Only 64-bit is supported"
  #endif

 -#include "sanitizer_common/sanitizer_common.h"
  #include "tsan_defs.h"
 +#include "tsan_trace.h"

  namespace __tsan {

 @@ -45,7 +45,9 @@ C/C++ on linux/x86_64 and freebsd/x86_64
  3000 0000 0000 - 4000 0000 0000: metainfo (memory blocks and sync objects)
  4000 0000 0000 - 5500 0000 0000: -
  5500 0000 0000 - 5680 0000 0000: pie binaries without ASLR or on 4.1+ kernels
 -5680 0000 0000 - 7d00 0000 0000: -
 +5680 0000 0000 - 6000 0000 0000: -
 +6000 0000 0000 - 6200 0000 0000: traces
 +6200 0000 0000 - 7d00 0000 0000: -
  7b00 0000 0000 - 7c00 0000 0000: heap
  7c00 0000 0000 - 7e80 0000 0000: -
  7e80 0000 0000 - 8000 0000 0000: modules and main thread stack
 @@ -65,6 +67,8 @@ C/C++ on netbsd/amd64 can reuse the same mapping:
  struct Mapping48AddressSpace {
    static const uptr kMetaShadowBeg = 0x300000000000ull;
    static const uptr kMetaShadowEnd = 0x340000000000ull;
 +  static const uptr kTraceMemBeg   = 0x600000000000ull;
 +  static const uptr kTraceMemEnd   = 0x620000000000ull;
    static const uptr kShadowBeg     = 0x010000000000ull;
    static const uptr kShadowEnd     = 0x200000000000ull;
    static const uptr kHeapMemBeg    = 0x7b0000000000ull;
 @@ -85,12 +89,14 @@ struct Mapping48AddressSpace {
  C/C++ on linux/mips64 (40-bit VMA)
  0000 0000 00 - 0100 0000 00: -                                           (4 GB)
  0100 0000 00 - 0200 0000 00: main binary                                 (4 GB)
 -0200 0000 00 - 1200 0000 00: -                                         (120 GB)
 -1200 0000 00 - 4000 0000 00: shadow                                    (128 GB)
 +0200 0000 00 - 2000 0000 00: -                                         (120 GB)
 +2000 0000 00 - 4000 0000 00: shadow                                    (128 GB)
  4000 0000 00 - 5000 0000 00: metainfo (memory blocks and sync objects)  (64 GB)
  5000 0000 00 - aa00 0000 00: -                                         (360 GB)
  aa00 0000 00 - ab00 0000 00: main binary (PIE)                           (4 GB)
 -ab00 0000 00 - fe00 0000 00: -                                         (332 GB)
 +ab00 0000 00 - b000 0000 00: -                                          (20 GB)
 +b000 0000 00 - b200 0000 00: traces                                      (8 GB)
 +b200 0000 00 - fe00 0000 00: -                                         (304 GB)
  fe00 0000 00 - ff00 0000 00: heap                                        (4 GB)
  ff00 0000 00 - ff80 0000 00: -                                           (2 GB)
  ff80 0000 00 - ffff ffff ff: modules and main thread stack              (<2 GB)
 @@ -98,7 +104,9 @@ ff80 0000 00 - ffff ffff ff: modules and main thread stack              (<2 GB)
  struct MappingMips64_40 {
    static const uptr kMetaShadowBeg = 0x4000000000ull;
    static const uptr kMetaShadowEnd = 0x5000000000ull;
 -  static const uptr kShadowBeg = 0x1200000000ull;
 +  static const uptr kTraceMemBeg   = 0xb000000000ull;
 +  static const uptr kTraceMemEnd   = 0xb200000000ull;
 +  static const uptr kShadowBeg     = 0x2000000000ull;
    static const uptr kShadowEnd     = 0x4000000000ull;
    static const uptr kHeapMemBeg    = 0xfe00000000ull;
    static const uptr kHeapMemEnd    = 0xff00000000ull;
 @@ -123,7 +131,9 @@ C/C++ on Darwin/iOS/ARM64 (36-bit VMA, 64 GB VM)
  0400 0000 00 - 0c00 0000 00: shadow memory                       (32 GB)
  0c00 0000 00 - 0d00 0000 00: -                                    (4 GB)
  0d00 0000 00 - 0e00 0000 00: metainfo                             (4 GB)
 -0e00 0000 00 - 1000 0000 00: -
 +0e00 0000 00 - 0f00 0000 00: -                                    (4 GB)
 +0f00 0000 00 - 0fc0 0000 00: traces                               (3 GB)
 +0fc0 0000 00 - 1000 0000 00: -
  */
  struct MappingAppleAarch64 {
    static const uptr kLoAppMemBeg   = 0x0100000000ull;
 @@ -134,11 +144,13 @@ struct MappingAppleAarch64 {
    static const uptr kShadowEnd     = 0x0c00000000ull;
    static const uptr kMetaShadowBeg = 0x0d00000000ull;
    static const uptr kMetaShadowEnd = 0x0e00000000ull;
 +  static const uptr kTraceMemBeg   = 0x0f00000000ull;
 +  static const uptr kTraceMemEnd   = 0x0fc0000000ull;
    static const uptr kHiAppMemBeg   = 0x0fc0000000ull;
    static const uptr kHiAppMemEnd   = 0x0fc0000000ull;
    static const uptr kShadowMsk = 0x0ull;
    static const uptr kShadowXor = 0x0ull;
 -  static const uptr kShadowAdd = 0x0200000000ull;
 +  static const uptr kShadowAdd = 0x0ull;
    static const uptr kVdsoBeg       = 0x7000000000000000ull;
    static const uptr kMidAppMemBeg = 0;
    static const uptr kMidAppMemEnd = 0;
 @@ -147,25 +159,29 @@ struct MappingAppleAarch64 {
  /*
  C/C++ on linux/aarch64 (39-bit VMA)
  0000 0010 00 - 0100 0000 00: main binary
 -0100 0000 00 - 0400 0000 00: -
 -0400 0000 00 - 2000 0000 00: shadow memory
 +0100 0000 00 - 0800 0000 00: -
 +0800 0000 00 - 2000 0000 00: shadow memory
  2000 0000 00 - 3100 0000 00: -
  3100 0000 00 - 3400 0000 00: metainfo
  3400 0000 00 - 5500 0000 00: -
  5500 0000 00 - 5600 0000 00: main binary (PIE)
 -5600 0000 00 - 7c00 0000 00: -
 +5600 0000 00 - 6000 0000 00: -
 +6000 0000 00 - 6200 0000 00: traces
 +6200 0000 00 - 7d00 0000 00: -
  7c00 0000 00 - 7d00 0000 00: heap
  7d00 0000 00 - 7fff ffff ff: modules and main thread stack
  */
  struct MappingAarch64_39 {
    static const uptr kLoAppMemBeg   = 0x0000001000ull;
    static const uptr kLoAppMemEnd   = 0x0100000000ull;
 -  static const uptr kShadowBeg = 0x0400000000ull;
 +  static const uptr kShadowBeg     = 0x0800000000ull;
    static const uptr kShadowEnd     = 0x2000000000ull;
    static const uptr kMetaShadowBeg = 0x3100000000ull;
    static const uptr kMetaShadowEnd = 0x3400000000ull;
    static const uptr kMidAppMemBeg  = 0x5500000000ull;
 -  static const uptr kMidAppMemEnd = 0x5600000000ull;
 +  static const uptr kMidAppMemEnd  = 0x5600000000ull;
 +  static const uptr kTraceMemBeg   = 0x6000000000ull;
 +  static const uptr kTraceMemEnd   = 0x6200000000ull;
    static const uptr kHeapMemBeg    = 0x7c00000000ull;
    static const uptr kHeapMemEnd    = 0x7d00000000ull;
    static const uptr kHiAppMemBeg   = 0x7e00000000ull;
 @@ -179,13 +195,15 @@ struct MappingAarch64_39 {
  /*
  C/C++ on linux/aarch64 (42-bit VMA)
  00000 0010 00 - 01000 0000 00: main binary
 -01000 0000 00 - 08000 0000 00: -
 -08000 0000 00 - 20000 0000 00: shadow memory
 +01000 0000 00 - 10000 0000 00: -
 +10000 0000 00 - 20000 0000 00: shadow memory
  20000 0000 00 - 26000 0000 00: -
  26000 0000 00 - 28000 0000 00: metainfo
  28000 0000 00 - 2aa00 0000 00: -
  2aa00 0000 00 - 2ab00 0000 00: main binary (PIE)
 -2ab00 0000 00 - 3e000 0000 00: -
 +2ab00 0000 00 - 36200 0000 00: -
 +36200 0000 00 - 36240 0000 00: traces
 +36240 0000 00 - 3e000 0000 00: -
  3e000 0000 00 - 3f000 0000 00: heap
  3f000 0000 00 - 3ffff ffff ff: modules and main thread stack
  */
 @@ -193,12 +211,14 @@ struct MappingAarch64_42 {
    static const uptr kBroken = kBrokenReverseMapping;
    static const uptr kLoAppMemBeg   = 0x00000001000ull;
    static const uptr kLoAppMemEnd   = 0x01000000000ull;
 -  static const uptr kShadowBeg = 0x08000000000ull;
 +  static const uptr kShadowBeg     = 0x10000000000ull;
    static const uptr kShadowEnd     = 0x20000000000ull;
    static const uptr kMetaShadowBeg = 0x26000000000ull;
    static const uptr kMetaShadowEnd = 0x28000000000ull;
    static const uptr kMidAppMemBeg  = 0x2aa00000000ull;
 -  static const uptr kMidAppMemEnd = 0x2ab00000000ull;
 +  static const uptr kMidAppMemEnd  = 0x2ab00000000ull;
 +  static const uptr kTraceMemBeg   = 0x36200000000ull;
 +  static const uptr kTraceMemEnd   = 0x36400000000ull;
    static const uptr kHeapMemBeg    = 0x3e000000000ull;
    static const uptr kHeapMemEnd    = 0x3f000000000ull;
    static const uptr kHiAppMemBeg   = 0x3f000000000ull;
 @@ -212,12 +232,14 @@ struct MappingAarch64_42 {
  struct MappingAarch64_48 {
    static const uptr kLoAppMemBeg   = 0x0000000001000ull;
    static const uptr kLoAppMemEnd   = 0x0000200000000ull;
 -  static const uptr kShadowBeg = 0x0001000000000ull;
 +  static const uptr kShadowBeg     = 0x0002000000000ull;
    static const uptr kShadowEnd     = 0x0004000000000ull;
    static const uptr kMetaShadowBeg = 0x0005000000000ull;
    static const uptr kMetaShadowEnd = 0x0006000000000ull;
    static const uptr kMidAppMemBeg  = 0x0aaaa00000000ull;
 -  static const uptr kMidAppMemEnd = 0x0aaaf00000000ull;
 +  static const uptr kMidAppMemEnd  = 0x0aaaf00000000ull;
 +  static const uptr kTraceMemBeg   = 0x0f06000000000ull;
 +  static const uptr kTraceMemEnd   = 0x0f06200000000ull;
    static const uptr kHeapMemBeg    = 0x0ffff00000000ull;
    static const uptr kHeapMemEnd    = 0x0ffff00000000ull;
    static const uptr kHiAppMemBeg   = 0x0ffff00000000ull;
 @@ -235,7 +257,9 @@ C/C++ on linux/powerpc64 (44-bit VMA)
  0001 0000 0000 - 0b00 0000 0000: shadow
  0b00 0000 0000 - 0b00 0000 0000: -
  0b00 0000 0000 - 0d00 0000 0000: metainfo (memory blocks and sync objects)
 -0d00 0000 0000 - 0f00 0000 0000: -
 +0d00 0000 0000 - 0d00 0000 0000: -
 +0d00 0000 0000 - 0f00 0000 0000: traces
 +0f00 0000 0000 - 0f00 0000 0000: -
  0f00 0000 0000 - 0f50 0000 0000: heap
  0f50 0000 0000 - 0f60 0000 0000: -
  0f60 0000 0000 - 1000 0000 0000: modules and main thread stack
 @@ -245,6 +269,8 @@ struct MappingPPC64_44 {
        kBrokenMapping | kBrokenReverseMapping | kBrokenLinearity;
    static const uptr kMetaShadowBeg = 0x0b0000000000ull;
    static const uptr kMetaShadowEnd = 0x0d0000000000ull;
 +  static const uptr kTraceMemBeg   = 0x0d0000000000ull;
 +  static const uptr kTraceMemEnd   = 0x0f0000000000ull;
    static const uptr kShadowBeg     = 0x000100000000ull;
    static const uptr kShadowEnd     = 0x0b0000000000ull;
    static const uptr kLoAppMemBeg   = 0x000000000100ull;
 @@ -269,7 +295,8 @@ C/C++ on linux/powerpc64 (46-bit VMA)
  1000 0000 0000 - 1000 0000 0000: -
  1000 0000 0000 - 2000 0000 0000: metainfo (memory blocks and sync objects)
  2000 0000 0000 - 2000 0000 0000: -
 -1200 0000 0000 - 3d00 0000 0000: -
 +2000 0000 0000 - 2200 0000 0000: traces
 +2200 0000 0000 - 3d00 0000 0000: -
  3d00 0000 0000 - 3e00 0000 0000: heap
  3e00 0000 0000 - 3e80 0000 0000: -
  3e80 0000 0000 - 4000 0000 0000: modules and main thread stack
 @@ -277,6 +304,8 @@ C/C++ on linux/powerpc64 (46-bit VMA)
  struct MappingPPC64_46 {
    static const uptr kMetaShadowBeg = 0x100000000000ull;
    static const uptr kMetaShadowEnd = 0x200000000000ull;
 +  static const uptr kTraceMemBeg   = 0x200000000000ull;
 +  static const uptr kTraceMemEnd   = 0x220000000000ull;
    static const uptr kShadowBeg     = 0x010000000000ull;
    static const uptr kShadowEnd     = 0x100000000000ull;
    static const uptr kHeapMemBeg    = 0x3d0000000000ull;
 @@ -300,7 +329,9 @@ C/C++ on linux/powerpc64 (47-bit VMA)
  0100 0000 0000 - 1000 0000 0000: shadow
  1000 0000 0000 - 1000 0000 0000: -
  1000 0000 0000 - 2000 0000 0000: metainfo (memory blocks and sync objects)
 -2000 0000 0000 - 7d00 0000 0000: -
 +2000 0000 0000 - 2000 0000 0000: -
 +2000 0000 0000 - 2200 0000 0000: traces
 +2200 0000 0000 - 7d00 0000 0000: -
  7d00 0000 0000 - 7e00 0000 0000: heap
  7e00 0000 0000 - 7e80 0000 0000: -
  7e80 0000 0000 - 8000 0000 0000: modules and main thread stack
 @@ -308,6 +339,8 @@ C/C++ on linux/powerpc64 (47-bit VMA)
  struct MappingPPC64_47 {
    static const uptr kMetaShadowBeg = 0x100000000000ull;
    static const uptr kMetaShadowEnd = 0x200000000000ull;
 +  static const uptr kTraceMemBeg   = 0x200000000000ull;
 +  static const uptr kTraceMemEnd   = 0x220000000000ull;
    static const uptr kShadowBeg     = 0x010000000000ull;
    static const uptr kShadowEnd     = 0x100000000000ull;
    static const uptr kHeapMemBeg    = 0x7d0000000000ull;
 @@ -329,17 +362,21 @@ C/C++ on linux/s390x
  While the kernel provides a 64-bit address space, we have to restrict ourselves
  to 48 bits due to how e.g. SyncVar::GetId() works.
  0000 0000 1000 - 0e00 0000 0000: binary, modules, stacks - 14 TiB
 -0e00 0000 0000 - 2000 0000 0000: -
 -2000 0000 0000 - 8000 0000 0000: shadow - 64TiB (4 * app)
 +0e00 0000 0000 - 4000 0000 0000: -
 +4000 0000 0000 - 8000 0000 0000: shadow - 64TiB (4 * app)
  8000 0000 0000 - 9000 0000 0000: -
  9000 0000 0000 - 9800 0000 0000: metainfo - 8TiB (0.5 * app)
 -9800 0000 0000 - be00 0000 0000: -
 +9800 0000 0000 - a000 0000 0000: -
 +a000 0000 0000 - b000 0000 0000: traces - 16TiB (max history * 128k threads)
 +b000 0000 0000 - be00 0000 0000: -
  be00 0000 0000 - c000 0000 0000: heap - 2TiB (max supported by the allocator)
  */
  struct MappingS390x {
    static const uptr kMetaShadowBeg = 0x900000000000ull;
    static const uptr kMetaShadowEnd = 0x980000000000ull;
 -  static const uptr kShadowBeg = 0x200000000000ull;
 +  static const uptr kTraceMemBeg   = 0xa00000000000ull;
 +  static const uptr kTraceMemEnd   = 0xb00000000000ull;
 +  static const uptr kShadowBeg     = 0x400000000000ull;
    static const uptr kShadowEnd     = 0x800000000000ull;
    static const uptr kHeapMemBeg    = 0xbe0000000000ull;
    static const uptr kHeapMemEnd    = 0xc00000000000ull;
 @@ -363,12 +400,16 @@ struct MappingS390x {
  2000 0000 0000 - 2380 0000 0000: shadow
  2380 0000 0000 - 3000 0000 0000: -
  3000 0000 0000 - 4000 0000 0000: metainfo (memory blocks and sync objects)
 -4000 0000 0000 - 8000 0000 0000: -
 +4000 0000 0000 - 6000 0000 0000: -
 +6000 0000 0000 - 6200 0000 0000: traces
 +6200 0000 0000 - 8000 0000 0000: -
  */

  struct MappingGo48 {
    static const uptr kMetaShadowBeg = 0x300000000000ull;
    static const uptr kMetaShadowEnd = 0x400000000000ull;
 +  static const uptr kTraceMemBeg   = 0x600000000000ull;
 +  static const uptr kTraceMemEnd   = 0x620000000000ull;
    static const uptr kShadowBeg     = 0x200000000000ull;
    static const uptr kShadowEnd     = 0x238000000000ull;
    static const uptr kLoAppMemBeg = 0x000000001000ull;
 @@ -391,7 +432,7 @@ struct MappingGo48 {
  00c0 0000 0000 - 00e0 0000 0000: heap
  00e0 0000 0000 - 0100 0000 0000: -
  0100 0000 0000 - 0500 0000 0000: shadow
 -0500 0000 0000 - 0700 0000 0000: -
 +0500 0000 0000 - 0700 0000 0000: traces
  0700 0000 0000 - 0770 0000 0000: metainfo (memory blocks and sync objects)
  07d0 0000 0000 - 8000 0000 0000: -
  */
 @@ -399,6 +440,8 @@ struct MappingGo48 {
  struct MappingGoWindows {
    static const uptr kMetaShadowBeg = 0x070000000000ull;
    static const uptr kMetaShadowEnd = 0x077000000000ull;
 +  static const uptr kTraceMemBeg = 0x050000000000ull;
 +  static const uptr kTraceMemEnd = 0x070000000000ull;
    static const uptr kShadowBeg     = 0x010000000000ull;
    static const uptr kShadowEnd     = 0x050000000000ull;
    static const uptr kLoAppMemBeg = 0x000000001000ull;
 @@ -423,12 +466,16 @@ struct MappingGoWindows {
  2000 0000 0000 - 2380 0000 0000: shadow
  2380 0000 0000 - 2400 0000 0000: -
  2400 0000 0000 - 3400 0000 0000: metainfo (memory blocks and sync objects)
 -3400 0000 0000 - 4000 0000 0000: -
 +3400 0000 0000 - 3600 0000 0000: -
 +3600 0000 0000 - 3800 0000 0000: traces
 +3800 0000 0000 - 4000 0000 0000: -
  */

  struct MappingGoPPC64_46 {
    static const uptr kMetaShadowBeg = 0x240000000000ull;
    static const uptr kMetaShadowEnd = 0x340000000000ull;
 +  static const uptr kTraceMemBeg   = 0x360000000000ull;
 +  static const uptr kTraceMemEnd   = 0x380000000000ull;
    static const uptr kShadowBeg     = 0x200000000000ull;
    static const uptr kShadowEnd     = 0x238000000000ull;
    static const uptr kLoAppMemBeg = 0x000000001000ull;
 @@ -453,12 +500,16 @@ struct MappingGoPPC64_46 {
  2000 0000 0000 - 3000 0000 0000: shadow
  3000 0000 0000 - 3000 0000 0000: -
  3000 0000 0000 - 4000 0000 0000: metainfo (memory blocks and sync objects)
 -4000 0000 0000 - 8000 0000 0000: -
 +4000 0000 0000 - 6000 0000 0000: -
 +6000 0000 0000 - 6200 0000 0000: traces
 +6200 0000 0000 - 8000 0000 0000: -
  */

  struct MappingGoPPC64_47 {
    static const uptr kMetaShadowBeg = 0x300000000000ull;
    static const uptr kMetaShadowEnd = 0x400000000000ull;
 +  static const uptr kTraceMemBeg   = 0x600000000000ull;
 +  static const uptr kTraceMemEnd   = 0x620000000000ull;
    static const uptr kShadowBeg     = 0x200000000000ull;
    static const uptr kShadowEnd     = 0x300000000000ull;
    static const uptr kLoAppMemBeg = 0x000000001000ull;
 @@ -483,11 +534,15 @@ struct MappingGoPPC64_47 {
  2000 0000 0000 - 3000 0000 0000: shadow
  3000 0000 0000 - 3000 0000 0000: -
  3000 0000 0000 - 4000 0000 0000: metainfo (memory blocks and sync objects)
 -4000 0000 0000 - 8000 0000 0000: -
 +4000 0000 0000 - 6000 0000 0000: -
 +6000 0000 0000 - 6200 0000 0000: traces
 +6200 0000 0000 - 8000 0000 0000: -
  */
  struct MappingGoAarch64 {
    static const uptr kMetaShadowBeg = 0x300000000000ull;
    static const uptr kMetaShadowEnd = 0x400000000000ull;
 +  static const uptr kTraceMemBeg   = 0x600000000000ull;
 +  static const uptr kTraceMemEnd   = 0x620000000000ull;
    static const uptr kShadowBeg     = 0x200000000000ull;
    static const uptr kShadowEnd     = 0x300000000000ull;
    static const uptr kLoAppMemBeg = 0x000000001000ull;
 @@ -513,11 +568,15 @@ Go on linux/mips64 (47-bit VMA)
  2000 0000 0000 - 3000 0000 0000: shadow
  3000 0000 0000 - 3000 0000 0000: -
  3000 0000 0000 - 4000 0000 0000: metainfo (memory blocks and sync objects)
 -3200 0000 0000 - 8000 0000 0000: -
 +4000 0000 0000 - 6000 0000 0000: -
 +6000 0000 0000 - 6200 0000 0000: traces
 +6200 0000 0000 - 8000 0000 0000: -
  */
  struct MappingGoMips64_47 {
    static const uptr kMetaShadowBeg = 0x300000000000ull;
    static const uptr kMetaShadowEnd = 0x400000000000ull;
 +  static const uptr kTraceMemBeg = 0x600000000000ull;
 +  static const uptr kTraceMemEnd = 0x620000000000ull;
    static const uptr kShadowBeg = 0x200000000000ull;
    static const uptr kShadowEnd = 0x300000000000ull;
    static const uptr kLoAppMemBeg = 0x000000001000ull;
 @@ -541,10 +600,14 @@ Go on linux/s390x
  4000 0000 0000 - 8000 0000 0000: shadow - 64TiB (4 * app)
  8000 0000 0000 - 9000 0000 0000: -
  9000 0000 0000 - 9800 0000 0000: metainfo - 8TiB (0.5 * app)
 +9800 0000 0000 - a000 0000 0000: -
 +a000 0000 0000 - b000 0000 0000: traces - 16TiB (max history * 128k threads)
  */
  struct MappingGoS390x {
    static const uptr kMetaShadowBeg = 0x900000000000ull;
    static const uptr kMetaShadowEnd = 0x980000000000ull;
 +  static const uptr kTraceMemBeg   = 0xa00000000000ull;
 +  static const uptr kTraceMemEnd   = 0xb00000000000ull;
    static const uptr kShadowBeg     = 0x400000000000ull;
    static const uptr kShadowEnd     = 0x800000000000ull;
    static const uptr kLoAppMemBeg = 0x000000001000ull;
 @@ -652,6 +715,8 @@ enum MappingType {
    kShadowEnd,
    kMetaShadowBeg,
    kMetaShadowEnd,
 +  kTraceMemBeg,
 +  kTraceMemEnd,
    kVdsoBeg,
  };

 @@ -685,6 +750,10 @@ struct MappingField {
          return Mapping::kMetaShadowBeg;
        case kMetaShadowEnd:
          return Mapping::kMetaShadowEnd;
 +      case kTraceMemBeg:
 +        return Mapping::kTraceMemBeg;
 +      case kTraceMemEnd:
 +        return Mapping::kTraceMemEnd;
      }
      Die();
    }
 @@ -723,6 +792,11 @@ uptr MetaShadowBeg(void) { return SelectMapping<MappingField>(kMetaShadowBeg); }
  ALWAYS_INLINE
  uptr MetaShadowEnd(void) { return SelectMapping<MappingField>(kMetaShadowEnd); }

 +ALWAYS_INLINE
 +uptr TraceMemBeg(void) { return SelectMapping<MappingField>(kTraceMemBeg); }
 +ALWAYS_INLINE
 +uptr TraceMemEnd(void) { return SelectMapping<MappingField>(kTraceMemEnd); }
 +
  struct IsAppMemImpl {
    template <typename Mapping>
    static bool Apply(uptr mem) {
 @@ -860,10 +934,43 @@ inline uptr RestoreAddr(uptr addr) {
    return SelectMapping<RestoreAddrImpl>(addr);
  }

 +// The additional page is to catch shadow stack overflow as paging fault.
 +// Windows wants 64K alignment for mmaps.
 +const uptr kTotalTraceSize = (kTraceSize * sizeof(Event) + sizeof(Trace)
 +    + (64 << 10) + (64 << 10) - 1) & ~((64 << 10) - 1);
 +
 +struct GetThreadTraceImpl {
 +  template <typename Mapping>
 +  static uptr Apply(uptr tid) {
 +    uptr p = Mapping::kTraceMemBeg + tid * kTotalTraceSize;
 +    DCHECK_LT(p, Mapping::kTraceMemEnd);
 +    return p;
 +  }
 +};
 +
 +ALWAYS_INLINE
 +uptr GetThreadTrace(int tid) { return SelectMapping<GetThreadTraceImpl>(tid); }
 +
 +struct GetThreadTraceHeaderImpl {
 +  template <typename Mapping>
 +  static uptr Apply(uptr tid) {
 +    uptr p = Mapping::kTraceMemBeg + tid * kTotalTraceSize +
 +             kTraceSize * sizeof(Event);
 +    DCHECK_LT(p, Mapping::kTraceMemEnd);
 +    return p;
 +  }
 +};
 +
 +ALWAYS_INLINE
 +uptr GetThreadTraceHeader(int tid) {
 +  return SelectMapping<GetThreadTraceHeaderImpl>(tid);
 +}
 +
  void InitializePlatform();
  void InitializePlatformEarly();
  void CheckAndProtect();
  void InitializeShadowMemoryPlatform();
 +void FlushShadowMemory();
  void WriteMemoryProfile(char *buf, uptr buf_size, u64 uptime_ns);
  int ExtractResolvFDs(void *state, int *fds, int nfd);
  int ExtractRecvmsgFDs(void *msg, int *fds, int nfd);
 diff --git a/compiler-rt/lib/tsan/rtl/tsan_platform_linux.cpp b/compiler-rt/lib/tsan/rtl/tsan_platform_linux.cpp
 index 17dbdff8a539..73ec14892d28 100644
 --- a/compiler-rt/lib/tsan/rtl/tsan_platform_linux.cpp
 +++ b/compiler-rt/lib/tsan/rtl/tsan_platform_linux.cpp
 @@ -94,6 +94,7 @@ enum {
    MemMeta,
    MemFile,
    MemMmap,
 +  MemTrace,
    MemHeap,
    MemOther,
    MemCount,
 @@ -111,6 +112,8 @@ void FillProfileCallback(uptr p, uptr rss, bool file, uptr *mem) {
      mem[file ? MemFile : MemMmap] += rss;
    else if (p >= HeapMemBeg() && p < HeapMemEnd())
      mem[MemHeap] += rss;
 +  else if (p >= TraceMemBeg() && p < TraceMemEnd())
 +    mem[MemTrace] += rss;
    else
      mem[MemOther] += rss;
  }
 @@ -123,33 +126,42 @@ void WriteMemoryProfile(char *buf, uptr buf_size, u64 uptime_ns) {
    StackDepotStats stacks = StackDepotGetStats();
    uptr nthread, nlive;
    ctx->thread_registry.GetNumberOfThreads(&nthread, &nlive);
 -  uptr trace_mem;
 -  {
 -    Lock l(&ctx->slot_mtx);
 -    trace_mem = ctx->trace_part_total_allocated * sizeof(TracePart);
 -  }
    uptr internal_stats[AllocatorStatCount];
    internal_allocator()->GetStats(internal_stats);
    // All these are allocated from the common mmap region.
 -  mem[MemMmap] -= meta.mem_block + meta.sync_obj + trace_mem +
 -                  stacks.allocated + internal_stats[AllocatorStatMapped];
 +  mem[MemMmap] -= meta.mem_block + meta.sync_obj + stacks.allocated +
 +                  internal_stats[AllocatorStatMapped];
    if (s64(mem[MemMmap]) < 0)
      mem[MemMmap] = 0;
    internal_snprintf(
        buf, buf_size,
 -      "==%zu== %llus [%zu]: RSS %zd MB: shadow:%zd meta:%zd file:%zd"
 -      " mmap:%zd heap:%zd other:%zd intalloc:%zd memblocks:%zd syncobj:%zu"
 -      " trace:%zu stacks=%zd threads=%zu/%zu\n",
 -      internal_getpid(), uptime_ns / (1000 * 1000 * 1000), ctx->global_epoch,
 -      mem[MemTotal] >> 20, mem[MemShadow] >> 20, mem[MemMeta] >> 20,
 -      mem[MemFile] >> 20, mem[MemMmap] >> 20, mem[MemHeap] >> 20,
 +      "%llus: RSS %zd MB: shadow:%zd meta:%zd file:%zd mmap:%zd"
 +      " trace:%zd heap:%zd other:%zd intalloc:%zd memblocks:%zd syncobj:%zu"
 +      " stacks=%zd[%zd] nthr=%zd/%zd\n",
 +      uptime_ns / (1000 * 1000 * 1000), mem[MemTotal] >> 20,
 +      mem[MemShadow] >> 20, mem[MemMeta] >> 20, mem[MemFile] >> 20,
 +      mem[MemMmap] >> 20, mem[MemTrace] >> 20, mem[MemHeap] >> 20,
        mem[MemOther] >> 20, internal_stats[AllocatorStatMapped] >> 20,
 -      meta.mem_block >> 20, meta.sync_obj >> 20, trace_mem >> 20,
 -      stacks.allocated >> 20, nlive, nthread);
 +      meta.mem_block >> 20, meta.sync_obj >> 20, stacks.allocated >> 20,
 +      stacks.n_uniq_ids, nlive, nthread);
 +}
 +
 +#  if SANITIZER_LINUX
 +void FlushShadowMemoryCallback(
 +    const SuspendedThreadsList &suspended_threads_list,
 +    void *argument) {
 +  ReleaseMemoryPagesToOS(ShadowBeg(), ShadowEnd());
 +}
 +#endif
 +
 +void FlushShadowMemory() {
 +#if SANITIZER_LINUX
 +  StopTheWorld(FlushShadowMemoryCallback, 0);
 +#endif
  }

  #if !SANITIZER_GO
 -// Mark shadow for .rodata sections with the special Shadow::kRodata marker.
 +// Mark shadow for .rodata sections with the special kShadowRodata marker.
  // Accesses to .rodata can't race, so this saves time, memory and trace space.
  static void MapRodata() {
    // First create temp file.
 @@ -170,13 +182,13 @@ static void MapRodata() {
      return;
    internal_unlink(name);  // Unlink it now, so that we can reuse the buffer.
    fd_t fd = openrv;
 -  // Fill the file with Shadow::kRodata.
 +  // Fill the file with kShadowRodata.
    const uptr kMarkerSize = 512 * 1024 / sizeof(RawShadow);
    InternalMmapVector<RawShadow> marker(kMarkerSize);
    // volatile to prevent insertion of memset
    for (volatile RawShadow *p = marker.data(); p < marker.data() + kMarkerSize;
         p++)
 -    *p = Shadow::kRodata;
 +    *p = kShadowRodata;
    internal_write(fd, marker.data(), marker.size() * sizeof(RawShadow));
    // Map the file into memory.
    uptr page = internal_mmap(0, GetPageSizeCached(), PROT_READ | PROT_WRITE,
 diff --git a/compiler-rt/lib/tsan/rtl/tsan_platform_mac.cpp b/compiler-rt/lib/tsan/rtl/tsan_platform_mac.cpp
 index 10e072559860..97ef9f7dfaab 100644
 --- a/compiler-rt/lib/tsan/rtl/tsan_platform_mac.cpp
 +++ b/compiler-rt/lib/tsan/rtl/tsan_platform_mac.cpp
 @@ -112,6 +112,9 @@ void cur_thread_finalize() {
  }
  #endif

 +void FlushShadowMemory() {
 +}
 +
  static void RegionMemUsage(uptr start, uptr end, uptr *res, uptr *dirty) {
    vm_address_t address = start;
    vm_address_t end_address = end;
 @@ -139,10 +142,12 @@ static void RegionMemUsage(uptr start, uptr end, uptr *res, uptr *dirty) {
  void WriteMemoryProfile(char *buf, uptr buf_size, u64 uptime_ns) {
    uptr shadow_res, shadow_dirty;
    uptr meta_res, meta_dirty;
 +  uptr trace_res, trace_dirty;
    RegionMemUsage(ShadowBeg(), ShadowEnd(), &shadow_res, &shadow_dirty);
    RegionMemUsage(MetaShadowBeg(), MetaShadowEnd(), &meta_res, &meta_dirty);
 +  RegionMemUsage(TraceMemBeg(), TraceMemEnd(), &trace_res, &trace_dirty);

 -#  if !SANITIZER_GO
 +#if !SANITIZER_GO
    uptr low_res, low_dirty;
    uptr high_res, high_dirty;
    uptr heap_res, heap_dirty;
 @@ -161,6 +166,7 @@ void WriteMemoryProfile(char *buf, uptr buf_size, u64 uptime_ns) {
        buf, buf_size,
        "shadow   (0x%016zx-0x%016zx): resident %zd kB, dirty %zd kB\n"
        "meta     (0x%016zx-0x%016zx): resident %zd kB, dirty %zd kB\n"
 +      "traces   (0x%016zx-0x%016zx): resident %zd kB, dirty %zd kB\n"
  #  if !SANITIZER_GO
        "low app  (0x%016zx-0x%016zx): resident %zd kB, dirty %zd kB\n"
        "high app (0x%016zx-0x%016zx): resident %zd kB, dirty %zd kB\n"
 @@ -173,6 +179,7 @@ void WriteMemoryProfile(char *buf, uptr buf_size, u64 uptime_ns) {
        "------------------------------\n",
        ShadowBeg(), ShadowEnd(), shadow_res / 1024, shadow_dirty / 1024,
        MetaShadowBeg(), MetaShadowEnd(), meta_res / 1024, meta_dirty / 1024,
 +      TraceMemBeg(), TraceMemEnd(), trace_res / 1024, trace_dirty / 1024,
  #  if !SANITIZER_GO
        LoAppMemBeg(), LoAppMemEnd(), low_res / 1024, low_dirty / 1024,
        HiAppMemBeg(), HiAppMemEnd(), high_res / 1024, high_dirty / 1024,
 diff --git a/compiler-rt/lib/tsan/rtl/tsan_platform_posix.cpp b/compiler-rt/lib/tsan/rtl/tsan_platform_posix.cpp
 index 763a533de525..763ac444377e 100644
 --- a/compiler-rt/lib/tsan/rtl/tsan_platform_posix.cpp
 +++ b/compiler-rt/lib/tsan/rtl/tsan_platform_posix.cpp
 @@ -113,20 +113,24 @@ void CheckAndProtect() {
  #    if defined(__aarch64__) && defined(__APPLE__) && SANITIZER_IOS
    ProtectRange(HeapMemEnd(), ShadowBeg());
    ProtectRange(ShadowEnd(), MetaShadowBeg());
 -  ProtectRange(MetaShadowEnd(), HeapMemBeg());
 -#    else
 +  ProtectRange(MetaShadowEnd(), TraceMemBeg());
 +#else
    ProtectRange(LoAppMemEnd(), ShadowBeg());
    ProtectRange(ShadowEnd(), MetaShadowBeg());
    if (MidAppMemBeg()) {
      ProtectRange(MetaShadowEnd(), MidAppMemBeg());
 -    ProtectRange(MidAppMemEnd(), HeapMemBeg());
 +    ProtectRange(MidAppMemEnd(), TraceMemBeg());
    } else {
 -    ProtectRange(MetaShadowEnd(), HeapMemBeg());
 +    ProtectRange(MetaShadowEnd(), TraceMemBeg());
    }
 +  // Memory for traces is mapped lazily in MapThreadTrace.
 +  // Protect the whole range for now, so that user does not map something here.
 +  ProtectRange(TraceMemBeg(), TraceMemEnd());
 +  ProtectRange(TraceMemEnd(), HeapMemBeg());
    ProtectRange(HeapEnd(), HiAppMemBeg());
 -#    endif
 +#endif

 -#    if defined(__s390x__)
 +#if defined(__s390x__)
    // Protect the rest of the address space.
    const uptr user_addr_max_l4 = 0x0020000000000000ull;
    const uptr user_addr_max_l5 = 0xfffffffffffff000ull;
 diff --git a/compiler-rt/lib/tsan/rtl/tsan_platform_windows.cpp b/compiler-rt/lib/tsan/rtl/tsan_platform_windows.cpp
 index eb8f354742f4..fea893768c79 100644
 --- a/compiler-rt/lib/tsan/rtl/tsan_platform_windows.cpp
 +++ b/compiler-rt/lib/tsan/rtl/tsan_platform_windows.cpp
 @@ -20,6 +20,9 @@

  namespace __tsan {

 +void FlushShadowMemory() {
 +}
 +
  void WriteMemoryProfile(char *buf, uptr buf_size, u64 uptime_ns) {}

  void InitializePlatformEarly() {
 diff --git a/compiler-rt/lib/tsan/rtl/tsan_rtl.cpp b/compiler-rt/lib/tsan/rtl/tsan_rtl.cpp
 index 507f93e6a4cc..6ff52e34a2c6 100644
 --- a/compiler-rt/lib/tsan/rtl/tsan_rtl.cpp
 +++ b/compiler-rt/lib/tsan/rtl/tsan_rtl.cpp
 @@ -57,348 +57,110 @@ Context *ctx;
  bool OnFinalize(bool failed);
  void OnInitialize();
  #else
 +#include <dlfcn.h>
  SANITIZER_WEAK_CXX_DEFAULT_IMPL
  bool OnFinalize(bool failed) {
 -#  if !SANITIZER_GO
 +#if !SANITIZER_GO
    if (on_finalize)
      return on_finalize(failed);
 -#  endif
 +#endif
    return failed;
  }
 -
  SANITIZER_WEAK_CXX_DEFAULT_IMPL
  void OnInitialize() {
 -#  if !SANITIZER_GO
 +#if !SANITIZER_GO
    if (on_initialize)
      on_initialize();
 -#  endif
 -}
  #endif
 -
 -static TracePart* TracePartAlloc(ThreadState* thr) {
 -  TracePart* part = nullptr;
 -  {
 -    Lock lock(&ctx->slot_mtx);
 -    uptr max_parts = Trace::kMinParts + flags()->history_size;
 -    Trace* trace = &thr->tctx->trace;
 -    if (trace->parts_allocated == max_parts ||
 -        ctx->trace_part_finished_excess) {
 -      part = ctx->trace_part_recycle.PopFront();
 -      DPrintf("#%d: TracePartAlloc: part=%p\n", thr->tid, part);
 -      if (part && part->trace) {
 -        Trace* trace1 = part->trace;
 -        Lock trace_lock(&trace1->mtx);
 -        part->trace = nullptr;
 -        TracePart* part1 = trace1->parts.PopFront();
 -        CHECK_EQ(part, part1);
 -        if (trace1->parts_allocated > trace1->parts.Size()) {
 -          ctx->trace_part_finished_excess +=
 -              trace1->parts_allocated - trace1->parts.Size();
 -          trace1->parts_allocated = trace1->parts.Size();
 -        }
 -      }
 -    }
 -    if (trace->parts_allocated < max_parts) {
 -      trace->parts_allocated++;
 -      if (ctx->trace_part_finished_excess)
 -        ctx->trace_part_finished_excess--;
 -    }
 -    if (!part)
 -      ctx->trace_part_total_allocated++;
 -    else if (ctx->trace_part_recycle_finished)
 -      ctx->trace_part_recycle_finished--;
 -  }
 -  if (!part)
 -    part = new (MmapOrDie(sizeof(*part), "TracePart")) TracePart();
 -  return part;
 -}
 -
 -static void TracePartFree(TracePart* part) REQUIRES(ctx->slot_mtx) {
 -  DCHECK(part->trace);
 -  part->trace = nullptr;
 -  ctx->trace_part_recycle.PushFront(part);
 -}
 -
 -void TraceResetForTesting() {
 -  Lock lock(&ctx->slot_mtx);
 -  while (auto* part = ctx->trace_part_recycle.PopFront()) {
 -    if (auto trace = part->trace)
 -      CHECK_EQ(trace->parts.PopFront(), part);
 -    UnmapOrDie(part, sizeof(*part));
 -  }
 -  ctx->trace_part_total_allocated = 0;
 -  ctx->trace_part_recycle_finished = 0;
 -  ctx->trace_part_finished_excess = 0;
 -}
 -
 -static void DoResetImpl(uptr epoch) {
 -  ThreadRegistryLock lock0(&ctx->thread_registry);
 -  Lock lock1(&ctx->slot_mtx);
 -  CHECK_EQ(ctx->global_epoch, epoch);
 -  ctx->global_epoch++;
 -  CHECK(!ctx->resetting);
 -  ctx->resetting = true;
 -  for (u32 i = ctx->thread_registry.NumThreadsLocked(); i--;) {
 -    ThreadContext* tctx = (ThreadContext*)ctx->thread_registry.GetThreadLocked(
 -        static_cast<Tid>(i));
 -    // Potentially we could purge all ThreadStatusDead threads from the
 -    // registry. Since we reset all shadow, they can't race with anything
 -    // anymore. However, their tid's can still be stored in some aux places
 -    // (e.g. tid of thread that created something).
 -    auto trace = &tctx->trace;
 -    Lock lock(&trace->mtx);
 -    bool attached = tctx->thr && tctx->thr->slot;
 -    auto parts = &trace->parts;
 -    bool local = false;
 -    while (!parts->Empty()) {
 -      auto part = parts->Front();
 -      local = local || part == trace->local_head;
 -      if (local)
 -        CHECK(!ctx->trace_part_recycle.Queued(part));
 -      else
 -        ctx->trace_part_recycle.Remove(part);
 -      if (attached && parts->Size() == 1) {
 -        // The thread is running and this is the last/current part.
 -        // Set the trace position to the end of the current part
 -        // to force the thread to call SwitchTracePart and re-attach
 -        // to a new slot and allocate a new trace part.
 -        // Note: the thread is concurrently modifying the position as well,
 -        // so this is only best-effort. The thread can only modify position
 -        // within this part, because switching parts is protected by
 -        // slot/trace mutexes that we hold here.
 -        atomic_store_relaxed(
 -            &tctx->thr->trace_pos,
 -            reinterpret_cast<uptr>(&part->events[TracePart::kSize]));
 -        break;
 -      }
 -      parts->Remove(part);
 -      TracePartFree(part);
 -    }
 -    CHECK_LE(parts->Size(), 1);
 -    trace->local_head = parts->Front();
 -    if (tctx->thr && !tctx->thr->slot) {
 -      atomic_store_relaxed(&tctx->thr->trace_pos, 0);
 -      tctx->thr->trace_prev_pc = 0;
 -    }
 -    if (trace->parts_allocated > trace->parts.Size()) {
 -      ctx->trace_part_finished_excess +=
 -          trace->parts_allocated - trace->parts.Size();
 -      trace->parts_allocated = trace->parts.Size();
 -    }
 -  }
 -  while (ctx->slot_queue.PopFront()) {
 -  }
 -  for (auto& slot : ctx->slots) {
 -    slot.SetEpoch(kEpochZero);
 -    slot.journal.Reset();
 -    slot.thr = nullptr;
 -    ctx->slot_queue.PushBack(&slot);
 -  }
 -
 -  DPrintf("Resetting shadow...\n");
 -  if (!MmapFixedSuperNoReserve(ShadowBeg(), ShadowEnd() - ShadowBeg(),
 -                               "shadow")) {
 -    Printf("failed to reset shadow memory\n");
 -    Die();
 -  }
 -  DPrintf("Resetting meta shadow...\n");
 -  ctx->metamap.ResetClocks();
 -  ctx->resetting = false;
 -}
 -
 -// Clang does not understand locking all slots in the loop:
 -// error: expecting mutex 'slot.mtx' to be held at start of each loop
 -void DoReset(ThreadState* thr, uptr epoch) NO_THREAD_SAFETY_ANALYSIS {
 -  {
 -    Lock l(&ctx->multi_slot_mtx);
 -    for (auto& slot : ctx->slots) {
 -      slot.mtx.Lock();
 -      if (UNLIKELY(epoch == 0))
 -        epoch = ctx->global_epoch;
 -      if (UNLIKELY(epoch != ctx->global_epoch)) {
 -        // Epoch can't change once we've locked the first slot.
 -        CHECK_EQ(slot.sid, 0);
 -        slot.mtx.Unlock();
 -        return;
 -      }
 -    }
 -  }
 -  DPrintf("#%d: DoReset epoch=%lu\n", thr ? thr->tid : -1, epoch);
 -  DoResetImpl(epoch);
 -  for (auto& slot : ctx->slots) slot.mtx.Unlock();
  }
 +#endif

 -void FlushShadowMemory() { DoReset(nullptr, 0); }
 -
 -static TidSlot* FindSlotAndLock(ThreadState* thr)
 -    ACQUIRE(thr->slot->mtx) NO_THREAD_SAFETY_ANALYSIS {
 -  CHECK(!thr->slot);
 -  TidSlot* slot = nullptr;
 -  for (;;) {
 -    uptr epoch;
 -    {
 -      Lock lock(&ctx->slot_mtx);
 -      epoch = ctx->global_epoch;
 -      if (slot) {
 -        // This is an exhausted slot from the previous iteration.
 -        if (ctx->slot_queue.Queued(slot))
 -          ctx->slot_queue.Remove(slot);
 -        thr->slot_locked = false;
 -        slot->mtx.Unlock();
 -      }
 -      for (;;) {
 -        slot = ctx->slot_queue.PopFront();
 -        if (!slot)
 -          break;
 -        if (slot->epoch() != kEpochLast) {
 -          ctx->slot_queue.PushBack(slot);
 -          break;
 -        }
 -      }
 -    }
 -    if (!slot) {
 -      DoReset(thr, epoch);
 -      continue;
 +static ThreadContextBase *CreateThreadContext(Tid tid) {
 +  // Map thread trace when context is created.
 +  char name[50];
 +  internal_snprintf(name, sizeof(name), "trace %u", tid);
 +  MapThreadTrace(GetThreadTrace(tid), TraceSize() * sizeof(Event), name);
 +  const uptr hdr = GetThreadTraceHeader(tid);
 +  internal_snprintf(name, sizeof(name), "trace header %u", tid);
 +  MapThreadTrace(hdr, sizeof(Trace), name);
 +  new((void*)hdr) Trace();
 +  // We are going to use only a small part of the trace with the default
 +  // value of history_size. However, the constructor writes to the whole trace.
 +  // Release the unused part.
 +  uptr hdr_end = hdr + sizeof(Trace);
 +  hdr_end -= sizeof(TraceHeader) * (kTraceParts - TraceParts());
 +  hdr_end = RoundUp(hdr_end, GetPageSizeCached());
 +  if (hdr_end < hdr + sizeof(Trace)) {
 +    ReleaseMemoryPagesToOS(hdr_end, hdr + sizeof(Trace));
 +    uptr unused = hdr + sizeof(Trace) - hdr_end;
 +    if (hdr_end != (uptr)MmapFixedNoAccess(hdr_end, unused)) {
 +      Report("ThreadSanitizer: failed to mprotect [0x%zx-0x%zx) \n", hdr_end,
 +             unused);
 +      CHECK("unable to mprotect" && 0);
      }
 -    slot->mtx.Lock();
 -    CHECK(!thr->slot_locked);
 -    thr->slot_locked = true;
 -    if (slot->thr) {
 -      DPrintf("#%d: preempting sid=%d tid=%d\n", thr->tid, (u32)slot->sid,
 -              slot->thr->tid);
 -      slot->SetEpoch(slot->thr->fast_state.epoch());
 -      slot->thr = nullptr;
 -    }
 -    if (slot->epoch() != kEpochLast)
 -      return slot;
    }
 +  return New<ThreadContext>(tid);
  }

 -void SlotAttachAndLock(ThreadState* thr) {
 -  TidSlot* slot = FindSlotAndLock(thr);
 -  DPrintf("#%d: SlotAttach: slot=%u\n", thr->tid, static_cast<int>(slot->sid));
 -  CHECK(!slot->thr);
 -  CHECK(!thr->slot);
 -  slot->thr = thr;
 -  thr->slot = slot;
 -  Epoch epoch = EpochInc(slot->epoch());
 -  CHECK(!EpochOverflow(epoch));
 -  slot->SetEpoch(epoch);
 -  thr->fast_state.SetSid(slot->sid);
 -  thr->fast_state.SetEpoch(epoch);
 -  if (thr->slot_epoch != ctx->global_epoch) {
 -    thr->slot_epoch = ctx->global_epoch;
 -    thr->clock.Reset();
  #if !SANITIZER_GO
 -    thr->last_sleep_stack_id = kInvalidStackID;
 -    thr->last_sleep_clock.Reset();
 +static const u32 kThreadQuarantineSize = 16;
 +#else
 +static const u32 kThreadQuarantineSize = 64;
  #endif
 -  }
 -  thr->clock.Set(slot->sid, epoch);
 -  slot->journal.PushBack({thr->tid, epoch});
 -}
 -
 -static void SlotDetachImpl(ThreadState* thr, bool exiting) {
 -  TidSlot* slot = thr->slot;
 -  thr->slot = nullptr;
 -  if (thr != slot->thr) {
 -    slot = nullptr;  // we don't own the slot anymore
 -    if (thr->slot_epoch != ctx->global_epoch) {
 -      TracePart* part = nullptr;
 -      auto* trace = &thr->tctx->trace;
 -      {
 -        Lock l(&trace->mtx);
 -        auto* parts = &trace->parts;
 -        // The trace can be completely empty in an unlikely event
 -        // the thread is preempted right after it acquired the slot
 -        // in ThreadStart and did not trace any events yet.
 -        CHECK_LE(parts->Size(), 1);
 -        part = parts->PopFront();
 -        thr->tctx->trace.local_head = nullptr;
 -        atomic_store_relaxed(&thr->trace_pos, 0);
 -        thr->trace_prev_pc = 0;
 -      }
 -      if (part) {
 -        Lock l(&ctx->slot_mtx);
 -        TracePartFree(part);
 -      }
 -    }
 -    return;
 -  }
 -  CHECK(exiting || thr->fast_state.epoch() == kEpochLast);
 -  slot->SetEpoch(thr->fast_state.epoch());
 -  slot->thr = nullptr;
 -}
 -
 -void SlotDetach(ThreadState* thr) {
 -  Lock lock(&thr->slot->mtx);
 -  SlotDetachImpl(thr, true);
 -}
 -
 -void SlotLock(ThreadState* thr) NO_THREAD_SAFETY_ANALYSIS {
 -  DCHECK(!thr->slot_locked);
 -  TidSlot* slot = thr->slot;
 -  slot->mtx.Lock();
 -  thr->slot_locked = true;
 -  if (LIKELY(thr == slot->thr && thr->fast_state.epoch() != kEpochLast))
 -    return;
 -  SlotDetachImpl(thr, false);
 -  thr->slot_locked = false;
 -  slot->mtx.Unlock();
 -  SlotAttachAndLock(thr);
 -}
 -
 -void SlotUnlock(ThreadState* thr) {
 -  DCHECK(thr->slot_locked);
 -  thr->slot_locked = false;
 -  thr->slot->mtx.Unlock();
 -}

  Context::Context()
      : initialized(),
        report_mtx(MutexTypeReport),
        nreported(),
 -      thread_registry([](Tid tid) -> ThreadContextBase* {
 -        return new (Alloc(sizeof(ThreadContext))) ThreadContext(tid);
 -      }),
 +      thread_registry(CreateThreadContext, kMaxTid, kThreadQuarantineSize,
 +                      kMaxTidReuse),
        racy_mtx(MutexTypeRacy),
        racy_stacks(),
        racy_addresses(),
        fired_suppressions_mtx(MutexTypeFired),
 -      clock_alloc(LINKER_INITIALIZED, "clock allocator"),
 -      slot_mtx(MutexTypeSlots),
 -      multi_slot_mtx(MutexTypeMultiSlot),
 -      resetting() {
 +      clock_alloc(LINKER_INITIALIZED, "clock allocator") {
    fired_suppressions.reserve(8);
 -  for (uptr i = 0; i < ARRAY_SIZE(slots); i++) {
 -    TidSlot* slot = &slots[i];
 -    slot->sid = static_cast<Sid>(i);
 -    slot_queue.PushBack(slot);
 -  }
 -  global_epoch = 1;
  }

 -TidSlot::TidSlot() : mtx(MutexTypeSlot) {}
 -
  // The objects are allocated in TLS, so one may rely on zero-initialization.
 -ThreadState::ThreadState(Tid tid)
 -    // Do not touch these, rely on zero initialization,
 -    // they may be accessed before the ctor.
 -    // ignore_reads_and_writes()
 -    // ignore_interceptors()
 -    : tid(tid) {
 +ThreadState::ThreadState(Context *ctx, Tid tid, int unique_id, u64 epoch,
 +                         unsigned reuse_count, uptr stk_addr, uptr stk_size,
 +                         uptr tls_addr, uptr tls_size)
 +    : fast_state(tid, epoch)
 +      // Do not touch these, rely on zero initialization,
 +      // they may be accessed before the ctor.
 +      // , ignore_reads_and_writes()
 +      // , ignore_interceptors()
 +      ,
 +      clock(tid, reuse_count)
 +#if !SANITIZER_GO
 +      ,
 +      jmp_bufs()
 +#endif
 +      ,
 +      tid(tid),
 +      unique_id(unique_id),
 +      stk_addr(stk_addr),
 +      stk_size(stk_size),
 +      tls_addr(tls_addr),
 +      tls_size(tls_size)
 +#if !SANITIZER_GO
 +      ,
 +      last_sleep_clock(tid)
 +#endif
 +{
    CHECK_EQ(reinterpret_cast<uptr>(this) % SANITIZER_CACHE_LINE_SIZE, 0);
  #if !SANITIZER_GO
    // C/C++ uses fixed size shadow stack.
    const int kInitStackSize = kShadowStackSize;
 -  shadow_stack = static_cast<uptr*>(
 +  shadow_stack = static_cast<uptr *>(
        MmapNoReserveOrDie(kInitStackSize * sizeof(uptr), "shadow stack"));
    SetShadowRegionHugePageMode(reinterpret_cast<uptr>(shadow_stack),
                                kInitStackSize * sizeof(uptr));
  #else
    // Go uses malloc-allocated shadow stack with dynamic size.
    const int kInitStackSize = 8;
 -  shadow_stack = static_cast<uptr*>(Alloc(kInitStackSize * sizeof(uptr)));
 +  shadow_stack = static_cast<uptr *>(Alloc(kInitStackSize * sizeof(uptr)));
  #endif
    shadow_stack_pos = shadow_stack;
    shadow_stack_end = shadow_stack + kInitStackSize;
 @@ -516,8 +278,7 @@ void UnmapShadow(ThreadState *thr, uptr addr, uptr size) {
    if (size == 0) return;
    DontNeedShadowFor(addr, size);
    ScopedGlobalProcessor sgp;
 -  SlotLocker locker(thr, true);
 -  ctx->metamap.ResetRange(thr->proc(), addr, size, true);
 +  ctx->metamap.ResetRange(thr->proc(), addr, size);
  }
  #endif

 @@ -563,6 +324,18 @@ void MapShadow(uptr addr, uptr size) {
            addr + size, meta_begin, meta_end);
  }

 +void MapThreadTrace(uptr addr, uptr size, const char *name) {
 +  DPrintf("#0: Mapping trace at 0x%zx-0x%zx(0x%zx)\n", addr, addr + size, size);
 +  CHECK_GE(addr, TraceMemBeg());
 +  CHECK_LE(addr + size, TraceMemEnd());
 +  CHECK_EQ(addr, addr & ~((64 << 10) - 1));  // windows wants 64K alignment
 +  if (!MmapFixedSuperNoReserve(addr, size, name)) {
 +    Printf("FATAL: ThreadSanitizer can not mmap thread trace (0x%zx/0x%zx)\n",
 +           addr, size);
 +    Die();
 +  }
 +}
 +
  #if !SANITIZER_GO
  static void OnStackUnwind(const SignalContext &sig, const void *,
                            BufferedStackTrace *stack) {
 @@ -581,11 +354,8 @@ void CheckUnwind() {
    // since we are going to die soon.
    ScopedIgnoreInterceptors ignore;
  #if !SANITIZER_GO
 -  ThreadState* thr = cur_thread();
 -  thr->nomalloc = false;
 -  thr->ignore_sync++;
 -  thr->ignore_reads_and_writes++;
 -  atomic_store_relaxed(&thr->in_signal_handler, 0);
 +  cur_thread()->ignore_sync++;
 +  cur_thread()->ignore_reads_and_writes++;
  #endif
    PrintCurrentStackSlow(StackTrace::GetCurrentPc());
  }
 @@ -640,22 +410,22 @@ void Initialize(ThreadState *thr) {
    Symbolizer::GetOrInit()->AddHooks(EnterSymbolizer, ExitSymbolizer);
  #endif

 -  VPrintf(1, "***** Running under ThreadSanitizer v3 (pid %d) *****\n",
 +  VPrintf(1, "***** Running under ThreadSanitizer v2 (pid %d) *****\n",
            (int)internal_getpid());

    // Initialize thread 0.
 -  Tid tid = ThreadCreate(nullptr, 0, 0, true);
 +  Tid tid = ThreadCreate(thr, 0, 0, true);
    CHECK_EQ(tid, kMainTid);
    ThreadStart(thr, tid, GetTid(), ThreadType::Regular);
  #if TSAN_CONTAINS_UBSAN
    __ubsan::InitAsPlugin();
  #endif
 +  ctx->initialized = true;

  #if !SANITIZER_GO
    Symbolizer::LateInitialize();
    InitializeMemoryProfiler();
  #endif
 -  ctx->initialized = true;

    if (flags()->stop_on_start) {
      Printf("ThreadSanitizer is suspended at startup (pid %d)."
 @@ -681,6 +451,7 @@ void MaybeSpawnBackgroundThread() {
  #endif
  }

 +
  int Finalize(ThreadState *thr) {
    bool failed = false;

 @@ -688,12 +459,12 @@ int Finalize(ThreadState *thr) {
      DumpProcessMap();

    if (flags()->atexit_sleep_ms > 0 && ThreadCount(thr) > 1)
 -    internal_usleep(u64(flags()->atexit_sleep_ms) * 1000);
 +    SleepForMillis(flags()->atexit_sleep_ms);

 -  {
 -    // Wait for pending reports.
 -    ScopedErrorReportLock lock;
 -  }
 +  // Wait for pending reports.
 +  ctx->report_mtx.Lock();
 +  { ScopedErrorReportLock l; }
 +  ctx->report_mtx.Unlock();

  #if !SANITIZER_GO
    if (Verbosity()) AllocatorPrintStats();
 @@ -720,14 +491,8 @@ int Finalize(ThreadState *thr) {

  #if !SANITIZER_GO
  void ForkBefore(ThreadState *thr, uptr pc) NO_THREAD_SAFETY_ANALYSIS {
 -  GlobalProcessorLock();
 -  // Detaching from the slot makes OnUserFree skip writing to the shadow.
 -  // The slot will be locked so any attempts to use it will deadlock anyway.
 -  SlotDetach(thr);
 -  ctx->multi_slot_mtx.Lock();
 -  for (auto& slot : ctx->slots) slot.mtx.Lock();
    ctx->thread_registry.Lock();
 -  ctx->slot_mtx.Lock();
 +  ctx->report_mtx.Lock();
    ScopedErrorReportLock::Lock();
    AllocatorLock();
    // Suppress all reports in the pthread_atfork callbacks.
 @@ -747,30 +512,30 @@ void ForkBefore(ThreadState *thr, uptr pc) NO_THREAD_SAFETY_ANALYSIS {
    __tsan_test_only_on_fork();
  }

 -static void ForkAfter(ThreadState* thr) NO_THREAD_SAFETY_ANALYSIS {
 +void ForkParentAfter(ThreadState *thr, uptr pc) NO_THREAD_SAFETY_ANALYSIS {
    thr->suppress_reports--;  // Enabled in ForkBefore.
    thr->ignore_interceptors--;
    thr->ignore_reads_and_writes--;
    AllocatorUnlock();
    ScopedErrorReportLock::Unlock();
 -  ctx->slot_mtx.Unlock();
 +  ctx->report_mtx.Unlock();
    ctx->thread_registry.Unlock();
 -  for (auto& slot : ctx->slots) slot.mtx.Unlock();
 -  ctx->multi_slot_mtx.Unlock();
 -  SlotAttachAndLock(thr);
 -  SlotUnlock(thr);
 -  GlobalProcessorUnlock();
  }

 -void ForkParentAfter(ThreadState* thr, uptr pc) { ForkAfter(thr); }
 +void ForkChildAfter(ThreadState *thr, uptr pc,
 +                    bool start_thread) NO_THREAD_SAFETY_ANALYSIS {
 +  thr->suppress_reports--;  // Enabled in ForkBefore.
 +  thr->ignore_interceptors--;
 +  thr->ignore_reads_and_writes--;
 +  AllocatorUnlock();
 +  ScopedErrorReportLock::Unlock();
 +  ctx->report_mtx.Unlock();
 +  ctx->thread_registry.Unlock();

 -void ForkChildAfter(ThreadState* thr, uptr pc, bool start_thread) {
 -  ForkAfter(thr);
 -  u32 nthread = ThreadCount(thr);
 -  VPrintf(1,
 -          "ThreadSanitizer: forked new process with pid %d,"
 -          " parent had %d threads\n",
 -          (int)internal_getpid(), (int)nthread);
 +  uptr nthread = 0;
 +  ctx->thread_registry.GetNumberOfThreads(0, 0, &nthread /* alive threads */);
 +  VPrintf(1, "ThreadSanitizer: forked new process with pid %d,"
 +      " parent had %d threads\n", (int)internal_getpid(), (int)nthread);
    if (nthread == 1) {
      if (start_thread)
        StartBackgroundThread();
 @@ -780,7 +545,6 @@ void ForkChildAfter(ThreadState* thr, uptr pc, bool start_thread) {
      // ignores for everything in the hope that we will exec soon.
      ctx->after_multithreaded_fork = true;
      thr->ignore_interceptors++;
 -    thr->suppress_reports++;
      ThreadIgnoreBegin(thr, pc);
      ThreadIgnoreSyncBegin(thr, pc);
    }
 @@ -802,10 +566,8 @@ void GrowShadowStack(ThreadState *thr) {
  #endif

  StackID CurrentStackId(ThreadState *thr, uptr pc) {
 -#if !SANITIZER_GO
    if (!thr->is_inited)  // May happen during bootstrap.
      return kInvalidStackID;
 -#endif
    if (pc != 0) {
  #if !SANITIZER_GO
      DCHECK_LT(thr->shadow_stack_pos, thr->shadow_stack_end);
 @@ -823,72 +585,53 @@ StackID CurrentStackId(ThreadState *thr, uptr pc) {
    return id;
  }

 -static bool TraceSkipGap(ThreadState* thr) {
 +namespace v3 {
 +
 +NOINLINE
 +void TraceSwitchPart(ThreadState *thr) {
    Trace *trace = &thr->tctx->trace;
    Event *pos = reinterpret_cast<Event *>(atomic_load_relaxed(&thr->trace_pos));
    DCHECK_EQ(reinterpret_cast<uptr>(pos + 1) & TracePart::kAlignment, 0);
    auto *part = trace->parts.Back();
 -  DPrintf("#%d: TraceSwitchPart enter trace=%p parts=%p-%p pos=%p\n", thr->tid,
 -          trace, trace->parts.Front(), part, pos);
 -  if (!part)
 -    return false;
 -  // We can get here when we still have space in the current trace part.
 -  // The fast-path check in TraceAcquire has false positives in the middle of
 -  // the part. Check if we are indeed at the end of the current part or not,
 -  // and fill any gaps with NopEvent's.
 -  Event* end = &part->events[TracePart::kSize];
 -  DCHECK_GE(pos, &part->events[0]);
 -  DCHECK_LE(pos, end);
 -  if (pos + 1 < end) {
 -    if ((reinterpret_cast<uptr>(pos) & TracePart::kAlignment) ==
 -        TracePart::kAlignment)
 +  DPrintf("TraceSwitchPart part=%p pos=%p\n", part, pos);
 +  if (part) {
 +    // We can get here when we still have space in the current trace part.
 +    // The fast-path check in TraceAcquire has false positives in the middle of
 +    // the part. Check if we are indeed at the end of the current part or not,
 +    // and fill any gaps with NopEvent's.
 +    Event *end = &part->events[TracePart::kSize];
 +    DCHECK_GE(pos, &part->events[0]);
 +    DCHECK_LE(pos, end);
 +    if (pos + 1 < end) {
 +      if ((reinterpret_cast<uptr>(pos) & TracePart::kAlignment) ==
 +          TracePart::kAlignment)
 +        *pos++ = NopEvent;
        *pos++ = NopEvent;
 -    *pos++ = NopEvent;
 -    DCHECK_LE(pos + 2, end);
 -    atomic_store_relaxed(&thr->trace_pos, reinterpret_cast<uptr>(pos));
 -    return true;
 +      DCHECK_LE(pos + 2, end);
 +      atomic_store_relaxed(&thr->trace_pos, reinterpret_cast<uptr>(pos));
 +      // Ensure we setup trace so that the next TraceAcquire
 +      // won't detect trace part end.
 +      Event *ev;
 +      CHECK(TraceAcquire(thr, &ev));
 +      return;
 +    }
 +    // We are indeed at the end.
 +    for (; pos < end; pos++) *pos = NopEvent;
    }
 -  // We are indeed at the end.
 -  for (; pos < end; pos++) *pos = NopEvent;
 -  return false;
 -}
 -
 -NOINLINE
 -void TraceSwitchPart(ThreadState* thr) {
 -  if (TraceSkipGap(thr))
 -    return;
  #if !SANITIZER_GO
    if (ctx->after_multithreaded_fork) {
      // We just need to survive till exec.
 -    TracePart* part = thr->tctx->trace.parts.Back();
 -    if (part) {
 -      atomic_store_relaxed(&thr->trace_pos,
 -                           reinterpret_cast<uptr>(&part->events[0]));
 -      return;
 -    }
 +    CHECK(part);
 +    atomic_store_relaxed(&thr->trace_pos,
 +                         reinterpret_cast<uptr>(&part->events[0]));
 +    return;
    }
  #endif
 -  TraceSwitchPartImpl(thr);
 -}
 -
 -void TraceSwitchPartImpl(ThreadState* thr) {
 -  SlotLocker locker(thr, true);
 -  Trace* trace = &thr->tctx->trace;
 -  TracePart* part = TracePartAlloc(thr);
 +  part = new (MmapOrDie(sizeof(TracePart), "TracePart")) TracePart();
    part->trace = trace;
    thr->trace_prev_pc = 0;
 -  TracePart* recycle = nullptr;
 -  // Keep roughly half of parts local to the thread
 -  // (not queued into the recycle queue).
 -  uptr local_parts = (Trace::kMinParts + flags()->history_size + 1) / 2;
    {
      Lock lock(&trace->mtx);
 -    if (trace->parts.Empty())
 -      trace->local_head = part;
 -    if (trace->parts.Size() >= local_parts) {
 -      recycle = trace->local_head;
 -      trace->local_head = trace->parts.Next(recycle);
 -    }
      trace->parts.PushBack(part);
      atomic_store_relaxed(&thr->trace_pos,
                           reinterpret_cast<uptr>(&part->events[0]));
 @@ -896,45 +639,60 @@ void TraceSwitchPartImpl(ThreadState* thr) {
    // Make this part self-sufficient by restoring the current stack
    // and mutex set in the beginning of the trace.
    TraceTime(thr);
 -  {
 -    // Pathologically large stacks may not fit into the part.
 -    // In these cases we log only fixed number of top frames.
 -    const uptr kMaxFrames = 1000;
 -    // Sanity check that kMaxFrames won't consume the whole part.
 -    static_assert(kMaxFrames < TracePart::kSize / 2, "kMaxFrames is too big");
 -    uptr* pos = Max(&thr->shadow_stack[0], thr->shadow_stack_pos - kMaxFrames);
 -    for (; pos < thr->shadow_stack_pos; pos++) {
 -      if (TryTraceFunc(thr, *pos))
 -        continue;
 -      CHECK(TraceSkipGap(thr));
 -      CHECK(TryTraceFunc(thr, *pos));
 -    }
 -  }
 +  for (uptr *pos = &thr->shadow_stack[0]; pos < thr->shadow_stack_pos; pos++)
 +    CHECK(TryTraceFunc(thr, *pos));
    for (uptr i = 0; i < thr->mset.Size(); i++) {
      MutexSet::Desc d = thr->mset.Get(i);
 -    for (uptr i = 0; i < d.count; i++)
 -      TraceMutexLock(thr, d.write ? EventType::kLock : EventType::kRLock, 0,
 -                     d.addr, d.stack_id);
 -  }
 -  {
 -    Lock lock(&ctx->slot_mtx);
 -    ctx->slot_queue.Remove(thr->slot);
 -    ctx->slot_queue.PushBack(thr->slot);
 -    if (recycle)
 -      ctx->trace_part_recycle.PushBack(recycle);
 +    TraceMutexLock(thr, d.write ? EventType::kLock : EventType::kRLock, 0,
 +                   d.addr, d.stack_id);
    }
 -  DPrintf("#%d: TraceSwitchPart exit parts=%p-%p pos=0x%zx\n", thr->tid,
 -          trace->parts.Front(), trace->parts.Back(),
 -          atomic_load_relaxed(&thr->trace_pos));
 +}
 +
 +}  // namespace v3
 +
 +void TraceSwitch(ThreadState *thr) {
 +#if !SANITIZER_GO
 +  if (ctx->after_multithreaded_fork)
 +    return;
 +#endif
 +  thr->nomalloc++;
 +  Trace *thr_trace = ThreadTrace(thr->tid);
 +  Lock l(&thr_trace->mtx);
 +  unsigned trace = (thr->fast_state.epoch() / kTracePartSize) % TraceParts();
 +  TraceHeader *hdr = &thr_trace->headers[trace];
 +  hdr->epoch0 = thr->fast_state.epoch();
 +  ObtainCurrentStack(thr, 0, &hdr->stack0);
 +  hdr->mset0 = thr->mset;
 +  thr->nomalloc--;
 +}
 +
 +Trace *ThreadTrace(Tid tid) { return (Trace *)GetThreadTraceHeader(tid); }
 +
 +uptr TraceTopPC(ThreadState *thr) {
 +  Event *events = (Event*)GetThreadTrace(thr->tid);
 +  uptr pc = events[thr->fast_state.GetTracePos()];
 +  return pc;
 +}
 +
 +uptr TraceSize() {
 +  return (uptr)(1ull << (kTracePartSizeBits + flags()->history_size + 1));
 +}
 +
 +uptr TraceParts() {
 +  return TraceSize() / kTracePartSize;
  }

  #if !SANITIZER_GO
 -extern "C" void __tsan_trace_switch() {}
 +extern "C" void __tsan_trace_switch() {
 +  TraceSwitch(cur_thread());
 +}

 -extern "C" void __tsan_report_race() {}
 +extern "C" void __tsan_report_race() {
 +  ReportRace(cur_thread());
 +}
  #endif

 -void ThreadIgnoreBegin(ThreadState* thr, uptr pc) {
 +void ThreadIgnoreBegin(ThreadState *thr, uptr pc) {
    DPrintf("#%d: ThreadIgnoreBegin\n", thr->tid);
    thr->ignore_reads_and_writes++;
    CHECK_GT(thr->ignore_reads_and_writes, 0);
 @@ -994,6 +752,7 @@ void build_consistency_debug() {}
  #else
  void build_consistency_release() {}
  #endif
 +
  }  // namespace __tsan

  #if SANITIZER_CHECK_DEADLOCKS
 @@ -1001,30 +760,21 @@ namespace __sanitizer {
  using namespace __tsan;
  MutexMeta mutex_meta[] = {
      {MutexInvalid, "Invalid", {}},
 -    {MutexThreadRegistry,
 -     "ThreadRegistry",
 -     {MutexTypeSlots, MutexTypeTrace, MutexTypeReport}},
 -    {MutexTypeReport, "Report", {MutexTypeTrace}},
 -    {MutexTypeSyncVar, "SyncVar", {MutexTypeReport, MutexTypeTrace}},
 +    {MutexThreadRegistry, "ThreadRegistry", {}},
 +    {MutexTypeTrace, "Trace", {}},
 +    {MutexTypeReport,
 +     "Report",
 +     {MutexTypeSyncVar, MutexTypeGlobalProc, MutexTypeTrace}},
 +    {MutexTypeSyncVar, "SyncVar", {MutexTypeTrace}},
      {MutexTypeAnnotations, "Annotations", {}},
 -    {MutexTypeAtExit, "AtExit", {}},
 +    {MutexTypeAtExit, "AtExit", {MutexTypeSyncVar}},
      {MutexTypeFired, "Fired", {MutexLeaf}},
      {MutexTypeRacy, "Racy", {MutexLeaf}},
 -    {MutexTypeGlobalProc,
 -     "GlobalProc",
 -     {MutexTypeSlot, MutexTypeSlots, MutexTypeMultiSlot}},
 +    {MutexTypeGlobalProc, "GlobalProc", {}},
      {MutexTypeInternalAlloc, "InternalAlloc", {MutexLeaf}},
 -    {MutexTypeTrace, "Trace", {}},
 -    {MutexTypeSlot,
 -     "Slot",
 -     {MutexMulti, MutexTypeTrace, MutexTypeSyncVar, MutexThreadRegistry,
 -      MutexTypeSlots}},
 -    {MutexTypeSlots, "Slots", {MutexTypeTrace, MutexTypeReport}},
 -    {MutexTypeMultiSlot, "MultiSlot", {MutexTypeSlot, MutexTypeSlots}},
      {},
  };

  void PrintMutexPC(uptr pc) { StackTrace(&pc, 1).Print(); }
 -
  }  // namespace __sanitizer
  #endif
 diff --git a/compiler-rt/lib/tsan/rtl/tsan_rtl.h b/compiler-rt/lib/tsan/rtl/tsan_rtl.h
 index 3175847a880a..c71b27e1cbf5 100644
 --- a/compiler-rt/lib/tsan/rtl/tsan_rtl.h
 +++ b/compiler-rt/lib/tsan/rtl/tsan_rtl.h
 @@ -38,7 +38,6 @@
  #include "tsan_defs.h"
  #include "tsan_flags.h"
  #include "tsan_ignoreset.h"
 -#include "tsan_ilist.h"
  #include "tsan_mman.h"
  #include "tsan_mutexset.h"
  #include "tsan_platform.h"
 @@ -47,7 +46,6 @@
  #include "tsan_stack_trace.h"
  #include "tsan_sync.h"
  #include "tsan_trace.h"
 -#include "tsan_vector_clock.h"

  #if SANITIZER_WORDSIZE != 64
  # error "ThreadSanitizer is supported only on 64-bit platforms"
 @@ -118,6 +116,7 @@ struct Processor {
  #endif
    DenseSlabAllocCache block_cache;
    DenseSlabAllocCache sync_cache;
 +  DenseSlabAllocCache clock_cache;
    DDPhysicalThread *dd_pt;
  };

 @@ -131,56 +130,30 @@ struct ScopedGlobalProcessor {
  };
  #endif

 -struct TidEpoch {
 -  Tid tid;
 -  Epoch epoch;
 -};
 -
 -struct TidSlot {
 -  Mutex mtx;
 -  Sid sid;
 -  atomic_uint32_t raw_epoch;
 -  ThreadState *thr;
 -  Vector<TidEpoch> journal;
 -  INode node;
 -
 -  Epoch epoch() const {
 -    return static_cast<Epoch>(atomic_load(&raw_epoch, memory_order_relaxed));
 -  }
 -
 -  void SetEpoch(Epoch v) {
 -    atomic_store(&raw_epoch, static_cast<u32>(v), memory_order_relaxed);
 -  }
 -
 -  TidSlot();
 -} ALIGNED(SANITIZER_CACHE_LINE_SIZE);
 -
  // This struct is stored in TLS.
  struct ThreadState {
    FastState fast_state;
 -  int ignore_sync;
 -#if !SANITIZER_GO
 -  int ignore_interceptors;
 -#endif
 -  uptr *shadow_stack_pos;
 -
 -  // Current position in tctx->trace.Back()->events (Event*).
 -  atomic_uintptr_t trace_pos;
 -  // PC of the last memory access, used to compute PC deltas in the trace.
 -  uptr trace_prev_pc;
 -
 +  // Synch epoch represents the threads's epoch before the last synchronization
 +  // action. It allows to reduce number of shadow state updates.
 +  // For example, fast_synch_epoch=100, last write to addr X was at epoch=150,
 +  // if we are processing write to X from the same thread at epoch=200,
 +  // we do nothing, because both writes happen in the same 'synch epoch'.
 +  // That is, if another memory access does not race with the former write,
 +  // it does not race with the latter as well.
 +  // QUESTION: can we can squeeze this into ThreadState::Fast?
 +  // E.g. ThreadState::Fast is a 44-bit, 32 are taken by synch_epoch and 12 are
 +  // taken by epoch between synchs.
 +  // This way we can save one load from tls.
 +  u64 fast_synch_epoch;
    // Technically `current` should be a separate THREADLOCAL variable;
    // but it is placed here in order to share cache line with previous fields.
    ThreadState* current;
 -
 -  atomic_sint32_t pending_signals;
 -
 -  VectorClock clock;
 -
    // This is a slow path flag. On fast path, fast_state.GetIgnoreBit() is read.
    // We do not distinguish beteween ignoring reads and writes
    // for better performance.
    int ignore_reads_and_writes;
 +  atomic_sint32_t pending_signals;
 +  int ignore_sync;
    int suppress_reports;
    // Go does not support ignores.
  #if !SANITIZER_GO
 @@ -189,27 +162,31 @@ struct ThreadState {
  #endif
    uptr *shadow_stack;
    uptr *shadow_stack_end;
 +  uptr *shadow_stack_pos;
 +  RawShadow *racy_shadow_addr;
 +  RawShadow racy_state[2];
 +  MutexSet mset;
 +  ThreadClock clock;
  #if !SANITIZER_GO
    Vector<JmpBuf> jmp_bufs;
 -  int in_symbolizer;
 +  int ignore_interceptors;
 +#endif
 +  const Tid tid;
 +  const int unique_id;
 +  bool in_symbolizer;
    bool in_ignored_lib;
    bool is_inited;
 -#endif
 -  MutexSet mset;
    bool is_dead;
 -  const Tid tid;
 -  uptr stk_addr;
 -  uptr stk_size;
 -  uptr tls_addr;
 -  uptr tls_size;
 +  bool is_freeing;
 +  bool is_vptr_access;
 +  const uptr stk_addr;
 +  const uptr stk_size;
 +  const uptr tls_addr;
 +  const uptr tls_size;
    ThreadContext *tctx;

    DDLogicalThread *dd_lt;

 -  TidSlot *slot;
 -  uptr slot_epoch;
 -  bool slot_locked;
 -
    // Current wired Processor, or nullptr. Required to handle any events.
    Processor *proc1;
  #if !SANITIZER_GO
 @@ -223,7 +200,7 @@ struct ThreadState {

  #if !SANITIZER_GO
    StackID last_sleep_stack_id;
 -  VectorClock last_sleep_clock;
 +  ThreadClock last_sleep_clock;
  #endif

    // Set in regions of runtime that must be signal-safe and fork-safe.
 @@ -232,7 +209,16 @@ struct ThreadState {

    const ReportDesc *current_report;

 -  explicit ThreadState(Tid tid);
 +  // Current position in tctx->trace.Back()->events (Event*).
 +  atomic_uintptr_t trace_pos;
 +  // PC of the last memory access, used to compute PC deltas in the trace.
 +  uptr trace_prev_pc;
 +  Sid sid;
 +  Epoch epoch;
 +
 +  explicit ThreadState(Context *ctx, Tid tid, int unique_id, u64 epoch,
 +                       unsigned reuse_count, uptr stk_addr, uptr stk_size,
 +                       uptr tls_addr, uptr tls_size);
  } ALIGNED(SANITIZER_CACHE_LINE_SIZE);

  #if !SANITIZER_GO
 @@ -266,9 +252,14 @@ class ThreadContext final : public ThreadContextBase {
    ~ThreadContext();
    ThreadState *thr;
    StackID creation_stack_id;
 -  VectorClock *sync;
 -  uptr sync_epoch;
 -  Trace trace;
 +  SyncClock sync;
 +  // Epoch at which the thread had started.
 +  // If we see an event from the thread stamped by an older epoch,
 +  // the event is from a dead thread that shared tid with this thread.
 +  u64 epoch0;
 +  u64 epoch1;
 +
 +  v3::Trace trace;

    // Override superclass callbacks.
    void OnDead() override;
 @@ -328,21 +319,7 @@ struct Context {
    Flags flags;
    fd_t memprof_fd;

 -  // The last slot index (kFreeSid) is used to denote freed memory.
 -  TidSlot slots[kThreadSlotCount - 1];
 -
 -  // Protects global_epoch, slot_queue, trace_part_recycle.
    Mutex slot_mtx;
 -  // Prevents lock order inversions when we lock more than 1 slot.
 -  Mutex multi_slot_mtx;
 -  uptr global_epoch;  // guarded by slot_mtx and by all slot mutexes
 -  bool resetting;     // global reset is in progress
 -  IList<TidSlot, &TidSlot::node> slot_queue GUARDED_BY(slot_mtx);
 -  IList<TraceHeader, &TraceHeader::global, TracePart> trace_part_recycle
 -      GUARDED_BY(slot_mtx);
 -  uptr trace_part_total_allocated GUARDED_BY(slot_mtx);
 -  uptr trace_part_recycle_finished GUARDED_BY(slot_mtx);
 -  uptr trace_part_finished_excess GUARDED_BY(slot_mtx);
  };

  extern Context *ctx;  // The one and the only global runtime context.
 @@ -371,13 +348,14 @@ uptr TagFromShadowStackFrame(uptr pc);

  class ScopedReportBase {
   public:
 -  void AddMemoryAccess(uptr addr, uptr external_tag, Shadow s, Tid tid,
 -                       StackTrace stack, const MutexSet *mset);
 +  void AddMemoryAccess(uptr addr, uptr external_tag, Shadow s, StackTrace stack,
 +                       const MutexSet *mset);
    void AddStack(StackTrace stack, bool suppressable = false);
    void AddThread(const ThreadContext *tctx, bool suppressable = false);
 -  void AddThread(Tid tid, bool suppressable = false);
 +  void AddThread(Tid unique_tid, bool suppressable = false);
    void AddUniqueTid(Tid unique_tid);
 -  int AddMutex(uptr addr, StackID creation_stack_id);
 +  void AddMutex(const SyncVar *s);
 +  u64 AddMutex(u64 id);
    void AddLocation(uptr addr, uptr size);
    void AddSleep(StackID stack_id);
    void SetCount(int count);
 @@ -394,6 +372,8 @@ class ScopedReportBase {
    // at best it will cause deadlocks on internal mutexes.
    ScopedIgnoreInterceptors ignore_interceptors_;

 +  void AddDeadMutex(u64 id);
 +
    ScopedReportBase(const ScopedReportBase &) = delete;
    void operator=(const ScopedReportBase &) = delete;
  };
 @@ -409,6 +389,8 @@ class ScopedReport : public ScopedReportBase {

  bool ShouldReport(ThreadState *thr, ReportType typ);
  ThreadContext *IsThreadStackOrTls(uptr addr, bool *is_stack);
 +void RestoreStack(Tid tid, const u64 epoch, VarSizeStackTrace *stk,
 +                  MutexSet *mset, uptr *tag = nullptr);

  // The stack could look like:
  //   <start> | <main> | <foo> | tag | <bar>
 @@ -456,8 +438,7 @@ void ForkBefore(ThreadState *thr, uptr pc);
  void ForkParentAfter(ThreadState *thr, uptr pc);
  void ForkChildAfter(ThreadState *thr, uptr pc, bool start_thread);

 -void ReportRace(ThreadState *thr, RawShadow *shadow_mem, Shadow cur, Shadow old,
 -                AccessType typ);
 +void ReportRace(ThreadState *thr);
  bool OutputReport(ThreadState *thr, const ScopedReport &srep);
  bool IsFiredSuppression(Context *ctx, ReportType type, StackTrace trace);
  bool IsExpectedReport(uptr addr, uptr size);
 @@ -487,28 +468,55 @@ int Finalize(ThreadState *thr);
  void OnUserAlloc(ThreadState *thr, uptr pc, uptr p, uptr sz, bool write);
  void OnUserFree(ThreadState *thr, uptr pc, uptr p, bool write);

 -void MemoryAccess(ThreadState *thr, uptr pc, uptr addr, uptr size,
 -                  AccessType typ);
 +void MemoryAccess(ThreadState *thr, uptr pc, uptr addr,
 +    int kAccessSizeLog, bool kAccessIsWrite, bool kIsAtomic);
 +void MemoryAccessImpl(ThreadState *thr, uptr addr,
 +    int kAccessSizeLog, bool kAccessIsWrite, bool kIsAtomic,
 +    u64 *shadow_mem, Shadow cur);
 +void MemoryAccessRange(ThreadState *thr, uptr pc, uptr addr,
 +    uptr size, bool is_write);
  void UnalignedMemoryAccess(ThreadState *thr, uptr pc, uptr addr, uptr size,
                             AccessType typ);
 -// This creates 2 non-inlined specialized versions of MemoryAccessRange.
 -template <bool is_read>
 -void MemoryAccessRangeT(ThreadState *thr, uptr pc, uptr addr, uptr size);
 +
 +const int kSizeLog1 = 0;
 +const int kSizeLog2 = 1;
 +const int kSizeLog4 = 2;
 +const int kSizeLog8 = 3;

  ALWAYS_INLINE
 -void MemoryAccessRange(ThreadState *thr, uptr pc, uptr addr, uptr size,
 -                       bool is_write) {
 -  if (size == 0)
 -    return;
 -  if (is_write)
 -    MemoryAccessRangeT<false>(thr, pc, addr, size);
 -  else
 -    MemoryAccessRangeT<true>(thr, pc, addr, size);
 +void MemoryAccess(ThreadState *thr, uptr pc, uptr addr, uptr size,
 +                  AccessType typ) {
 +  int size_log;
 +  switch (size) {
 +    case 1:
 +      size_log = kSizeLog1;
 +      break;
 +    case 2:
 +      size_log = kSizeLog2;
 +      break;
 +    case 4:
 +      size_log = kSizeLog4;
 +      break;
 +    default:
 +      DCHECK_EQ(size, 8);
 +      size_log = kSizeLog8;
 +      break;
 +  }
 +  bool is_write = !(typ & kAccessRead);
 +  bool is_atomic = typ & kAccessAtomic;
 +  if (typ & kAccessVptr)
 +    thr->is_vptr_access = true;
 +  if (typ & kAccessFree)
 +    thr->is_freeing = true;
 +  MemoryAccess(thr, pc, addr, size_log, is_write, is_atomic);
 +  if (typ & kAccessVptr)
 +    thr->is_vptr_access = false;
 +  if (typ & kAccessFree)
 +    thr->is_freeing = false;
  }

 -void ShadowSet(RawShadow *p, RawShadow *end, RawShadow v);
 -void MemoryRangeFreed(ThreadState *thr, uptr pc, uptr addr, uptr size);
  void MemoryResetRange(ThreadState *thr, uptr pc, uptr addr, uptr size);
 +void MemoryRangeFreed(ThreadState *thr, uptr pc, uptr addr, uptr size);
  void MemoryRangeImitateWrite(ThreadState *thr, uptr pc, uptr addr, uptr size);
  void MemoryRangeImitateWriteOrResetRange(ThreadState *thr, uptr pc, uptr addr,
                                           uptr size);
 @@ -518,6 +526,9 @@ void ThreadIgnoreEnd(ThreadState *thr);
  void ThreadIgnoreSyncBegin(ThreadState *thr, uptr pc);
  void ThreadIgnoreSyncEnd(ThreadState *thr);

 +void FuncEntry(ThreadState *thr, uptr pc);
 +void FuncExit(ThreadState *thr);
 +
  Tid ThreadCreate(ThreadState *thr, uptr pc, uptr uid, bool detached);
  void ThreadStart(ThreadState *thr, Tid tid, tid_t os_id,
                   ThreadType thread_type);
 @@ -563,7 +574,11 @@ void Release(ThreadState *thr, uptr pc, uptr addr);
  void ReleaseStoreAcquire(ThreadState *thr, uptr pc, uptr addr);
  void ReleaseStore(ThreadState *thr, uptr pc, uptr addr);
  void AfterSleep(ThreadState *thr, uptr pc);
 -void IncrementEpoch(ThreadState *thr);
 +void AcquireImpl(ThreadState *thr, uptr pc, SyncClock *c);
 +void ReleaseImpl(ThreadState *thr, uptr pc, SyncClock *c);
 +void ReleaseStoreAcquireImpl(ThreadState *thr, uptr pc, SyncClock *c);
 +void ReleaseStoreImpl(ThreadState *thr, uptr pc, SyncClock *c);
 +void AcquireReleaseImpl(ThreadState *thr, uptr pc, SyncClock *c);

  // The hacky call uses custom calling convention and an assembly thunk.
  // It is considerably faster that a normal call for the caller
 @@ -586,19 +601,43 @@ void IncrementEpoch(ThreadState *thr);
  #define HACKY_CALL(f) f()
  #endif

 +void TraceSwitch(ThreadState *thr);
 +uptr TraceTopPC(ThreadState *thr);
 +uptr TraceSize();
 +uptr TraceParts();
 +Trace *ThreadTrace(Tid tid);
 +
 +extern "C" void __tsan_trace_switch();
 +void ALWAYS_INLINE TraceAddEvent(ThreadState *thr, FastState fs,
 +                                        EventType typ, u64 addr) {
 +  if (!kCollectHistory)
 +    return;
 +  // TraceSwitch accesses shadow_stack, but it's called infrequently,
 +  // so we check it here proactively.
 +  DCHECK(thr->shadow_stack);
 +  DCHECK_GE((int)typ, 0);
 +  DCHECK_LE((int)typ, 7);
 +  DCHECK_EQ(GetLsb(addr, kEventPCBits), addr);
 +  u64 pos = fs.GetTracePos();
 +  if (UNLIKELY((pos % kTracePartSize) == 0)) {
 +#if !SANITIZER_GO
 +    HACKY_CALL(__tsan_trace_switch);
 +#else
 +    TraceSwitch(thr);
 +#endif
 +  }
 +  Event *trace = (Event*)GetThreadTrace(fs.tid());
 +  Event *evp = &trace[pos];
 +  Event ev = (u64)addr | ((u64)typ << kEventPCBits);
 +  *evp = ev;
 +}
 +
  #if !SANITIZER_GO
  uptr ALWAYS_INLINE HeapEnd() {
    return HeapMemEnd() + PrimaryAllocator::AdditionalSize();
  }
  #endif

 -void SlotAttachAndLock(ThreadState *thr) ACQUIRE(thr->slot->mtx);
 -void SlotDetach(ThreadState *thr);
 -void SlotLock(ThreadState *thr) ACQUIRE(thr->slot->mtx);
 -void SlotUnlock(ThreadState *thr) RELEASE(thr->slot->mtx);
 -void DoReset(ThreadState *thr, uptr epoch);
 -void FlushShadowMemory();
 -
  ThreadState *FiberCreate(ThreadState *thr, uptr pc, unsigned flags);
  void FiberDestroy(ThreadState *thr, uptr pc, ThreadState *fiber);
  void FiberSwitch(ThreadState *thr, uptr pc, ThreadState *fiber, unsigned flags);
 @@ -609,53 +648,6 @@ enum FiberSwitchFlags {
    FiberSwitchFlagNoSync = 1 << 0, // __tsan_switch_to_fiber_no_sync
  };

 -class SlotPairLocker {
 - public:
 -  SlotPairLocker(ThreadState *thr, Sid sid);
 -  ~SlotPairLocker();
 -
 - private:
 -  ThreadState *thr_;
 -  TidSlot *slot_;
 -};
 -
 -class SlotLocker {
 - public:
 -  ALWAYS_INLINE
 -  SlotLocker(ThreadState *thr, bool recursive = false)
 -      : thr_(thr), locked_(recursive ? thr->slot_locked : false) {
 -    if (!locked_)
 -      SlotLock(thr_);
 -  }
 -
 -  ALWAYS_INLINE
 -  ~SlotLocker() {
 -    if (!locked_)
 -      SlotUnlock(thr_);
 -  }
 -
 - private:
 -  ThreadState *thr_;
 -  bool locked_;
 -};
 -
 -class SlotUnlocker {
 - public:
 -  SlotUnlocker(ThreadState *thr) : thr_(thr), locked_(thr->slot_locked) {
 -    if (locked_)
 -      SlotUnlock(thr_);
 -  }
 -
 -  ~SlotUnlocker() {
 -    if (locked_)
 -      SlotLock(thr_);
 -  }
 -
 - private:
 -  ThreadState *thr_;
 -  bool locked_;
 -};
 -
  ALWAYS_INLINE void ProcessPendingSignals(ThreadState *thr) {
    if (UNLIKELY(atomic_load_relaxed(&thr->pending_signals)))
      ProcessPendingSignalsImpl(thr);
 @@ -674,19 +666,16 @@ void LazyInitialize(ThreadState *thr) {
  #endif
  }

 -void TraceResetForTesting();
 +namespace v3 {
 +
  void TraceSwitchPart(ThreadState *thr);
 -void TraceSwitchPartImpl(ThreadState *thr);
 -bool RestoreStack(EventType type, Sid sid, Epoch epoch, uptr addr, uptr size,
 -                  AccessType typ, Tid *ptid, VarSizeStackTrace *pstk,
 +bool RestoreStack(Tid tid, EventType type, Sid sid, Epoch epoch, uptr addr,
 +                  uptr size, AccessType typ, VarSizeStackTrace *pstk,
                    MutexSet *pmset, uptr *ptag);

  template <typename EventT>
  ALWAYS_INLINE WARN_UNUSED_RESULT bool TraceAcquire(ThreadState *thr,
                                                     EventT **ev) {
 -  // TraceSwitchPart accesses shadow_stack, but it's called infrequently,
 -  // so we check it here proactively.
 -  DCHECK(thr->shadow_stack);
    Event *pos = reinterpret_cast<Event *>(atomic_load_relaxed(&thr->trace_pos));
  #if SANITIZER_DEBUG
    // TraceSwitch acquires these mutexes,
 @@ -757,16 +746,20 @@ void TraceMutexLock(ThreadState *thr, EventType type, uptr pc, uptr addr,
  void TraceMutexUnlock(ThreadState *thr, uptr addr);
  void TraceTime(ThreadState *thr);

 -void TraceRestartFuncExit(ThreadState *thr);
 -void TraceRestartFuncEntry(ThreadState *thr, uptr pc);
 +}  // namespace v3

  void GrowShadowStack(ThreadState *thr);

  ALWAYS_INLINE
  void FuncEntry(ThreadState *thr, uptr pc) {
 -  DPrintf2("#%d: FuncEntry %p\n", (int)thr->fast_state.sid(), (void *)pc);
 -  if (UNLIKELY(!TryTraceFunc(thr, pc)))
 -    return TraceRestartFuncEntry(thr, pc);
 +  DPrintf2("#%d: FuncEntry %p\n", (int)thr->fast_state.tid(), (void *)pc);
 +  if (kCollectHistory) {
 +    thr->fast_state.IncrementEpoch();
 +    TraceAddEvent(thr, thr->fast_state, EventTypeFuncEnter, pc);
 +  }
 +
 +  // Shadow stack maintenance can be replaced with
 +  // stack unwinding during trace switch (which presumably must be faster).
    DCHECK_GE(thr->shadow_stack_pos, thr->shadow_stack);
  #if !SANITIZER_GO
    DCHECK_LT(thr->shadow_stack_pos, thr->shadow_stack_end);
 @@ -780,9 +773,12 @@ void FuncEntry(ThreadState *thr, uptr pc) {

  ALWAYS_INLINE
  void FuncExit(ThreadState *thr) {
 -  DPrintf2("#%d: FuncExit\n", (int)thr->fast_state.sid());
 -  if (UNLIKELY(!TryTraceFunc(thr, 0)))
 -    return TraceRestartFuncExit(thr);
 +  DPrintf2("#%d: FuncExit\n", (int)thr->fast_state.tid());
 +  if (kCollectHistory) {
 +    thr->fast_state.IncrementEpoch();
 +    TraceAddEvent(thr, thr->fast_state, EventTypeFuncExit, 0);
 +  }
 +
    DCHECK_GT(thr->shadow_stack_pos, thr->shadow_stack);
  #if !SANITIZER_GO
    DCHECK_LT(thr->shadow_stack_pos, thr->shadow_stack_end);
 @@ -794,6 +790,7 @@ void FuncExit(ThreadState *thr) {
  extern void (*on_initialize)(void);
  extern int (*on_finalize)(int);
  #endif
 +
  }  // namespace __tsan

  #endif  // TSAN_RTL_H
 diff --git a/compiler-rt/lib/tsan/rtl/tsan_rtl_access.cpp b/compiler-rt/lib/tsan/rtl/tsan_rtl_access.cpp
 index 76e269e2ed2a..7365fdaa3038 100644
 --- a/compiler-rt/lib/tsan/rtl/tsan_rtl_access.cpp
 +++ b/compiler-rt/lib/tsan/rtl/tsan_rtl_access.cpp
 @@ -15,13 +15,15 @@

  namespace __tsan {

 -ALWAYS_INLINE USED bool TryTraceMemoryAccess(ThreadState* thr, uptr pc,
 +namespace v3 {
 +
 +ALWAYS_INLINE USED bool TryTraceMemoryAccess(ThreadState *thr, uptr pc,
                                               uptr addr, uptr size,
                                               AccessType typ) {
    DCHECK(size == 1 || size == 2 || size == 4 || size == 8);
    if (!kCollectHistory)
      return true;
 -  EventAccess* ev;
 +  EventAccess *ev;
    if (UNLIKELY(!TraceAcquire(thr, &ev)))
      return false;
    u64 size_log = size == 1 ? 0 : size == 2 ? 1 : size == 4 ? 2 : 3;
 @@ -38,27 +40,25 @@ ALWAYS_INLINE USED bool TryTraceMemoryAccess(ThreadState* thr, uptr pc,
      TraceRelease(thr, ev);
      return true;
    }
 -  auto* evex = reinterpret_cast<EventAccessExt*>(ev);
 +  auto *evex = reinterpret_cast<EventAccessExt *>(ev);
    evex->is_access = 0;
    evex->is_func = 0;
    evex->type = EventType::kAccessExt;
    evex->is_read = !!(typ & kAccessRead);
    evex->is_atomic = !!(typ & kAccessAtomic);
    evex->size_log = size_log;
 -  // Note: this is important, see comment in EventAccessExt.
 -  evex->_ = 0;
    evex->addr = CompressAddr(addr);
    evex->pc = pc;
    TraceRelease(thr, evex);
    return true;
  }

 -ALWAYS_INLINE
 -bool TryTraceMemoryAccessRange(ThreadState* thr, uptr pc, uptr addr, uptr size,
 -                               AccessType typ) {
 +ALWAYS_INLINE USED bool TryTraceMemoryAccessRange(ThreadState *thr, uptr pc,
 +                                                  uptr addr, uptr size,
 +                                                  AccessType typ) {
    if (!kCollectHistory)
      return true;
 -  EventAccessRange* ev;
 +  EventAccessRange *ev;
    if (UNLIKELY(!TraceAcquire(thr, &ev)))
      return false;
    thr->trace_prev_pc = pc;
 @@ -75,7 +75,7 @@ bool TryTraceMemoryAccessRange(ThreadState* thr, uptr pc, uptr addr, uptr size,
    return true;
  }

 -void TraceMemoryAccessRange(ThreadState* thr, uptr pc, uptr addr, uptr size,
 +void TraceMemoryAccessRange(ThreadState *thr, uptr pc, uptr addr, uptr size,
                              AccessType typ) {
    if (LIKELY(TryTraceMemoryAccessRange(thr, pc, addr, size, typ)))
      return;
 @@ -84,7 +84,7 @@ void TraceMemoryAccessRange(ThreadState* thr, uptr pc, uptr addr, uptr size,
    DCHECK(res);
  }

 -void TraceFunc(ThreadState* thr, uptr pc) {
 +void TraceFunc(ThreadState *thr, uptr pc) {
    if (LIKELY(TryTraceFunc(thr, pc)))
      return;
    TraceSwitchPart(thr);
 @@ -92,17 +92,7 @@ void TraceFunc(ThreadState* thr, uptr pc) {
    DCHECK(res);
  }

 -NOINLINE void TraceRestartFuncEntry(ThreadState* thr, uptr pc) {
 -  TraceSwitchPart(thr);
 -  FuncEntry(thr, pc);
 -}
 -
 -NOINLINE void TraceRestartFuncExit(ThreadState* thr) {
 -  TraceSwitchPart(thr);
 -  FuncExit(thr);
 -}
 -
 -void TraceMutexLock(ThreadState* thr, EventType type, uptr pc, uptr addr,
 +void TraceMutexLock(ThreadState *thr, EventType type, uptr pc, uptr addr,
                      StackID stk) {
    DCHECK(type == EventType::kLock || type == EventType::kRLock);
    if (!kCollectHistory)
 @@ -119,7 +109,7 @@ void TraceMutexLock(ThreadState* thr, EventType type, uptr pc, uptr addr,
    TraceEvent(thr, ev);
  }

 -void TraceMutexUnlock(ThreadState* thr, uptr addr) {
 +void TraceMutexUnlock(ThreadState *thr, uptr addr) {
    if (!kCollectHistory)
      return;
    EventUnlock ev;
 @@ -131,485 +121,396 @@ void TraceMutexUnlock(ThreadState* thr, uptr addr) {
    TraceEvent(thr, ev);
  }

 -void TraceTime(ThreadState* thr) {
 +void TraceTime(ThreadState *thr) {
    if (!kCollectHistory)
      return;
 -  FastState fast_state = thr->fast_state;
    EventTime ev;
    ev.is_access = 0;
    ev.is_func = 0;
    ev.type = EventType::kTime;
 -  ev.sid = static_cast<u64>(fast_state.sid());
 -  ev.epoch = static_cast<u64>(fast_state.epoch());
 +  ev.sid = static_cast<u64>(thr->sid);
 +  ev.epoch = static_cast<u64>(thr->epoch);
    ev._ = 0;
    TraceEvent(thr, ev);
  }

 -ALWAYS_INLINE RawShadow LoadShadow(RawShadow* p) {
 -  return static_cast<RawShadow>(
 -      atomic_load((atomic_uint32_t*)p, memory_order_relaxed));
 -}
 +}  // namespace v3

 -ALWAYS_INLINE void StoreShadow(RawShadow* sp, RawShadow s) {
 -  atomic_store((atomic_uint32_t*)sp, static_cast<u32>(s), memory_order_relaxed);
 +ALWAYS_INLINE
 +Shadow LoadShadow(u64 *p) {
 +  u64 raw = atomic_load((atomic_uint64_t *)p, memory_order_relaxed);
 +  return Shadow(raw);
  }

 -NOINLINE void DoReportRace(ThreadState* thr, RawShadow* shadow_mem, Shadow cur,
 -                           Shadow old,
 -                           AccessType typ) NO_THREAD_SAFETY_ANALYSIS {
 -  // For the free shadow markers the first element (that contains kFreeSid)
 -  // triggers the race, but the second element contains info about the freeing
 -  // thread, take it.
 -  if (old.sid() == kFreeSid)
 -    old = Shadow(LoadShadow(&shadow_mem[1]));
 -  // This prevents trapping on this address in future.
 -  for (uptr i = 0; i < kShadowCnt; i++)
 -    StoreShadow(&shadow_mem[i], i == 0 ? Shadow::kRodata : Shadow::kEmpty);
 -  // See the comment in MemoryRangeFreed as to why the slot is locked
 -  // for free memory accesses. ReportRace must not be called with
 -  // the slot locked because of the fork. But MemoryRangeFreed is not
 -  // called during fork because fork sets ignore_reads_and_writes,
 -  // so simply unlocking the slot should be fine.
 -  if (typ & kAccessFree)
 -    SlotUnlock(thr);
 -  ReportRace(thr, shadow_mem, cur, Shadow(old), typ);
 -  if (typ & kAccessFree)
 -    SlotLock(thr);
 +ALWAYS_INLINE
 +void StoreShadow(u64 *sp, u64 s) {
 +  atomic_store((atomic_uint64_t *)sp, s, memory_order_relaxed);
  }

 -#if !TSAN_VECTORIZE
  ALWAYS_INLINE
 -bool ContainsSameAccess(RawShadow* s, Shadow cur, int unused0, int unused1,
 -                        AccessType typ) {
 -  for (uptr i = 0; i < kShadowCnt; i++) {
 -    auto old = LoadShadow(&s[i]);
 -    if (!(typ & kAccessRead)) {
 -      if (old == cur.raw())
 -        return true;
 -      continue;
 -    }
 -    auto masked = static_cast<RawShadow>(static_cast<u32>(old) |
 -                                         static_cast<u32>(Shadow::kRodata));
 -    if (masked == cur.raw())
 -      return true;
 -    if (!(typ & kAccessNoRodata) && !SANITIZER_GO) {
 -      if (old == Shadow::kRodata)
 -        return true;
 -    }
 -  }
 -  return false;
 +void StoreIfNotYetStored(u64 *sp, u64 *s) {
 +  StoreShadow(sp, *s);
 +  *s = 0;
  }

 +extern "C" void __tsan_report_race();
 +
  ALWAYS_INLINE
 -bool CheckRaces(ThreadState* thr, RawShadow* shadow_mem, Shadow cur,
 -                int unused0, int unused1, AccessType typ) {
 -  bool stored = false;
 -  for (uptr idx = 0; idx < kShadowCnt; idx++) {
 -    RawShadow* sp = &shadow_mem[idx];
 -    Shadow old(LoadShadow(sp));
 -    if (LIKELY(old.raw() == Shadow::kEmpty)) {
 -      if (!(typ & kAccessCheckOnly) && !stored)
 -        StoreShadow(sp, cur.raw());
 -      return false;
 -    }
 -    if (LIKELY(!(cur.access() & old.access())))
 -      continue;
 -    if (LIKELY(cur.sid() == old.sid())) {
 -      if (!(typ & kAccessCheckOnly) &&
 -          LIKELY(cur.access() == old.access() && old.IsRWWeakerOrEqual(typ))) {
 -        StoreShadow(sp, cur.raw());
 -        stored = true;
 -      }
 -      continue;
 -    }
 -    if (LIKELY(old.IsBothReadsOrAtomic(typ)))
 -      continue;
 -    if (LIKELY(thr->clock.Get(old.sid()) >= old.epoch()))
 -      continue;
 -    DoReportRace(thr, shadow_mem, cur, old, typ);
 -    return true;
 -  }
 -  // We did not find any races and had already stored
 -  // the current access info, so we are done.
 -  if (LIKELY(stored))
 -    return false;
 -  // Choose a random candidate slot and replace it.
 -  uptr index =
 -      atomic_load_relaxed(&thr->trace_pos) / sizeof(Event) % kShadowCnt;
 -  StoreShadow(&shadow_mem[index], cur.raw());
 -  return false;
 +void HandleRace(ThreadState *thr, u64 *shadow_mem, Shadow cur, Shadow old) {
 +  thr->racy_state[0] = cur.raw();
 +  thr->racy_state[1] = old.raw();
 +  thr->racy_shadow_addr = shadow_mem;
 +#if !SANITIZER_GO
 +  HACKY_CALL(__tsan_report_race);
 +#else
 +  ReportRace(thr);
 +#endif
  }

 -#  define LOAD_CURRENT_SHADOW(cur, shadow_mem) UNUSED int access = 0, shadow = 0
 -
 -#else /* !TSAN_VECTORIZE */
 +static inline bool HappensBefore(Shadow old, ThreadState *thr) {
 +  return thr->clock.get(old.TidWithIgnore()) >= old.epoch();
 +}

  ALWAYS_INLINE
 -bool ContainsSameAccess(RawShadow* unused0, Shadow unused1, m128 shadow,
 -                        m128 access, AccessType typ) {
 -  // Note: we could check if there is a larger access of the same type,
 -  // e.g. we just allocated/memset-ed a block (so it contains 8 byte writes)
 -  // and now do smaller reads/writes, these can also be considered as "same
 -  // access". However, it will make the check more expensive, so it's unclear
 -  // if it's worth it. But this would conserve trace space, so it's useful
 -  // besides potential speed up.
 -  if (!(typ & kAccessRead)) {
 -    const m128 same = _mm_cmpeq_epi32(shadow, access);
 -    return _mm_movemask_epi8(same);
 +void MemoryAccessImpl1(ThreadState *thr, uptr addr, int kAccessSizeLog,
 +                       bool kAccessIsWrite, bool kIsAtomic, u64 *shadow_mem,
 +                       Shadow cur) {
 +  // This potentially can live in an MMX/SSE scratch register.
 +  // The required intrinsics are:
 +  // __m128i _mm_move_epi64(__m128i*);
 +  // _mm_storel_epi64(u64*, __m128i);
 +  u64 store_word = cur.raw();
 +  bool stored = false;
 +
 +  // scan all the shadow values and dispatch to 4 categories:
 +  // same, replace, candidate and race (see comments below).
 +  // we consider only 3 cases regarding access sizes:
 +  // equal, intersect and not intersect. initially I considered
 +  // larger and smaller as well, it allowed to replace some
 +  // 'candidates' with 'same' or 'replace', but I think
 +  // it's just not worth it (performance- and complexity-wise).
 +
 +  Shadow old(0);
 +
 +  // It release mode we manually unroll the loop,
 +  // because empirically gcc generates better code this way.
 +  // However, we can't afford unrolling in debug mode, because the function
 +  // consumes almost 4K of stack. Gtest gives only 4K of stack to death test
 +  // threads, which is not enough for the unrolled loop.
 +#if SANITIZER_DEBUG
 +  for (int idx = 0; idx < 4; idx++) {
 +#  include "tsan_update_shadow_word.inc"
 +  }
 +#else
 +  int idx = 0;
 +#  include "tsan_update_shadow_word.inc"
 +  idx = 1;
 +  if (stored) {
 +#  include "tsan_update_shadow_word.inc"
 +  } else {
 +#  include "tsan_update_shadow_word.inc"
    }
 -  // For reads we need to reset read bit in the shadow,
 -  // because we need to match read with both reads and writes.
 -  // Shadow::kRodata has only read bit set, so it does what we want.
 -  // We also abuse it for rodata check to save few cycles
 -  // since we already loaded Shadow::kRodata into a register.
 -  // Reads from rodata can't race.
 -  // Measurements show that they can be 10-20% of all memory accesses.
 -  // Shadow::kRodata has epoch 0 which cannot appear in shadow normally
 -  // (thread epochs start from 1). So the same read bit mask
 -  // serves as rodata indicator.
 -  const m128 read_mask = _mm_set1_epi32(static_cast<u32>(Shadow::kRodata));
 -  const m128 masked_shadow = _mm_or_si128(shadow, read_mask);
 -  m128 same = _mm_cmpeq_epi32(masked_shadow, access);
 -  // Range memory accesses check Shadow::kRodata before calling this,
 -  // Shadow::kRodatas is not possible for free memory access
 -  // and Go does not use Shadow::kRodata.
 -  if (!(typ & kAccessNoRodata) && !SANITIZER_GO) {
 -    const m128 ro = _mm_cmpeq_epi32(shadow, read_mask);
 -    same = _mm_or_si128(ro, same);
 +  idx = 2;
 +  if (stored) {
 +#  include "tsan_update_shadow_word.inc"
 +  } else {
 +#  include "tsan_update_shadow_word.inc"
    }
 -  return _mm_movemask_epi8(same);
 -}
 +  idx = 3;
 +  if (stored) {
 +#  include "tsan_update_shadow_word.inc"
 +  } else {
 +#  include "tsan_update_shadow_word.inc"
 +  }
 +#endif

 -NOINLINE void DoReportRaceV(ThreadState* thr, RawShadow* shadow_mem, Shadow cur,
 -                            u32 race_mask, m128 shadow, AccessType typ) {
 -  // race_mask points which of the shadow elements raced with the current
 -  // access. Extract that element.
 -  CHECK_NE(race_mask, 0);
 -  u32 old;
 -  // Note: _mm_extract_epi32 index must be a constant value.
 -  switch (__builtin_ffs(race_mask) / 4) {
 -    case 0:
 -      old = _mm_extract_epi32(shadow, 0);
 -      break;
 -    case 1:
 -      old = _mm_extract_epi32(shadow, 1);
 -      break;
 -    case 2:
 -      old = _mm_extract_epi32(shadow, 2);
 -      break;
 -    case 3:
 -      old = _mm_extract_epi32(shadow, 3);
 -      break;
 +  // we did not find any races and had already stored
 +  // the current access info, so we are done
 +  if (LIKELY(stored))
 +    return;
 +  // choose a random candidate slot and replace it
 +  StoreShadow(shadow_mem + (cur.epoch() % kShadowCnt), store_word);
 +  return;
 +RACE:
 +  HandleRace(thr, shadow_mem, cur, old);
 +  return;
 +}
 +
 +void UnalignedMemoryAccess(ThreadState *thr, uptr pc, uptr addr, uptr size,
 +                           AccessType typ) {
 +  DCHECK(!(typ & kAccessAtomic));
 +  const bool kAccessIsWrite = !(typ & kAccessRead);
 +  const bool kIsAtomic = false;
 +  while (size) {
 +    int size1 = 1;
 +    int kAccessSizeLog = kSizeLog1;
 +    if (size >= 8 && (addr & ~7) == ((addr + 7) & ~7)) {
 +      size1 = 8;
 +      kAccessSizeLog = kSizeLog8;
 +    } else if (size >= 4 && (addr & ~7) == ((addr + 3) & ~7)) {
 +      size1 = 4;
 +      kAccessSizeLog = kSizeLog4;
 +    } else if (size >= 2 && (addr & ~7) == ((addr + 1) & ~7)) {
 +      size1 = 2;
 +      kAccessSizeLog = kSizeLog2;
 +    }
 +    MemoryAccess(thr, pc, addr, kAccessSizeLog, kAccessIsWrite, kIsAtomic);
 +    addr += size1;
 +    size -= size1;
    }
 -  Shadow prev(static_cast<RawShadow>(old));
 -  // For the free shadow markers the first element (that contains kFreeSid)
 -  // triggers the race, but the second element contains info about the freeing
 -  // thread, take it.
 -  if (prev.sid() == kFreeSid)
 -    prev = Shadow(static_cast<RawShadow>(_mm_extract_epi32(shadow, 1)));
 -  DoReportRace(thr, shadow_mem, cur, prev, typ);
  }

  ALWAYS_INLINE
 -bool CheckRaces(ThreadState* thr, RawShadow* shadow_mem, Shadow cur,
 -                m128 shadow, m128 access, AccessType typ) {
 -  // Note: empty/zero slots don't intersect with any access.
 -  const m128 zero = _mm_setzero_si128();
 -  const m128 mask_access = _mm_set1_epi32(0x000000ff);
 -  const m128 mask_sid = _mm_set1_epi32(0x0000ff00);
 -  const m128 mask_read_atomic = _mm_set1_epi32(0xc0000000);
 -  const m128 access_and = _mm_and_si128(access, shadow);
 -  const m128 access_xor = _mm_xor_si128(access, shadow);
 -  const m128 intersect = _mm_and_si128(access_and, mask_access);
 -  const m128 not_intersect = _mm_cmpeq_epi32(intersect, zero);
 -  const m128 not_same_sid = _mm_and_si128(access_xor, mask_sid);
 -  const m128 same_sid = _mm_cmpeq_epi32(not_same_sid, zero);
 -  const m128 both_read_or_atomic = _mm_and_si128(access_and, mask_read_atomic);
 -  const m128 no_race =
 -      _mm_or_si128(_mm_or_si128(not_intersect, same_sid), both_read_or_atomic);
 -  const int race_mask = _mm_movemask_epi8(_mm_cmpeq_epi32(no_race, zero));
 -  if (UNLIKELY(race_mask))
 -    goto SHARED;
 -
 -STORE : {
 -  if (typ & kAccessCheckOnly)
 -    return false;
 -  // We could also replace different sid's if access is the same,
 -  // rw weaker and happens before. However, just checking access below
 -  // is not enough because we also need to check that !both_read_or_atomic
 -  // (reads from different sids can be concurrent).
 -  // Theoretically we could replace smaller accesses with larger accesses,
 -  // but it's unclear if it's worth doing.
 -  const m128 mask_access_sid = _mm_set1_epi32(0x0000ffff);
 -  const m128 not_same_sid_access = _mm_and_si128(access_xor, mask_access_sid);
 -  const m128 same_sid_access = _mm_cmpeq_epi32(not_same_sid_access, zero);
 -  const m128 access_read_atomic =
 -      _mm_set1_epi32((typ & (kAccessRead | kAccessAtomic)) << 30);
 -  const m128 rw_weaker =
 -      _mm_cmpeq_epi32(_mm_max_epu32(shadow, access_read_atomic), shadow);
 -  const m128 rewrite = _mm_and_si128(same_sid_access, rw_weaker);
 -  const int rewrite_mask = _mm_movemask_epi8(rewrite);
 -  int index = __builtin_ffs(rewrite_mask);
 -  if (UNLIKELY(index == 0)) {
 -    const m128 empty = _mm_cmpeq_epi32(shadow, zero);
 -    const int empty_mask = _mm_movemask_epi8(empty);
 -    index = __builtin_ffs(empty_mask);
 -    if (UNLIKELY(index == 0))
 -      index = (atomic_load_relaxed(&thr->trace_pos) / 2) % 16;
 +bool ContainsSameAccessSlow(u64 *s, u64 a, u64 sync_epoch, bool is_write) {
 +  Shadow cur(a);
 +  for (uptr i = 0; i < kShadowCnt; i++) {
 +    Shadow old(LoadShadow(&s[i]));
 +    if (Shadow::Addr0AndSizeAreEqual(cur, old) &&
 +        old.TidWithIgnore() == cur.TidWithIgnore() &&
 +        old.epoch() > sync_epoch && old.IsAtomic() == cur.IsAtomic() &&
 +        old.IsRead() <= cur.IsRead())
 +      return true;
    }
 -  StoreShadow(&shadow_mem[index / 4], cur.raw());
 -  // We could zero other slots determined by rewrite_mask.
 -  // That would help other threads to evict better slots,
 -  // but it's unclear if it's worth it.
    return false;
  }

 -SHARED:
 -  m128 thread_epochs = _mm_set1_epi32(0x7fffffff);
 -  // Need to unwind this because _mm_extract_epi8/_mm_insert_epi32
 -  // indexes must be constants.
 -#  define LOAD_EPOCH(idx)                                                     \
 -    if (LIKELY(race_mask & (1 << (idx * 4)))) {                               \
 -      u8 sid = _mm_extract_epi8(shadow, idx * 4 + 1);                         \
 -      u16 epoch = static_cast<u16>(thr->clock.Get(static_cast<Sid>(sid)));    \
 -      thread_epochs = _mm_insert_epi32(thread_epochs, u32(epoch) << 16, idx); \
 -    }
 -  LOAD_EPOCH(0);
 -  LOAD_EPOCH(1);
 -  LOAD_EPOCH(2);
 -  LOAD_EPOCH(3);
 -#  undef LOAD_EPOCH
 -  const m128 mask_epoch = _mm_set1_epi32(0x3fff0000);
 -  const m128 shadow_epochs = _mm_and_si128(shadow, mask_epoch);
 -  const m128 concurrent = _mm_cmplt_epi32(thread_epochs, shadow_epochs);
 -  const int concurrent_mask = _mm_movemask_epi8(concurrent);
 -  if (LIKELY(concurrent_mask == 0))
 -    goto STORE;
 -
 -  DoReportRaceV(thr, shadow_mem, cur, concurrent_mask, shadow, typ);
 -  return true;
 +#if TSAN_VECTORIZE
 +#  define SHUF(v0, v1, i0, i1, i2, i3)                    \
 +    _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(v0), \
 +                                    _mm_castsi128_ps(v1), \
 +                                    (i0)*1 + (i1)*4 + (i2)*16 + (i3)*64))
 +ALWAYS_INLINE
 +bool ContainsSameAccessFast(u64 *s, u64 a, u64 sync_epoch, bool is_write) {
 +  // This is an optimized version of ContainsSameAccessSlow.
 +  // load current access into access[0:63]
 +  const m128 access = _mm_cvtsi64_si128(a);
 +  // duplicate high part of access in addr0:
 +  // addr0[0:31]        = access[32:63]
 +  // addr0[32:63]       = access[32:63]
 +  // addr0[64:95]       = access[32:63]
 +  // addr0[96:127]      = access[32:63]
 +  const m128 addr0 = SHUF(access, access, 1, 1, 1, 1);
 +  // load 4 shadow slots
 +  const m128 shadow0 = _mm_load_si128((__m128i *)s);
 +  const m128 shadow1 = _mm_load_si128((__m128i *)s + 1);
 +  // load high parts of 4 shadow slots into addr_vect:
 +  // addr_vect[0:31]    = shadow0[32:63]
 +  // addr_vect[32:63]   = shadow0[96:127]
 +  // addr_vect[64:95]   = shadow1[32:63]
 +  // addr_vect[96:127]  = shadow1[96:127]
 +  m128 addr_vect = SHUF(shadow0, shadow1, 1, 3, 1, 3);
 +  if (!is_write) {
 +    // set IsRead bit in addr_vect
 +    const m128 rw_mask1 = _mm_cvtsi64_si128(1 << 15);
 +    const m128 rw_mask = SHUF(rw_mask1, rw_mask1, 0, 0, 0, 0);
 +    addr_vect = _mm_or_si128(addr_vect, rw_mask);
 +  }
 +  // addr0 == addr_vect?
 +  const m128 addr_res = _mm_cmpeq_epi32(addr0, addr_vect);
 +  // epoch1[0:63]       = sync_epoch
 +  const m128 epoch1 = _mm_cvtsi64_si128(sync_epoch);
 +  // epoch[0:31]        = sync_epoch[0:31]
 +  // epoch[32:63]       = sync_epoch[0:31]
 +  // epoch[64:95]       = sync_epoch[0:31]
 +  // epoch[96:127]      = sync_epoch[0:31]
 +  const m128 epoch = SHUF(epoch1, epoch1, 0, 0, 0, 0);
 +  // load low parts of shadow cell epochs into epoch_vect:
 +  // epoch_vect[0:31]   = shadow0[0:31]
 +  // epoch_vect[32:63]  = shadow0[64:95]
 +  // epoch_vect[64:95]  = shadow1[0:31]
 +  // epoch_vect[96:127] = shadow1[64:95]
 +  const m128 epoch_vect = SHUF(shadow0, shadow1, 0, 2, 0, 2);
 +  // epoch_vect >= sync_epoch?
 +  const m128 epoch_res = _mm_cmpgt_epi32(epoch_vect, epoch);
 +  // addr_res & epoch_res
 +  const m128 res = _mm_and_si128(addr_res, epoch_res);
 +  // mask[0] = res[7]
 +  // mask[1] = res[15]
 +  // ...
 +  // mask[15] = res[127]
 +  const int mask = _mm_movemask_epi8(res);
 +  return mask != 0;
  }
 -
 -#  define LOAD_CURRENT_SHADOW(cur, shadow_mem)                         \
 -    const m128 access = _mm_set1_epi32(static_cast<u32>((cur).raw())); \
 -    const m128 shadow = _mm_load_si128(reinterpret_cast<m128*>(shadow_mem))
  #endif

 -char* DumpShadow(char* buf, RawShadow raw) {
 -  if (raw == Shadow::kEmpty) {
 -    internal_snprintf(buf, 64, "0");
 -    return buf;
 -  }
 -  Shadow s(raw);
 -  AccessType typ;
 -  s.GetAccess(nullptr, nullptr, &typ);
 -  internal_snprintf(buf, 64, "{tid=%u@%u access=0x%x typ=%x}",
 -                    static_cast<u32>(s.sid()), static_cast<u32>(s.epoch()),
 -                    s.access(), static_cast<u32>(typ));
 -  return buf;
 +ALWAYS_INLINE
 +bool ContainsSameAccess(u64 *s, u64 a, u64 sync_epoch, bool is_write) {
 +#if TSAN_VECTORIZE
 +  bool res = ContainsSameAccessFast(s, a, sync_epoch, is_write);
 +  // NOTE: this check can fail if the shadow is concurrently mutated
 +  // by other threads. But it still can be useful if you modify
 +  // ContainsSameAccessFast and want to ensure that it's not completely broken.
 +  // DCHECK_EQ(res, ContainsSameAccessSlow(s, a, sync_epoch, is_write));
 +  return res;
 +#else
 +  return ContainsSameAccessSlow(s, a, sync_epoch, is_write);
 +#endif
  }

 -// TryTrace* and TraceRestart* functions allow to turn memory access and func
 -// entry/exit callbacks into leaf functions with all associated performance
 -// benefits. These hottest callbacks do only 2 slow path calls: report a race
 -// and trace part switching. Race reporting is easy to turn into a tail call, we
 -// just always return from the runtime after reporting a race. But trace part
 -// switching is harder because it needs to be in the middle of callbacks. To
 -// turn it into a tail call we immidiately return after TraceRestart* functions,
 -// but TraceRestart* functions themselves recurse into the callback after
 -// switching trace part. As the result the hottest callbacks contain only tail
 -// calls, which effectively makes them leaf functions (can use all registers,
 -// no frame setup, etc).
 -NOINLINE void TraceRestartMemoryAccess(ThreadState* thr, uptr pc, uptr addr,
 -                                       uptr size, AccessType typ) {
 -  TraceSwitchPart(thr);
 -  MemoryAccess(thr, pc, addr, size, typ);
 -}
 +ALWAYS_INLINE USED void MemoryAccess(ThreadState *thr, uptr pc, uptr addr,
 +                                     int kAccessSizeLog, bool kAccessIsWrite,
 +                                     bool kIsAtomic) {
 +  RawShadow *shadow_mem = MemToShadow(addr);
 +  DPrintf2(
 +      "#%d: MemoryAccess: @%p %p size=%d"
 +      " is_write=%d shadow_mem=%p {%zx, %zx, %zx, %zx}\n",
 +      (int)thr->fast_state.tid(), (void *)pc, (void *)addr,
 +      (int)(1 << kAccessSizeLog), kAccessIsWrite, shadow_mem,
 +      (uptr)shadow_mem[0], (uptr)shadow_mem[1], (uptr)shadow_mem[2],
 +      (uptr)shadow_mem[3]);
 +#if SANITIZER_DEBUG
 +  if (!IsAppMem(addr)) {
 +    Printf("Access to non app mem %zx\n", addr);
 +    DCHECK(IsAppMem(addr));
 +  }
 +  if (!IsShadowMem(shadow_mem)) {
 +    Printf("Bad shadow addr %p (%zx)\n", shadow_mem, addr);
 +    DCHECK(IsShadowMem(shadow_mem));
 +  }
 +#endif

 -ALWAYS_INLINE USED void MemoryAccess(ThreadState* thr, uptr pc, uptr addr,
 -                                     uptr size, AccessType typ) {
 -  RawShadow* shadow_mem = MemToShadow(addr);
 -  UNUSED char memBuf[4][64];
 -  DPrintf2("#%d: Access: %d@%d %p/%zd typ=0x%x {%s, %s, %s, %s}\n", thr->tid,
 -           static_cast<int>(thr->fast_state.sid()),
 -           static_cast<int>(thr->fast_state.epoch()), (void*)addr, size,
 -           static_cast<int>(typ), DumpShadow(memBuf[0], shadow_mem[0]),
 -           DumpShadow(memBuf[1], shadow_mem[1]),
 -           DumpShadow(memBuf[2], shadow_mem[2]),
 -           DumpShadow(memBuf[3], shadow_mem[3]));
 +  if (!SANITIZER_GO && !kAccessIsWrite && *shadow_mem == kShadowRodata) {
 +    // Access to .rodata section, no races here.
 +    // Measurements show that it can be 10-20% of all memory accesses.
 +    return;
 +  }

    FastState fast_state = thr->fast_state;
 -  Shadow cur(fast_state, addr, size, typ);
 -
 -  LOAD_CURRENT_SHADOW(cur, shadow_mem);
 -  if (LIKELY(ContainsSameAccess(shadow_mem, cur, shadow, access, typ)))
 +  if (UNLIKELY(fast_state.GetIgnoreBit())) {
      return;
 -  if (UNLIKELY(fast_state.GetIgnoreBit()))
 -    return;
 -  if (!TryTraceMemoryAccess(thr, pc, addr, size, typ))
 -    return TraceRestartMemoryAccess(thr, pc, addr, size, typ);
 -  CheckRaces(thr, shadow_mem, cur, shadow, access, typ);
 -}
 +  }

 -NOINLINE
 -void RestartUnalignedMemoryAccess(ThreadState* thr, uptr pc, uptr addr,
 -                                  uptr size, AccessType typ) {
 -  TraceSwitchPart(thr);
 -  UnalignedMemoryAccess(thr, pc, addr, size, typ);
 -}
 +  Shadow cur(fast_state);
 +  cur.SetAddr0AndSizeLog(addr & 7, kAccessSizeLog);
 +  cur.SetWrite(kAccessIsWrite);
 +  cur.SetAtomic(kIsAtomic);

 -ALWAYS_INLINE USED void UnalignedMemoryAccess(ThreadState* thr, uptr pc,
 -                                              uptr addr, uptr size,
 -                                              AccessType typ) {
 -  DCHECK_LE(size, 8);
 -  FastState fast_state = thr->fast_state;
 -  if (UNLIKELY(fast_state.GetIgnoreBit()))
 +  if (LIKELY(ContainsSameAccess(shadow_mem, cur.raw(), thr->fast_synch_epoch,
 +                                kAccessIsWrite))) {
      return;
 -  RawShadow* shadow_mem = MemToShadow(addr);
 -  bool traced = false;
 -  uptr size1 = Min<uptr>(size, RoundUp(addr + 1, kShadowCell) - addr);
 -  {
 -    Shadow cur(fast_state, addr, size1, typ);
 -    LOAD_CURRENT_SHADOW(cur, shadow_mem);
 -    if (LIKELY(ContainsSameAccess(shadow_mem, cur, shadow, access, typ)))
 -      goto SECOND;
 -    if (!TryTraceMemoryAccessRange(thr, pc, addr, size, typ))
 -      return RestartUnalignedMemoryAccess(thr, pc, addr, size, typ);
 -    traced = true;
 -    if (UNLIKELY(CheckRaces(thr, shadow_mem, cur, shadow, access, typ)))
 -      return;
    }
 -SECOND:
 -  uptr size2 = size - size1;
 -  if (LIKELY(size2 == 0))
 -    return;
 -  shadow_mem += kShadowCnt;
 -  Shadow cur(fast_state, 0, size2, typ);
 -  LOAD_CURRENT_SHADOW(cur, shadow_mem);
 -  if (LIKELY(ContainsSameAccess(shadow_mem, cur, shadow, access, typ)))
 -    return;
 -  if (!traced && !TryTraceMemoryAccessRange(thr, pc, addr, size, typ))
 -    return RestartUnalignedMemoryAccess(thr, pc, addr, size, typ);
 -  CheckRaces(thr, shadow_mem, cur, shadow, access, typ);
 +
 +  if (kCollectHistory) {
 +    fast_state.IncrementEpoch();
 +    thr->fast_state = fast_state;
 +    TraceAddEvent(thr, fast_state, EventTypeMop, pc);
 +    cur.IncrementEpoch();
 +  }
 +
 +  MemoryAccessImpl1(thr, addr, kAccessSizeLog, kAccessIsWrite, kIsAtomic,
 +                    shadow_mem, cur);
  }

 -void ShadowSet(RawShadow* p, RawShadow* end, RawShadow v) {
 -  DCHECK_LE(p, end);
 -  DCHECK(IsShadowMem(p));
 -  DCHECK(IsShadowMem(end));
 -  UNUSED const uptr kAlign = kShadowCnt * kShadowSize;
 -  DCHECK_EQ(reinterpret_cast<uptr>(p) % kAlign, 0);
 -  DCHECK_EQ(reinterpret_cast<uptr>(end) % kAlign, 0);
 -#if !TSAN_VECTORIZE
 -  for (; p < end; p += kShadowCnt) {
 -    p[0] = v;
 -    for (uptr i = 1; i < kShadowCnt; i++) p[i] = Shadow::kEmpty;
 +// Called by MemoryAccessRange in tsan_rtl_thread.cpp
 +ALWAYS_INLINE USED void MemoryAccessImpl(ThreadState *thr, uptr addr,
 +                                         int kAccessSizeLog,
 +                                         bool kAccessIsWrite, bool kIsAtomic,
 +                                         u64 *shadow_mem, Shadow cur) {
 +  if (LIKELY(ContainsSameAccess(shadow_mem, cur.raw(), thr->fast_synch_epoch,
 +                                kAccessIsWrite))) {
 +    return;
    }
 -#else
 -  m128 vv = _mm_setr_epi32(
 -      static_cast<u32>(v), static_cast<u32>(Shadow::kEmpty),
 -      static_cast<u32>(Shadow::kEmpty), static_cast<u32>(Shadow::kEmpty));
 -  m128* vp = reinterpret_cast<m128*>(p);
 -  m128* vend = reinterpret_cast<m128*>(end);
 -  for (; vp < vend; vp++) _mm_store_si128(vp, vv);
 -#endif
 +
 +  MemoryAccessImpl1(thr, addr, kAccessSizeLog, kAccessIsWrite, kIsAtomic,
 +                    shadow_mem, cur);
  }

 -static void MemoryRangeSet(uptr addr, uptr size, RawShadow val) {
 +static void MemoryRangeSet(ThreadState *thr, uptr pc, uptr addr, uptr size,
 +                           u64 val) {
 +  (void)thr;
 +  (void)pc;
    if (size == 0)
      return;
 -  DCHECK_EQ(addr % kShadowCell, 0);
 -  DCHECK_EQ(size % kShadowCell, 0);
 +  // FIXME: fix me.
 +  uptr offset = addr % kShadowCell;
 +  if (offset) {
 +    offset = kShadowCell - offset;
 +    if (size <= offset)
 +      return;
 +    addr += offset;
 +    size -= offset;
 +  }
 +  DCHECK_EQ(addr % 8, 0);
    // If a user passes some insane arguments (memset(0)),
    // let it just crash as usual.
    if (!IsAppMem(addr) || !IsAppMem(addr + size - 1))
      return;
 -  RawShadow* begin = MemToShadow(addr);
 -  RawShadow* end = begin + size / kShadowCell * kShadowCnt;
    // Don't want to touch lots of shadow memory.
    // If a program maps 10MB stack, there is no need reset the whole range.
 +  size = (size + (kShadowCell - 1)) & ~(kShadowCell - 1);
    // UnmapOrDie/MmapFixedNoReserve does not work on Windows.
 -  if (SANITIZER_WINDOWS ||
 -      size <= common_flags()->clear_shadow_mmap_threshold) {
 -    ShadowSet(begin, end, val);
 -    return;
 -  }
 -  // The region is big, reset only beginning and end.
 -  const uptr kPageSize = GetPageSizeCached();
 -  // Set at least first kPageSize/2 to page boundary.
 -  RawShadow* mid1 =
 -      Min(end, reinterpret_cast<RawShadow*>(RoundUp(
 -                   reinterpret_cast<uptr>(begin) + kPageSize / 2, kPageSize)));
 -  ShadowSet(begin, mid1, val);
 -  // Reset middle part.
 -  RawShadow* mid2 = RoundDown(end, kPageSize);
 -  if (mid2 > mid1) {
 -    if (!MmapFixedSuperNoReserve((uptr)mid1, (uptr)mid2 - (uptr)mid1))
 +  if (SANITIZER_WINDOWS || size < common_flags()->clear_shadow_mmap_threshold) {
 +    RawShadow *p = MemToShadow(addr);
 +    CHECK(IsShadowMem(p));
 +    CHECK(IsShadowMem(p + size * kShadowCnt / kShadowCell - 1));
 +    // FIXME: may overwrite a part outside the region
 +    for (uptr i = 0; i < size / kShadowCell * kShadowCnt;) {
 +      p[i++] = val;
 +      for (uptr j = 1; j < kShadowCnt; j++) p[i++] = 0;
 +    }
 +  } else {
 +    // The region is big, reset only beginning and end.
 +    const uptr kPageSize = GetPageSizeCached();
 +    RawShadow *begin = MemToShadow(addr);
 +    RawShadow *end = begin + size / kShadowCell * kShadowCnt;
 +    RawShadow *p = begin;
 +    // Set at least first kPageSize/2 to page boundary.
 +    while ((p < begin + kPageSize / kShadowSize / 2) || ((uptr)p % kPageSize)) {
 +      *p++ = val;
 +      for (uptr j = 1; j < kShadowCnt; j++) *p++ = 0;
 +    }
 +    // Reset middle part.
 +    RawShadow *p1 = p;
 +    p = RoundDown(end, kPageSize);
 +    if (!MmapFixedSuperNoReserve((uptr)p1, (uptr)p - (uptr)p1))
        Die();
 +    // Set the ending.
 +    while (p < end) {
 +      *p++ = val;
 +      for (uptr j = 1; j < kShadowCnt; j++) *p++ = 0;
 +    }
    }
 -  // Set the ending.
 -  ShadowSet(mid2, end, val);
  }

 -void MemoryResetRange(ThreadState* thr, uptr pc, uptr addr, uptr size) {
 -  uptr addr1 = RoundDown(addr, kShadowCell);
 -  uptr size1 = RoundUp(size + addr - addr1, kShadowCell);
 -  MemoryRangeSet(addr1, size1, Shadow::kEmpty);
 +void MemoryResetRange(ThreadState *thr, uptr pc, uptr addr, uptr size) {
 +  MemoryRangeSet(thr, pc, addr, size, 0);
  }

 -void MemoryRangeFreed(ThreadState* thr, uptr pc, uptr addr, uptr size) {
 -  // Callers must lock the slot to ensure synchronization with the reset.
 -  // The problem with "freed" memory is that it's not "monotonic"
 -  // with respect to bug detection: freed memory is bad to access,
 -  // but then if the heap block is reallocated later, it's good to access.
 -  // As the result a garbage "freed" shadow can lead to a false positive
 -  // if it happens to match a real free in the thread trace,
 -  // but the heap block was reallocated before the current memory access,
 -  // so it's still good to access. It's not the case with data races.
 -  DCHECK(thr->slot_locked);
 -  DCHECK_EQ(addr % kShadowCell, 0);
 -  size = RoundUp(size, kShadowCell);
 -  // Processing more than 1k (2k of shadow) is expensive,
 +void MemoryRangeFreed(ThreadState *thr, uptr pc, uptr addr, uptr size) {
 +  // Processing more than 1k (4k of shadow) is expensive,
    // can cause excessive memory consumption (user does not necessary touch
    // the whole range) and most likely unnecessary.
 -  size = Min<uptr>(size, 1024);
 -  const AccessType typ =
 -      kAccessWrite | kAccessFree | kAccessCheckOnly | kAccessNoRodata;
 -  TraceMemoryAccessRange(thr, pc, addr, size, typ);
 -  RawShadow* shadow_mem = MemToShadow(addr);
 -  Shadow cur(thr->fast_state, 0, kShadowCell, typ);
 -#if TSAN_VECTORIZE
 -  const m128 access = _mm_set1_epi32(static_cast<u32>(cur.raw()));
 -  const m128 freed = _mm_setr_epi32(
 -      static_cast<u32>(Shadow::FreedMarker()),
 -      static_cast<u32>(Shadow::FreedInfo(cur.sid(), cur.epoch())), 0, 0);
 -  for (; size; size -= kShadowCell, shadow_mem += kShadowCnt) {
 -    const m128 shadow = _mm_load_si128((m128*)shadow_mem);
 -    if (UNLIKELY(CheckRaces(thr, shadow_mem, cur, shadow, access, typ)))
 -      return;
 -    _mm_store_si128((m128*)shadow_mem, freed);
 +  if (size > 1024)
 +    size = 1024;
 +  CHECK_EQ(thr->is_freeing, false);
 +  thr->is_freeing = true;
 +  MemoryAccessRange(thr, pc, addr, size, true);
 +  thr->is_freeing = false;
 +  if (kCollectHistory) {
 +    thr->fast_state.IncrementEpoch();
 +    TraceAddEvent(thr, thr->fast_state, EventTypeMop, pc);
    }
 -#else
 -  for (; size; size -= kShadowCell, shadow_mem += kShadowCnt) {
 -    if (UNLIKELY(CheckRaces(thr, shadow_mem, cur, 0, 0, typ)))
 -      return;
 -    StoreShadow(&shadow_mem[0], Shadow::FreedMarker());
 -    StoreShadow(&shadow_mem[1], Shadow::FreedInfo(cur.sid(), cur.epoch()));
 -    StoreShadow(&shadow_mem[2], Shadow::kEmpty);
 -    StoreShadow(&shadow_mem[3], Shadow::kEmpty);
 +  Shadow s(thr->fast_state);
 +  s.ClearIgnoreBit();
 +  s.MarkAsFreed();
 +  s.SetWrite(true);
 +  s.SetAddr0AndSizeLog(0, 3);
 +  MemoryRangeSet(thr, pc, addr, size, s.raw());
 +}
 +
 +void MemoryRangeImitateWrite(ThreadState *thr, uptr pc, uptr addr, uptr size) {
 +  if (kCollectHistory) {
 +    thr->fast_state.IncrementEpoch();
 +    TraceAddEvent(thr, thr->fast_state, EventTypeMop, pc);
    }
 -#endif
 -}
 -
 -void MemoryRangeImitateWrite(ThreadState* thr, uptr pc, uptr addr, uptr size) {
 -  DCHECK_EQ(addr % kShadowCell, 0);
 -  size = RoundUp(size, kShadowCell);
 -  TraceMemoryAccessRange(thr, pc, addr, size, kAccessWrite);
 -  Shadow cur(thr->fast_state, 0, 8, kAccessWrite);
 -  MemoryRangeSet(addr, size, cur.raw());
 +  Shadow s(thr->fast_state);
 +  s.ClearIgnoreBit();
 +  s.SetWrite(true);
 +  s.SetAddr0AndSizeLog(0, 3);
 +  MemoryRangeSet(thr, pc, addr, size, s.raw());
  }

 -void MemoryRangeImitateWriteOrResetRange(ThreadState* thr, uptr pc, uptr addr,
 +void MemoryRangeImitateWriteOrResetRange(ThreadState *thr, uptr pc, uptr addr,
                                           uptr size) {
    if (thr->ignore_reads_and_writes == 0)
      MemoryRangeImitateWrite(thr, pc, addr, size);
 @@ -617,29 +518,14 @@ void MemoryRangeImitateWriteOrResetRange(ThreadState* thr, uptr pc, uptr addr,
      MemoryResetRange(thr, pc, addr, size);
  }

 -ALWAYS_INLINE
 -bool MemoryAccessRangeOne(ThreadState* thr, RawShadow* shadow_mem, Shadow cur,
 -                          AccessType typ) {
 -  LOAD_CURRENT_SHADOW(cur, shadow_mem);
 -  if (LIKELY(ContainsSameAccess(shadow_mem, cur, shadow, access, typ)))
 -    return false;
 -  return CheckRaces(thr, shadow_mem, cur, shadow, access, typ);
 -}
 -
 -template <bool is_read>
 -NOINLINE void RestartMemoryAccessRange(ThreadState* thr, uptr pc, uptr addr,
 -                                       uptr size) {
 -  TraceSwitchPart(thr);
 -  MemoryAccessRangeT<is_read>(thr, pc, addr, size);
 -}
 +void MemoryAccessRange(ThreadState *thr, uptr pc, uptr addr, uptr size,
 +                       bool is_write) {
 +  if (size == 0)
 +    return;

 -template <bool is_read>
 -void MemoryAccessRangeT(ThreadState* thr, uptr pc, uptr addr, uptr size) {
 -  const AccessType typ =
 -      (is_read ? kAccessRead : kAccessWrite) | kAccessNoRodata;
 -  RawShadow* shadow_mem = MemToShadow(addr);
 -  DPrintf2("#%d: MemoryAccessRange: @%p %p size=%d is_read=%d\n", thr->tid,
 -           (void*)pc, (void*)addr, (int)size, is_read);
 +  RawShadow *shadow_mem = MemToShadow(addr);
 +  DPrintf2("#%d: MemoryAccessRange: @%p %p size=%d is_write=%d\n", thr->tid,
 +           (void *)pc, (void *)addr, (int)size, is_write);

  #if SANITIZER_DEBUG
    if (!IsAppMem(addr)) {
 @@ -651,57 +537,65 @@ void MemoryAccessRangeT(ThreadState* thr, uptr pc, uptr addr, uptr size) {
      DCHECK(IsAppMem(addr + size - 1));
    }
    if (!IsShadowMem(shadow_mem)) {
 -    Printf("Bad shadow addr %p (%zx)\n", static_cast<void*>(shadow_mem), addr);
 +    Printf("Bad shadow addr %p (%zx)\n", shadow_mem, addr);
      DCHECK(IsShadowMem(shadow_mem));
    }
 -  if (!IsShadowMem(shadow_mem + size * kShadowCnt - 1)) {
 -    Printf("Bad shadow addr %p (%zx)\n",
 -           static_cast<void*>(shadow_mem + size * kShadowCnt - 1),
 +  if (!IsShadowMem(shadow_mem + size * kShadowCnt / 8 - 1)) {
 +    Printf("Bad shadow addr %p (%zx)\n", shadow_mem + size * kShadowCnt / 8 - 1,
             addr + size - 1);
 -    DCHECK(IsShadowMem(shadow_mem + size * kShadowCnt - 1));
 +    DCHECK(IsShadowMem(shadow_mem + size * kShadowCnt / 8 - 1));
    }
  #endif

 -  // Access to .rodata section, no races here.
 -  // Measurements show that it can be 10-20% of all memory accesses.
 -  if (is_read && *shadow_mem == Shadow::kRodata)
 +  if (*shadow_mem == kShadowRodata) {
 +    DCHECK(!is_write);
 +    // Access to .rodata section, no races here.
 +    // Measurements show that it can be 10-20% of all memory accesses.
      return;
 +  }

    FastState fast_state = thr->fast_state;
 -  if (UNLIKELY(fast_state.GetIgnoreBit()))
 +  if (fast_state.GetIgnoreBit())
      return;

 -  if (!TryTraceMemoryAccessRange(thr, pc, addr, size, typ))
 -    return RestartMemoryAccessRange<is_read>(thr, pc, addr, size);
 +  fast_state.IncrementEpoch();
 +  thr->fast_state = fast_state;
 +  TraceAddEvent(thr, fast_state, EventTypeMop, pc);

 -  if (UNLIKELY(addr % kShadowCell)) {
 -    // Handle unaligned beginning, if any.
 -    uptr size1 = Min(size, RoundUp(addr, kShadowCell) - addr);
 -    size -= size1;
 -    Shadow cur(fast_state, addr, size1, typ);
 -    if (UNLIKELY(MemoryAccessRangeOne(thr, shadow_mem, cur, typ)))
 -      return;
 -    shadow_mem += kShadowCnt;
 +  bool unaligned = (addr % kShadowCell) != 0;
 +
 +  // Handle unaligned beginning, if any.
 +  for (; addr % kShadowCell && size; addr++, size--) {
 +    int const kAccessSizeLog = 0;
 +    Shadow cur(fast_state);
 +    cur.SetWrite(is_write);
 +    cur.SetAddr0AndSizeLog(addr & (kShadowCell - 1), kAccessSizeLog);
 +    MemoryAccessImpl(thr, addr, kAccessSizeLog, is_write, false, shadow_mem,
 +                     cur);
    }
 +  if (unaligned)
 +    shadow_mem += kShadowCnt;
    // Handle middle part, if any.
 -  Shadow cur(fast_state, 0, kShadowCell, typ);
 -  for (; size >= kShadowCell; size -= kShadowCell, shadow_mem += kShadowCnt) {
 -    if (UNLIKELY(MemoryAccessRangeOne(thr, shadow_mem, cur, typ)))
 -      return;
 +  for (; size >= kShadowCell; addr += kShadowCell, size -= kShadowCell) {
 +    int const kAccessSizeLog = 3;
 +    Shadow cur(fast_state);
 +    cur.SetWrite(is_write);
 +    cur.SetAddr0AndSizeLog(0, kAccessSizeLog);
 +    MemoryAccessImpl(thr, addr, kAccessSizeLog, is_write, false, shadow_mem,
 +                     cur);
 +    shadow_mem += kShadowCnt;
    }
    // Handle ending, if any.
 -  if (UNLIKELY(size)) {
 -    Shadow cur(fast_state, 0, size, typ);
 -    if (UNLIKELY(MemoryAccessRangeOne(thr, shadow_mem, cur, typ)))
 -      return;
 +  for (; size; addr++, size--) {
 +    int const kAccessSizeLog = 0;
 +    Shadow cur(fast_state);
 +    cur.SetWrite(is_write);
 +    cur.SetAddr0AndSizeLog(addr & (kShadowCell - 1), kAccessSizeLog);
 +    MemoryAccessImpl(thr, addr, kAccessSizeLog, is_write, false, shadow_mem,
 +                     cur);
    }
  }

 -template void MemoryAccessRangeT<true>(ThreadState* thr, uptr pc, uptr addr,
 -                                       uptr size);
 -template void MemoryAccessRangeT<false>(ThreadState* thr, uptr pc, uptr addr,
 -                                        uptr size);
 -
  }  // namespace __tsan

  #if !SANITIZER_GO
 diff --git a/compiler-rt/lib/tsan/rtl/tsan_rtl_mutex.cpp b/compiler-rt/lib/tsan/rtl/tsan_rtl_mutex.cpp
 index 5ca2e4fca827..7d6b41116aa6 100644
 --- a/compiler-rt/lib/tsan/rtl/tsan_rtl_mutex.cpp
 +++ b/compiler-rt/lib/tsan/rtl/tsan_rtl_mutex.cpp
 @@ -23,8 +23,6 @@
  namespace __tsan {

  void ReportDeadlock(ThreadState *thr, uptr pc, DDReport *r);
 -void ReportDestroyLocked(ThreadState *thr, uptr pc, uptr addr,
 -                         FastState last_lock, StackID creation_stack_id);

  struct Callback final : public DDCallback {
    ThreadState *thr;
 @@ -38,17 +36,17 @@ struct Callback final : public DDCallback {
    }

    StackID Unwind() override { return CurrentStackId(thr, pc); }
 -  int UniqueTid() override { return thr->tid; }
 +  int UniqueTid() override { return thr->unique_id; }
  };

  void DDMutexInit(ThreadState *thr, uptr pc, SyncVar *s) {
    Callback cb(thr, pc);
    ctx->dd->MutexInit(&cb, &s->dd);
 -  s->dd.ctx = s->addr;
 +  s->dd.ctx = s->GetId();
  }

  static void ReportMutexMisuse(ThreadState *thr, uptr pc, ReportType typ,
 -                              uptr addr, StackID creation_stack_id) {
 +    uptr addr, u64 mid) {
    // In Go, these misuses are either impossible, or detected by std lib,
    // or false positives (e.g. unlock in a different thread).
    if (SANITIZER_GO)
 @@ -57,7 +55,7 @@ static void ReportMutexMisuse(ThreadState *thr, uptr pc, ReportType typ,
      return;
    ThreadRegistryLock l(&ctx->thread_registry);
    ScopedReport rep(typ);
 -  rep.AddMutex(addr, creation_stack_id);
 +  rep.AddMutex(mid);
    VarSizeStackTrace trace;
    ObtainCurrentStack(thr, pc, &trace);
    rep.AddStack(trace, true);
 @@ -65,93 +63,95 @@ static void ReportMutexMisuse(ThreadState *thr, uptr pc, ReportType typ,
    OutputReport(thr, rep);
  }

 -static void RecordMutexLock(ThreadState *thr, uptr pc, uptr addr,
 -                            StackID stack_id, bool write) {
 -  auto typ = write ? EventType::kLock : EventType::kRLock;
 -  // Note: it's important to trace before modifying mutex set
 -  // because tracing can switch trace part and we write the current
 -  // mutex set in the beginning of each part.
 -  // If we do it in the opposite order, we will write already reduced
 -  // mutex set in the beginning of the part and then trace unlock again.
 -  TraceMutexLock(thr, typ, pc, addr, stack_id);
 -  thr->mset.AddAddr(addr, stack_id, write);
 -}
 -
 -static void RecordMutexUnlock(ThreadState *thr, uptr addr) {
 -  // See the comment in RecordMutexLock re order of operations.
 -  TraceMutexUnlock(thr, addr);
 -  thr->mset.DelAddr(addr);
 -}
 -
  void MutexCreate(ThreadState *thr, uptr pc, uptr addr, u32 flagz) {
    DPrintf("#%d: MutexCreate %zx flagz=0x%x\n", thr->tid, addr, flagz);
 -  if (!(flagz & MutexFlagLinkerInit) && pc && IsAppMem(addr))
 +  if (!(flagz & MutexFlagLinkerInit) && IsAppMem(addr)) {
 +    CHECK(!thr->is_freeing);
 +    thr->is_freeing = true;
      MemoryAccess(thr, pc, addr, 1, kAccessWrite);
 -  SlotLocker locker(thr);
 -  auto s = ctx->metamap.GetSyncOrCreate(thr, pc, addr, true);
 +    thr->is_freeing = false;
 +  }
 +  SyncVar *s = ctx->metamap.GetSyncOrCreate(thr, pc, addr, true);
 +  Lock l(&s->mtx);
    s->SetFlags(flagz & MutexCreationFlagMask);
    // Save stack in the case the sync object was created before as atomic.
 -  if (!SANITIZER_GO && s->creation_stack_id == kInvalidStackID)
 +  if (!SANITIZER_GO && s->creation_stack_id == 0)
      s->creation_stack_id = CurrentStackId(thr, pc);
  }

  void MutexDestroy(ThreadState *thr, uptr pc, uptr addr, u32 flagz) {
    DPrintf("#%d: MutexDestroy %zx\n", thr->tid, addr);
    bool unlock_locked = false;
 -  StackID creation_stack_id;
 -  FastState last_lock;
 +  u64 mid = 0;
 +  u64 last_lock = 0;
    {
 -    auto s = ctx->metamap.GetSyncIfExists(addr);
 -    if (!s)
 +    SyncVar *s = ctx->metamap.GetSyncIfExists(addr);
 +    if (s == 0)
        return;
 -    SlotLocker locker(thr);
 -    {
 -      Lock lock(&s->mtx);
 -      creation_stack_id = s->creation_stack_id;
 -      last_lock = s->last_lock;
 -      if ((flagz & MutexFlagLinkerInit) || s->IsFlagSet(MutexFlagLinkerInit) ||
 -          ((flagz & MutexFlagNotStatic) && !s->IsFlagSet(MutexFlagNotStatic))) {
 -        // Destroy is no-op for linker-initialized mutexes.
 -        return;
 -      }
 -      if (common_flags()->detect_deadlocks) {
 -        Callback cb(thr, pc);
 -        ctx->dd->MutexDestroy(&cb, &s->dd);
 -        ctx->dd->MutexInit(&cb, &s->dd);
 -      }
 -      if (flags()->report_destroy_locked && s->owner_tid != kInvalidTid &&
 -          !s->IsFlagSet(MutexFlagBroken)) {
 -        s->SetFlags(MutexFlagBroken);
 -        unlock_locked = true;
 -      }
 -      s->Reset();
 +    Lock l(&s->mtx);
 +    if ((flagz & MutexFlagLinkerInit) || s->IsFlagSet(MutexFlagLinkerInit) ||
 +        ((flagz & MutexFlagNotStatic) && !s->IsFlagSet(MutexFlagNotStatic))) {
 +      // Destroy is no-op for linker-initialized mutexes.
 +      return;
 +    }
 +    if (common_flags()->detect_deadlocks) {
 +      Callback cb(thr, pc);
 +      ctx->dd->MutexDestroy(&cb, &s->dd);
 +      ctx->dd->MutexInit(&cb, &s->dd);
 +    }
 +    if (flags()->report_destroy_locked && s->owner_tid != kInvalidTid &&
 +        !s->IsFlagSet(MutexFlagBroken)) {
 +      s->SetFlags(MutexFlagBroken);
 +      unlock_locked = true;
 +    }
 +    mid = s->GetId();
 +    last_lock = s->last_lock;
 +    if (!unlock_locked)
 +      s->Reset(thr->proc());  // must not reset it before the report is printed
 +  }
 +  if (unlock_locked && ShouldReport(thr, ReportTypeMutexDestroyLocked)) {
 +    ThreadRegistryLock l(&ctx->thread_registry);
 +    ScopedReport rep(ReportTypeMutexDestroyLocked);
 +    rep.AddMutex(mid);
 +    VarSizeStackTrace trace;
 +    ObtainCurrentStack(thr, pc, &trace);
 +    rep.AddStack(trace, true);
 +    FastState last(last_lock);
 +    RestoreStack(last.tid(), last.epoch(), &trace, 0);
 +    rep.AddStack(trace, true);
 +    rep.AddLocation(addr, 1);
 +    OutputReport(thr, rep);
 +
 +    SyncVar *s = ctx->metamap.GetSyncIfExists(addr);
 +    if (s != 0) {
 +      Lock l(&s->mtx);
 +      s->Reset(thr->proc());
      }
 -    // Imitate a memory write to catch unlock-destroy races.
 -    if (pc && IsAppMem(addr))
 -      MemoryAccess(thr, pc, addr, 1, kAccessWrite | kAccessFree);
    }
 -  if (unlock_locked && ShouldReport(thr, ReportTypeMutexDestroyLocked))
 -    ReportDestroyLocked(thr, pc, addr, last_lock, creation_stack_id);
 -  thr->mset.DelAddr(addr, true);
 +  thr->mset.Remove(mid);
 +  // Imitate a memory write to catch unlock-destroy races.
 +  // Do this outside of sync mutex, because it can report a race which locks
 +  // sync mutexes.
 +  if (IsAppMem(addr))
 +    MemoryAccess(thr, pc, addr, 1, kAccessWrite | kAccessFree);
    // s will be destroyed and freed in MetaMap::FreeBlock.
  }

  void MutexPreLock(ThreadState *thr, uptr pc, uptr addr, u32 flagz) {
    DPrintf("#%d: MutexPreLock %zx flagz=0x%x\n", thr->tid, addr, flagz);
 -  if (flagz & MutexFlagTryLock)
 -    return;
 -  if (!common_flags()->detect_deadlocks)
 -    return;
 -  Callback cb(thr, pc);
 -  {
 -    SlotLocker locker(thr);
 -    auto s = ctx->metamap.GetSyncOrCreate(thr, pc, addr, true);
 -    ReadLock lock(&s->mtx);
 -    s->UpdateFlags(flagz);
 -    if (s->owner_tid != thr->tid)
 -      ctx->dd->MutexBeforeLock(&cb, &s->dd, true);
 +  if (!(flagz & MutexFlagTryLock) && common_flags()->detect_deadlocks) {
 +    SyncVar *s = ctx->metamap.GetSyncOrCreate(thr, pc, addr, true);
 +    {
 +      ReadLock l(&s->mtx);
 +      s->UpdateFlags(flagz);
 +      if (s->owner_tid != thr->tid) {
 +        Callback cb(thr, pc);
 +        ctx->dd->MutexBeforeLock(&cb, &s->dd, true);
 +      }
 +    }
 +    Callback cb(thr, pc);
 +    ReportDeadlock(thr, pc, ctx->dd->GetReport(&cb));
    }
 -  ReportDeadlock(thr, pc, ctx->dd->GetReport(&cb));
  }

  void MutexPostLock(ThreadState *thr, uptr pc, uptr addr, u32 flagz, int rec) {
 @@ -161,51 +161,48 @@ void MutexPostLock(ThreadState *thr, uptr pc, uptr addr, u32 flagz, int rec) {
      CHECK_GT(rec, 0);
    else
      rec = 1;
 -  if (pc && IsAppMem(addr))
 +  if (IsAppMem(addr))
      MemoryAccess(thr, pc, addr, 1, kAccessRead | kAccessAtomic);
 -  bool report_double_lock = false;
 +  u64 mid = 0;
    bool pre_lock = false;
    bool first = false;
 -  StackID creation_stack_id = kInvalidStackID;
 +  bool report_double_lock = false;
    {
 -    SlotLocker locker(thr);
 -    auto s = ctx->metamap.GetSyncOrCreate(thr, pc, addr, true);
 -    creation_stack_id = s->creation_stack_id;
 -    RecordMutexLock(thr, pc, addr, creation_stack_id, true);
 -    {
 -      Lock lock(&s->mtx);
 -      first = s->recursion == 0;
 -      s->UpdateFlags(flagz);
 -      if (s->owner_tid == kInvalidTid) {
 -        CHECK_EQ(s->recursion, 0);
 -        s->owner_tid = thr->tid;
 -        s->last_lock = thr->fast_state;
 -      } else if (s->owner_tid == thr->tid) {
 -        CHECK_GT(s->recursion, 0);
 -      } else if (flags()->report_mutex_bugs && !s->IsFlagSet(MutexFlagBroken)) {
 -        s->SetFlags(MutexFlagBroken);
 -        report_double_lock = true;
 -      }
 -      s->recursion += rec;
 -      if (first) {
 -        if (!thr->ignore_sync) {
 -          thr->clock.Acquire(s->clock);
 -          thr->clock.Acquire(s->read_clock);
 -        }
 -      }
 -      if (first && common_flags()->detect_deadlocks) {
 -        pre_lock = (flagz & MutexFlagDoPreLockOnPostLock) &&
 -                   !(flagz & MutexFlagTryLock);
 -        Callback cb(thr, pc);
 -        if (pre_lock)
 -          ctx->dd->MutexBeforeLock(&cb, &s->dd, true);
 -        ctx->dd->MutexAfterLock(&cb, &s->dd, true, flagz & MutexFlagTryLock);
 -      }
 +    SyncVar *s = ctx->metamap.GetSyncOrCreate(thr, pc, addr, true);
 +    Lock l(&s->mtx);
 +    s->UpdateFlags(flagz);
 +    thr->fast_state.IncrementEpoch();
 +    TraceAddEvent(thr, thr->fast_state, EventTypeLock, s->GetId());
 +    if (s->owner_tid == kInvalidTid) {
 +      CHECK_EQ(s->recursion, 0);
 +      s->owner_tid = thr->tid;
 +      s->last_lock = thr->fast_state.raw();
 +    } else if (s->owner_tid == thr->tid) {
 +      CHECK_GT(s->recursion, 0);
 +    } else if (flags()->report_mutex_bugs && !s->IsFlagSet(MutexFlagBroken)) {
 +      s->SetFlags(MutexFlagBroken);
 +      report_double_lock = true;
 +    }
 +    first = s->recursion == 0;
 +    s->recursion += rec;
 +    if (first) {
 +      AcquireImpl(thr, pc, &s->clock);
 +      AcquireImpl(thr, pc, &s->read_clock);
 +    } else if (!s->IsFlagSet(MutexFlagWriteReentrant)) {
      }
 +    thr->mset.Add(s->GetId(), true, thr->fast_state.epoch());
 +    if (first && common_flags()->detect_deadlocks) {
 +      pre_lock =
 +          (flagz & MutexFlagDoPreLockOnPostLock) && !(flagz & MutexFlagTryLock);
 +      Callback cb(thr, pc);
 +      if (pre_lock)
 +        ctx->dd->MutexBeforeLock(&cb, &s->dd, true);
 +      ctx->dd->MutexAfterLock(&cb, &s->dd, true, flagz & MutexFlagTryLock);
 +    }
 +    mid = s->GetId();
    }
    if (report_double_lock)
 -    ReportMutexMisuse(thr, pc, ReportTypeMutexDoubleLock, addr,
 -                      creation_stack_id);
 +    ReportMutexMisuse(thr, pc, ReportTypeMutexDoubleLock, addr, mid);
    if (first && pre_lock && common_flags()->detect_deadlocks) {
      Callback cb(thr, pc);
      ReportDeadlock(thr, pc, ctx->dd->GetReport(&cb));
 @@ -214,47 +211,40 @@ void MutexPostLock(ThreadState *thr, uptr pc, uptr addr, u32 flagz, int rec) {

  int MutexUnlock(ThreadState *thr, uptr pc, uptr addr, u32 flagz) {
    DPrintf("#%d: MutexUnlock %zx flagz=0x%x\n", thr->tid, addr, flagz);
 -  if (pc && IsAppMem(addr))
 +  if (IsAppMem(addr))
      MemoryAccess(thr, pc, addr, 1, kAccessRead | kAccessAtomic);
 -  StackID creation_stack_id;
 -  RecordMutexUnlock(thr, addr);
 +  u64 mid = 0;
    bool report_bad_unlock = false;
    int rec = 0;
    {
 -    SlotLocker locker(thr);
 -    auto s = ctx->metamap.GetSyncOrCreate(thr, pc, addr, true);
 -    bool released = false;
 -    {
 -      Lock lock(&s->mtx);
 -      creation_stack_id = s->creation_stack_id;
 -      if (!SANITIZER_GO && (s->recursion == 0 || s->owner_tid != thr->tid)) {
 -        if (flags()->report_mutex_bugs && !s->IsFlagSet(MutexFlagBroken)) {
 -          s->SetFlags(MutexFlagBroken);
 -          report_bad_unlock = true;
 -        }
 -      } else {
 -        rec = (flagz & MutexFlagRecursiveUnlock) ? s->recursion : 1;
 -        s->recursion -= rec;
 -        if (s->recursion == 0) {
 -          s->owner_tid = kInvalidTid;
 -          if (!thr->ignore_sync) {
 -            thr->clock.ReleaseStore(&s->clock);
 -            released = true;
 -          }
 -        }
 +    SyncVar *s = ctx->metamap.GetSyncOrCreate(thr, pc, addr, true);
 +    Lock l(&s->mtx);
 +    thr->fast_state.IncrementEpoch();
 +    TraceAddEvent(thr, thr->fast_state, EventTypeUnlock, s->GetId());
 +    if (!SANITIZER_GO && (s->recursion == 0 || s->owner_tid != thr->tid)) {
 +      if (flags()->report_mutex_bugs && !s->IsFlagSet(MutexFlagBroken)) {
 +        s->SetFlags(MutexFlagBroken);
 +        report_bad_unlock = true;
        }
 -      if (common_flags()->detect_deadlocks && s->recursion == 0 &&
 -          !report_bad_unlock) {
 -        Callback cb(thr, pc);
 -        ctx->dd->MutexBeforeUnlock(&cb, &s->dd, true);
 +    } else {
 +      rec = (flagz & MutexFlagRecursiveUnlock) ? s->recursion : 1;
 +      s->recursion -= rec;
 +      if (s->recursion == 0) {
 +        s->owner_tid = kInvalidTid;
 +        ReleaseStoreImpl(thr, pc, &s->clock);
 +      } else {
        }
      }
 -    if (released)
 -      IncrementEpoch(thr);
 +    thr->mset.Del(s->GetId(), true);
 +    if (common_flags()->detect_deadlocks && s->recursion == 0 &&
 +        !report_bad_unlock) {
 +      Callback cb(thr, pc);
 +      ctx->dd->MutexBeforeUnlock(&cb, &s->dd, true);
 +    }
 +    mid = s->GetId();
    }
    if (report_bad_unlock)
 -    ReportMutexMisuse(thr, pc, ReportTypeMutexBadUnlock, addr,
 -                      creation_stack_id);
 +    ReportMutexMisuse(thr, pc, ReportTypeMutexBadUnlock, addr, mid);
    if (common_flags()->detect_deadlocks && !report_bad_unlock) {
      Callback cb(thr, pc);
      ReportDeadlock(thr, pc, ctx->dd->GetReport(&cb));
 @@ -264,56 +254,53 @@ int MutexUnlock(ThreadState *thr, uptr pc, uptr addr, u32 flagz) {

  void MutexPreReadLock(ThreadState *thr, uptr pc, uptr addr, u32 flagz) {
    DPrintf("#%d: MutexPreReadLock %zx flagz=0x%x\n", thr->tid, addr, flagz);
 -  if ((flagz & MutexFlagTryLock) || !common_flags()->detect_deadlocks)
 -    return;
 -  Callback cb(thr, pc);
 -  {
 -    SlotLocker locker(thr);
 -    auto s = ctx->metamap.GetSyncOrCreate(thr, pc, addr, true);
 -    ReadLock lock(&s->mtx);
 -    s->UpdateFlags(flagz);
 -    ctx->dd->MutexBeforeLock(&cb, &s->dd, false);
 +  if (!(flagz & MutexFlagTryLock) && common_flags()->detect_deadlocks) {
 +    {
 +      SyncVar *s = ctx->metamap.GetSyncOrCreate(thr, pc, addr, true);
 +      ReadLock l(&s->mtx);
 +      s->UpdateFlags(flagz);
 +      Callback cb(thr, pc);
 +      ctx->dd->MutexBeforeLock(&cb, &s->dd, false);
 +    }
 +    Callback cb(thr, pc);
 +    ReportDeadlock(thr, pc, ctx->dd->GetReport(&cb));
    }
 -  ReportDeadlock(thr, pc, ctx->dd->GetReport(&cb));
  }

  void MutexPostReadLock(ThreadState *thr, uptr pc, uptr addr, u32 flagz) {
    DPrintf("#%d: MutexPostReadLock %zx flagz=0x%x\n", thr->tid, addr, flagz);
 -  if (pc && IsAppMem(addr))
 +  if (IsAppMem(addr))
      MemoryAccess(thr, pc, addr, 1, kAccessRead | kAccessAtomic);
 +  u64 mid = 0;
    bool report_bad_lock = false;
    bool pre_lock = false;
 -  StackID creation_stack_id = kInvalidStackID;
    {
 -    SlotLocker locker(thr);
 -    auto s = ctx->metamap.GetSyncOrCreate(thr, pc, addr, true);
 -    creation_stack_id = s->creation_stack_id;
 -    RecordMutexLock(thr, pc, addr, creation_stack_id, false);
 -    {
 -      ReadLock lock(&s->mtx);
 -      s->UpdateFlags(flagz);
 -      if (s->owner_tid != kInvalidTid) {
 -        if (flags()->report_mutex_bugs && !s->IsFlagSet(MutexFlagBroken)) {
 -          s->SetFlags(MutexFlagBroken);
 -          report_bad_lock = true;
 -        }
 -      }
 -      if (!thr->ignore_sync)
 -        thr->clock.Acquire(s->clock);
 -      s->last_lock = thr->fast_state;
 -      if (common_flags()->detect_deadlocks) {
 -        pre_lock = (flagz & MutexFlagDoPreLockOnPostLock) &&
 -                   !(flagz & MutexFlagTryLock);
 -        Callback cb(thr, pc);
 -        if (pre_lock)
 -          ctx->dd->MutexBeforeLock(&cb, &s->dd, false);
 -        ctx->dd->MutexAfterLock(&cb, &s->dd, false, flagz & MutexFlagTryLock);
 +    SyncVar *s = ctx->metamap.GetSyncOrCreate(thr, pc, addr, true);
 +    ReadLock l(&s->mtx);
 +    s->UpdateFlags(flagz);
 +    thr->fast_state.IncrementEpoch();
 +    TraceAddEvent(thr, thr->fast_state, EventTypeRLock, s->GetId());
 +    if (s->owner_tid != kInvalidTid) {
 +      if (flags()->report_mutex_bugs && !s->IsFlagSet(MutexFlagBroken)) {
 +        s->SetFlags(MutexFlagBroken);
 +        report_bad_lock = true;
        }
      }
 +    AcquireImpl(thr, pc, &s->clock);
 +    s->last_lock = thr->fast_state.raw();
 +    thr->mset.Add(s->GetId(), false, thr->fast_state.epoch());
 +    if (common_flags()->detect_deadlocks) {
 +      pre_lock =
 +          (flagz & MutexFlagDoPreLockOnPostLock) && !(flagz & MutexFlagTryLock);
 +      Callback cb(thr, pc);
 +      if (pre_lock)
 +        ctx->dd->MutexBeforeLock(&cb, &s->dd, false);
 +      ctx->dd->MutexAfterLock(&cb, &s->dd, false, flagz & MutexFlagTryLock);
 +    }
 +    mid = s->GetId();
    }
    if (report_bad_lock)
 -    ReportMutexMisuse(thr, pc, ReportTypeMutexBadReadLock, addr,
 -                      creation_stack_id);
 +    ReportMutexMisuse(thr, pc, ReportTypeMutexBadReadLock, addr, mid);
    if (pre_lock  && common_flags()->detect_deadlocks) {
      Callback cb(thr, pc);
      ReportDeadlock(thr, pc, ctx->dd->GetReport(&cb));
 @@ -322,39 +309,31 @@ void MutexPostReadLock(ThreadState *thr, uptr pc, uptr addr, u32 flagz) {

  void MutexReadUnlock(ThreadState *thr, uptr pc, uptr addr) {
    DPrintf("#%d: MutexReadUnlock %zx\n", thr->tid, addr);
 -  if (pc && IsAppMem(addr))
 +  if (IsAppMem(addr))
      MemoryAccess(thr, pc, addr, 1, kAccessRead | kAccessAtomic);
 -  RecordMutexUnlock(thr, addr);
 -  StackID creation_stack_id;
 +  u64 mid = 0;
    bool report_bad_unlock = false;
    {
 -    SlotLocker locker(thr);
 -    auto s = ctx->metamap.GetSyncOrCreate(thr, pc, addr, true);
 -    bool released = false;
 -    {
 -      Lock lock(&s->mtx);
 -      creation_stack_id = s->creation_stack_id;
 -      if (s->owner_tid != kInvalidTid) {
 -        if (flags()->report_mutex_bugs && !s->IsFlagSet(MutexFlagBroken)) {
 -          s->SetFlags(MutexFlagBroken);
 -          report_bad_unlock = true;
 -        }
 -      }
 -      if (!thr->ignore_sync) {
 -        thr->clock.Release(&s->read_clock);
 -        released = true;
 -      }
 -      if (common_flags()->detect_deadlocks && s->recursion == 0) {
 -        Callback cb(thr, pc);
 -        ctx->dd->MutexBeforeUnlock(&cb, &s->dd, false);
 +    SyncVar *s = ctx->metamap.GetSyncOrCreate(thr, pc, addr, true);
 +    Lock l(&s->mtx);
 +    thr->fast_state.IncrementEpoch();
 +    TraceAddEvent(thr, thr->fast_state, EventTypeRUnlock, s->GetId());
 +    if (s->owner_tid != kInvalidTid) {
 +      if (flags()->report_mutex_bugs && !s->IsFlagSet(MutexFlagBroken)) {
 +        s->SetFlags(MutexFlagBroken);
 +        report_bad_unlock = true;
        }
      }
 -    if (released)
 -      IncrementEpoch(thr);
 +    ReleaseImpl(thr, pc, &s->read_clock);
 +    if (common_flags()->detect_deadlocks && s->recursion == 0) {
 +      Callback cb(thr, pc);
 +      ctx->dd->MutexBeforeUnlock(&cb, &s->dd, false);
 +    }
 +    mid = s->GetId();
    }
 +  thr->mset.Del(mid, false);
    if (report_bad_unlock)
 -    ReportMutexMisuse(thr, pc, ReportTypeMutexBadReadUnlock, addr,
 -                      creation_stack_id);
 +    ReportMutexMisuse(thr, pc, ReportTypeMutexBadReadUnlock, addr, mid);
    if (common_flags()->detect_deadlocks) {
      Callback cb(thr, pc);
      ReportDeadlock(thr, pc, ctx->dd->GetReport(&cb));
 @@ -363,52 +342,44 @@ void MutexReadUnlock(ThreadState *thr, uptr pc, uptr addr) {

  void MutexReadOrWriteUnlock(ThreadState *thr, uptr pc, uptr addr) {
    DPrintf("#%d: MutexReadOrWriteUnlock %zx\n", thr->tid, addr);
 -  if (pc && IsAppMem(addr))
 +  if (IsAppMem(addr))
      MemoryAccess(thr, pc, addr, 1, kAccessRead | kAccessAtomic);
 -  RecordMutexUnlock(thr, addr);
 -  StackID creation_stack_id;
 +  u64 mid = 0;
    bool report_bad_unlock = false;
 -  bool write = true;
    {
 -    SlotLocker locker(thr);
 -    auto s = ctx->metamap.GetSyncOrCreate(thr, pc, addr, true);
 -    bool released = false;
 -    {
 -      Lock lock(&s->mtx);
 -      creation_stack_id = s->creation_stack_id;
 -      if (s->owner_tid == kInvalidTid) {
 -        // Seems to be read unlock.
 -        write = false;
 -        if (!thr->ignore_sync) {
 -          thr->clock.Release(&s->read_clock);
 -          released = true;
 -        }
 -      } else if (s->owner_tid == thr->tid) {
 -        // Seems to be write unlock.
 -        CHECK_GT(s->recursion, 0);
 -        s->recursion--;
 -        if (s->recursion == 0) {
 -          s->owner_tid = kInvalidTid;
 -          if (!thr->ignore_sync) {
 -            thr->clock.ReleaseStore(&s->clock);
 -            released = true;
 -          }
 -        }
 -      } else if (!s->IsFlagSet(MutexFlagBroken)) {
 -        s->SetFlags(MutexFlagBroken);
 -        report_bad_unlock = true;
 -      }
 -      if (common_flags()->detect_deadlocks && s->recursion == 0) {
 -        Callback cb(thr, pc);
 -        ctx->dd->MutexBeforeUnlock(&cb, &s->dd, write);
 +    SyncVar *s = ctx->metamap.GetSyncOrCreate(thr, pc, addr, true);
 +    Lock l(&s->mtx);
 +    bool write = true;
 +    if (s->owner_tid == kInvalidTid) {
 +      // Seems to be read unlock.
 +      write = false;
 +      thr->fast_state.IncrementEpoch();
 +      TraceAddEvent(thr, thr->fast_state, EventTypeRUnlock, s->GetId());
 +      ReleaseImpl(thr, pc, &s->read_clock);
 +    } else if (s->owner_tid == thr->tid) {
 +      // Seems to be write unlock.
 +      thr->fast_state.IncrementEpoch();
 +      TraceAddEvent(thr, thr->fast_state, EventTypeUnlock, s->GetId());
 +      CHECK_GT(s->recursion, 0);
 +      s->recursion--;
 +      if (s->recursion == 0) {
 +        s->owner_tid = kInvalidTid;
 +        ReleaseStoreImpl(thr, pc, &s->clock);
 +      } else {
        }
 +    } else if (!s->IsFlagSet(MutexFlagBroken)) {
 +      s->SetFlags(MutexFlagBroken);
 +      report_bad_unlock = true;
 +    }
 +    thr->mset.Del(s->GetId(), write);
 +    if (common_flags()->detect_deadlocks && s->recursion == 0) {
 +      Callback cb(thr, pc);
 +      ctx->dd->MutexBeforeUnlock(&cb, &s->dd, write);
      }
 -    if (released)
 -      IncrementEpoch(thr);
 +    mid = s->GetId();
    }
    if (report_bad_unlock)
 -    ReportMutexMisuse(thr, pc, ReportTypeMutexBadUnlock, addr,
 -                      creation_stack_id);
 +    ReportMutexMisuse(thr, pc, ReportTypeMutexBadUnlock, addr, mid);
    if (common_flags()->detect_deadlocks) {
      Callback cb(thr, pc);
      ReportDeadlock(thr, pc, ctx->dd->GetReport(&cb));
 @@ -417,120 +388,151 @@ void MutexReadOrWriteUnlock(ThreadState *thr, uptr pc, uptr addr) {

  void MutexRepair(ThreadState *thr, uptr pc, uptr addr) {
    DPrintf("#%d: MutexRepair %zx\n", thr->tid, addr);
 -  SlotLocker locker(thr);
 -  auto s = ctx->metamap.GetSyncOrCreate(thr, pc, addr, true);
 -  Lock lock(&s->mtx);
 +  SyncVar *s = ctx->metamap.GetSyncOrCreate(thr, pc, addr, true);
 +  Lock l(&s->mtx);
    s->owner_tid = kInvalidTid;
    s->recursion = 0;
  }

  void MutexInvalidAccess(ThreadState *thr, uptr pc, uptr addr) {
    DPrintf("#%d: MutexInvalidAccess %zx\n", thr->tid, addr);
 -  StackID creation_stack_id = kInvalidStackID;
 -  {
 -    SlotLocker locker(thr);
 -    auto s = ctx->metamap.GetSyncOrCreate(thr, pc, addr, true);
 -    if (s)
 -      creation_stack_id = s->creation_stack_id;
 -  }
 -  ReportMutexMisuse(thr, pc, ReportTypeMutexInvalidAccess, addr,
 -                    creation_stack_id);
 +  SyncVar *s = ctx->metamap.GetSyncOrCreate(thr, pc, addr, true);
 +  ReportMutexMisuse(thr, pc, ReportTypeMutexInvalidAccess, addr, s->GetId());
  }

  void Acquire(ThreadState *thr, uptr pc, uptr addr) {
    DPrintf("#%d: Acquire %zx\n", thr->tid, addr);
    if (thr->ignore_sync)
      return;
 -  auto s = ctx->metamap.GetSyncIfExists(addr);
 +  SyncVar *s = ctx->metamap.GetSyncIfExists(addr);
    if (!s)
      return;
 -  SlotLocker locker(thr);
 -  if (!s->clock)
 -    return;
 -  ReadLock lock(&s->mtx);
 -  thr->clock.Acquire(s->clock);
 +  ReadLock l(&s->mtx);
 +  AcquireImpl(thr, pc, &s->clock);
 +}
 +
 +static void UpdateClockCallback(ThreadContextBase *tctx_base, void *arg) {
 +  ThreadState *thr = reinterpret_cast<ThreadState*>(arg);
 +  ThreadContext *tctx = static_cast<ThreadContext*>(tctx_base);
 +  u64 epoch = tctx->epoch1;
 +  if (tctx->status == ThreadStatusRunning) {
 +    epoch = tctx->thr->fast_state.epoch();
 +    tctx->thr->clock.NoteGlobalAcquire(epoch);
 +  }
 +  thr->clock.set(&thr->proc()->clock_cache, tctx->tid, epoch);
  }

  void AcquireGlobal(ThreadState *thr) {
    DPrintf("#%d: AcquireGlobal\n", thr->tid);
    if (thr->ignore_sync)
      return;
 -  SlotLocker locker(thr);
 -  for (auto &slot : ctx->slots) thr->clock.Set(slot.sid, slot.epoch());
 +  ThreadRegistryLock l(&ctx->thread_registry);
 +  ctx->thread_registry.RunCallbackForEachThreadLocked(UpdateClockCallback, thr);
  }

 -void Release(ThreadState *thr, uptr pc, uptr addr) {
 -  DPrintf("#%d: Release %zx\n", thr->tid, addr);
 +void ReleaseStoreAcquire(ThreadState *thr, uptr pc, uptr addr) {
 +  DPrintf("#%d: ReleaseStoreAcquire %zx\n", thr->tid, addr);
    if (thr->ignore_sync)
      return;
 -  SlotLocker locker(thr);
 -  {
 -    auto s = ctx->metamap.GetSyncOrCreate(thr, pc, addr, false);
 -    Lock lock(&s->mtx);
 -    thr->clock.Release(&s->clock);
 -  }
 -  IncrementEpoch(thr);
 +  SyncVar *s = ctx->metamap.GetSyncOrCreate(thr, pc, addr, false);
 +  Lock l(&s->mtx);
 +  thr->fast_state.IncrementEpoch();
 +  // Can't increment epoch w/o writing to the trace as well.
 +  TraceAddEvent(thr, thr->fast_state, EventTypeMop, 0);
 +  ReleaseStoreAcquireImpl(thr, pc, &s->clock);
  }

 -void ReleaseStore(ThreadState *thr, uptr pc, uptr addr) {
 -  DPrintf("#%d: ReleaseStore %zx\n", thr->tid, addr);
 +void Release(ThreadState *thr, uptr pc, uptr addr) {
 +  DPrintf("#%d: Release %zx\n", thr->tid, addr);
    if (thr->ignore_sync)
      return;
 -  SlotLocker locker(thr);
 -  {
 -    auto s = ctx->metamap.GetSyncOrCreate(thr, pc, addr, false);
 -    Lock lock(&s->mtx);
 -    thr->clock.ReleaseStore(&s->clock);
 -  }
 -  IncrementEpoch(thr);
 +  SyncVar *s = ctx->metamap.GetSyncOrCreate(thr, pc, addr, false);
 +  Lock l(&s->mtx);
 +  thr->fast_state.IncrementEpoch();
 +  // Can't increment epoch w/o writing to the trace as well.
 +  TraceAddEvent(thr, thr->fast_state, EventTypeMop, 0);
 +  ReleaseImpl(thr, pc, &s->clock);
  }

 -void ReleaseStoreAcquire(ThreadState *thr, uptr pc, uptr addr) {
 -  DPrintf("#%d: ReleaseStoreAcquire %zx\n", thr->tid, addr);
 +void ReleaseStore(ThreadState *thr, uptr pc, uptr addr) {
 +  DPrintf("#%d: ReleaseStore %zx\n", thr->tid, addr);
    if (thr->ignore_sync)
      return;
 -  SlotLocker locker(thr);
 -  {
 -    auto s = ctx->metamap.GetSyncOrCreate(thr, pc, addr, false);
 -    Lock lock(&s->mtx);
 -    thr->clock.ReleaseStoreAcquire(&s->clock);
 -  }
 -  IncrementEpoch(thr);
 +  SyncVar *s = ctx->metamap.GetSyncOrCreate(thr, pc, addr, false);
 +  Lock l(&s->mtx);
 +  thr->fast_state.IncrementEpoch();
 +  // Can't increment epoch w/o writing to the trace as well.
 +  TraceAddEvent(thr, thr->fast_state, EventTypeMop, 0);
 +  ReleaseStoreImpl(thr, pc, &s->clock);
  }

 -void IncrementEpoch(ThreadState *thr) {
 -  DCHECK(!thr->ignore_sync);
 -  DCHECK(thr->slot_locked);
 -  Epoch epoch = EpochInc(thr->fast_state.epoch());
 -  if (!EpochOverflow(epoch)) {
 -    Sid sid = thr->fast_state.sid();
 -    thr->clock.Set(sid, epoch);
 -    thr->fast_state.SetEpoch(epoch);
 -    thr->slot->SetEpoch(epoch);
 -    TraceTime(thr);
 -  }
 +#if !SANITIZER_GO
 +static void UpdateSleepClockCallback(ThreadContextBase *tctx_base, void *arg) {
 +  ThreadState *thr = reinterpret_cast<ThreadState*>(arg);
 +  ThreadContext *tctx = static_cast<ThreadContext*>(tctx_base);
 +  u64 epoch = tctx->epoch1;
 +  if (tctx->status == ThreadStatusRunning)
 +    epoch = tctx->thr->fast_state.epoch();
 +  thr->last_sleep_clock.set(&thr->proc()->clock_cache, tctx->tid, epoch);
  }

 -#if !SANITIZER_GO
  void AfterSleep(ThreadState *thr, uptr pc) {
    DPrintf("#%d: AfterSleep\n", thr->tid);
    if (thr->ignore_sync)
      return;
    thr->last_sleep_stack_id = CurrentStackId(thr, pc);
 -  thr->last_sleep_clock.Reset();
 -  SlotLocker locker(thr);
 -  for (auto &slot : ctx->slots)
 -    thr->last_sleep_clock.Set(slot.sid, slot.epoch());
 +  ThreadRegistryLock l(&ctx->thread_registry);
 +  ctx->thread_registry.RunCallbackForEachThreadLocked(UpdateSleepClockCallback,
 +                                                      thr);
  }
  #endif

 +void AcquireImpl(ThreadState *thr, uptr pc, SyncClock *c) {
 +  if (thr->ignore_sync)
 +    return;
 +  thr->clock.set(thr->fast_state.epoch());
 +  thr->clock.acquire(&thr->proc()->clock_cache, c);
 +}
 +
 +void ReleaseStoreAcquireImpl(ThreadState *thr, uptr pc, SyncClock *c) {
 +  if (thr->ignore_sync)
 +    return;
 +  thr->clock.set(thr->fast_state.epoch());
 +  thr->fast_synch_epoch = thr->fast_state.epoch();
 +  thr->clock.releaseStoreAcquire(&thr->proc()->clock_cache, c);
 +}
 +
 +void ReleaseImpl(ThreadState *thr, uptr pc, SyncClock *c) {
 +  if (thr->ignore_sync)
 +    return;
 +  thr->clock.set(thr->fast_state.epoch());
 +  thr->fast_synch_epoch = thr->fast_state.epoch();
 +  thr->clock.release(&thr->proc()->clock_cache, c);
 +}
 +
 +void ReleaseStoreImpl(ThreadState *thr, uptr pc, SyncClock *c) {
 +  if (thr->ignore_sync)
 +    return;
 +  thr->clock.set(thr->fast_state.epoch());
 +  thr->fast_synch_epoch = thr->fast_state.epoch();
 +  thr->clock.ReleaseStore(&thr->proc()->clock_cache, c);
 +}
 +
 +void AcquireReleaseImpl(ThreadState *thr, uptr pc, SyncClock *c) {
 +  if (thr->ignore_sync)
 +    return;
 +  thr->clock.set(thr->fast_state.epoch());
 +  thr->fast_synch_epoch = thr->fast_state.epoch();
 +  thr->clock.acq_rel(&thr->proc()->clock_cache, c);
 +}
 +
  void ReportDeadlock(ThreadState *thr, uptr pc, DDReport *r) {
    if (r == 0 || !ShouldReport(thr, ReportTypeDeadlock))
      return;
    ThreadRegistryLock l(&ctx->thread_registry);
    ScopedReport rep(ReportTypeDeadlock);
    for (int i = 0; i < r->n; i++) {
 -    rep.AddMutex(r->loop[i].mtx_ctx0, r->loop[i].stk[0]);
 +    rep.AddMutex(r->loop[i].mtx_ctx0);
      rep.AddUniqueTid((int)r->loop[i].thr_ctx);
      rep.AddThread((int)r->loop[i].thr_ctx);
    }
 @@ -538,7 +540,7 @@ void ReportDeadlock(ThreadState *thr, uptr pc, DDReport *r) {
    for (int i = 0; i < r->n; i++) {
      for (int j = 0; j < (flags()->second_deadlock_stack ? 2 : 1); j++) {
        u32 stk = r->loop[i].stk[j];
 -      if (stk && stk != kInvalidStackID) {
 +      if (stk && stk != 0xffffffff) {
          rep.AddStack(StackDepotGet(stk), true);
        } else {
          // Sometimes we fail to extract the stack trace (FIXME: investigate),
 @@ -550,26 +552,4 @@ void ReportDeadlock(ThreadState *thr, uptr pc, DDReport *r) {
    OutputReport(thr, rep);
  }

 -void ReportDestroyLocked(ThreadState *thr, uptr pc, uptr addr,
 -                         FastState last_lock, StackID creation_stack_id) {
 -  SlotPairLocker locker(thr, last_lock.sid());
 -  ThreadRegistryLock l0(&ctx->thread_registry);
 -  Lock slots_lock(&ctx->slot_mtx);
 -  ScopedReport rep(ReportTypeMutexDestroyLocked);
 -  rep.AddMutex(addr, creation_stack_id);
 -  VarSizeStackTrace trace;
 -  ObtainCurrentStack(thr, pc, &trace);
 -  rep.AddStack(trace, true);
 -
 -  Tid tid;
 -  DynamicMutexSet mset;
 -  uptr tag;
 -  if (!RestoreStack(EventType::kLock, last_lock.sid(), last_lock.epoch(), addr,
 -                    0, kAccessWrite, &tid, &trace, mset, &tag))
 -    return;
 -  rep.AddStack(trace, true);
 -  rep.AddLocation(addr, 1);
 -  OutputReport(thr, rep);
 -}
 -
  }  // namespace __tsan
 diff --git a/compiler-rt/lib/tsan/rtl/tsan_rtl_proc.cpp b/compiler-rt/lib/tsan/rtl/tsan_rtl_proc.cpp
 index 5acc3967208e..def61cca14d5 100644
 --- a/compiler-rt/lib/tsan/rtl/tsan_rtl_proc.cpp
 +++ b/compiler-rt/lib/tsan/rtl/tsan_rtl_proc.cpp
 @@ -35,6 +35,7 @@ void ProcDestroy(Processor *proc) {
  #if !SANITIZER_GO
    AllocatorProcFinish(proc);
  #endif
 +  ctx->clock_alloc.FlushCache(&proc->clock_cache);
    ctx->metamap.OnProcIdle(proc);
    if (common_flags()->detect_deadlocks)
       ctx->dd->DestroyPhysicalThread(proc->dd_pt);
 diff --git a/compiler-rt/lib/tsan/rtl/tsan_rtl_report.cpp b/compiler-rt/lib/tsan/rtl/tsan_rtl_report.cpp
 index efe3abea6375..f332a6a8d1d8 100644
 --- a/compiler-rt/lib/tsan/rtl/tsan_rtl_report.cpp
 +++ b/compiler-rt/lib/tsan/rtl/tsan_rtl_report.cpp
 @@ -175,26 +175,22 @@ void ScopedReportBase::AddStack(StackTrace stack, bool suppressable) {
  }

  void ScopedReportBase::AddMemoryAccess(uptr addr, uptr external_tag, Shadow s,
 -                                       Tid tid, StackTrace stack,
 -                                       const MutexSet *mset) {
 -  uptr addr0, size;
 -  AccessType typ;
 -  s.GetAccess(&addr0, &size, &typ);
 +                                       StackTrace stack, const MutexSet *mset) {
    auto *mop = New<ReportMop>();
    rep_->mops.PushBack(mop);
 -  mop->tid = tid;
 -  mop->addr = addr + addr0;
 -  mop->size = size;
 -  mop->write = !(typ & kAccessRead);
 -  mop->atomic = typ & kAccessAtomic;
 +  mop->tid = s.tid();
 +  mop->addr = addr + s.addr0();
 +  mop->size = s.size();
 +  mop->write = s.IsWrite();
 +  mop->atomic = s.IsAtomic();
    mop->stack = SymbolizeStack(stack);
    mop->external_tag = external_tag;
    if (mop->stack)
      mop->stack->suppressable = true;
    for (uptr i = 0; i < mset->Size(); i++) {
      MutexSet::Desc d = mset->Get(i);
 -    u64 id = this->AddMutex(d.addr, d.stack_id);
 -    ReportMopMutex mtx = {id, d.write};
 +    u64 mid = this->AddMutex(d.id);
 +    ReportMopMutex mtx = {mid, d.write};
      mop->mset.PushBack(mtx);
    }
  }
 @@ -223,6 +219,18 @@ void ScopedReportBase::AddThread(const ThreadContext *tctx, bool suppressable) {
  }

  #if !SANITIZER_GO
 +static bool FindThreadByUidLockedCallback(ThreadContextBase *tctx, void *arg) {
 +  int unique_id = *(int *)arg;
 +  return tctx->unique_id == (u32)unique_id;
 +}
 +
 +static ThreadContext *FindThreadByUidLocked(Tid unique_id) {
 +  ctx->thread_registry.CheckLocked();
 +  return static_cast<ThreadContext *>(
 +      ctx->thread_registry.FindThreadContextLocked(
 +          FindThreadByUidLockedCallback, &unique_id));
 +}
 +
  static ThreadContext *FindThreadByTidLocked(Tid tid) {
    ctx->thread_registry.CheckLocked();
    return static_cast<ThreadContext *>(
 @@ -254,25 +262,55 @@ ThreadContext *IsThreadStackOrTls(uptr addr, bool *is_stack) {
  }
  #endif

 -void ScopedReportBase::AddThread(Tid tid, bool suppressable) {
 +void ScopedReportBase::AddThread(Tid unique_tid, bool suppressable) {
  #if !SANITIZER_GO
 -  if (const ThreadContext *tctx = FindThreadByTidLocked(tid))
 +  if (const ThreadContext *tctx = FindThreadByUidLocked(unique_tid))
      AddThread(tctx, suppressable);
  #endif
  }

 -int ScopedReportBase::AddMutex(uptr addr, StackID creation_stack_id) {
 +void ScopedReportBase::AddMutex(const SyncVar *s) {
    for (uptr i = 0; i < rep_->mutexes.Size(); i++) {
 -    if (rep_->mutexes[i]->addr == addr)
 -      return rep_->mutexes[i]->id;
 +    if (rep_->mutexes[i]->id == s->uid)
 +      return;
    }
    auto *rm = New<ReportMutex>();
    rep_->mutexes.PushBack(rm);
 -  rm->id = rep_->mutexes.Size() - 1;
 -  rm->addr = addr;
 +  rm->id = s->uid;
 +  rm->addr = s->addr;
    rm->destroyed = false;
 -  rm->stack = SymbolizeStackId(creation_stack_id);
 -  return rm->id;
 +  rm->stack = SymbolizeStackId(s->creation_stack_id);
 +}
 +
 +u64 ScopedReportBase::AddMutex(u64 id) {
 +  u64 uid = 0;
 +  u64 mid = id;
 +  uptr addr = SyncVar::SplitId(id, &uid);
 +  SyncVar *s = ctx->metamap.GetSyncIfExists(addr);
 +  // Check that the mutex is still alive.
 +  // Another mutex can be created at the same address,
 +  // so check uid as well.
 +  if (s && s->CheckId(uid)) {
 +    Lock l(&s->mtx);
 +    mid = s->uid;
 +    AddMutex(s);
 +  } else {
 +    AddDeadMutex(id);
 +  }
 +  return mid;
 +}
 +
 +void ScopedReportBase::AddDeadMutex(u64 id) {
 +  for (uptr i = 0; i < rep_->mutexes.Size(); i++) {
 +    if (rep_->mutexes[i]->id == id)
 +      return;
 +  }
 +  auto *rm = New<ReportMutex>();
 +  rep_->mutexes.PushBack(rm);
 +  rm->id = id;
 +  rm->addr = 0;
 +  rm->destroyed = true;
 +  rm->stack = 0;
  }

  void ScopedReportBase::AddLocation(uptr addr, uptr size) {
 @@ -289,7 +327,7 @@ void ScopedReportBase::AddLocation(uptr addr, uptr size) {
      loc->tid = creat_tid;
      loc->stack = SymbolizeStackId(creat_stack);
      rep_->locs.PushBack(loc);
 -    ThreadContext *tctx = FindThreadByTidLocked(creat_tid);
 +    ThreadContext *tctx = FindThreadByUidLocked(creat_tid);
      if (tctx)
        AddThread(tctx);
      return;
 @@ -305,15 +343,16 @@ void ScopedReportBase::AddLocation(uptr addr, uptr size) {
    if (!b)
      b = JavaHeapBlock(addr, &block_begin);
    if (b != 0) {
 +    ThreadContext *tctx = FindThreadByTidLocked(b->tid);
      auto *loc = New<ReportLocation>();
      loc->type = ReportLocationHeap;
      loc->heap_chunk_start = block_begin;
      loc->heap_chunk_size = b->siz;
      loc->external_tag = b->tag;
 -    loc->tid = b->tid;
 +    loc->tid = tctx ? tctx->tid : b->tid;
      loc->stack = SymbolizeStackId(b->stk);
      rep_->locs.PushBack(loc);
 -    if (ThreadContext *tctx = FindThreadByTidLocked(b->tid))
 +    if (tctx)
        AddThread(tctx);
      return;
    }
 @@ -348,6 +387,71 @@ ScopedReport::ScopedReport(ReportType typ, uptr tag)

  ScopedReport::~ScopedReport() {}

 +void RestoreStack(Tid tid, const u64 epoch, VarSizeStackTrace *stk,
 +                  MutexSet *mset, uptr *tag) {
 +  // This function restores stack trace and mutex set for the thread/epoch.
 +  // It does so by getting stack trace and mutex set at the beginning of
 +  // trace part, and then replaying the trace till the given epoch.
 +  Trace* trace = ThreadTrace(tid);
 +  ReadLock l(&trace->mtx);
 +  const int partidx = (epoch / kTracePartSize) % TraceParts();
 +  TraceHeader* hdr = &trace->headers[partidx];
 +  if (epoch < hdr->epoch0 || epoch >= hdr->epoch0 + kTracePartSize)
 +    return;
 +  CHECK_EQ(RoundDown(epoch, kTracePartSize), hdr->epoch0);
 +  const u64 epoch0 = RoundDown(epoch, TraceSize());
 +  const u64 eend = epoch % TraceSize();
 +  const u64 ebegin = RoundDown(eend, kTracePartSize);
 +  DPrintf("#%d: RestoreStack epoch=%zu ebegin=%zu eend=%zu partidx=%d\n",
 +          tid, (uptr)epoch, (uptr)ebegin, (uptr)eend, partidx);
 +  Vector<uptr> stack;
 +  stack.Resize(hdr->stack0.size + 64);
 +  for (uptr i = 0; i < hdr->stack0.size; i++) {
 +    stack[i] = hdr->stack0.trace[i];
 +    DPrintf2("  #%02zu: pc=%zx\n", i, stack[i]);
 +  }
 +  if (mset)
 +    *mset = hdr->mset0;
 +  uptr pos = hdr->stack0.size;
 +  Event *events = (Event*)GetThreadTrace(tid);
 +  for (uptr i = ebegin; i <= eend; i++) {
 +    Event ev = events[i];
 +    EventType typ = (EventType)(ev >> kEventPCBits);
 +    uptr pc = (uptr)(ev & ((1ull << kEventPCBits) - 1));
 +    DPrintf2("  %zu typ=%d pc=%zx\n", i, typ, pc);
 +    if (typ == EventTypeMop) {
 +      stack[pos] = pc;
 +    } else if (typ == EventTypeFuncEnter) {
 +      if (stack.Size() < pos + 2)
 +        stack.Resize(pos + 2);
 +      stack[pos++] = pc;
 +    } else if (typ == EventTypeFuncExit) {
 +      if (pos > 0)
 +        pos--;
 +    }
 +    if (mset) {
 +      if (typ == EventTypeLock) {
 +        mset->Add(pc, true, epoch0 + i);
 +      } else if (typ == EventTypeUnlock) {
 +        mset->Del(pc, true);
 +      } else if (typ == EventTypeRLock) {
 +        mset->Add(pc, false, epoch0 + i);
 +      } else if (typ == EventTypeRUnlock) {
 +        mset->Del(pc, false);
 +      }
 +    }
 +    for (uptr j = 0; j <= pos; j++)
 +      DPrintf2("      #%zu: %zx\n", j, stack[j]);
 +  }
 +  if (pos == 0 && stack[0] == 0)
 +    return;
 +  pos++;
 +  stk->Init(&stack[0], pos);
 +  ExtractTagFromStack(stk, tag);
 +}
 +
 +namespace v3 {
 +
  // Replays the trace up to last_pos position in the last part
  // or up to the provided epoch/sid (whichever is earlier)
  // and calls the provided function f for each event.
 @@ -365,7 +469,6 @@ void TraceReplay(Trace *trace, TracePart *last, Event *last_pos, Sid sid,
      Event *end = &part->events[TracePart::kSize - 1];
      if (part == last)
        end = last_pos;
 -    f(kFreeSid, kEpochOver, nullptr);  // notify about part start
      for (Event *evp = &part->events[0]; evp < end; evp++) {
        Event *evp0 = evp;
        if (!evp->is_access && !evp->is_func) {
 @@ -425,36 +528,21 @@ static constexpr bool IsWithinAccess(uptr addr1, uptr size1, uptr addr2,
    return addr1 >= addr2 && addr1 + size1 <= addr2 + size2;
  }

 -// Replays the trace of slot sid up to the target event identified
 -// by epoch/addr/size/typ and restores and returns tid, stack, mutex set
 +// Replays the trace of thread tid up to the target event identified
 +// by sid/epoch/addr/size/typ and restores and returns stack, mutex set
  // and tag for that event. If there are multiple such events, it returns
  // the last one. Returns false if the event is not present in the trace.
 -bool RestoreStack(EventType type, Sid sid, Epoch epoch, uptr addr, uptr size,
 -                  AccessType typ, Tid *ptid, VarSizeStackTrace *pstk,
 +bool RestoreStack(Tid tid, EventType type, Sid sid, Epoch epoch, uptr addr,
 +                  uptr size, AccessType typ, VarSizeStackTrace *pstk,
                    MutexSet *pmset, uptr *ptag) {
    // This function restores stack trace and mutex set for the thread/epoch.
    // It does so by getting stack trace and mutex set at the beginning of
    // trace part, and then replaying the trace till the given epoch.
 -  DPrintf2("RestoreStack: sid=%u@%u addr=0x%zx/%zu typ=%x\n",
 +  DPrintf2("RestoreStack: tid=%u sid=%u@%u addr=0x%zx/%zu typ=%x\n", tid,
             static_cast<int>(sid), static_cast<int>(epoch), addr, size,
             static_cast<int>(typ));
    ctx->slot_mtx.CheckLocked();  // needed to prevent trace part recycling
    ctx->thread_registry.CheckLocked();
 -  TidSlot *slot = &ctx->slots[static_cast<uptr>(sid)];
 -  Tid tid = kInvalidTid;
 -  // Need to lock the slot mutex as it protects slot->journal.
 -  slot->mtx.CheckLocked();
 -  for (uptr i = 0; i < slot->journal.Size(); i++) {
 -    DPrintf2("  journal: epoch=%d tid=%d\n",
 -             static_cast<int>(slot->journal[i].epoch), slot->journal[i].tid);
 -    if (i == slot->journal.Size() - 1 || slot->journal[i + 1].epoch > epoch) {
 -      tid = slot->journal[i].tid;
 -      break;
 -    }
 -  }
 -  if (tid == kInvalidTid)
 -    return false;
 -  *ptid = tid;
    ThreadContext *tctx =
        static_cast<ThreadContext *>(ctx->thread_registry.GetThreadLocked(tid));
    Trace *trace = &tctx->trace;
 @@ -465,10 +553,8 @@ bool RestoreStack(EventType type, Sid sid, Epoch epoch, uptr addr, uptr size,
    {
      Lock lock(&trace->mtx);
      first_part = trace->parts.Front();
 -    if (!first_part) {
 -      DPrintf2("RestoreStack: tid=%d trace=%p no trace parts\n", tid, trace);
 +    if (!first_part)
        return false;
 -    }
      last_part = trace->parts.Back();
      last_pos = trace->final_pos;
      if (tctx->thr)
 @@ -481,18 +567,9 @@ bool RestoreStack(EventType type, Sid sid, Epoch epoch, uptr addr, uptr size,
    bool is_read = typ & kAccessRead;
    bool is_atomic = typ & kAccessAtomic;
    bool is_free = typ & kAccessFree;
 -  DPrintf2("RestoreStack: tid=%d parts=[%p-%p] last_pos=%p\n", tid,
 -           trace->parts.Front(), last_part, last_pos);
    TraceReplay(
        trace, last_part, last_pos, sid, epoch,
        [&](Sid ev_sid, Epoch ev_epoch, Event *evp) {
 -        if (evp == nullptr) {
 -          // Each trace part is self-consistent, so we reset state.
 -          stack.Resize(0);
 -          mset->Reset();
 -          prev_pc = 0;
 -          return;
 -        }
          bool match = ev_sid == sid && ev_epoch == epoch;
          if (evp->is_access) {
            if (evp->is_func == 0 && evp->type == EventType::kAccessExt &&
 @@ -515,15 +592,12 @@ bool RestoreStack(EventType type, Sid sid, Epoch epoch, uptr addr, uptr size,
          if (evp->is_func) {
            auto *ev = reinterpret_cast<EventFunc *>(evp);
            if (ev->pc) {
 -            DPrintf2(" FuncEnter: pc=0x%llx\n", ev->pc);
 +            DPrintf2("  FuncEnter: pc=0x%llx\n", ev->pc);
              stack.PushBack(ev->pc);
            } else {
 -            DPrintf2(" FuncExit\n");
 -            // We don't log pathologically large stacks in each part,
 -            // if the stack was truncated we can have more func exits than
 -            // entries.
 -            if (stack.Size())
 -              stack.PopBack();
 +            DPrintf2("  FuncExit\n");
 +            CHECK(stack.Size());
 +            stack.PopBack();
            }
            return;
          }
 @@ -592,6 +666,8 @@ bool RestoreStack(EventType type, Sid sid, Epoch epoch, uptr addr, uptr size,
    return found;
  }

 +}  // namespace v3
 +
  bool RacyStacks::operator==(const RacyStacks &other) const {
    if (hash[0] == other.hash[0] && hash[1] == other.hash[1])
      return true;
 @@ -682,7 +758,10 @@ bool OutputReport(ThreadState *thr, const ScopedReport &srep) {
      ctx->fired_suppressions.push_back(s);
    }
    {
 +    bool old_is_freeing = thr->is_freeing;
 +    thr->is_freeing = false;
      bool suppressed = OnReport(rep, pc_or_addr != 0);
 +    thr->is_freeing = old_is_freeing;
      if (suppressed) {
        thr->current_report = nullptr;
        return false;
 @@ -729,91 +808,97 @@ static bool IsFiredSuppression(Context *ctx, ReportType type, uptr addr) {
    return false;
  }

 -// We need to lock the target slot during RestoreStack because it protects
 -// the slot journal. However, the target slot can be the slot of the current
 -// thread or a different slot.
 -SlotPairLocker::SlotPairLocker(ThreadState *thr,
 -                               Sid sid) NO_THREAD_SAFETY_ANALYSIS : thr_(thr),
 -                                                                    slot_() {
 -  CHECK_NE(sid, kFreeSid);
 -  Lock l(&ctx->multi_slot_mtx);
 -  SlotLock(thr);
 -  if (sid == thr->slot->sid)
 -    return;
 -  slot_ = &ctx->slots[static_cast<uptr>(sid)];
 -  slot_->mtx.Lock();
 -}
 -
 -SlotPairLocker::~SlotPairLocker() NO_THREAD_SAFETY_ANALYSIS {
 -  SlotUnlock(thr_);
 -  if (slot_)
 -    slot_->mtx.Unlock();
 +static bool RaceBetweenAtomicAndFree(ThreadState *thr) {
 +  Shadow s0(thr->racy_state[0]);
 +  Shadow s1(thr->racy_state[1]);
 +  CHECK(!(s0.IsAtomic() && s1.IsAtomic()));
 +  if (!s0.IsAtomic() && !s1.IsAtomic())
 +    return true;
 +  if (s0.IsAtomic() && s1.IsFreed())
 +    return true;
 +  if (s1.IsAtomic() && thr->is_freeing)
 +    return true;
 +  return false;
  }

 -void ReportRace(ThreadState *thr, RawShadow *shadow_mem, Shadow cur, Shadow old,
 -                AccessType typ0) {
 +void ReportRace(ThreadState *thr) {
    CheckedMutex::CheckNoLocks();

    // Symbolizer makes lots of intercepted calls. If we try to process them,
    // at best it will cause deadlocks on internal mutexes.
    ScopedIgnoreInterceptors ignore;

 -  uptr addr = ShadowToMem(shadow_mem);
 -  DPrintf("#%d: ReportRace %p\n", thr->tid, (void *)addr);
    if (!ShouldReport(thr, ReportTypeRace))
      return;
 -  uptr addr_off0, size0;
 -  cur.GetAccess(&addr_off0, &size0, nullptr);
 -  uptr addr_off1, size1, typ1;
 -  old.GetAccess(&addr_off1, &size1, &typ1);
 -  if (!flags()->report_atomic_races &&
 -      ((typ0 & kAccessAtomic) || (typ1 & kAccessAtomic)) &&
 -      !(typ0 & kAccessFree) && !(typ1 & kAccessFree))
 +  if (!flags()->report_atomic_races && !RaceBetweenAtomicAndFree(thr))
      return;

 -  const uptr kMop = 2;
 -  Shadow s[kMop] = {cur, old};
 -  uptr addr0 = addr + addr_off0;
 -  uptr addr1 = addr + addr_off1;
 -  uptr end0 = addr0 + size0;
 -  uptr end1 = addr1 + size1;
 -  uptr addr_min = min(addr0, addr1);
 -  uptr addr_max = max(end0, end1);
 -  if (IsExpectedReport(addr_min, addr_max - addr_min))
 -    return;
 +  bool freed = false;
 +  {
 +    Shadow s(thr->racy_state[1]);
 +    freed = s.GetFreedAndReset();
 +    thr->racy_state[1] = s.raw();
 +  }
 +
 +  uptr addr = ShadowToMem(thr->racy_shadow_addr);
 +  uptr addr_min = 0;
 +  uptr addr_max = 0;
 +  {
 +    uptr a0 = addr + Shadow(thr->racy_state[0]).addr0();
 +    uptr a1 = addr + Shadow(thr->racy_state[1]).addr0();
 +    uptr e0 = a0 + Shadow(thr->racy_state[0]).size();
 +    uptr e1 = a1 + Shadow(thr->racy_state[1]).size();
 +    addr_min = min(a0, a1);
 +    addr_max = max(e0, e1);
 +    if (IsExpectedReport(addr_min, addr_max - addr_min))
 +      return;
 +  }
    if (HandleRacyAddress(thr, addr_min, addr_max))
      return;

 -  ReportType rep_typ = ReportTypeRace;
 -  if ((typ0 & kAccessVptr) && (typ1 & kAccessFree))
 -    rep_typ = ReportTypeVptrUseAfterFree;
 -  else if (typ0 & kAccessVptr)
 -    rep_typ = ReportTypeVptrRace;
 -  else if (typ1 & kAccessFree)
 -    rep_typ = ReportTypeUseAfterFree;
 +  ReportType typ = ReportTypeRace;
 +  if (thr->is_vptr_access && freed)
 +    typ = ReportTypeVptrUseAfterFree;
 +  else if (thr->is_vptr_access)
 +    typ = ReportTypeVptrRace;
 +  else if (freed)
 +    typ = ReportTypeUseAfterFree;

 -  if (IsFiredSuppression(ctx, rep_typ, addr))
 +  if (IsFiredSuppression(ctx, typ, addr))
      return;

 +  const uptr kMop = 2;
    VarSizeStackTrace traces[kMop];
 -  Tid tids[kMop] = {thr->tid, kInvalidTid};
 -  uptr tags[kMop] = {kExternalTagNone, kExternalTagNone};
 -
 -  ObtainCurrentStack(thr, thr->trace_prev_pc, &traces[0], &tags[0]);
 -  if (IsFiredSuppression(ctx, rep_typ, traces[0]))
 -    return;
 -
 -  DynamicMutexSet mset1;
 -  MutexSet *mset[kMop] = {&thr->mset, mset1};
 -
 -  SlotPairLocker locker(thr, s[1].sid());
 -  ThreadRegistryLock l0(&ctx->thread_registry);
 -  Lock slots_lock(&ctx->slot_mtx);
 -  if (!RestoreStack(EventType::kAccessExt, s[1].sid(), s[1].epoch(), addr1,
 -                    size1, typ1, &tids[1], &traces[1], mset[1], &tags[1]))
 +  uptr tags[kMop] = {kExternalTagNone};
 +  uptr toppc = TraceTopPC(thr);
 +  if (toppc >> kEventPCBits) {
 +    // This is a work-around for a known issue.
 +    // The scenario where this happens is rather elaborate and requires
 +    // an instrumented __sanitizer_report_error_summary callback and
 +    // a __tsan_symbolize_external callback and a race during a range memory
 +    // access larger than 8 bytes. MemoryAccessRange adds the current PC to
 +    // the trace and starts processing memory accesses. A first memory access
 +    // triggers a race, we report it and call the instrumented
 +    // __sanitizer_report_error_summary, which adds more stuff to the trace
 +    // since it is intrumented. Then a second memory access in MemoryAccessRange
 +    // also triggers a race and we get here and call TraceTopPC to get the
 +    // current PC, however now it contains some unrelated events from the
 +    // callback. Most likely, TraceTopPC will now return a EventTypeFuncExit
 +    // event. Later we subtract -1 from it (in GetPreviousInstructionPc)
 +    // and the resulting PC has kExternalPCBit set, so we pass it to
 +    // __tsan_symbolize_external_ex. __tsan_symbolize_external_ex is within its
 +    // rights to crash since the PC is completely bogus.
 +    // test/tsan/double_race.cpp contains a test case for this.
 +    toppc = 0;
 +  }
 +  ObtainCurrentStack(thr, toppc, &traces[0], &tags[0]);
 +  if (IsFiredSuppression(ctx, typ, traces[0]))
      return;

 -  if (IsFiredSuppression(ctx, rep_typ, traces[1]))
 +  DynamicMutexSet mset2;
 +  Shadow s2(thr->racy_state[1]);
 +  RestoreStack(s2.tid(), s2.epoch(), &traces[1], mset2, &tags[1]);
 +  if (IsFiredSuppression(ctx, typ, traces[1]))
      return;

    if (HandleRacyStacks(thr, traces))
 @@ -823,29 +908,39 @@ void ReportRace(ThreadState *thr, RawShadow *shadow_mem, Shadow cur, Shadow old,
    uptr tag = kExternalTagNone;
    for (uptr i = 0; i < kMop; i++) {
      if (tags[i] != kExternalTagNone) {
 -      rep_typ = ReportTypeExternalRace;
 +      typ = ReportTypeExternalRace;
        tag = tags[i];
        break;
      }
    }

 -  ScopedReport rep(rep_typ, tag);
 -  for (uptr i = 0; i < kMop; i++)
 -    rep.AddMemoryAccess(addr, tags[i], s[i], tids[i], traces[i], mset[i]);
 +  ThreadRegistryLock l0(&ctx->thread_registry);
 +  ScopedReport rep(typ, tag);
 +  for (uptr i = 0; i < kMop; i++) {
 +    Shadow s(thr->racy_state[i]);
 +    rep.AddMemoryAccess(addr, tags[i], s, traces[i],
 +                        i == 0 ? &thr->mset : mset2);
 +  }

    for (uptr i = 0; i < kMop; i++) {
 +    FastState s(thr->racy_state[i]);
      ThreadContext *tctx = static_cast<ThreadContext *>(
 -        ctx->thread_registry.GetThreadLocked(tids[i]));
 +        ctx->thread_registry.GetThreadLocked(s.tid()));
 +    if (s.epoch() < tctx->epoch0 || s.epoch() > tctx->epoch1)
 +      continue;
      rep.AddThread(tctx);
    }

    rep.AddLocation(addr_min, addr_max - addr_min);

  #if !SANITIZER_GO
 -  if (!((typ0 | typ1) & kAccessFree) &&
 -      s[1].epoch() <= thr->last_sleep_clock.Get(s[1].sid()))
 -    rep.AddSleep(thr->last_sleep_stack_id);
 +  {
 +    Shadow s(thr->racy_state[1]);
 +    if (s.epoch() <= thr->last_sleep_clock.get(s.tid()))
 +      rep.AddSleep(thr->last_sleep_stack_id);
 +  }
  #endif
 +
    OutputReport(thr, rep);
  }

 diff --git a/compiler-rt/lib/tsan/rtl/tsan_rtl_thread.cpp b/compiler-rt/lib/tsan/rtl/tsan_rtl_thread.cpp
 index fc5088c336cd..c8f7124c009d 100644
 --- a/compiler-rt/lib/tsan/rtl/tsan_rtl_thread.cpp
 +++ b/compiler-rt/lib/tsan/rtl/tsan_rtl_thread.cpp
 @@ -21,14 +21,20 @@ namespace __tsan {

  // ThreadContext implementation.

 -ThreadContext::ThreadContext(Tid tid) : ThreadContextBase(tid), thr(), sync() {}
 +ThreadContext::ThreadContext(Tid tid)
 +    : ThreadContextBase(tid), thr(), sync(), epoch0(), epoch1() {}

  #if !SANITIZER_GO
  ThreadContext::~ThreadContext() {
  }
  #endif

 -void ThreadContext::OnReset() { CHECK(!sync); }
 +void ThreadContext::OnReset() {
 +  CHECK_EQ(sync.size(), 0);
 +  uptr trace_p = GetThreadTrace(tid);
 +  ReleaseMemoryPagesToOS(trace_p, trace_p + TraceSize() * sizeof(Event));
 +  //!!! ReleaseMemoryToOS(GetThreadTraceHeader(tid), sizeof(Trace));
 +}

  #if !SANITIZER_GO
  struct ThreadLeak {
 @@ -106,35 +112,30 @@ int ThreadCount(ThreadState *thr) {
  }

  struct OnCreatedArgs {
 -  VectorClock *sync;
 -  uptr sync_epoch;
 -  StackID stack;
 +  ThreadState *thr;
 +  uptr pc;
  };

  Tid ThreadCreate(ThreadState *thr, uptr pc, uptr uid, bool detached) {
 -  // The main thread and GCD workers don't have a parent thread.
 -  Tid parent = kInvalidTid;
 -  OnCreatedArgs arg = {nullptr, 0, kInvalidStackID};
 -  if (thr) {
 -    parent = thr->tid;
 -    arg.stack = CurrentStackId(thr, pc);
 -    if (!thr->ignore_sync) {
 -      SlotLocker locker(thr);
 -      thr->clock.ReleaseStore(&arg.sync);
 -      arg.sync_epoch = ctx->global_epoch;
 -      IncrementEpoch(thr);
 -    }
 -  }
 -  Tid tid = ctx->thread_registry.CreateThread(uid, detached, parent, &arg);
 -  DPrintf("#%d: ThreadCreate tid=%d uid=%zu\n", parent, tid, uid);
 +  OnCreatedArgs args = { thr, pc };
 +  u32 parent_tid = thr ? thr->tid : kInvalidTid;  // No parent for GCD workers.
 +  Tid tid = ctx->thread_registry.CreateThread(uid, detached, parent_tid, &args);
 +  DPrintf("#%d: ThreadCreate tid=%d uid=%zu\n", parent_tid, tid, uid);
    return tid;
  }

  void ThreadContext::OnCreated(void *arg) {
 +  thr = 0;
 +  if (tid == kMainTid)
 +    return;
    OnCreatedArgs *args = static_cast<OnCreatedArgs *>(arg);
 -  sync = args->sync;
 -  sync_epoch = args->sync_epoch;
 -  creation_stack_id = args->stack;
 +  if (!args->thr)  // GCD workers don't have a parent thread.
 +    return;
 +  args->thr->fast_state.IncrementEpoch();
 +  // Can't increment epoch w/o writing to the trace as well.
 +  TraceAddEvent(args->thr, args->thr->fast_state, EventTypeMop, 0);
 +  ReleaseImpl(args->thr, 0, &sync);
 +  creation_stack_id = CurrentStackId(args->thr, args->pc);
  }

  extern "C" void __tsan_stack_initialization() {}
 @@ -149,15 +150,6 @@ struct OnStartedArgs {

  void ThreadStart(ThreadState *thr, Tid tid, tid_t os_id,
                   ThreadType thread_type) {
 -  ctx->thread_registry.StartThread(tid, os_id, thread_type, thr);
 -  if (!thr->ignore_sync) {
 -    SlotAttachAndLock(thr);
 -    if (thr->tctx->sync_epoch == ctx->global_epoch)
 -      thr->clock.Acquire(thr->tctx->sync);
 -    SlotUnlock(thr);
 -  }
 -  Free(thr->tctx->sync);
 -
    uptr stk_addr = 0;
    uptr stk_size = 0;
    uptr tls_addr = 0;
 @@ -167,10 +159,12 @@ void ThreadStart(ThreadState *thr, Tid tid, tid_t os_id,
      GetThreadStackAndTls(tid == kMainTid, &stk_addr, &stk_size, &tls_addr,
                           &tls_size);
  #endif
 -  thr->stk_addr = stk_addr;
 -  thr->stk_size = stk_size;
 -  thr->tls_addr = tls_addr;
 -  thr->tls_size = tls_size;
 +
 +  ThreadRegistry *tr = &ctx->thread_registry;
 +  OnStartedArgs args = { thr, stk_addr, stk_size, tls_addr, tls_size };
 +  tr->StartThread(tid, os_id, thread_type, &args);
 +
 +  while (!thr->tctx->trace.parts.Empty()) thr->tctx->trace.parts.PopBack();

  #if !SANITIZER_GO
    if (ctx->after_multithreaded_fork) {
 @@ -198,41 +192,57 @@ void ThreadStart(ThreadState *thr, Tid tid, tid_t os_id,
  }

  void ThreadContext::OnStarted(void *arg) {
 -  thr = static_cast<ThreadState *>(arg);
 -  DPrintf("#%d: ThreadStart\n", tid);
 -  new (thr) ThreadState(tid);
 +  OnStartedArgs *args = static_cast<OnStartedArgs *>(arg);
 +  thr = args->thr;
 +  // RoundUp so that one trace part does not contain events
 +  // from different threads.
 +  epoch0 = RoundUp(epoch1 + 1, kTracePartSize);
 +  epoch1 = (u64)-1;
 +  new (thr)
 +      ThreadState(ctx, tid, unique_id, epoch0, reuse_count, args->stk_addr,
 +                  args->stk_size, args->tls_addr, args->tls_size);
    if (common_flags()->detect_deadlocks)
 -    thr->dd_lt = ctx->dd->CreateLogicalThread(tid);
 +    thr->dd_lt = ctx->dd->CreateLogicalThread(unique_id);
 +  thr->fast_state.SetHistorySize(flags()->history_size);
 +  // Commit switch to the new part of the trace.
 +  // TraceAddEvent will reset stack0/mset0 in the new part for us.
 +  TraceAddEvent(thr, thr->fast_state, EventTypeMop, 0);
 +
 +  thr->fast_synch_epoch = epoch0;
 +  AcquireImpl(thr, 0, &sync);
 +  sync.Reset(&thr->proc()->clock_cache);
    thr->tctx = this;
 -#if !SANITIZER_GO
    thr->is_inited = true;
 -#endif
 +  DPrintf(
 +      "#%d: ThreadStart epoch=%zu stk_addr=%zx stk_size=%zx "
 +      "tls_addr=%zx tls_size=%zx\n",
 +      tid, (uptr)epoch0, args->stk_addr, args->stk_size, args->tls_addr,
 +      args->tls_size);
  }

  void ThreadFinish(ThreadState *thr) {
 -  DPrintf("#%d: ThreadFinish\n", thr->tid);
    ThreadCheckIgnore(thr);
    if (thr->stk_addr && thr->stk_size)
      DontNeedShadowFor(thr->stk_addr, thr->stk_size);
    if (thr->tls_addr && thr->tls_size)
      DontNeedShadowFor(thr->tls_addr, thr->tls_size);
    thr->is_dead = true;
 -#if !SANITIZER_GO
    thr->is_inited = false;
 +#if !SANITIZER_GO
    thr->ignore_interceptors++;
 -  PlatformCleanUpThreadState(thr);
  #endif
 -  if (!thr->ignore_sync) {
 -    SlotLocker locker(thr);
 -    ThreadRegistryLock lock(&ctx->thread_registry);
 -    // Note: detached is protected by the thread registry mutex,
 -    // the thread may be detaching concurrently in another thread.
 -    if (!thr->tctx->detached) {
 -      thr->clock.ReleaseStore(&thr->tctx->sync);
 -      thr->tctx->sync_epoch = ctx->global_epoch;
 -      IncrementEpoch(thr);
 -    }
 +  ctx->thread_registry.FinishThread(thr->tid);
 +}
 +
 +void ThreadContext::OnFinished() {
 +  if (!detached) {
 +    thr->fast_state.IncrementEpoch();
 +    // Can't increment epoch w/o writing to the trace as well.
 +    TraceAddEvent(thr, thr->fast_state, EventTypeMop, 0);
 +    ReleaseImpl(thr, 0, &sync);
    }
 +  epoch1 = thr->fast_state.epoch();
 +
  #if !SANITIZER_GO
    UnmapOrDie(thr->shadow_stack, kShadowStackSize * sizeof(uptr));
  #else
 @@ -241,37 +251,18 @@ void ThreadFinish(ThreadState *thr) {
    thr->shadow_stack = nullptr;
    thr->shadow_stack_pos = nullptr;
    thr->shadow_stack_end = nullptr;
 +
    if (common_flags()->detect_deadlocks)
      ctx->dd->DestroyLogicalThread(thr->dd_lt);
 -  SlotDetach(thr);
 -  ctx->thread_registry.FinishThread(thr->tid);
 +  thr->clock.ResetCached(&thr->proc()->clock_cache);
 +#if !SANITIZER_GO
 +  thr->last_sleep_clock.ResetCached(&thr->proc()->clock_cache);
 +#endif
 +#if !SANITIZER_GO
 +  PlatformCleanUpThreadState(thr);
 +#endif
    thr->~ThreadState();
 -}
 -
 -void ThreadContext::OnFinished() {
 -  Lock lock(&ctx->slot_mtx);
 -  Lock lock1(&trace.mtx);
 -  // Queue all trace parts into the global recycle queue.
 -  auto parts = &trace.parts;
 -  while (trace.local_head) {
 -    CHECK(parts->Queued(trace.local_head));
 -    ctx->trace_part_recycle.PushBack(trace.local_head);
 -    trace.local_head = parts->Next(trace.local_head);
 -  }
 -  ctx->trace_part_recycle_finished += parts->Size();
 -  if (ctx->trace_part_recycle_finished > Trace::kFinishedThreadHi) {
 -    ctx->trace_part_finished_excess += parts->Size();
 -    trace.parts_allocated = 0;
 -  } else if (ctx->trace_part_recycle_finished > Trace::kFinishedThreadLo &&
 -             parts->Size() > 1) {
 -    ctx->trace_part_finished_excess += parts->Size() - 1;
 -    trace.parts_allocated = 1;
 -  }
 -  // From now on replay will use trace->final_pos.
 -  trace.final_pos = (Event *)atomic_load_relaxed(&thr->trace_pos);
 -  atomic_store_relaxed(&thr->trace_pos, 0);
 -  thr->tctx = nullptr;
 -  thr = nullptr;
 +  thr = 0;
  }

  struct ConsumeThreadContext {
 @@ -283,43 +274,35 @@ Tid ThreadConsumeTid(ThreadState *thr, uptr pc, uptr uid) {
    return ctx->thread_registry.ConsumeThreadUserId(uid);
  }

 -struct JoinArg {
 -  VectorClock *sync;
 -  uptr sync_epoch;
 -};
 -
  void ThreadJoin(ThreadState *thr, uptr pc, Tid tid) {
    CHECK_GT(tid, 0);
 +  CHECK_LT(tid, kMaxTid);
    DPrintf("#%d: ThreadJoin tid=%d\n", thr->tid, tid);
 -  JoinArg arg = {};
 -  ctx->thread_registry.JoinThread(tid, &arg);
 -  if (!thr->ignore_sync) {
 -    SlotLocker locker(thr);
 -    if (arg.sync_epoch == ctx->global_epoch)
 -      thr->clock.Acquire(arg.sync);
 -  }
 -  Free(arg.sync);
 +  ctx->thread_registry.JoinThread(tid, thr);
  }

 -void ThreadContext::OnJoined(void *ptr) {
 -  auto arg = static_cast<JoinArg *>(ptr);
 -  arg->sync = sync;
 -  arg->sync_epoch = sync_epoch;
 -  sync = nullptr;
 -  sync_epoch = 0;
 +void ThreadContext::OnJoined(void *arg) {
 +  ThreadState *caller_thr = static_cast<ThreadState *>(arg);
 +  AcquireImpl(caller_thr, 0, &sync);
 +  sync.Reset(&caller_thr->proc()->clock_cache);
  }

 -void ThreadContext::OnDead() { CHECK_EQ(sync, nullptr); }
 +void ThreadContext::OnDead() { CHECK_EQ(sync.size(), 0); }

  void ThreadDetach(ThreadState *thr, uptr pc, Tid tid) {
    CHECK_GT(tid, 0);
 +  CHECK_LT(tid, kMaxTid);
    ctx->thread_registry.DetachThread(tid, thr);
  }

 -void ThreadContext::OnDetached(void *arg) { Free(sync); }
 +void ThreadContext::OnDetached(void *arg) {
 +  ThreadState *thr1 = static_cast<ThreadState *>(arg);
 +  sync.Reset(&thr1->proc()->clock_cache);
 +}

  void ThreadNotJoined(ThreadState *thr, uptr pc, Tid tid, uptr uid) {
    CHECK_GT(tid, 0);
 +  CHECK_LT(tid, kMaxTid);
    ctx->thread_registry.SetThreadUserId(tid, uid);
  }

 diff --git a/compiler-rt/lib/tsan/rtl/tsan_shadow.h b/compiler-rt/lib/tsan/rtl/tsan_shadow.h
 index 843573ecf5d3..8b7bc341713e 100644
 --- a/compiler-rt/lib/tsan/rtl/tsan_shadow.h
 +++ b/compiler-rt/lib/tsan/rtl/tsan_shadow.h
 @@ -10,170 +10,223 @@
  #define TSAN_SHADOW_H

  #include "tsan_defs.h"
 +#include "tsan_trace.h"

  namespace __tsan {

 +// FastState (from most significant bit):
 +//   ignore          : 1
 +//   tid             : kTidBits
 +//   unused          : -
 +//   history_size    : 3
 +//   epoch           : kClkBits
  class FastState {
   public:
 -  FastState() { Reset(); }
 +  FastState(u64 tid, u64 epoch) {
 +    x_ = tid << kTidShift;
 +    x_ |= epoch;
 +    DCHECK_EQ(tid, this->tid());
 +    DCHECK_EQ(epoch, this->epoch());
 +    DCHECK_EQ(GetIgnoreBit(), false);
 +  }
 +
 +  explicit FastState(u64 x) : x_(x) {}
 +
 +  u64 raw() const { return x_; }
 +
 +  u64 tid() const {
 +    u64 res = (x_ & ~kIgnoreBit) >> kTidShift;
 +    return res;
 +  }
 +
 +  u64 TidWithIgnore() const {
 +    u64 res = x_ >> kTidShift;
 +    return res;
 +  }
 +
 +  u64 epoch() const {
 +    u64 res = x_ & ((1ull << kClkBits) - 1);
 +    return res;
 +  }

 -  void Reset() {
 -    part_.unused0_ = 0;
 -    part_.sid_ = static_cast<u8>(kFreeSid);
 -    part_.epoch_ = static_cast<u16>(kEpochLast);
 -    part_.unused1_ = 0;
 -    part_.ignore_accesses_ = false;
 +  void IncrementEpoch() {
 +    u64 old_epoch = epoch();
 +    x_ += 1;
 +    DCHECK_EQ(old_epoch + 1, epoch());
 +    (void)old_epoch;
    }

 -  void SetSid(Sid sid) { part_.sid_ = static_cast<u8>(sid); }
 +  void SetIgnoreBit() { x_ |= kIgnoreBit; }
 +  void ClearIgnoreBit() { x_ &= ~kIgnoreBit; }
 +  bool GetIgnoreBit() const { return (s64)x_ < 0; }

 -  Sid sid() const { return static_cast<Sid>(part_.sid_); }
 +  void SetHistorySize(int hs) {
 +    CHECK_GE(hs, 0);
 +    CHECK_LE(hs, 7);
 +    x_ = (x_ & ~(kHistoryMask << kHistoryShift)) | (u64(hs) << kHistoryShift);
 +  }

 -  Epoch epoch() const { return static_cast<Epoch>(part_.epoch_); }
 +  ALWAYS_INLINE
 +  int GetHistorySize() const {
 +    return (int)((x_ >> kHistoryShift) & kHistoryMask);
 +  }

 -  void SetEpoch(Epoch epoch) { part_.epoch_ = static_cast<u16>(epoch); }
 +  void ClearHistorySize() { SetHistorySize(0); }

 -  void SetIgnoreBit() { part_.ignore_accesses_ = 1; }
 -  void ClearIgnoreBit() { part_.ignore_accesses_ = 0; }
 -  bool GetIgnoreBit() const { return part_.ignore_accesses_; }
 +  ALWAYS_INLINE
 +  u64 GetTracePos() const {
 +    const int hs = GetHistorySize();
 +    // When hs == 0, the trace consists of 2 parts.
 +    const u64 mask = (1ull << (kTracePartSizeBits + hs + 1)) - 1;
 +    return epoch() & mask;
 +  }

   private:
    friend class Shadow;
 -  struct Parts {
 -    u32 unused0_ : 8;
 -    u32 sid_ : 8;
 -    u32 epoch_ : kEpochBits;
 -    u32 unused1_ : 1;
 -    u32 ignore_accesses_ : 1;
 -  };
 -  union {
 -    Parts part_;
 -    u32 raw_;
 -  };
 +  static const int kTidShift = 64 - kTidBits - 1;
 +  static const u64 kIgnoreBit = 1ull << 63;
 +  static const u64 kFreedBit = 1ull << 63;
 +  static const u64 kHistoryShift = kClkBits;
 +  static const u64 kHistoryMask = 7;
 +  u64 x_;
  };

 -static_assert(sizeof(FastState) == kShadowSize, "bad FastState size");
 -
 -class Shadow {
 +// Shadow (from most significant bit):
 +//   freed           : 1
 +//   tid             : kTidBits
 +//   is_atomic       : 1
 +//   is_read         : 1
 +//   size_log        : 2
 +//   addr0           : 3
 +//   epoch           : kClkBits
 +class Shadow : public FastState {
   public:
 -  static constexpr RawShadow kEmpty = static_cast<RawShadow>(0);
 -
 -  Shadow(FastState state, u32 addr, u32 size, AccessType typ) {
 -    raw_ = state.raw_;
 -    DCHECK_GT(size, 0);
 -    DCHECK_LE(size, 8);
 -    UNUSED Sid sid0 = part_.sid_;
 -    UNUSED u16 epoch0 = part_.epoch_;
 -    raw_ |= (!!(typ & kAccessAtomic) << kIsAtomicShift) |
 -            (!!(typ & kAccessRead) << kIsReadShift) |
 -            (((((1u << size) - 1) << (addr & 0x7)) & 0xff) << kAccessShift);
 -    // Note: we don't check kAccessAtomic because it overlaps with
 -    // FastState::ignore_accesses_ and it may be set spuriously.
 -    DCHECK_EQ(part_.is_read_, !!(typ & kAccessRead));
 -    DCHECK_EQ(sid(), sid0);
 -    DCHECK_EQ(epoch(), epoch0);
 -  }
 -
 -  explicit Shadow(RawShadow x = Shadow::kEmpty) { raw_ = static_cast<u32>(x); }
 -
 -  RawShadow raw() const { return static_cast<RawShadow>(raw_); }
 -  Sid sid() const { return part_.sid_; }
 -  Epoch epoch() const { return static_cast<Epoch>(part_.epoch_); }
 -  u8 access() const { return part_.access_; }
 -
 -  void GetAccess(uptr *addr, uptr *size, AccessType *typ) const {
 -    DCHECK(part_.access_ != 0 || raw_ == static_cast<u32>(Shadow::kRodata));
 -    if (addr)
 -      *addr = part_.access_ ? __builtin_ffs(part_.access_) - 1 : 0;
 -    if (size)
 -      *size = part_.access_ == kFreeAccess ? kShadowCell
 -                                           : __builtin_popcount(part_.access_);
 -    if (typ)
 -      *typ = (part_.is_read_ ? kAccessRead : kAccessWrite) |
 -             (part_.is_atomic_ ? kAccessAtomic : 0) |
 -             (part_.access_ == kFreeAccess ? kAccessFree : 0);
 +  explicit Shadow(u64 x) : FastState(x) {}
 +
 +  explicit Shadow(const FastState &s) : FastState(s.x_) { ClearHistorySize(); }
 +
 +  void SetAddr0AndSizeLog(u64 addr0, unsigned kAccessSizeLog) {
 +    DCHECK_EQ((x_ >> kClkBits) & 31, 0);
 +    DCHECK_LE(addr0, 7);
 +    DCHECK_LE(kAccessSizeLog, 3);
 +    x_ |= ((kAccessSizeLog << 3) | addr0) << kClkBits;
 +    DCHECK_EQ(kAccessSizeLog, size_log());
 +    DCHECK_EQ(addr0, this->addr0());
    }

 -  ALWAYS_INLINE
 -  bool IsBothReadsOrAtomic(AccessType typ) const {
 -    u32 is_read = !!(typ & kAccessRead);
 -    u32 is_atomic = !!(typ & kAccessAtomic);
 -    bool res =
 -        raw_ & ((is_atomic << kIsAtomicShift) | (is_read << kIsReadShift));
 -    DCHECK_EQ(res,
 -              (part_.is_read_ && is_read) || (part_.is_atomic_ && is_atomic));
 +  void SetWrite(unsigned kAccessIsWrite) {
 +    DCHECK_EQ(x_ & kReadBit, 0);
 +    if (!kAccessIsWrite)
 +      x_ |= kReadBit;
 +    DCHECK_EQ(kAccessIsWrite, IsWrite());
 +  }
 +
 +  void SetAtomic(bool kIsAtomic) {
 +    DCHECK(!IsAtomic());
 +    if (kIsAtomic)
 +      x_ |= kAtomicBit;
 +    DCHECK_EQ(IsAtomic(), kIsAtomic);
 +  }
 +
 +  bool IsAtomic() const { return x_ & kAtomicBit; }
 +
 +  bool IsZero() const { return x_ == 0; }
 +
 +  static inline bool TidsAreEqual(const Shadow s1, const Shadow s2) {
 +    u64 shifted_xor = (s1.x_ ^ s2.x_) >> kTidShift;
 +    DCHECK_EQ(shifted_xor == 0, s1.TidWithIgnore() == s2.TidWithIgnore());
 +    return shifted_xor == 0;
 +  }
 +
 +  static ALWAYS_INLINE bool Addr0AndSizeAreEqual(const Shadow s1,
 +                                                 const Shadow s2) {
 +    u64 masked_xor = ((s1.x_ ^ s2.x_) >> kClkBits) & 31;
 +    return masked_xor == 0;
 +  }
 +
 +  static ALWAYS_INLINE bool TwoRangesIntersect(Shadow s1, Shadow s2,
 +                                               unsigned kS2AccessSize) {
 +    bool res = false;
 +    u64 diff = s1.addr0() - s2.addr0();
 +    if ((s64)diff < 0) {  // s1.addr0 < s2.addr0
 +      // if (s1.addr0() + size1) > s2.addr0()) return true;
 +      if (s1.size() > -diff)
 +        res = true;
 +    } else {
 +      // if (s2.addr0() + kS2AccessSize > s1.addr0()) return true;
 +      if (kS2AccessSize > diff)
 +        res = true;
 +    }
 +    DCHECK_EQ(res, TwoRangesIntersectSlow(s1, s2));
 +    DCHECK_EQ(res, TwoRangesIntersectSlow(s2, s1));
      return res;
    }

 -  ALWAYS_INLINE
 -  bool IsRWWeakerOrEqual(AccessType typ) const {
 -    u32 is_read = !!(typ & kAccessRead);
 -    u32 is_atomic = !!(typ & kAccessAtomic);
 -    UNUSED u32 res0 =
 -        (part_.is_atomic_ > is_atomic) ||
 -        (part_.is_atomic_ == is_atomic && part_.is_read_ >= is_read);
 -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
 -    const u32 kAtomicReadMask = (1 << kIsAtomicShift) | (1 << kIsReadShift);
 -    bool res = (raw_ & kAtomicReadMask) >=
 -               ((is_atomic << kIsAtomicShift) | (is_read << kIsReadShift));
 -
 -    DCHECK_EQ(res, res0);
 +  u64 ALWAYS_INLINE addr0() const { return (x_ >> kClkBits) & 7; }
 +  u64 ALWAYS_INLINE size() const { return 1ull << size_log(); }
 +  bool ALWAYS_INLINE IsWrite() const { return !IsRead(); }
 +  bool ALWAYS_INLINE IsRead() const { return x_ & kReadBit; }
 +
 +  // The idea behind the freed bit is as follows.
 +  // When the memory is freed (or otherwise unaccessible) we write to the shadow
 +  // values with tid/epoch related to the free and the freed bit set.
 +  // During memory accesses processing the freed bit is considered
 +  // as msb of tid. So any access races with shadow with freed bit set
 +  // (it is as if write from a thread with which we never synchronized before).
 +  // This allows us to detect accesses to freed memory w/o additional
 +  // overheads in memory access processing and at the same time restore
 +  // tid/epoch of free.
 +  void MarkAsFreed() { x_ |= kFreedBit; }
 +
 +  bool IsFreed() const { return x_ & kFreedBit; }
 +
 +  bool GetFreedAndReset() {
 +    bool res = x_ & kFreedBit;
 +    x_ &= ~kFreedBit;
      return res;
 -#else
 -    return res0;
 -#endif
    }

 -  // The FreedMarker must not pass "the same access check" so that we don't
 -  // return from the race detection algorithm early.
 -  static RawShadow FreedMarker() {
 -    FastState fs;
 -    fs.SetSid(kFreeSid);
 -    fs.SetEpoch(kEpochLast);
 -    Shadow s(fs, 0, 8, kAccessWrite);
 -    return s.raw();
 +  bool ALWAYS_INLINE IsBothReadsOrAtomic(bool kIsWrite, bool kIsAtomic) const {
 +    bool v = x_ & ((u64(kIsWrite ^ 1) << kReadShift) |
 +                   (u64(kIsAtomic) << kAtomicShift));
 +    DCHECK_EQ(v, (!IsWrite() && !kIsWrite) || (IsAtomic() && kIsAtomic));
 +    return v;
    }

 -  static RawShadow FreedInfo(Sid sid, Epoch epoch) {
 -    Shadow s;
 -    s.part_.sid_ = sid;
 -    s.part_.epoch_ = static_cast<u16>(epoch);
 -    s.part_.access_ = kFreeAccess;
 -    return s.raw();
 +  bool ALWAYS_INLINE IsRWNotWeaker(bool kIsWrite, bool kIsAtomic) const {
 +    bool v = ((x_ >> kReadShift) & 3) <= u64((kIsWrite ^ 1) | (kIsAtomic << 1));
 +    DCHECK_EQ(v, (IsAtomic() < kIsAtomic) ||
 +                     (IsAtomic() == kIsAtomic && !IsWrite() <= !kIsWrite));
 +    return v;
    }

 - private:
 -  struct Parts {
 -    u8 access_;
 -    Sid sid_;
 -    u16 epoch_ : kEpochBits;
 -    u16 is_read_ : 1;
 -    u16 is_atomic_ : 1;
 -  };
 -  union {
 -    Parts part_;
 -    u32 raw_;
 -  };
 -
 -  static constexpr u8 kFreeAccess = 0x81;
 -
 -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
 -  static constexpr uptr kAccessShift = 0;
 -  static constexpr uptr kIsReadShift = 30;
 -  static constexpr uptr kIsAtomicShift = 31;
 -#else
 -  static constexpr uptr kAccessShift = 24;
 -  static constexpr uptr kIsReadShift = 1;
 -  static constexpr uptr kIsAtomicShift = 0;
 -#endif
 +  bool ALWAYS_INLINE IsRWWeakerOrEqual(bool kIsWrite, bool kIsAtomic) const {
 +    bool v = ((x_ >> kReadShift) & 3) >= u64((kIsWrite ^ 1) | (kIsAtomic << 1));
 +    DCHECK_EQ(v, (IsAtomic() > kIsAtomic) ||
 +                     (IsAtomic() == kIsAtomic && !IsWrite() >= !kIsWrite));
 +    return v;
 +  }

 - public:
 -  // .rodata shadow marker, see MapRodata and ContainsSameAccessFast.
 -  static constexpr RawShadow kRodata =
 -      static_cast<RawShadow>(1 << kIsReadShift);
 + private:
 +  static const u64 kReadShift = 5 + kClkBits;
 +  static const u64 kReadBit = 1ull << kReadShift;
 +  static const u64 kAtomicShift = 6 + kClkBits;
 +  static const u64 kAtomicBit = 1ull << kAtomicShift;
 +
 +  u64 size_log() const { return (x_ >> (3 + kClkBits)) & 3; }
 +
 +  static bool TwoRangesIntersectSlow(const Shadow s1, const Shadow s2) {
 +    if (s1.addr0() == s2.addr0())
 +      return true;
 +    if (s1.addr0() < s2.addr0() && s1.addr0() + s1.size() > s2.addr0())
 +      return true;
 +    if (s2.addr0() < s1.addr0() && s2.addr0() + s2.size() > s1.addr0())
 +      return true;
 +    return false;
 +  }
  };

 -static_assert(sizeof(Shadow) == kShadowSize, "bad Shadow size");
 +const RawShadow kShadowRodata = (RawShadow)-1;  // .rodata shadow marker

  }  // namespace __tsan

 diff --git a/compiler-rt/lib/tsan/rtl/tsan_sync.cpp b/compiler-rt/lib/tsan/rtl/tsan_sync.cpp
 index 09d41780d188..f042abab74e5 100644
 --- a/compiler-rt/lib/tsan/rtl/tsan_sync.cpp
 +++ b/compiler-rt/lib/tsan/rtl/tsan_sync.cpp
 @@ -18,31 +18,43 @@ namespace __tsan {

  void DDMutexInit(ThreadState *thr, uptr pc, SyncVar *s);

 -SyncVar::SyncVar() : mtx(MutexTypeSyncVar) { Reset(); }
 +SyncVar::SyncVar() : mtx(MutexTypeSyncVar) { Reset(0); }

 -void SyncVar::Init(ThreadState *thr, uptr pc, uptr addr, bool save_stack) {
 -  Reset();
 +void SyncVar::Init(ThreadState *thr, uptr pc, uptr addr, u64 uid,
 +                   bool save_stack) {
    this->addr = addr;
 -  next = 0;
 +  this->uid = uid;
 +  this->next = 0;
 +
 +  creation_stack_id = kInvalidStackID;
    if (save_stack && !SANITIZER_GO)  // Go does not use them
      creation_stack_id = CurrentStackId(thr, pc);
    if (common_flags()->detect_deadlocks)
      DDMutexInit(thr, pc, this);
  }

 -void SyncVar::Reset() {
 -  CHECK(!ctx->resetting);
 +void SyncVar::Reset(Processor *proc) {
 +  uid = 0;
    creation_stack_id = kInvalidStackID;
    owner_tid = kInvalidTid;
 -  last_lock.Reset();
 +  last_lock = 0;
    recursion = 0;
    atomic_store_relaxed(&flags, 0);
 -  Free(clock);
 -  Free(read_clock);
 +
 +  if (proc == 0) {
 +    CHECK_EQ(clock.size(), 0);
 +    CHECK_EQ(read_clock.size(), 0);
 +  } else {
 +    clock.Reset(&proc->clock_cache);
 +    read_clock.Reset(&proc->clock_cache);
 +  }
  }

  MetaMap::MetaMap()
 -    : block_alloc_("heap block allocator"), sync_alloc_("sync allocator") {}
 +    : block_alloc_(LINKER_INITIALIZED, "heap block allocator"),
 +      sync_alloc_(LINKER_INITIALIZED, "sync allocator") {
 +  atomic_store(&uid_gen_, 0, memory_order_relaxed);
 +}

  void MetaMap::AllocBlock(ThreadState *thr, uptr pc, uptr p, uptr sz) {
    u32 idx = block_alloc_.Alloc(&thr->proc()->block_cache);
 @@ -56,16 +68,16 @@ void MetaMap::AllocBlock(ThreadState *thr, uptr pc, uptr p, uptr sz) {
    *meta = idx | kFlagBlock;
  }

 -uptr MetaMap::FreeBlock(Processor *proc, uptr p, bool reset) {
 +uptr MetaMap::FreeBlock(Processor *proc, uptr p) {
    MBlock* b = GetBlock(p);
    if (b == 0)
      return 0;
    uptr sz = RoundUpTo(b->siz, kMetaShadowCell);
 -  FreeRange(proc, p, sz, reset);
 +  FreeRange(proc, p, sz);
    return sz;
  }

 -bool MetaMap::FreeRange(Processor *proc, uptr p, uptr sz, bool reset) {
 +bool MetaMap::FreeRange(Processor *proc, uptr p, uptr sz) {
    bool has_something = false;
    u32 *meta = MemToMeta(p);
    u32 *end = MemToMeta(p + sz);
 @@ -87,8 +99,7 @@ bool MetaMap::FreeRange(Processor *proc, uptr p, uptr sz, bool reset) {
          DCHECK(idx & kFlagSync);
          SyncVar *s = sync_alloc_.Map(idx & ~kFlagMask);
          u32 next = s->next;
 -        if (reset)
 -          s->Reset();
 +        s->Reset(proc);
          sync_alloc_.Free(&proc->sync_cache, idx & ~kFlagMask);
          idx = next;
        } else {
 @@ -105,30 +116,30 @@ bool MetaMap::FreeRange(Processor *proc, uptr p, uptr sz, bool reset) {
  // which can be huge. The function probes pages one-by-one until it finds a page
  // without meta objects, at this point it stops freeing meta objects. Because
  // thread stacks grow top-down, we do the same starting from end as well.
 -void MetaMap::ResetRange(Processor *proc, uptr p, uptr sz, bool reset) {
 +void MetaMap::ResetRange(Processor *proc, uptr p, uptr sz) {
    if (SANITIZER_GO) {
      // UnmapOrDie/MmapFixedNoReserve does not work on Windows,
      // so we do the optimization only for C/C++.
 -    FreeRange(proc, p, sz, reset);
 +    FreeRange(proc, p, sz);
      return;
    }
    const uptr kMetaRatio = kMetaShadowCell / kMetaShadowSize;
    const uptr kPageSize = GetPageSizeCached() * kMetaRatio;
    if (sz <= 4 * kPageSize) {
      // If the range is small, just do the normal free procedure.
 -    FreeRange(proc, p, sz, reset);
 +    FreeRange(proc, p, sz);
      return;
    }
    // First, round both ends of the range to page size.
    uptr diff = RoundUp(p, kPageSize) - p;
    if (diff != 0) {
 -    FreeRange(proc, p, diff, reset);
 +    FreeRange(proc, p, diff);
      p += diff;
      sz -= diff;
    }
    diff = p + sz - RoundDown(p + sz, kPageSize);
    if (diff != 0) {
 -    FreeRange(proc, p + sz - diff, diff, reset);
 +    FreeRange(proc, p + sz - diff, diff);
      sz -= diff;
    }
    // Now we must have a non-empty page-aligned range.
 @@ -139,7 +150,7 @@ void MetaMap::ResetRange(Processor *proc, uptr p, uptr sz, bool reset) {
    const uptr sz0 = sz;
    // Probe start of the range.
    for (uptr checked = 0; sz > 0; checked += kPageSize) {
 -    bool has_something = FreeRange(proc, p, kPageSize, reset);
 +    bool has_something = FreeRange(proc, p, kPageSize);
      p += kPageSize;
      sz -= kPageSize;
      if (!has_something && checked > (128 << 10))
 @@ -147,7 +158,7 @@ void MetaMap::ResetRange(Processor *proc, uptr p, uptr sz, bool reset) {
    }
    // Probe end of the range.
    for (uptr checked = 0; sz > 0; checked += kPageSize) {
 -    bool has_something = FreeRange(proc, p + sz - kPageSize, kPageSize, reset);
 +    bool has_something = FreeRange(proc, p + sz - kPageSize, kPageSize);
      sz -= kPageSize;
      // Stacks grow down, so sync object are most likely at the end of the region
      // (if it is a stack). The very end of the stack is TLS and tsan increases
 @@ -166,27 +177,6 @@ void MetaMap::ResetRange(Processor *proc, uptr p, uptr sz, bool reset) {
      Die();
  }

 -void MetaMap::ResetClocks() {
 -  // This can be called from the background thread
 -  // which does not have proc/cache.
 -  // The cache is too large for stack.
 -  static InternalAllocatorCache cache;
 -  internal_memset(&cache, 0, sizeof(cache));
 -  internal_allocator()->InitCache(&cache);
 -  sync_alloc_.ForEach([&](SyncVar *s) {
 -    if (s->clock) {
 -      InternalFree(s->clock, &cache);
 -      s->clock = nullptr;
 -    }
 -    if (s->read_clock) {
 -      InternalFree(s->read_clock, &cache);
 -      s->read_clock = nullptr;
 -    }
 -    s->last_lock.Reset();
 -  });
 -  internal_allocator()->DestroyCache(&cache);
 -}
 -
  MBlock* MetaMap::GetBlock(uptr p) {
    u32 *meta = MemToMeta(p);
    u32 idx = *meta;
 @@ -203,7 +193,6 @@ MBlock* MetaMap::GetBlock(uptr p) {

  SyncVar *MetaMap::GetSync(ThreadState *thr, uptr pc, uptr addr, bool create,
                            bool save_stack) {
 -  DCHECK(!create || thr->slot_locked);
    u32 *meta = MemToMeta(addr);
    u32 idx0 = *meta;
    u32 myidx = 0;
 @@ -214,7 +203,7 @@ SyncVar *MetaMap::GetSync(ThreadState *thr, uptr pc, uptr addr, bool create,
        SyncVar * s = sync_alloc_.Map(idx & ~kFlagMask);
        if (LIKELY(s->addr == addr)) {
          if (UNLIKELY(myidx != 0)) {
 -          mys->Reset();
 +          mys->Reset(thr->proc());
            sync_alloc_.Free(&thr->proc()->sync_cache, myidx);
          }
          return s;
 @@ -229,9 +218,10 @@ SyncVar *MetaMap::GetSync(ThreadState *thr, uptr pc, uptr addr, bool create,
      }

      if (LIKELY(myidx == 0)) {
 +      const u64 uid = atomic_fetch_add(&uid_gen_, 1, memory_order_relaxed);
        myidx = sync_alloc_.Alloc(&thr->proc()->sync_cache);
        mys = sync_alloc_.Map(myidx);
 -      mys->Init(thr, pc, addr, save_stack);
 +      mys->Init(thr, pc, addr, uid, save_stack);
      }
      mys->next = idx0;
      if (atomic_compare_exchange_strong((atomic_uint32_t*)meta, &idx0,
 diff --git a/compiler-rt/lib/tsan/rtl/tsan_sync.h b/compiler-rt/lib/tsan/rtl/tsan_sync.h
 index 1e5f828349c6..fc8fa288a841 100644
 --- a/compiler-rt/lib/tsan/rtl/tsan_sync.h
 +++ b/compiler-rt/lib/tsan/rtl/tsan_sync.h
 @@ -15,11 +15,9 @@
  #include "sanitizer_common/sanitizer_atomic.h"
  #include "sanitizer_common/sanitizer_common.h"
  #include "sanitizer_common/sanitizer_deadlock_detector_interface.h"
 -#include "tsan_clock.h"
  #include "tsan_defs.h"
 +#include "tsan_clock.h"
  #include "tsan_dense_alloc.h"
 -#include "tsan_shadow.h"
 -#include "tsan_vector_clock.h"

  namespace __tsan {

 @@ -55,18 +53,34 @@ struct SyncVar {

    uptr addr;  // overwritten by DenseSlabAlloc freelist
    Mutex mtx;
 +  u64 uid;  // Globally unique id.
    StackID creation_stack_id;
    Tid owner_tid;  // Set only by exclusive owners.
 -  FastState last_lock;
 +  u64 last_lock;
    int recursion;
    atomic_uint32_t flags;
    u32 next;  // in MetaMap
    DDMutex dd;
 -  VectorClock *read_clock;  // Used for rw mutexes only.
 -  VectorClock *clock;
 +  SyncClock read_clock;  // Used for rw mutexes only.
 +  // The clock is placed last, so that it is situated on a different cache line
 +  // with the mtx. This reduces contention for hot sync objects.
 +  SyncClock clock;

 -  void Init(ThreadState *thr, uptr pc, uptr addr, bool save_stack);
 -  void Reset();
 +  void Init(ThreadState *thr, uptr pc, uptr addr, u64 uid, bool save_stack);
 +  void Reset(Processor *proc);
 +
 +  u64 GetId() const {
 +    // 48 lsb is addr, then 14 bits is low part of uid, then 2 zero bits.
 +    return GetLsb((u64)addr | (uid << 48), 60);
 +  }
 +  bool CheckId(u64 uid) const {
 +    CHECK_EQ(uid, GetLsb(uid, 14));
 +    return GetLsb(this->uid, 14) == uid;
 +  }
 +  static uptr SplitId(u64 id, u64 *uid) {
 +    *uid = id >> 48;
 +    return (uptr)GetLsb(id, 48);
 +  }

    bool IsFlagSet(u32 f) const {
      return atomic_load_relaxed(&flags) & f;
 @@ -96,20 +110,9 @@ class MetaMap {
    MetaMap();

    void AllocBlock(ThreadState *thr, uptr pc, uptr p, uptr sz);
 -
 -  // FreeBlock resets all sync objects in the range if reset=true and must not
 -  // run concurrently with ResetClocks which resets all sync objects
 -  // w/o any synchronization (as part of DoReset).
 -  // If we don't have a thread slot (very early/late in thread lifetime or
 -  // Go/Java callbacks) or the slot is not locked, then reset must be set to
 -  // false. In such case sync object clocks will be reset later (when it's
 -  // reused or during the next ResetClocks).
 -  uptr FreeBlock(Processor *proc, uptr p, bool reset);
 -  bool FreeRange(Processor *proc, uptr p, uptr sz, bool reset);
 -  void ResetRange(Processor *proc, uptr p, uptr sz, bool reset);
 -  // Reset vector clocks of all sync objects.
 -  // Must be called when no other threads access sync objects.
 -  void ResetClocks();
 +  uptr FreeBlock(Processor *proc, uptr p);
 +  bool FreeRange(Processor *proc, uptr p, uptr sz);
 +  void ResetRange(Processor *proc, uptr p, uptr sz);
    MBlock* GetBlock(uptr p);

    SyncVar *GetSyncOrCreate(ThreadState *thr, uptr pc, uptr addr,
 @@ -139,6 +142,7 @@ class MetaMap {
    typedef DenseSlabAlloc<SyncVar, 1 << 20, 1 << 10, kFlagMask> SyncAlloc;
    BlockAlloc block_alloc_;
    SyncAlloc sync_alloc_;
 +  atomic_uint64_t uid_gen_;

    SyncVar *GetSync(ThreadState *thr, uptr pc, uptr addr, bool create,
                     bool save_stack);
 diff --git a/compiler-rt/lib/tsan/rtl/tsan_trace.h b/compiler-rt/lib/tsan/rtl/tsan_trace.h
 index 01bb7b34f43a..ffc8c991ece0 100644
 --- a/compiler-rt/lib/tsan/rtl/tsan_trace.h
 +++ b/compiler-rt/lib/tsan/rtl/tsan_trace.h
 @@ -19,6 +19,57 @@

  namespace __tsan {

 +const int kTracePartSizeBits = 13;
 +const int kTracePartSize = 1 << kTracePartSizeBits;
 +const int kTraceParts = 2 * 1024 * 1024 / kTracePartSize;
 +const int kTraceSize = kTracePartSize * kTraceParts;
 +
 +// Must fit into 3 bits.
 +enum EventType {
 +  EventTypeMop,
 +  EventTypeFuncEnter,
 +  EventTypeFuncExit,
 +  EventTypeLock,
 +  EventTypeUnlock,
 +  EventTypeRLock,
 +  EventTypeRUnlock
 +};
 +
 +// Represents a thread event (from most significant bit):
 +// u64 typ  : 3;   // EventType.
 +// u64 addr : 61;  // Associated pc.
 +typedef u64 Event;
 +
 +const uptr kEventPCBits = 61;
 +
 +struct TraceHeader {
 +#if !SANITIZER_GO
 +  BufferedStackTrace stack0;  // Start stack for the trace.
 +#else
 +  VarSizeStackTrace stack0;
 +#endif
 +  u64        epoch0;  // Start epoch for the trace.
 +  MutexSet   mset0;
 +
 +  TraceHeader() : stack0(), epoch0() {}
 +};
 +
 +struct Trace {
 +  Mutex mtx;
 +#if !SANITIZER_GO
 +  // Must be last to catch overflow as paging fault.
 +  // Go shadow stack is dynamically allocated.
 +  uptr shadow_stack[kShadowStackSize];
 +#endif
 +  // Must be the last field, because we unmap the unused part in
 +  // CreateThreadContext.
 +  TraceHeader headers[kTraceParts];
 +
 +  Trace() : mtx(MutexTypeTrace) {}
 +};
 +
 +namespace v3 {
 +
  enum class EventType : u64 {
    kAccessExt,
    kAccessRange,
 @@ -166,7 +217,6 @@ struct Trace;
  struct TraceHeader {
    Trace* trace = nullptr;  // back-pointer to Trace containing this part
    INode trace_parts;       // in Trace::parts
 -  INode global;            // in Contex::trace_part_recycle
  };

  struct TracePart : TraceHeader {
 @@ -189,27 +239,14 @@ static_assert(sizeof(TracePart) == TracePart::kByteSize, "bad TracePart size");
  struct Trace {
    Mutex mtx;
    IList<TraceHeader, &TraceHeader::trace_parts, TracePart> parts;
 -  // First node non-queued into ctx->trace_part_recycle.
 -  TracePart* local_head;
 -  // Final position in the last part for finished threads.
 -  Event* final_pos = nullptr;
 -  // Number of trace parts allocated on behalf of this trace specifically.
 -  // Total number of parts in this trace can be larger if we retake some
 -  // parts from other traces.
 -  uptr parts_allocated = 0;
 +  Event* final_pos =
 +      nullptr;  // final position in the last part for finished threads

    Trace() : mtx(MutexTypeTrace) {}
 -
 -  // We need at least 3 parts per thread, because we want to keep at last
 -  // 2 parts per thread that are not queued into ctx->trace_part_recycle
 -  // (the current one being filled and one full part that ensures that
 -  // we always have at least one part worth of previous memory accesses).
 -  static constexpr uptr kMinParts = 3;
 -
 -  static constexpr uptr kFinishedThreadLo = 16;
 -  static constexpr uptr kFinishedThreadHi = 64;
  };

 +}  // namespace v3
 +
  }  // namespace __tsan

  #endif  // TSAN_TRACE_H
 diff --git a/compiler-rt/lib/tsan/rtl/tsan_update_shadow_word.inc b/compiler-rt/lib/tsan/rtl/tsan_update_shadow_word.inc
 new file mode 100644
 index 000000000000..a58ef0f17efa
 --- /dev/null
 +++ b/compiler-rt/lib/tsan/rtl/tsan_update_shadow_word.inc
 @@ -0,0 +1,59 @@
 +//===-- tsan_update_shadow_word.inc -----------------------------*- C++ -*-===//
 +//
 +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 +// See https://llvm.org/LICENSE.txt for license information.
 +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 +//
 +//===----------------------------------------------------------------------===//
 +//
 +// This file is a part of ThreadSanitizer (TSan), a race detector.
 +//
 +// Body of the hottest inner loop.
 +// If we wrap this body into a function, compilers (both gcc and clang)
 +// produce sligtly less efficient code.
 +//===----------------------------------------------------------------------===//
 +do {
 +  const unsigned kAccessSize = 1 << kAccessSizeLog;
 +  u64 *sp = &shadow_mem[idx];
 +  old = LoadShadow(sp);
 +  if (LIKELY(old.IsZero())) {
 +    if (!stored) {
 +      StoreIfNotYetStored(sp, &store_word);
 +      stored = true;
 +    }
 +    break;
 +  }
 +  // is the memory access equal to the previous?
 +  if (LIKELY(Shadow::Addr0AndSizeAreEqual(cur, old))) {
 +    // same thread?
 +    if (LIKELY(Shadow::TidsAreEqual(old, cur))) {
 +      if (LIKELY(old.IsRWWeakerOrEqual(kAccessIsWrite, kIsAtomic))) {
 +        StoreIfNotYetStored(sp, &store_word);
 +        stored = true;
 +      }
 +      break;
 +    }
 +    if (HappensBefore(old, thr)) {
 +      if (old.IsRWWeakerOrEqual(kAccessIsWrite, kIsAtomic)) {
 +        StoreIfNotYetStored(sp, &store_word);
 +        stored = true;
 +      }
 +      break;
 +    }
 +    if (LIKELY(old.IsBothReadsOrAtomic(kAccessIsWrite, kIsAtomic)))
 +      break;
 +    goto RACE;
 +  }
 +  // Do the memory access intersect?
 +  if (Shadow::TwoRangesIntersect(old, cur, kAccessSize)) {
 +    if (Shadow::TidsAreEqual(old, cur))
 +      break;
 +    if (old.IsBothReadsOrAtomic(kAccessIsWrite, kIsAtomic))
 +      break;
 +    if (LIKELY(HappensBefore(old, thr)))
 +      break;
 +    goto RACE;
 +  }
 +  // The accesses do not intersect.
 +  break;
 +} while (0);
 diff --git a/compiler-rt/lib/tsan/tests/unit/tsan_flags_test.cpp b/compiler-rt/lib/tsan/tests/unit/tsan_flags_test.cpp
 index 1e6dfe597ad7..cb8ce91e9743 100644
 --- a/compiler-rt/lib/tsan/tests/unit/tsan_flags_test.cpp
 +++ b/compiler-rt/lib/tsan/tests/unit/tsan_flags_test.cpp
 @@ -98,7 +98,7 @@ void VerifyOptions1(Flags *f) {
    EXPECT_EQ(f->memory_limit_mb, 666);
    EXPECT_EQ(f->stop_on_start, 0);
    EXPECT_EQ(f->running_on_valgrind, 0);
 -  EXPECT_EQ(f->history_size, (uptr)5);
 +  EXPECT_EQ(f->history_size, 5);
    EXPECT_EQ(f->io_sync, 1);
    EXPECT_EQ(f->die_after_fork, true);
  }
 @@ -122,7 +122,7 @@ void VerifyOptions2(Flags *f) {
    EXPECT_EQ(f->memory_limit_mb, 456);
    EXPECT_EQ(f->stop_on_start, true);
    EXPECT_EQ(f->running_on_valgrind, true);
 -  EXPECT_EQ(f->history_size, 6ul);
 +  EXPECT_EQ(f->history_size, 6);
    EXPECT_EQ(f->io_sync, 2);
    EXPECT_EQ(f->die_after_fork, false);
  }
 diff --git a/compiler-rt/lib/tsan/tests/unit/tsan_shadow_test.cpp b/compiler-rt/lib/tsan/tests/unit/tsan_shadow_test.cpp
 index ba49df7deda3..890a12213bf3 100644
 --- a/compiler-rt/lib/tsan/tests/unit/tsan_shadow_test.cpp
 +++ b/compiler-rt/lib/tsan/tests/unit/tsan_shadow_test.cpp
 @@ -15,70 +15,34 @@

  namespace __tsan {

 -void CheckShadow(const Shadow *s, Sid sid, Epoch epoch, uptr addr, uptr size,
 -                 AccessType typ) {
 -  uptr addr1 = 0;
 -  uptr size1 = 0;
 -  AccessType typ1 = 0;
 -  s->GetAccess(&addr1, &size1, &typ1);
 -  CHECK_EQ(s->sid(), sid);
 -  CHECK_EQ(s->epoch(), epoch);
 -  CHECK_EQ(addr1, addr);
 -  CHECK_EQ(size1, size);
 -  CHECK_EQ(typ1, typ);
 -}
 -
 -TEST(Shadow, Shadow) {
 -  Sid sid = static_cast<Sid>(11);
 -  Epoch epoch = static_cast<Epoch>(22);
 -  FastState fs;
 -  fs.SetSid(sid);
 -  fs.SetEpoch(epoch);
 -  CHECK_EQ(fs.sid(), sid);
 -  CHECK_EQ(fs.epoch(), epoch);
 -  CHECK_EQ(fs.GetIgnoreBit(), false);
 -  fs.SetIgnoreBit();
 -  CHECK_EQ(fs.GetIgnoreBit(), true);
 -  fs.ClearIgnoreBit();
 -  CHECK_EQ(fs.GetIgnoreBit(), false);
 -
 -  Shadow s0(fs, 1, 2, kAccessWrite);
 -  CheckShadow(&s0, sid, epoch, 1, 2, kAccessWrite);
 -  Shadow s1(fs, 2, 3, kAccessRead);
 -  CheckShadow(&s1, sid, epoch, 2, 3, kAccessRead);
 -  Shadow s2(fs, 0xfffff8 + 4, 1, kAccessWrite | kAccessAtomic);
 -  CheckShadow(&s2, sid, epoch, 4, 1, kAccessWrite | kAccessAtomic);
 -  Shadow s3(fs, 0xfffff8 + 0, 8, kAccessRead | kAccessAtomic);
 -  CheckShadow(&s3, sid, epoch, 0, 8, kAccessRead | kAccessAtomic);
 -
 -  CHECK(!s0.IsBothReadsOrAtomic(kAccessRead | kAccessAtomic));
 -  CHECK(!s1.IsBothReadsOrAtomic(kAccessAtomic));
 -  CHECK(!s1.IsBothReadsOrAtomic(kAccessWrite));
 -  CHECK(s1.IsBothReadsOrAtomic(kAccessRead));
 -  CHECK(s2.IsBothReadsOrAtomic(kAccessAtomic));
 -  CHECK(!s2.IsBothReadsOrAtomic(kAccessWrite));
 -  CHECK(!s2.IsBothReadsOrAtomic(kAccessRead));
 -  CHECK(s3.IsBothReadsOrAtomic(kAccessAtomic));
 -  CHECK(!s3.IsBothReadsOrAtomic(kAccessWrite));
 -  CHECK(s3.IsBothReadsOrAtomic(kAccessRead));
 -
 -  CHECK(!s0.IsRWWeakerOrEqual(kAccessRead | kAccessAtomic));
 -  CHECK(s1.IsRWWeakerOrEqual(kAccessWrite));
 -  CHECK(s1.IsRWWeakerOrEqual(kAccessRead));
 -  CHECK(!s1.IsRWWeakerOrEqual(kAccessWrite | kAccessAtomic));
 -
 -  CHECK(!s2.IsRWWeakerOrEqual(kAccessRead | kAccessAtomic));
 -  CHECK(s2.IsRWWeakerOrEqual(kAccessWrite | kAccessAtomic));
 -  CHECK(s2.IsRWWeakerOrEqual(kAccessRead));
 -  CHECK(s2.IsRWWeakerOrEqual(kAccessWrite));
 -
 -  CHECK(s3.IsRWWeakerOrEqual(kAccessRead | kAccessAtomic));
 -  CHECK(s3.IsRWWeakerOrEqual(kAccessWrite | kAccessAtomic));
 -  CHECK(s3.IsRWWeakerOrEqual(kAccessRead));
 -  CHECK(s3.IsRWWeakerOrEqual(kAccessWrite));
 -
 -  Shadow sro(Shadow::kRodata);
 -  CheckShadow(&sro, static_cast<Sid>(0), kEpochZero, 0, 0, kAccessRead);
 +TEST(Shadow, FastState) {
 +  Shadow s(FastState(11, 22));
 +  EXPECT_EQ(s.tid(), (u64)11);
 +  EXPECT_EQ(s.epoch(), (u64)22);
 +  EXPECT_EQ(s.GetIgnoreBit(), false);
 +  EXPECT_EQ(s.GetFreedAndReset(), false);
 +  EXPECT_EQ(s.GetHistorySize(), 0);
 +  EXPECT_EQ(s.addr0(), (u64)0);
 +  EXPECT_EQ(s.size(), (u64)1);
 +  EXPECT_EQ(s.IsWrite(), true);
 +
 +  s.IncrementEpoch();
 +  EXPECT_EQ(s.epoch(), (u64)23);
 +  s.IncrementEpoch();
 +  EXPECT_EQ(s.epoch(), (u64)24);
 +
 +  s.SetIgnoreBit();
 +  EXPECT_EQ(s.GetIgnoreBit(), true);
 +  s.ClearIgnoreBit();
 +  EXPECT_EQ(s.GetIgnoreBit(), false);
 +
 +  for (int i = 0; i < 8; i++) {
 +    s.SetHistorySize(i);
 +    EXPECT_EQ(s.GetHistorySize(), i);
 +  }
 +  s.SetHistorySize(2);
 +  s.ClearHistorySize();
 +  EXPECT_EQ(s.GetHistorySize(), 0);
  }

  TEST(Shadow, Mapping) {
 diff --git a/compiler-rt/lib/tsan/tests/unit/tsan_stack_test.cpp b/compiler-rt/lib/tsan/tests/unit/tsan_stack_test.cpp
 index ba3fbb35999f..23590caa3019 100644
 --- a/compiler-rt/lib/tsan/tests/unit/tsan_stack_test.cpp
 +++ b/compiler-rt/lib/tsan/tests/unit/tsan_stack_test.cpp
 @@ -18,7 +18,7 @@ namespace __tsan {

  template <typename StackTraceTy>
  static void TestStackTrace(StackTraceTy *trace) {
 -  ThreadState thr(kMainTid);
 +  ThreadState thr(0, 0, 0, 0, 0, 0, 0, 0, 0);

    ObtainCurrentStack(&thr, 0, trace);
    EXPECT_EQ(0U, trace->size);
 @@ -43,7 +43,7 @@ static void TestStackTrace(StackTraceTy *trace) {

  template<typename StackTraceTy>
  static void TestTrim(StackTraceTy *trace) {
 -  ThreadState thr(kMainTid);
 +  ThreadState thr(0, 0, 0, 0, 0, 0, 0, 0, 0);

    for (uptr i = 0; i < 2 * kStackTraceMax; ++i)
      *thr.shadow_stack_pos++ = 100 + i;
 diff --git a/compiler-rt/lib/tsan/tests/unit/tsan_sync_test.cpp b/compiler-rt/lib/tsan/tests/unit/tsan_sync_test.cpp
 index 87a28f2bc2b1..8e6c98590530 100644
 --- a/compiler-rt/lib/tsan/tests/unit/tsan_sync_test.cpp
 +++ b/compiler-rt/lib/tsan/tests/unit/tsan_sync_test.cpp
 @@ -17,7 +17,6 @@ namespace __tsan {

  TEST(MetaMap, Basic) {
    ThreadState *thr = cur_thread();
 -  SlotLocker locker(thr);
    MetaMap *m = &ctx->metamap;
    u64 block[1] = {};  // fake malloc block
    m->AllocBlock(thr, 0, (uptr)&block[0], 1 * sizeof(u64));
 @@ -25,7 +24,7 @@ TEST(MetaMap, Basic) {
    CHECK_NE(mb, (MBlock *)0);
    CHECK_EQ(mb->siz, 1 * sizeof(u64));
    CHECK_EQ(mb->tid, thr->tid);
 -  uptr sz = m->FreeBlock(thr->proc(), (uptr)&block[0], true);
 +  uptr sz = m->FreeBlock(thr->proc(), (uptr)&block[0]);
    CHECK_EQ(sz, 1 * sizeof(u64));
    mb = m->GetBlock((uptr)&block[0]);
    CHECK_EQ(mb, (MBlock *)0);
 @@ -33,7 +32,6 @@ TEST(MetaMap, Basic) {

  TEST(MetaMap, FreeRange) {
    ThreadState *thr = cur_thread();
 -  SlotLocker locker(thr);
    MetaMap *m = &ctx->metamap;
    u64 block[4] = {};  // fake malloc block
    m->AllocBlock(thr, 0, (uptr)&block[0], 1 * sizeof(u64));
 @@ -42,7 +40,7 @@ TEST(MetaMap, FreeRange) {
    CHECK_EQ(mb1->siz, 1 * sizeof(u64));
    MBlock *mb2 = m->GetBlock((uptr)&block[1]);
    CHECK_EQ(mb2->siz, 3 * sizeof(u64));
 -  m->FreeRange(thr->proc(), (uptr)&block[0], 4 * sizeof(u64), true);
 +  m->FreeRange(thr->proc(), (uptr)&block[0], 4 * sizeof(u64));
    mb1 = m->GetBlock((uptr)&block[0]);
    CHECK_EQ(mb1, (MBlock *)0);
    mb2 = m->GetBlock((uptr)&block[1]);
 @@ -54,7 +52,6 @@ TEST(MetaMap, Sync) {
    // them from detecting that we exit runtime with mutexes held.
    ScopedIgnoreInterceptors ignore;
    ThreadState *thr = cur_thread();
 -  SlotLocker locker(thr);
    MetaMap *m = &ctx->metamap;
    u64 block[4] = {};  // fake malloc block
    m->AllocBlock(thr, 0, (uptr)&block[0], 4 * sizeof(u64));
 @@ -66,7 +63,7 @@ TEST(MetaMap, Sync) {
    SyncVar *s2 = m->GetSyncOrCreate(thr, 0, (uptr)&block[1], false);
    CHECK_NE(s2, (SyncVar *)0);
    CHECK_EQ(s2->addr, (uptr)&block[1]);
 -  m->FreeBlock(thr->proc(), (uptr)&block[0], true);
 +  m->FreeBlock(thr->proc(), (uptr)&block[0]);
    s1 = m->GetSyncIfExists((uptr)&block[0]);
    CHECK_EQ(s1, (SyncVar *)0);
    s2 = m->GetSyncIfExists((uptr)&block[1]);
 @@ -77,7 +74,6 @@ TEST(MetaMap, Sync) {
  TEST(MetaMap, MoveMemory) {
    ScopedIgnoreInterceptors ignore;
    ThreadState *thr = cur_thread();
 -  SlotLocker locker(thr);
    MetaMap *m = &ctx->metamap;
    u64 block1[4] = {};  // fake malloc block
    u64 block2[4] = {};  // fake malloc block
 @@ -106,19 +102,18 @@ TEST(MetaMap, MoveMemory) {
    s2 = m->GetSyncIfExists((uptr)&block2[1]);
    CHECK_NE(s2, (SyncVar *)0);
    CHECK_EQ(s2->addr, (uptr)&block2[1]);
 -  m->FreeRange(thr->proc(), (uptr)&block2[0], 4 * sizeof(u64), true);
 +  m->FreeRange(thr->proc(), (uptr)&block2[0], 4 * sizeof(u64));
  }

  TEST(MetaMap, ResetSync) {
    ScopedIgnoreInterceptors ignore;
    ThreadState *thr = cur_thread();
 -  SlotLocker locker(thr);
    MetaMap *m = &ctx->metamap;
    u64 block[1] = {};  // fake malloc block
    m->AllocBlock(thr, 0, (uptr)&block[0], 1 * sizeof(u64));
    SyncVar *s = m->GetSyncOrCreate(thr, 0, (uptr)&block[0], false);
 -  s->Reset();
 -  uptr sz = m->FreeBlock(thr->proc(), (uptr)&block[0], true);
 +  s->Reset(thr->proc());
 +  uptr sz = m->FreeBlock(thr->proc(), (uptr)&block[0]);
    CHECK_EQ(sz, 1 * sizeof(u64));
  }

 diff --git a/compiler-rt/lib/tsan/tests/unit/tsan_trace_test.cpp b/compiler-rt/lib/tsan/tests/unit/tsan_trace_test.cpp
 index 13c03353e70e..c2e852d941c0 100644
 --- a/compiler-rt/lib/tsan/tests/unit/tsan_trace_test.cpp
 +++ b/compiler-rt/lib/tsan/tests/unit/tsan_trace_test.cpp
 @@ -31,6 +31,8 @@

  namespace __tsan {

 +using namespace v3;
 +
  // We need to run all trace tests in a new thread,
  // so that the thread trace is empty initially.
  template <uptr N>
 @@ -76,30 +78,27 @@ TRACE_TEST(Trace, RestoreAccess) {
    ThreadArray<1> thr;
    TraceFunc(thr, 0x1000);
    TraceFunc(thr, 0x1001);
 -  TraceMutexLock(thr, EventType::kLock, 0x4000, 0x5000, 0x6000);
 -  TraceMutexLock(thr, EventType::kLock, 0x4001, 0x5001, 0x6001);
 +  TraceMutexLock(thr, v3::EventType::kLock, 0x4000, 0x5000, 0x6000);
 +  TraceMutexLock(thr, v3::EventType::kLock, 0x4001, 0x5001, 0x6001);
    TraceMutexUnlock(thr, 0x5000);
    TraceFunc(thr);
    CHECK(TryTraceMemoryAccess(thr, 0x2001, 0x3001, 8, kAccessRead));
 -  TraceMutexLock(thr, EventType::kRLock, 0x4002, 0x5002, 0x6002);
 +  TraceMutexLock(thr, v3::EventType::kRLock, 0x4002, 0x5002, 0x6002);
    TraceFunc(thr, 0x1002);
    CHECK(TryTraceMemoryAccess(thr, 0x2000, 0x3000, 8, kAccessRead));
    // This is the access we want to find.
    // The previous one is equivalent, but RestoreStack must prefer
    // the last of the matchig accesses.
    CHECK(TryTraceMemoryAccess(thr, 0x2002, 0x3000, 8, kAccessRead));
 -  SlotPairLocker locker(thr, thr->fast_state.sid());
 -  ThreadRegistryLock lock1(&ctx->thread_registry);
 -  Lock lock2(&ctx->slot_mtx);
 -  Tid tid = kInvalidTid;
 +  Lock lock1(&ctx->slot_mtx);
 +  ThreadRegistryLock lock2(&ctx->thread_registry);
    VarSizeStackTrace stk;
    MutexSet mset;
    uptr tag = kExternalTagNone;
 -  bool res = RestoreStack(EventType::kAccessExt, thr->fast_state.sid(),
 -                          thr->fast_state.epoch(), 0x3000, 8, kAccessRead, &tid,
 -                          &stk, &mset, &tag);
 +  bool res =
 +      RestoreStack(thr->tid, v3::EventType::kAccessExt, thr->sid, thr->epoch,
 +                   0x3000, 8, kAccessRead, &stk, &mset, &tag);
    CHECK(res);
 -  CHECK_EQ(tid, thr->tid);
    CHECK_EQ(stk.size, 3);
    CHECK_EQ(stk.trace[0], 0x1000);
    CHECK_EQ(stk.trace[1], 0x1002);
 @@ -148,17 +147,14 @@ TRACE_TEST(Trace, MemoryAccessSize) {
                                   kAccessRead);
            break;
        }
 -      SlotPairLocker locker(thr, thr->fast_state.sid());
 -      ThreadRegistryLock lock1(&ctx->thread_registry);
 -      Lock lock2(&ctx->slot_mtx);
 -      Tid tid = kInvalidTid;
 +      Lock lock1(&ctx->slot_mtx);
 +      ThreadRegistryLock lock2(&ctx->thread_registry);
        VarSizeStackTrace stk;
        MutexSet mset;
        uptr tag = kExternalTagNone;
 -      bool res =
 -          RestoreStack(EventType::kAccessExt, thr->fast_state.sid(),
 -                       thr->fast_state.epoch(), 0x3000 + params.offset,
 -                       params.size, kAccessRead, &tid, &stk, &mset, &tag);
 +      bool res = RestoreStack(thr->tid, v3::EventType::kAccessExt, thr->sid,
 +                              thr->epoch, 0x3000 + params.offset, params.size,
 +                              kAccessRead, &stk, &mset, &tag);
        CHECK_EQ(res, params.res);
        if (params.res) {
          CHECK_EQ(stk.size, 2);
 @@ -173,19 +169,16 @@ TRACE_TEST(Trace, RestoreMutexLock) {
    // Check of restoration of a mutex lock event.
    ThreadArray<1> thr;
    TraceFunc(thr, 0x1000);
 -  TraceMutexLock(thr, EventType::kLock, 0x4000, 0x5000, 0x6000);
 -  TraceMutexLock(thr, EventType::kRLock, 0x4001, 0x5001, 0x6001);
 -  TraceMutexLock(thr, EventType::kRLock, 0x4002, 0x5001, 0x6002);
 -  SlotPairLocker locker(thr, thr->fast_state.sid());
 -  ThreadRegistryLock lock1(&ctx->thread_registry);
 -  Lock lock2(&ctx->slot_mtx);
 -  Tid tid = kInvalidTid;
 +  TraceMutexLock(thr, v3::EventType::kLock, 0x4000, 0x5000, 0x6000);
 +  TraceMutexLock(thr, v3::EventType::kRLock, 0x4001, 0x5001, 0x6001);
 +  TraceMutexLock(thr, v3::EventType::kRLock, 0x4002, 0x5001, 0x6002);
 +  Lock lock1(&ctx->slot_mtx);
 +  ThreadRegistryLock lock2(&ctx->thread_registry);
    VarSizeStackTrace stk;
    MutexSet mset;
    uptr tag = kExternalTagNone;
 -  bool res = RestoreStack(EventType::kLock, thr->fast_state.sid(),
 -                          thr->fast_state.epoch(), 0x5001, 0, 0, &tid, &stk,
 -                          &mset, &tag);
 +  bool res = RestoreStack(thr->tid, v3::EventType::kLock, thr->sid, thr->epoch,
 +                          0x5001, 0, 0, &stk, &mset, &tag);
    CHECK(res);
    CHECK_EQ(stk.size, 2);
    CHECK_EQ(stk.trace[0], 0x1000);
 @@ -202,35 +195,28 @@ TRACE_TEST(Trace, RestoreMutexLock) {
  TRACE_TEST(Trace, MultiPart) {
    // Check replay of a trace with multiple parts.
    ThreadArray<1> thr;
 -  FuncEntry(thr, 0x1000);
 -  FuncEntry(thr, 0x2000);
 -  MutexPreLock(thr, 0x4000, 0x5000, 0);
 -  MutexPostLock(thr, 0x4000, 0x5000, 0);
 -  MutexPreLock(thr, 0x4000, 0x5000, 0);
 -  MutexPostLock(thr, 0x4000, 0x5000, 0);
 -  const uptr kEvents = 3 * sizeof(TracePart) / sizeof(Event);
 +  TraceFunc(thr, 0x1000);
 +  TraceFunc(thr, 0x2000);
 +  TraceMutexLock(thr, v3::EventType::kLock, 0x4000, 0x5000, 0x6000);
 +  const uptr kEvents = 3 * sizeof(TracePart) / sizeof(v3::Event);
    for (uptr i = 0; i < kEvents; i++) {
 -    FuncEntry(thr, 0x3000);
 -    MutexPreLock(thr, 0x4002, 0x5002, 0);
 -    MutexPostLock(thr, 0x4002, 0x5002, 0);
 -    MutexUnlock(thr, 0x4003, 0x5002, 0);
 -    FuncExit(thr);
 +    TraceFunc(thr, 0x3000);
 +    TraceMutexLock(thr, v3::EventType::kLock, 0x4002, 0x5002, 0x6002);
 +    TraceMutexUnlock(thr, 0x5002);
 +    TraceFunc(thr);
    }
 -  FuncEntry(thr, 0x4000);
 -  TraceMutexLock(thr, EventType::kRLock, 0x4001, 0x5001, 0x6001);
 +  TraceFunc(thr, 0x4000);
 +  TraceMutexLock(thr, v3::EventType::kRLock, 0x4001, 0x5001, 0x6001);
    CHECK(TryTraceMemoryAccess(thr, 0x2002, 0x3000, 8, kAccessRead));
 -  SlotPairLocker locker(thr, thr->fast_state.sid());
 -  ThreadRegistryLock lock1(&ctx->thread_registry);
 -  Lock lock2(&ctx->slot_mtx);
 -  Tid tid = kInvalidTid;
 +  Lock lock1(&ctx->slot_mtx);
 +  ThreadRegistryLock lock2(&ctx->thread_registry);
    VarSizeStackTrace stk;
    MutexSet mset;
    uptr tag = kExternalTagNone;
 -  bool res = RestoreStack(EventType::kAccessExt, thr->fast_state.sid(),
 -                          thr->fast_state.epoch(), 0x3000, 8, kAccessRead, &tid,
 -                          &stk, &mset, &tag);
 +  bool res =
 +      RestoreStack(thr->tid, v3::EventType::kAccessExt, thr->sid, thr->epoch,
 +                   0x3000, 8, kAccessRead, &stk, &mset, &tag);
    CHECK(res);
 -  CHECK_EQ(tid, thr->tid);
    CHECK_EQ(stk.size, 4);
    CHECK_EQ(stk.trace[0], 0x1000);
    CHECK_EQ(stk.trace[1], 0x2000);
 @@ -238,94 +224,11 @@ TRACE_TEST(Trace, MultiPart) {
    CHECK_EQ(stk.trace[3], 0x2002);
    CHECK_EQ(mset.Size(), 2);
    CHECK_EQ(mset.Get(0).addr, 0x5000);
 +  CHECK_EQ(mset.Get(0).stack_id, 0x6000);
    CHECK_EQ(mset.Get(0).write, true);
 -  CHECK_EQ(mset.Get(0).count, 2);
    CHECK_EQ(mset.Get(1).addr, 0x5001);
 +  CHECK_EQ(mset.Get(1).stack_id, 0x6001);
    CHECK_EQ(mset.Get(1).write, false);
 -  CHECK_EQ(mset.Get(1).count, 1);
 -}
 -
 -void CheckTraceState(uptr count, uptr finished, uptr excess, uptr recycle) {
 -  Lock l(&ctx->slot_mtx);
 -  Printf("CheckTraceState(%zu/%zu, %zu/%zu, %zu/%zu, %zu/%zu)\n",
 -         ctx->trace_part_total_allocated, count,
 -         ctx->trace_part_recycle_finished, finished,
 -         ctx->trace_part_finished_excess, excess,
 -         ctx->trace_part_recycle.Size(), recycle);
 -  CHECK_EQ(ctx->trace_part_total_allocated, count);
 -  CHECK_EQ(ctx->trace_part_recycle_finished, finished);
 -  CHECK_EQ(ctx->trace_part_finished_excess, excess);
 -  CHECK_EQ(ctx->trace_part_recycle.Size(), recycle);
 -}
 -
 -TRACE_TEST(TraceAlloc, SingleThread) {
 -  TraceResetForTesting();
 -  auto check_thread = [&](ThreadState *thr, uptr size, uptr count,
 -                          uptr finished, uptr excess, uptr recycle) {
 -    CHECK_EQ(thr->tctx->trace.parts.Size(), size);
 -    CheckTraceState(count, finished, excess, recycle);
 -  };
 -  ThreadArray<2> threads;
 -  check_thread(threads[0], 0, 0, 0, 0, 0);
 -  TraceSwitchPartImpl(threads[0]);
 -  check_thread(threads[0], 1, 1, 0, 0, 0);
 -  TraceSwitchPartImpl(threads[0]);
 -  check_thread(threads[0], 2, 2, 0, 0, 0);
 -  TraceSwitchPartImpl(threads[0]);
 -  check_thread(threads[0], 3, 3, 0, 0, 1);
 -  TraceSwitchPartImpl(threads[0]);
 -  check_thread(threads[0], 3, 3, 0, 0, 1);
 -  threads.Finish(0);
 -  CheckTraceState(3, 3, 0, 3);
 -  threads.Finish(1);
 -  CheckTraceState(3, 3, 0, 3);
 -}
 -
 -TRACE_TEST(TraceAlloc, FinishedThreadReuse) {
 -  TraceResetForTesting();
 -  constexpr uptr Hi = Trace::kFinishedThreadHi;
 -  constexpr uptr kThreads = 4 * Hi;
 -  ThreadArray<kThreads> threads;
 -  for (uptr i = 0; i < kThreads; i++) {
 -    Printf("thread %zu\n", i);
 -    TraceSwitchPartImpl(threads[i]);
 -    if (i <= Hi)
 -      CheckTraceState(i + 1, i, 0, i);
 -    else if (i <= 2 * Hi)
 -      CheckTraceState(Hi + 1, Hi, i - Hi, Hi);
 -    else
 -      CheckTraceState(Hi + 1, Hi, Hi, Hi);
 -    threads.Finish(i);
 -    if (i < Hi)
 -      CheckTraceState(i + 1, i + 1, 0, i + 1);
 -    else if (i < 2 * Hi)
 -      CheckTraceState(Hi + 1, Hi + 1, i - Hi + 1, Hi + 1);
 -    else
 -      CheckTraceState(Hi + 1, Hi + 1, Hi + 1, Hi + 1);
 -  }
 -}
 -
 -TRACE_TEST(TraceAlloc, FinishedThreadReuse2) {
 -  TraceResetForTesting();
 -  // constexpr uptr Lo = Trace::kFinishedThreadLo;
 -  // constexpr uptr Hi = Trace::kFinishedThreadHi;
 -  constexpr uptr Min = Trace::kMinParts;
 -  constexpr uptr kThreads = 10;
 -  constexpr uptr kParts = 2 * Min;
 -  ThreadArray<kThreads> threads;
 -  for (uptr i = 0; i < kThreads; i++) {
 -    Printf("thread %zu\n", i);
 -    for (uptr j = 0; j < kParts; j++) TraceSwitchPartImpl(threads[i]);
 -    if (i == 0)
 -      CheckTraceState(Min, 0, 0, 1);
 -    else
 -      CheckTraceState(2 * Min, 0, Min, Min + 1);
 -    threads.Finish(i);
 -    if (i == 0)
 -      CheckTraceState(Min, Min, 0, Min);
 -    else
 -      CheckTraceState(2 * Min, 2 * Min, Min, 2 * Min);
 -  }
  }

  }  // namespace __tsan
 diff --git a/compiler-rt/test/tsan/bench_threads.cpp b/compiler-rt/test/tsan/bench_threads.cpp
 index 1d0be21eb5c6..d0ba42e29d8a 100644
 --- a/compiler-rt/test/tsan/bench_threads.cpp
 +++ b/compiler-rt/test/tsan/bench_threads.cpp
 @@ -4,6 +4,11 @@
  // bench.h needs pthread barriers which are not available on OS X
  // UNSUPPORTED: darwin

 +// aarch64 fails with:
 +// CHECK failed: tsan_rtl.cpp:327 "((addr + size)) <= ((TraceMemEnd()))"
 +// TODO: try to re-enable when D112603 is landed.
 +// XFAIL: aarch64
 +
  #include "bench.h"

  void *nop_thread(void *arg) {
 diff --git a/compiler-rt/test/tsan/free_race2.c b/compiler-rt/test/tsan/free_race2.c
 index ddba22c63701..a2137a7cdc70 100644
 --- a/compiler-rt/test/tsan/free_race2.c
 +++ b/compiler-rt/test/tsan/free_race2.c
 @@ -28,7 +28,7 @@ int main() {
  }

  // CHECK: WARNING: ThreadSanitizer: heap-use-after-free
 -// CHECK:   Write of size {{.*}} at {{.*}} by main thread:
 +// CHECK:   Write of size 8 at {{.*}} by main thread:
  // CHECK:     #0 bar
  // CHECK:     #1 main
  // CHECK:   Previous write of size 8 at {{.*}} by main thread:
 diff --git a/compiler-rt/test/tsan/memcmp_race.cpp b/compiler-rt/test/tsan/memcmp_race.cpp
 index 911c33524dd6..40b11a77b364 100644
 --- a/compiler-rt/test/tsan/memcmp_race.cpp
 +++ b/compiler-rt/test/tsan/memcmp_race.cpp
 @@ -34,7 +34,7 @@ int main() {

  // CHECK: addr=[[ADDR:0x[0-9,a-f]+]]
  // CHECK: WARNING: ThreadSanitizer: data race
 -// CHECK:   Write of size 3 at [[ADDR]] by thread T2:
 +// CHECK:   Write of size 1 at [[ADDR]] by thread T2:
  // CHECK:     #0 {{(memcpy|memmove)}}
  // CHECK:     #{{[12]}} Thread2
  // CHECK:   Previous read of size 1 at [[ADDR]] by thread T1:
 diff --git a/compiler-rt/test/tsan/memcpy_race.cpp b/compiler-rt/test/tsan/memcpy_race.cpp
 index cfdec7cd642f..09b2a319e205 100644
 --- a/compiler-rt/test/tsan/memcpy_race.cpp
 +++ b/compiler-rt/test/tsan/memcpy_race.cpp
 @@ -22,8 +22,7 @@ void *Thread2(void *x) {

  int main() {
    barrier_init(&barrier, 2);
 -  print_address("addr1=", 1, &data[3]);
 -  print_address("addr2=", 1, &data[5]);
 +  print_address("addr=", 1, &data[5]);
    pthread_t t[2];
    pthread_create(&t[0], NULL, Thread1, NULL);
    pthread_create(&t[1], NULL, Thread2, NULL);
 @@ -32,12 +31,11 @@ int main() {
    return 0;
  }

 -// CHECK: addr1=[[ADDR1:0x[0-9,a-f]+]]
 -// CHECK: addr2=[[ADDR2:0x[0-9,a-f]+]]
 +// CHECK: addr=[[ADDR:0x[0-9,a-f]+]]
  // CHECK: WARNING: ThreadSanitizer: data race
 -// CHECK:   Write of size 4 at [[ADDR1]] by thread T2:
 +// CHECK:   Write of size 1 at [[ADDR]] by thread T2:
  // CHECK:     #0 {{(memcpy|memmove)}}
  // CHECK:     #{{[12]}} Thread2
 -// CHECK:   Previous write of size 1 at [[ADDR2]] by thread T1:
 +// CHECK:   Previous write of size 1 at [[ADDR]] by thread T1:
  // CHECK:     #0 {{(memcpy|memmove)}}
  // CHECK:     #{{[12]}} Thread1
 diff --git a/compiler-rt/test/tsan/mutexset7.cpp b/compiler-rt/test/tsan/mutexset7.cpp
 index 5b4c7b9bb38c..d3729659717e 100644
 --- a/compiler-rt/test/tsan/mutexset7.cpp
 +++ b/compiler-rt/test/tsan/mutexset7.cpp
 @@ -36,6 +36,6 @@ int main() {
  // CHECK: Write of size 4 at {{.*}} by thread T1:
  // CHECK: Previous write of size 4 at {{.*}} by thread T2
  // CHECK:                                      (mutexes: write [[M1:M[0-9]+]]):
 -// CHECK: Mutex [[M1]] (0x{{.*}}) created at:
 -// CHECK:   #0 pthread_mutex_init
 -// CHECK:   #1 Thread2
 +// CHECK: Mutex [[M1]] is already destroyed
 +// CHECK-NOT: Mutex {{.*}} created at
 +
 --
 2.35.0.rc0.227.g00780c9af4-goog