| //===-- lib/runtime/trampoline.cpp -------------------------------*- C++-*-===// |
| // |
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| // |
| //===----------------------------------------------------------------------===// |
| // |
| // W^X-compliant trampoline pool implementation. |
| // |
| // This file implements a runtime trampoline pool that maintains separate |
| // memory regions for executable code (RX) and writable data (RW). |
| // |
| // On Linux the code region transitions RW → RX (never simultaneously W+X). |
| // On macOS Apple Silicon the code region uses MAP_JIT with per-thread W^X |
| // toggling via pthread_jit_write_protect_np, so the mapping permissions |
| // include both W and X but hardware enforces that only one is active at |
| // a time on any given thread. |
| // |
| // Architecture: |
| // - Code region (RX): Contains pre-assembled trampoline stubs that load |
| // callee address and static chain from a paired TDATA entry, then jump |
| // to the callee with the static chain in the appropriate register. |
| // - Data region (RW): Contains TrampolineData entries with {callee_address, |
| // static_chain_address} pairs, one per trampoline slot. |
| // - Free list: Tracks available trampoline slots for O(1) alloc/free. |
| // |
| // Thread safety: Uses Fortran::runtime::Lock (pthreads on POSIX, |
| // CRITICAL_SECTION on Windows) — not std::mutex — to avoid C++ runtime |
| // library dependence. A single global lock serializes pool operations. |
| // This is a deliberate V1 design choice to keep the initial W^X |
| // architectural change minimal. Per-thread lock-free pools are deferred |
| // to a future optimization patch. |
| // |
| // AddressSanitizer note: The trampoline code region is allocated via |
| // mmap (not malloc/new), so ASan does not track it. The data region |
| // and handles are allocated via malloc (through AllocateMemoryOrCrash), |
| // which ASan intercepts normally. No special annotations are needed. |
| // |
| // See flang/docs/InternalProcedureTrampolines.md for design details. |
| // |
| //===----------------------------------------------------------------------===// |
| |
| #include "flang/Runtime/trampoline.h" |
| #include "flang-rt/runtime/lock.h" |
| #include "flang-rt/runtime/memory.h" |
| #include "flang-rt/runtime/terminator.h" |
| #include "flang-rt/runtime/trampoline.h" |
| #include "flang/Runtime/freestanding-tools.h" |
| |
| #include <atomic> |
| #include <cassert> |
| #include <cstdint> |
| #include <cstdlib> |
| #include <cstring> |
| |
| // Platform-specific headers for memory mapping. |
| #if defined(_WIN32) |
| #include <windows.h> |
| #else |
| // On macOS/Darwin, the flang-rt CMake configuration sets |
| // -D_POSIX_C_SOURCE=200809, which hides BSD/Apple-specific mmap flags |
| // (MAP_ANON, MAP_JIT) from <sys/mman.h>. Define _DARWIN_C_SOURCE to |
| // re-expose them for MAP_JIT on Apple Silicon and MAP_ANON elsewhere. |
| #if defined(__APPLE__) && !defined(_DARWIN_C_SOURCE) |
| #define _DARWIN_C_SOURCE |
| #endif |
| #include <fcntl.h> |
| #include <sys/mman.h> |
| #include <unistd.h> |
| // Some platforms (e.g. AIX) define MAP_ANON instead of MAP_ANONYMOUS. |
| #if !defined(MAP_ANONYMOUS) && defined(MAP_ANON) |
| #define MAP_ANONYMOUS MAP_ANON |
| #endif |
| #endif |
| |
| // macOS Apple Silicon requires MAP_JIT and pthread_jit_write_protect_np |
| // to create executable memory under the hardened runtime. |
| #if defined(__APPLE__) && defined(__aarch64__) |
| #include <libkern/OSCacheControl.h> |
| #include <pthread.h> |
| #endif |
| |
| // Architecture support check. Stub generators exist only for x86-64 and |
| // AArch64. On other architectures the file compiles but the runtime API |
| // functions crash with a diagnostic if actually called, so that building |
| // flang-rt on e.g. RISC-V or PPC64 never fails. |
| #if defined(__x86_64__) || defined(_M_X64) || defined(__aarch64__) || \ |
| defined(_M_ARM64) |
| #define TRAMPOLINE_ARCH_SUPPORTED 1 |
| #else |
| #define TRAMPOLINE_ARCH_SUPPORTED 0 |
| #endif |
| |
| namespace Fortran::runtime::trampoline { |
| |
| /// A handle returned to the caller. Contains enough info to find |
| /// both the trampoline stub and its data entry. |
| struct TrampolineHandle { |
| void *codePtr{nullptr}; // Pointer to the trampoline stub in the RX region. |
| std::size_t slotIndex{0}; // Index in the pool for free-list management. |
| }; |
| |
| // Namespace-scope globals following Flang runtime conventions: |
| // - Lock is trivially constructible (pthread_mutex_t / CRITICAL_SECTION) |
| // - Pool pointer uses std::atomic for safe double-checked locking |
| class TrampolinePool; // Forward declaration for pointer below. |
| static Lock poolLock; |
| static std::atomic<TrampolinePool *> poolInstance{nullptr}; |
| |
| /// The global trampoline pool. |
| class TrampolinePool { |
| public: |
| TrampolinePool() = default; |
| |
| static TrampolinePool &instance() { |
| TrampolinePool *p{poolInstance.load(std::memory_order_acquire)}; |
| if (p) { |
| return *p; |
| } |
| CriticalSection critical{poolLock}; |
| p = poolInstance.load(std::memory_order_relaxed); |
| if (p) { |
| return *p; |
| } |
| // Allocate pool using SizedNew (malloc + placement new). |
| Terminator terminator{__FILE__, __LINE__}; |
| auto owning{SizedNew<TrampolinePool>{terminator}(sizeof(TrampolinePool))}; |
| p = owning.release(); |
| poolInstance.store(p, std::memory_order_release); |
| return *p; |
| } |
| |
| /// Allocate a trampoline slot and initialize it. |
| TrampolineHandle *allocate( |
| const void *calleeAddress, const void *staticChainAddress) { |
| CriticalSection critical{lock_}; |
| ensureInitialized(); |
| |
| if (freeHead_ == kInvalidIndex) { |
| // Pool exhausted — fixed size by design for V1. |
| // The pool capacity is controlled by FLANG_TRAMPOLINE_POOL_SIZE |
| // (default 1024). Dynamic slab growth can be added in a follow-up |
| // patch if real workloads demonstrate a need for it. |
| Terminator terminator{__FILE__, __LINE__}; |
| terminator.Crash("Trampoline pool exhausted (max %zu slots). " |
| "Set FLANG_TRAMPOLINE_POOL_SIZE to increase.", |
| poolSize_); |
| } |
| |
| std::size_t index{freeHead_}; |
| freeHead_ = freeList_[index]; |
| |
| // Initialize the data entry. |
| dataRegion_[index].calleeAddress = calleeAddress; |
| dataRegion_[index].staticChainAddress = staticChainAddress; |
| |
| // Create handle using SizedNew (malloc + placement new). |
| Terminator terminator{__FILE__, __LINE__}; |
| auto owning{New<TrampolineHandle>{terminator}()}; |
| TrampolineHandle *handle{owning.release()}; |
| handle->codePtr = |
| static_cast<char *>(codeRegion_) + index * kTrampolineStubSize; |
| handle->slotIndex = index; |
| |
| return handle; |
| } |
| |
| /// Get the callable address of a trampoline. |
| void *getCallableAddress(TrampolineHandle *handle) { return handle->codePtr; } |
| |
| /// Free a trampoline slot. |
| void free(TrampolineHandle *handle) { |
| CriticalSection critical{lock_}; |
| |
| std::size_t index{handle->slotIndex}; |
| |
| // Poison the data entry so that any dangling call through a freed |
| // trampoline traps immediately. Setting to NULL means the stub will |
| // jump to address 0, which is unmapped on all supported platforms |
| // and produces SIGSEGV/SIGBUS immediately. |
| dataRegion_[index].calleeAddress = nullptr; |
| dataRegion_[index].staticChainAddress = nullptr; |
| |
| // Return slot to free list. |
| freeList_[index] = freeHead_; |
| freeHead_ = index; |
| |
| FreeMemory(handle); |
| } |
| |
| private: |
| static constexpr std::size_t kInvalidIndex{~std::size_t{0}}; |
| |
| void ensureInitialized() { |
| if (initialized_) { |
| return; |
| } |
| initialized_ = true; |
| |
| // Check environment variable for pool size override. |
| // Fixed-size pool by design (V1): avoids complexity of dynamic growth |
| // and re-protection of code pages. The default (1024 slots) is |
| // sufficient for typical Fortran programs. Users can override via: |
| // export FLANG_TRAMPOLINE_POOL_SIZE=4096 |
| if (const char *envSize = std::getenv("FLANG_TRAMPOLINE_POOL_SIZE")) { |
| long val{std::strtol(envSize, nullptr, 10)}; |
| if (val > 0) { |
| poolSize_ = {static_cast<std::size_t>(val)}; |
| } |
| } |
| |
| // Allocate the data region (RW). |
| Terminator terminator{__FILE__, __LINE__}; |
| dataRegion_ = static_cast<TrampolineData *>( |
| AllocateMemoryOrCrash(terminator, poolSize_ * sizeof(TrampolineData))); |
| runtime::memset(dataRegion_, 0, poolSize_ * sizeof(TrampolineData)); |
| |
| // Allocate the code region (initially RW for writing stubs, then RX). |
| std::size_t codeSize{poolSize_ * kTrampolineStubSize}; |
| #if defined(_WIN32) |
| codeRegion_ = VirtualAlloc( |
| nullptr, codeSize, MEM_COMMIT | MEM_RESERVE, PAGE_READWRITE); |
| #elif defined(__APPLE__) && defined(__aarch64__) |
| // macOS Apple Silicon: MAP_JIT is required for pages that will become |
| // executable. Use pthread_jit_write_protect_np to toggle W↔X. |
| codeRegion_ = mmap(nullptr, codeSize, PROT_READ | PROT_WRITE | PROT_EXEC, |
| MAP_PRIVATE | MAP_ANONYMOUS | MAP_JIT, -1, 0); |
| if (codeRegion_ == MAP_FAILED) { |
| codeRegion_ = nullptr; |
| } |
| if (codeRegion_) { |
| // Enable writing on this thread (MAP_JIT defaults to execute). |
| // Guard for deployment targets older than macOS 11.0 (Apple Silicon |
| // always runs >= 11.0, so this is effectively unconditional at runtime). |
| if (__builtin_available(macOS 11.0, *)) { |
| pthread_jit_write_protect_np(0); // 0 = writable |
| } |
| } |
| #elif defined(MAP_ANONYMOUS) |
| // Linux and other POSIX platforms with MAP_ANONYMOUS. |
| codeRegion_ = mmap(nullptr, codeSize, PROT_READ | PROT_WRITE, |
| MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); |
| if (codeRegion_ == MAP_FAILED) { |
| codeRegion_ = nullptr; |
| } |
| #else |
| // Platforms without MAP_ANONYMOUS or MAP_ANON (e.g. AIX): map /dev/zero |
| // as a portable anonymous-mapping equivalent (per POSIX). |
| { |
| int devZero{open("/dev/zero", O_RDONLY)}; |
| if (devZero >= 0) { |
| codeRegion_ = mmap( |
| nullptr, codeSize, PROT_READ | PROT_WRITE, MAP_PRIVATE, devZero, 0); |
| if (codeRegion_ == MAP_FAILED) { |
| codeRegion_ = nullptr; |
| } |
| close(devZero); |
| } |
| } |
| #endif |
| if (!codeRegion_) { |
| terminator.Crash("Failed to allocate trampoline code region"); |
| } |
| |
| // Generate trampoline stubs. |
| generateStubs(); |
| |
| // Flush instruction cache. Required on architectures with non-coherent |
| // I-cache/D-cache (AArch64, PPC, etc.). On x86-64 this is a no-op |
| // but harmless. Without this, AArch64 may execute stale instructions. |
| #if defined(__APPLE__) && defined(__aarch64__) |
| // On macOS, use sys_icache_invalidate (from libkern/OSCacheControl.h). |
| sys_icache_invalidate(codeRegion_, codeSize); |
| #elif defined(_WIN32) |
| FlushInstructionCache(GetCurrentProcess(), codeRegion_, codeSize); |
| #else |
| __builtin___clear_cache(static_cast<char *>(codeRegion_), |
| static_cast<char *>(codeRegion_) + codeSize); |
| #endif |
| |
| // Make code region executable and non-writable (W^X). |
| #if defined(_WIN32) |
| DWORD oldProtect; |
| VirtualProtect(codeRegion_, codeSize, PAGE_EXECUTE_READ, &oldProtect); |
| #elif defined(__APPLE__) && defined(__aarch64__) |
| // Switch back to execute-only (MAP_JIT manages per-thread W^X). |
| if (__builtin_available(macOS 11.0, *)) { |
| pthread_jit_write_protect_np(1); // 1 = executable |
| } |
| #else |
| mprotect(codeRegion_, codeSize, PROT_READ | PROT_EXEC); |
| #endif |
| |
| // Initialize free list. |
| freeList_ = static_cast<std::size_t *>( |
| AllocateMemoryOrCrash(terminator, poolSize_ * sizeof(std::size_t))); |
| |
| for (std::size_t i{0}; i < poolSize_ - 1; ++i) { |
| freeList_[i] = i + 1; |
| } |
| freeList_[poolSize_ - 1] = kInvalidIndex; |
| freeHead_ = 0; |
| } |
| |
| /// Generate platform-specific trampoline stubs in the code region. |
| /// Each stub loads callee address and static chain from its paired |
| /// TDATA entry and jumps to the callee. |
| void generateStubs() { |
| #if defined(__x86_64__) || defined(_M_X64) |
| generateStubsX86_64(); |
| #elif defined(__aarch64__) || defined(_M_ARM64) |
| generateStubsAArch64(); |
| #else |
| // Unsupported architecture — should never be reached because the |
| // extern "C" API functions guard with TRAMPOLINE_ARCH_SUPPORTED. |
| // Fill with trap bytes as a safety net. |
| runtime::memset(codeRegion_, 0, poolSize_ * kTrampolineStubSize); |
| #endif |
| } |
| |
| #if defined(__x86_64__) || defined(_M_X64) |
| /// Generate x86-64 trampoline stubs. |
| /// |
| /// Each stub does: |
| /// movabsq $dataEntry, %r11 ; load TDATA entry address |
| /// movq 8(%r11), %r10 ; load static chain -> nest register |
| /// jmpq *(%r11) ; jump to callee address |
| /// |
| /// Total: 10 + 4 + 3 = 17 bytes, padded to kTrampolineStubSize. |
| void generateStubsX86_64() { |
| auto *code{static_cast<uint8_t *>(codeRegion_)}; |
| |
| for (std::size_t i{0}; i < poolSize_; ++i) { |
| uint8_t *stub{code + i * kTrampolineStubSize}; |
| |
| // Address of the corresponding TDATA entry. |
| auto dataAddr{reinterpret_cast<uint64_t>(&dataRegion_[i])}; |
| |
| std::size_t off{0}; |
| |
| // movabsq $dataAddr, %r11 (REX.W + B, opcode 0xBB for r11) |
| stub[off++] = 0x49; // REX.WB |
| stub[off++] = 0xBB; // MOV r11, imm64 |
| runtime::memcpy(&stub[off], &dataAddr, 8); |
| off += 8; |
| |
| // movq 8(%r11), %r10 (load staticChainAddress into r10) |
| stub[off++] = 0x4D; // REX.WRB |
| stub[off++] = 0x8B; // MOV r/m64 -> r64 |
| stub[off++] = 0x53; // ModRM: [r11 + disp8], r10 |
| stub[off++] = 0x08; // disp8 = 8 |
| |
| // jmpq *(%r11) (jump to calleeAddress) |
| stub[off++] = 0x41; // REX.B |
| stub[off++] = 0xFF; // JMP r/m64 |
| stub[off++] = 0x23; // ModRM: [r11], opcode extension 4 |
| |
| // Pad the rest with INT3 (0xCC) for safety. |
| while (off < kTrampolineStubSize) { |
| stub[off++] = 0xCC; |
| } |
| } |
| } |
| #endif |
| |
| #if defined(__aarch64__) || defined(_M_ARM64) |
| /// Generate AArch64 trampoline stubs. |
| /// |
| /// Each stub does: |
| /// ldr x17, .Ldata_addr ; load TDATA entry address |
| /// ldr x15, [x17, #8] ; load static chain -> x15 (nest reg) |
| /// ldr x17, [x17] ; load callee address |
| /// br x17 ; jump to callee |
| /// .Ldata_addr: |
| /// .quad <address of dataRegion_[i]> |
| /// |
| /// Total: 4*4 + 8 = 24 bytes, padded to kTrampolineStubSize. |
| void generateStubsAArch64() { |
| auto *code{static_cast<uint8_t *>(codeRegion_)}; |
| |
| for (std::size_t i{0}; i < poolSize_; ++i) { |
| auto *stub{reinterpret_cast<uint32_t *>(code + i * kTrampolineStubSize)}; |
| |
| // Address of the corresponding TDATA entry. |
| auto dataAddr{reinterpret_cast<uint64_t>(&dataRegion_[i])}; |
| |
| // ldr x17, .Ldata_addr (PC-relative load, offset = 4 instructions = 16 |
| // bytes) LDR (literal): opc=01, V=0, imm19=(16/4)=4, Rt=17 |
| stub[0] = 0x58000091; // ldr x17, #16 (imm19=4, shifted left 2 = 16) |
| // Encoding: 0101 1000 0000 0000 0000 0000 1001 0001 |
| |
| // ldr x15, [x17, #8] (load static chain into x15, the nest register) |
| // LDR (unsigned offset): size=11, V=0, opc=01, imm12=1(×8), Rn=17, Rt=15 |
| stub[1] = 0xF940062F; // ldr x15, [x17, #8] |
| |
| // ldr x17, [x17] (load callee address) |
| // LDR (unsigned offset): size=11, V=0, opc=01, imm12=0, Rn=17, Rt=17 |
| stub[2] = 0xF9400231; // ldr x17, [x17, #0] |
| |
| // br x17 |
| stub[3] = 0xD61F0220; // br x17 |
| |
| // .Ldata_addr: .quad dataRegion_[i] |
| runtime::memcpy(&stub[4], &dataAddr, 8); |
| |
| // Pad remaining with BRK #0 (trap) for safety. |
| std::size_t usedWords{4 + 2}; // 4 instructions + 1 quad (2 words) |
| for (std::size_t w{usedWords}; w < kTrampolineStubSize / sizeof(uint32_t); |
| ++w) { |
| stub[w] = 0xD4200000; // brk #0 |
| } |
| } |
| } |
| #endif |
| |
| Lock lock_; |
| bool initialized_{false}; |
| std::size_t poolSize_{kDefaultPoolSize}; |
| |
| void *codeRegion_{nullptr}; // RX after initialization |
| TrampolineData *dataRegion_{nullptr}; // RW always |
| std::size_t *freeList_{nullptr}; // Intrusive free list |
| std::size_t freeHead_{kInvalidIndex}; |
| }; |
| |
| } // namespace Fortran::runtime::trampoline |
| |
| namespace Fortran::runtime { |
| extern "C" { |
| |
| // Helper: crash with a clear message on unsupported architectures. |
| // This is only reached if -fsafe-trampoline was used on a target |
| // that lacks stub generators. The driver should emit a warning and |
| // ignore the flag on unsupported architectures, but the runtime |
| // provides a safety net. |
| static inline void crashIfUnsupported() { |
| #if !TRAMPOLINE_ARCH_SUPPORTED |
| Terminator terminator{__FILE__, __LINE__}; |
| terminator.Crash("Runtime trampolines are not supported on this " |
| "architecture. Recompile without -fsafe-trampoline " |
| "to use the legacy stack-trampoline path."); |
| #endif |
| } |
| |
| void *RTDEF(TrampolineInit)( |
| void *scratch, const void *calleeAddress, const void *staticChainAddress) { |
| crashIfUnsupported(); |
| auto &pool{trampoline::TrampolinePool::instance()}; |
| return pool.allocate(calleeAddress, staticChainAddress); |
| } |
| |
| void *RTDEF(TrampolineAdjust)(void *handle) { |
| crashIfUnsupported(); |
| auto &pool{trampoline::TrampolinePool::instance()}; |
| return pool.getCallableAddress( |
| static_cast<trampoline::TrampolineHandle *>(handle)); |
| } |
| |
| void RTDEF(TrampolineFree)(void *handle) { |
| crashIfUnsupported(); |
| auto &pool{trampoline::TrampolinePool::instance()}; |
| pool.free(static_cast<trampoline::TrampolineHandle *>(handle)); |
| } |
| |
| } // extern "C" |
| } // namespace Fortran::runtime |