| // Copyright 2022 The ChromiumOS Authors |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| //! Provides the ability to monitor and control a memcg's memory usage. |
| |
| use std::cell::RefCell; |
| use std::ffi::CString; |
| use std::fmt::Debug; |
| use std::fmt::Formatter; |
| use std::fs::File; |
| use std::fs::OpenOptions; |
| use std::io::Read; |
| use std::io::Seek; |
| use std::io::SeekFrom; |
| use std::io::Write; |
| use std::mem; |
| use std::os::unix::io::AsRawFd; |
| use std::os::unix::io::FromRawFd; |
| use std::os::unix::io::RawFd; |
| use std::path::Path; |
| use std::path::PathBuf; |
| use std::ptr::null_mut; |
| use std::rc::Rc; |
| use std::result::Result as StdResult; |
| use std::string::String; |
| |
| use anyhow::anyhow; |
| use anyhow::bail; |
| use anyhow::Context; |
| use anyhow::Result; |
| use crosvm_base::unix::round_up_to_page_size; |
| use libc::c_void; |
| use libc::inotify_add_watch; |
| use libc::inotify_event; |
| use libc::inotify_init1; |
| use libc::itimerspec; |
| use libc::time_t; |
| use libc::timerfd_create; |
| use libc::timerfd_settime; |
| use libc::CLOCK_MONOTONIC; |
| use libc::IN_MODIFY; |
| use libc::O_NONBLOCK; |
| use libc::TFD_CLOEXEC; |
| use libsirenia::linux::events::EventMultiplexer; |
| use libsirenia::linux::events::EventSource; |
| use libsirenia::linux::events::Mutator; |
| use libsirenia::linux::events::RemoveFdMutator; |
| use libsirenia::sys; |
| use log::error; |
| use log::info; |
| use log::warn; |
| |
| /// Trait which should respond to memory limit changes. |
| pub trait MemcgController { |
| /// Called when the memcg's memory allocation should be changed. |delta| is the |
| /// requested change in bytes, and this function should return the actual amount |
| /// to change. |
| /// |
| /// When |delta| is negative, this is called after the memcg's limits have been |
| /// adjusted. When |delta| is positive, this is called before the memcg's limits |
| /// get adjusted. |
| fn on_allocation_change(&self, delta: i64) -> i64; |
| } |
| |
| struct MemcgGroup { |
| // Path to the memcg directory |
| path: PathBuf, |
| |
| // Various files from the memcg directory |
| high_limit: File, |
| max_limit: File, |
| cur_bytes: File, |
| events: File, |
| |
| // The current max limit of the memcg |
| max_limit_bytes: u64, |
| |
| // High watermark event count used to detect and filter out other memcg events. |
| high_watermark_event_count: u64, |
| |
| // Counts the number of times we've checked to see if the limit can |
| // decrease without seeing the high limit being breached. Used for |
| // backoff in the timer and to progressively tighten the limits. |
| no_increase_count: u32, |
| |
| // Timerfd used for polling to decrease memory limits |
| timer: File, |
| } |
| |
| // The relationship between the high and max limits is: |
| // |
| // max_limit = max(high_limit + 8MiB, high_limit * 1.08) |
| // |
| // This ensures that it takes >10 seconds between breaching the high |
| // limit and breaching the max limit, which should give enough time |
| // to inflate the balloon and reclaim memory before a memcg OOM. |
| fn compute_max_limit_from_high_limit(val: u64) -> u64 { |
| if val > 100 * 1024 * 1024 { |
| (val as f64 * 1.08) as u64 |
| } else { |
| val + 8 * 1024 * 1024 |
| } |
| } |
| |
| fn compute_high_limit_from_max_limit(val: u64) -> u64 { |
| if val > 108 * 1024 * 1024 { |
| (val as f64 / 1.08) as u64 |
| } else { |
| val - 8 * 1024 * 1024 |
| } |
| } |
| |
| // Computes the next limit from the given limit. The relation is: |
| // |
| // next_limit = max(cur_limit * 1.1, cur_limit + 10MiB) |
| // |
| // The gap between the current and next limit provides slack before the |
| // limit needs to be increased again, at the cost of potentially reserving |
| // more memory than the memcg will actually use. The current formula was |
| // picked somewhat arbitrarily to provide a reasonable trade-off here. |
| fn compute_next_max_limit(val: u64) -> u64 { |
| ((val as f64 * 1.1) as u64).max(val + (10 * 1024 * 1024)) |
| } |
| |
| impl MemcgGroup { |
| fn new(name: &str, initial_max_limit_bytes: u64) -> Result<Self> { |
| let cgroup = Path::new("/sys/kernel/cgroup").join(name); |
| let mut events = File::open(cgroup.join("memory.events"))?; |
| let high_limit = OpenOptions::new() |
| .read(true) |
| .write(true) |
| .open(cgroup.join("memory.high"))?; |
| let max_limit = OpenOptions::new() |
| .read(true) |
| .write(true) |
| .open(cgroup.join("memory.max"))?; |
| |
| // Safe because we check the return value. |
| let timer_fd = unsafe { timerfd_create(CLOCK_MONOTONIC, TFD_CLOEXEC) }; |
| if timer_fd < 0 { |
| bail!("Failed to create timer fd: {}", sys::errno()); |
| } |
| // Safe since we own the fd |
| let timer = unsafe { File::from_raw_fd(timer_fd) }; |
| |
| let mut memcg = MemcgGroup { |
| path: cgroup.clone(), |
| |
| high_watermark_event_count: read_watermark_event_count(&mut events, "high")?, |
| max_limit_bytes: 0, |
| no_increase_count: 0, |
| timer, |
| |
| high_limit, |
| max_limit, |
| cur_bytes: File::open(cgroup.join("memory.current"))?, |
| events, |
| }; |
| |
| memcg.set_max_limit_bytes(initial_max_limit_bytes)?; |
| Ok(memcg) |
| } |
| |
| // Sets |memory.max| to the given value, and sets |memory.high| to the |
| // corresponding value to ensure the monitor has sufficient time to respond to |
| // allocations. |
| fn set_max_limit_bytes(&mut self, max_limit: u64) -> Result<()> { |
| let high_limit = compute_high_limit_from_max_limit(max_limit); |
| |
| write_memcg_limit_file(&mut self.max_limit, max_limit) |
| .context("failed to update max limit")?; |
| write_memcg_limit_file(&mut self.high_limit, high_limit) |
| .context("failed to update high limit")?; |
| |
| self.max_limit_bytes = max_limit; |
| Ok(()) |
| } |
| |
| fn round_delta(delta: u64) -> u64 { |
| round_up_to_page_size(delta as usize) as u64 |
| } |
| |
| fn get_cur_bytes(&mut self) -> Result<u64> { |
| read_memcg_single_value_file(&mut self.cur_bytes) |
| } |
| |
| // Gets the target |memory.max| to use when increasing memory limits. |
| fn target_max_limit_increase(&self) -> u64 { |
| Self::round_delta(compute_next_max_limit(self.max_limit_bytes) - self.max_limit_bytes) |
| } |
| |
| // Gets the target |memory.max| to use when decreasing memory limits. |
| fn target_max_limit_decrease(&mut self) -> Result<u64> { |
| // Calculate the target max limit by assuming the current bytes |
| // will become the new high limit. To prevent oscillation of the |
| // limit, add an extra margin based on how long it's been since |
| // the memcg has breached its high limit (i.e. since it's allocated |
| // a significant amount of memory). |
| let cur_bytes = self.get_cur_bytes()?; |
| let test_max_limit = compute_max_limit_from_high_limit(cur_bytes); |
| let target_max_limit = { |
| let extra_margin_fraction = if self.no_increase_count == 0 { |
| 1.0 |
| } else { |
| (1.0 / (self.no_increase_count as f64)).max(0.1) |
| }; |
| let extra_max_limit = compute_next_max_limit(test_max_limit); |
| let extra_margin = (extra_max_limit - test_max_limit) as f64 * extra_margin_fraction; |
| test_max_limit + (extra_margin as u64) |
| }; |
| Ok(Self::round_delta( |
| self.max_limit_bytes - target_max_limit.min(self.max_limit_bytes), |
| )) |
| } |
| |
| fn update_limits(&mut self, delta: i64) -> Result<()> { |
| let new_limit = if delta > 0 { |
| self.max_limit_bytes + delta as u64 |
| } else { |
| self.max_limit_bytes - delta.unsigned_abs() |
| }; |
| self.set_max_limit_bytes(new_limit) |
| } |
| |
| fn set_next_timer(&mut self) -> Result<()> { |
| let timeout_sec = 2_u64 |
| .checked_pow(self.no_increase_count) |
| .unwrap_or(60) |
| .min(60); |
| |
| // Safe since spec is a c struct where all-zeros is valid |
| let mut spec: itimerspec = unsafe { mem::zeroed() }; |
| spec.it_value.tv_sec = timeout_sec as time_t; |
| |
| // Safe because it doesn't modify memory and we check the return value. |
| let ret = unsafe { timerfd_settime(self.timer.as_raw_fd(), 0, &spec, null_mut()) }; |
| if ret < 0 { |
| Err(anyhow!("Failed to arm timer fd: {}", sys::errno())) |
| } else { |
| Ok(()) |
| } |
| } |
| } |
| |
| fn read_watermark_event_count(events: &mut File, name: &str) -> Result<u64> { |
| events |
| .seek(SeekFrom::Start(0)) |
| .context("failed to reset events position")?; |
| |
| let mut buf = Vec::new(); |
| events |
| .read_to_end(&mut buf) |
| .context("failed to read events")?; |
| for line in String::from_utf8_lossy(&buf).split('\n') { |
| if line.contains(name) { |
| return Ok(line.split_at(name.len()).1.trim().parse::<u64>().unwrap()); |
| } |
| } |
| |
| Err(anyhow!("failed to find high event count")) |
| } |
| |
| fn read_memcg_single_value_file(file: &mut File) -> Result<u64> { |
| file.seek(SeekFrom::Start(0)) |
| .context("failed to reset position")?; |
| |
| let mut buf = Vec::new(); |
| file.read_to_end(&mut buf).context("failed to read")?; |
| |
| Ok(String::from_utf8_lossy(&buf) |
| .trim() |
| .parse::<u64>() |
| .unwrap_or(u64::MAX)) |
| } |
| |
| fn write_memcg_limit_file(file: &mut File, limit: u64) -> Result<()> { |
| file.seek(SeekFrom::Start(0)) |
| .context("failed to reset position")?; |
| file.write(limit.to_string().as_bytes()) |
| .context(format!("failed to write to {}", file.as_raw_fd()))?; |
| Ok(()) |
| } |
| |
| // Monitors for modifications to a memcg's event file. |
| struct InotifyMonitor { |
| state: Rc<RefCell<MemcgGroup>>, |
| inotify: File, |
| controller: Rc<dyn MemcgController>, |
| } |
| |
| impl InotifyMonitor { |
| fn new( |
| state: Rc<RefCell<MemcgGroup>>, |
| controller: Rc<dyn MemcgController>, |
| ) -> Result<InotifyMonitor> { |
| // Safe because it doesn't modify memory and we check the return value. |
| let inotify_fd = unsafe { inotify_init1(O_NONBLOCK) }; |
| if inotify_fd < 0 { |
| bail!("Failed to create fd: {}", sys::errno()); |
| } |
| // Safe because we own the fd. |
| let inotify = unsafe { File::from_raw_fd(inotify_fd) }; |
| |
| let path = CString::new( |
| state |
| .borrow() |
| .path |
| .join("memory.events") |
| .to_string_lossy() |
| .as_bytes(), |
| ) |
| .context("failed to construct event path string")?; |
| // Safe because it doesn't modify memory and we check the return value. |
| let ret = unsafe { inotify_add_watch(inotify.as_raw_fd(), path.as_ptr(), IN_MODIFY) }; |
| if ret < 0 { |
| bail!("Failed to add inotify watch: {}", sys::errno()); |
| } |
| |
| Ok(InotifyMonitor { |
| state, |
| inotify, |
| controller, |
| }) |
| } |
| } |
| |
| impl AsRawFd for InotifyMonitor { |
| fn as_raw_fd(&self) -> RawFd { |
| self.inotify.as_raw_fd() |
| } |
| } |
| |
| impl Debug for InotifyMonitor { |
| fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { |
| f.debug_struct("InotifyMonitor") |
| .field("path", &self.state.borrow().path) |
| .finish() |
| } |
| } |
| |
| // Reads from the inotify file, returning whether or not there was a pending event. |
| fn read_inotify(inotify: &File) -> Result<bool> { |
| //Safe because event is a c-struct where all zeros is valid. |
| let mut event: inotify_event = unsafe { mem::zeroed() }; |
| // Safe because we check the return value and the kernel will only write to |event|. |
| let len = unsafe { |
| libc::read( |
| inotify.as_raw_fd(), |
| &mut event as *mut inotify_event as *mut c_void, |
| mem::size_of::<inotify_event>(), |
| ) |
| }; |
| if len > 0 || sys::errno() == libc::EAGAIN { |
| Ok(len > 0) |
| } else { |
| Err(anyhow!("Failed to read from inotify fd: {}", sys::errno())) |
| } |
| } |
| |
| impl EventSource for InotifyMonitor { |
| fn on_event(&mut self) -> StdResult<Option<Box<dyn Mutator>>, String> { |
| // Drain all pending inotify events |
| while read_inotify(&self.inotify).map_err(|e| format!("{:?}", e))? {} |
| |
| let mut state = self.state.borrow_mut(); |
| |
| let new_high_count = read_watermark_event_count(&mut state.events, "high") |
| .map_err(|e| format!("{:?}", e))?; |
| if new_high_count == state.high_watermark_event_count { |
| for evt in ["max", "oom", "oom_kill"] { |
| if let Ok(val) = read_watermark_event_count(&mut state.events, evt) { |
| if val != 0 { |
| info!("Memcg {} event for {:?}, count {}", evt, state.path, val); |
| } |
| } |
| } |
| return Ok(None); |
| } |
| state.high_watermark_event_count = new_high_count; |
| |
| let target_increase = state.target_max_limit_increase(); |
| if target_increase > 0 { |
| let actual_delta = self.controller.on_allocation_change(target_increase as i64); |
| state |
| .update_limits(actual_delta) |
| .map_err(|e| format!("{:?}", e))?; |
| } |
| state.no_increase_count = 0; |
| state.set_next_timer().map_err(|e| format!("{:?}", e))?; |
| |
| Ok(None) |
| } |
| } |
| |
| // Polls a memcg's memory consumption to release unused memory. |
| struct MemcgSlackMonitor { |
| state: Rc<RefCell<MemcgGroup>>, |
| controller: Rc<dyn MemcgController>, |
| } |
| |
| impl MemcgSlackMonitor { |
| fn new( |
| state: Rc<RefCell<MemcgGroup>>, |
| controller: Rc<dyn MemcgController>, |
| ) -> Result<MemcgSlackMonitor> { |
| let monitor = MemcgSlackMonitor { state, controller }; |
| monitor.state.borrow_mut().set_next_timer()?; |
| Ok(monitor) |
| } |
| } |
| |
| impl AsRawFd for MemcgSlackMonitor { |
| fn as_raw_fd(&self) -> RawFd { |
| self.state.borrow().timer.as_raw_fd() |
| } |
| } |
| |
| impl Debug for MemcgSlackMonitor { |
| fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { |
| f.debug_struct("MemcgSlackMonitor") |
| .field("path", &self.state.borrow().path) |
| .finish() |
| } |
| } |
| |
| impl EventSource for MemcgSlackMonitor { |
| fn on_event(&mut self) -> StdResult<Option<Box<dyn Mutator>>, String> { |
| let mut state = self.state.borrow_mut(); |
| let mut buf: [u8; 8] = [0; 8]; |
| if let Err(e) = state.timer.read(&mut buf) { |
| error!("Failed to reset timer {}", e); |
| drop(state); |
| return Ok(Some(Box::new(RemoveFdMutator(self.as_raw_fd())))); |
| } |
| |
| let target_decrease = state |
| .target_max_limit_decrease() |
| .map_err(|e| format!("{:?}", e))?; |
| if target_decrease > 0 { |
| // Update the limits before changing the allocation, to ensure the app |
| // doesn't OOM while we're releasing memory. |
| let delta = -(target_decrease as i64); |
| state.update_limits(delta).map_err(|e| format!("{:?}", e))?; |
| |
| let actual_delta = self.controller.on_allocation_change(delta); |
| |
| if actual_delta != delta { |
| warn!( |
| "unexpected allocation change: expected={} actual={}", |
| delta, actual_delta |
| ); |
| state |
| .update_limits(actual_delta - delta) |
| .map_err(|e| format!("{:?}", e))?; |
| } |
| } |
| state.no_increase_count += 1; |
| state.set_next_timer().map_err(|e| format!("{:?}", e))?; |
| |
| Ok(None) |
| } |
| } |
| |
| /// Start monitor the memcg |name|. Set it's initial memory limit to |initial_max_limit_bytes|. |
| pub fn monitor_memcg( |
| name: &str, |
| ctx: &mut EventMultiplexer, |
| controller: Rc<dyn MemcgController>, |
| initial_max_limit_bytes: u64, |
| ) -> Result<()> { |
| let state = Rc::new(RefCell::new(MemcgGroup::new( |
| name, |
| initial_max_limit_bytes, |
| )?)); |
| |
| ctx.add_event(Box::new(InotifyMonitor::new( |
| state.clone(), |
| controller.clone(), |
| )?)) |
| .context("failed to add inotify monitor")?; |
| |
| ctx.add_event(Box::new(MemcgSlackMonitor::new(state, controller.clone())?)) |
| .context("failed to add slack monitor")?; |
| |
| Ok(()) |
| } |