| // SPDX-License-Identifier: GPL-2.0 |
| /* |
| * Timer events oriented CPU idle governor |
| * |
| * Copyright (C) 2018 Intel Corporation |
| * Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com> |
| * |
| * The idea of this governor is based on the observation that on many systems |
| * timer events are two or more orders of magnitude more frequent than any |
| * other interrupts, so they are likely to be the most significant source of CPU |
| * wakeups from idle states. Moreover, information about what happened in the |
| * (relatively recent) past can be used to estimate whether or not the deepest |
| * idle state with target residency within the time to the closest timer is |
| * likely to be suitable for the upcoming idle time of the CPU and, if not, then |
| * which of the shallower idle states to choose. |
| * |
| * Of course, non-timer wakeup sources are more important in some use cases and |
| * they can be covered by taking a few most recent idle time intervals of the |
| * CPU into account. However, even in that case it is not necessary to consider |
| * idle duration values greater than the time till the closest timer, as the |
| * patterns that they may belong to produce average values close enough to |
| * the time till the closest timer (sleep length) anyway. |
| * |
| * Thus this governor estimates whether or not the upcoming idle time of the CPU |
| * is likely to be significantly shorter than the sleep length and selects an |
| * idle state for it in accordance with that, as follows: |
| * |
| * - Find an idle state on the basis of the sleep length and state statistics |
| * collected over time: |
| * |
| * o Find the deepest idle state whose target residency is less than or equal |
| * to the sleep length. |
| * |
| * o Select it if it matched both the sleep length and the observed idle |
| * duration in the past more often than it matched the sleep length alone |
| * (i.e. the observed idle duration was significantly shorter than the sleep |
| * length matched by it). |
| * |
| * o Otherwise, select the shallower state with the greatest matched "early" |
| * wakeups metric. |
| * |
| * - If the majority of the most recent idle duration values are below the |
| * target residency of the idle state selected so far, use those values to |
| * compute the new expected idle duration and find an idle state matching it |
| * (which has to be shallower than the one selected so far). |
| */ |
| |
| #include <linux/cpuidle.h> |
| #include <linux/jiffies.h> |
| #include <linux/kernel.h> |
| #include <linux/sched/clock.h> |
| #include <linux/tick.h> |
| |
| /* |
| * The PULSE value is added to metrics when they grow and the DECAY_SHIFT value |
| * is used for decreasing metrics on a regular basis. |
| */ |
| #define PULSE 1024 |
| #define DECAY_SHIFT 3 |
| |
| /* |
| * Number of the most recent idle duration values to take into consideration for |
| * the detection of wakeup patterns. |
| */ |
| #define INTERVALS 8 |
| |
| /** |
| * struct teo_idle_state - Idle state data used by the TEO cpuidle governor. |
| * @early_hits: "Early" CPU wakeups "matching" this state. |
| * @hits: "On time" CPU wakeups "matching" this state. |
| * @misses: CPU wakeups "missing" this state. |
| * |
| * A CPU wakeup is "matched" by a given idle state if the idle duration measured |
| * after the wakeup is between the target residency of that state and the target |
| * residency of the next one (or if this is the deepest available idle state, it |
| * "matches" a CPU wakeup when the measured idle duration is at least equal to |
| * its target residency). |
| * |
| * Also, from the TEO governor perspective, a CPU wakeup from idle is "early" if |
| * it occurs significantly earlier than the closest expected timer event (that |
| * is, early enough to match an idle state shallower than the one matching the |
| * time till the closest timer event). Otherwise, the wakeup is "on time", or |
| * it is a "hit". |
| * |
| * A "miss" occurs when the given state doesn't match the wakeup, but it matches |
| * the time till the closest timer event used for idle state selection. |
| */ |
| struct teo_idle_state { |
| unsigned int early_hits; |
| unsigned int hits; |
| unsigned int misses; |
| }; |
| |
| /** |
| * struct teo_cpu - CPU data used by the TEO cpuidle governor. |
| * @time_span_ns: Time between idle state selection and post-wakeup update. |
| * @sleep_length_ns: Time till the closest timer event (at the selection time). |
| * @states: Idle states data corresponding to this CPU. |
| * @interval_idx: Index of the most recent saved idle interval. |
| * @intervals: Saved idle duration values. |
| */ |
| struct teo_cpu { |
| u64 time_span_ns; |
| u64 sleep_length_ns; |
| struct teo_idle_state states[CPUIDLE_STATE_MAX]; |
| int interval_idx; |
| unsigned int intervals[INTERVALS]; |
| }; |
| |
| static DEFINE_PER_CPU(struct teo_cpu, teo_cpus); |
| |
| /** |
| * teo_update - Update CPU data after wakeup. |
| * @drv: cpuidle driver containing state data. |
| * @dev: Target CPU. |
| */ |
| static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev) |
| { |
| struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu); |
| unsigned int sleep_length_us = ktime_to_us(cpu_data->sleep_length_ns); |
| int i, idx_hit = -1, idx_timer = -1; |
| unsigned int measured_us; |
| |
| if (cpu_data->time_span_ns >= cpu_data->sleep_length_ns) { |
| /* |
| * One of the safety nets has triggered or the wakeup was close |
| * enough to the closest timer event expected at the idle state |
| * selection time to be discarded. |
| */ |
| measured_us = UINT_MAX; |
| } else { |
| unsigned int lat; |
| |
| lat = drv->states[dev->last_state_idx].exit_latency; |
| |
| measured_us = ktime_to_us(cpu_data->time_span_ns); |
| /* |
| * The delay between the wakeup and the first instruction |
| * executed by the CPU is not likely to be worst-case every |
| * time, so take 1/2 of the exit latency as a very rough |
| * approximation of the average of it. |
| */ |
| if (measured_us >= lat) |
| measured_us -= lat / 2; |
| else |
| measured_us /= 2; |
| } |
| |
| /* |
| * Decay the "early hits" metric for all of the states and find the |
| * states matching the sleep length and the measured idle duration. |
| */ |
| for (i = 0; i < drv->state_count; i++) { |
| unsigned int early_hits = cpu_data->states[i].early_hits; |
| |
| cpu_data->states[i].early_hits -= early_hits >> DECAY_SHIFT; |
| |
| if (drv->states[i].target_residency <= sleep_length_us) { |
| idx_timer = i; |
| if (drv->states[i].target_residency <= measured_us) |
| idx_hit = i; |
| } |
| } |
| |
| /* |
| * Update the "hits" and "misses" data for the state matching the sleep |
| * length. If it matches the measured idle duration too, this is a hit, |
| * so increase the "hits" metric for it then. Otherwise, this is a |
| * miss, so increase the "misses" metric for it. In the latter case |
| * also increase the "early hits" metric for the state that actually |
| * matches the measured idle duration. |
| */ |
| if (idx_timer >= 0) { |
| unsigned int hits = cpu_data->states[idx_timer].hits; |
| unsigned int misses = cpu_data->states[idx_timer].misses; |
| |
| hits -= hits >> DECAY_SHIFT; |
| misses -= misses >> DECAY_SHIFT; |
| |
| if (idx_timer > idx_hit) { |
| misses += PULSE; |
| if (idx_hit >= 0) |
| cpu_data->states[idx_hit].early_hits += PULSE; |
| } else { |
| hits += PULSE; |
| } |
| |
| cpu_data->states[idx_timer].misses = misses; |
| cpu_data->states[idx_timer].hits = hits; |
| } |
| |
| /* |
| * Save idle duration values corresponding to non-timer wakeups for |
| * pattern detection. |
| */ |
| cpu_data->intervals[cpu_data->interval_idx++] = measured_us; |
| if (cpu_data->interval_idx >= INTERVALS) |
| cpu_data->interval_idx = 0; |
| } |
| |
| /** |
| * teo_find_shallower_state - Find shallower idle state matching given duration. |
| * @drv: cpuidle driver containing state data. |
| * @dev: Target CPU. |
| * @state_idx: Index of the capping idle state. |
| * @duration_us: Idle duration value to match. |
| */ |
| static int teo_find_shallower_state(struct cpuidle_driver *drv, |
| struct cpuidle_device *dev, int state_idx, |
| unsigned int duration_us) |
| { |
| int i; |
| |
| for (i = state_idx - 1; i >= 0; i--) { |
| if (drv->states[i].disabled || dev->states_usage[i].disable) |
| continue; |
| |
| state_idx = i; |
| if (drv->states[i].target_residency <= duration_us) |
| break; |
| } |
| return state_idx; |
| } |
| |
| /** |
| * teo_select - Selects the next idle state to enter. |
| * @drv: cpuidle driver containing state data. |
| * @dev: Target CPU. |
| * @stop_tick: Indication on whether or not to stop the scheduler tick. |
| */ |
| static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, |
| bool *stop_tick) |
| { |
| struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu); |
| int latency_req = cpuidle_governor_latency_req(dev->cpu); |
| unsigned int duration_us, hits, misses, early_hits; |
| int max_early_idx, prev_max_early_idx, constraint_idx, idx, i; |
| ktime_t delta_tick; |
| |
| if (dev->last_state_idx >= 0) { |
| teo_update(drv, dev); |
| dev->last_state_idx = -1; |
| } |
| |
| cpu_data->time_span_ns = local_clock(); |
| |
| cpu_data->sleep_length_ns = tick_nohz_get_sleep_length(&delta_tick); |
| duration_us = ktime_to_us(cpu_data->sleep_length_ns); |
| |
| hits = 0; |
| misses = 0; |
| early_hits = 0; |
| max_early_idx = -1; |
| prev_max_early_idx = -1; |
| constraint_idx = drv->state_count; |
| idx = -1; |
| |
| for (i = 0; i < drv->state_count; i++) { |
| struct cpuidle_state *s = &drv->states[i]; |
| struct cpuidle_state_usage *su = &dev->states_usage[i]; |
| |
| if (s->disabled || su->disable) { |
| /* |
| * Ignore disabled states with target residencies beyond |
| * the anticipated idle duration. |
| */ |
| if (s->target_residency > duration_us) |
| continue; |
| |
| /* |
| * This state is disabled, so the range of idle duration |
| * values corresponding to it is covered by the current |
| * candidate state, but still the "hits" and "misses" |
| * metrics of the disabled state need to be used to |
| * decide whether or not the state covering the range in |
| * question is good enough. |
| */ |
| hits = cpu_data->states[i].hits; |
| misses = cpu_data->states[i].misses; |
| |
| if (early_hits >= cpu_data->states[i].early_hits || |
| idx < 0) |
| continue; |
| |
| /* |
| * If the current candidate state has been the one with |
| * the maximum "early hits" metric so far, the "early |
| * hits" metric of the disabled state replaces the |
| * current "early hits" count to avoid selecting a |
| * deeper state with lower "early hits" metric. |
| */ |
| if (max_early_idx == idx) { |
| early_hits = cpu_data->states[i].early_hits; |
| continue; |
| } |
| |
| /* |
| * The current candidate state is closer to the disabled |
| * one than the current maximum "early hits" state, so |
| * replace the latter with it, but in case the maximum |
| * "early hits" state index has not been set so far, |
| * check if the current candidate state is not too |
| * shallow for that role. |
| */ |
| if (!(tick_nohz_tick_stopped() && |
| drv->states[idx].target_residency < TICK_USEC)) { |
| prev_max_early_idx = max_early_idx; |
| early_hits = cpu_data->states[i].early_hits; |
| max_early_idx = idx; |
| } |
| |
| continue; |
| } |
| |
| if (idx < 0) { |
| idx = i; /* first enabled state */ |
| hits = cpu_data->states[i].hits; |
| misses = cpu_data->states[i].misses; |
| } |
| |
| if (s->target_residency > duration_us) |
| break; |
| |
| if (s->exit_latency > latency_req && constraint_idx > i) |
| constraint_idx = i; |
| |
| idx = i; |
| hits = cpu_data->states[i].hits; |
| misses = cpu_data->states[i].misses; |
| |
| if (early_hits < cpu_data->states[i].early_hits && |
| !(tick_nohz_tick_stopped() && |
| drv->states[i].target_residency < TICK_USEC)) { |
| prev_max_early_idx = max_early_idx; |
| early_hits = cpu_data->states[i].early_hits; |
| max_early_idx = i; |
| } |
| } |
| |
| /* |
| * If the "hits" metric of the idle state matching the sleep length is |
| * greater than its "misses" metric, that is the one to use. Otherwise, |
| * it is more likely that one of the shallower states will match the |
| * idle duration observed after wakeup, so take the one with the maximum |
| * "early hits" metric, but if that cannot be determined, just use the |
| * state selected so far. |
| */ |
| if (hits <= misses) { |
| /* |
| * The current candidate state is not suitable, so take the one |
| * whose "early hits" metric is the maximum for the range of |
| * shallower states. |
| */ |
| if (idx == max_early_idx) |
| max_early_idx = prev_max_early_idx; |
| |
| if (max_early_idx >= 0) { |
| idx = max_early_idx; |
| duration_us = drv->states[idx].target_residency; |
| } |
| } |
| |
| /* |
| * If there is a latency constraint, it may be necessary to use a |
| * shallower idle state than the one selected so far. |
| */ |
| if (constraint_idx < idx) |
| idx = constraint_idx; |
| |
| if (idx < 0) { |
| idx = 0; /* No states enabled. Must use 0. */ |
| } else if (idx > 0) { |
| unsigned int count = 0; |
| u64 sum = 0; |
| |
| /* |
| * Count and sum the most recent idle duration values less than |
| * the current expected idle duration value. |
| */ |
| for (i = 0; i < INTERVALS; i++) { |
| unsigned int val = cpu_data->intervals[i]; |
| |
| if (val >= duration_us) |
| continue; |
| |
| count++; |
| sum += val; |
| } |
| |
| /* |
| * Give up unless the majority of the most recent idle duration |
| * values are in the interesting range. |
| */ |
| if (count > INTERVALS / 2) { |
| unsigned int avg_us = div64_u64(sum, count); |
| |
| /* |
| * Avoid spending too much time in an idle state that |
| * would be too shallow. |
| */ |
| if (!(tick_nohz_tick_stopped() && avg_us < TICK_USEC)) { |
| duration_us = avg_us; |
| if (drv->states[idx].target_residency > avg_us) |
| idx = teo_find_shallower_state(drv, dev, |
| idx, avg_us); |
| } |
| } |
| } |
| |
| /* |
| * Don't stop the tick if the selected state is a polling one or if the |
| * expected idle duration is shorter than the tick period length. |
| */ |
| if (((drv->states[idx].flags & CPUIDLE_FLAG_POLLING) || |
| duration_us < TICK_USEC) && !tick_nohz_tick_stopped()) { |
| unsigned int delta_tick_us = ktime_to_us(delta_tick); |
| |
| *stop_tick = false; |
| |
| /* |
| * The tick is not going to be stopped, so if the target |
| * residency of the state to be returned is not within the time |
| * till the closest timer including the tick, try to correct |
| * that. |
| */ |
| if (idx > 0 && drv->states[idx].target_residency > delta_tick_us) |
| idx = teo_find_shallower_state(drv, dev, idx, delta_tick_us); |
| } |
| |
| return idx; |
| } |
| |
| /** |
| * teo_reflect - Note that governor data for the CPU need to be updated. |
| * @dev: Target CPU. |
| * @state: Entered state. |
| */ |
| static void teo_reflect(struct cpuidle_device *dev, int state) |
| { |
| struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu); |
| |
| dev->last_state_idx = state; |
| /* |
| * If the wakeup was not "natural", but triggered by one of the safety |
| * nets, assume that the CPU might have been idle for the entire sleep |
| * length time. |
| */ |
| if (dev->poll_time_limit || |
| (tick_nohz_idle_got_tick() && cpu_data->sleep_length_ns > TICK_NSEC)) { |
| dev->poll_time_limit = false; |
| cpu_data->time_span_ns = cpu_data->sleep_length_ns; |
| } else { |
| cpu_data->time_span_ns = local_clock() - cpu_data->time_span_ns; |
| } |
| } |
| |
| /** |
| * teo_enable_device - Initialize the governor's data for the target CPU. |
| * @drv: cpuidle driver (not used). |
| * @dev: Target CPU. |
| */ |
| static int teo_enable_device(struct cpuidle_driver *drv, |
| struct cpuidle_device *dev) |
| { |
| struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu); |
| int i; |
| |
| memset(cpu_data, 0, sizeof(*cpu_data)); |
| |
| for (i = 0; i < INTERVALS; i++) |
| cpu_data->intervals[i] = UINT_MAX; |
| |
| return 0; |
| } |
| |
| static struct cpuidle_governor teo_governor = { |
| .name = "teo", |
| .rating = 19, |
| .enable = teo_enable_device, |
| .select = teo_select, |
| .reflect = teo_reflect, |
| }; |
| |
| static int __init teo_governor_init(void) |
| { |
| return cpuidle_register_governor(&teo_governor); |
| } |
| |
| postcore_initcall(teo_governor_init); |