// SPDX-License-Identifier: GPL-2.0-only /* * intel_powerclamp.c - package c-state idle injection * * Copyright (c) 2012-2023, Intel Corporation. * * Authors: * Arjan van de Ven <arjan@linux.intel.com> * Jacob Pan <jacob.jun.pan@linux.intel.com> * * TODO: * 1. better handle wakeup from external interrupts, currently a fixed * compensation is added to clamping duration when excessive amount * of wakeups are observed during idle time. the reason is that in * case of external interrupts without need for ack, clamping down * cpu in non-irq context does not reduce irq. for majority of the * cases, clamping down cpu does help reduce irq as well, we should * be able to differentiate the two cases and give a quantitative * solution for the irqs that we can control. perhaps based on * get_cpu_iowait_time_us() * * 2. synchronization with other hw blocks
*/
#define MAX_TARGET_RATIO (100U) /* For each undisturbed clamping period (no extra wake ups during idle time), * we increment the confidence counter for the given target ratio. * CONFIDENCE_OK defines the level where runtime calibration results are * valid.
*/ #define CONFIDENCE_OK (3) /* Default idle injection duration, driver adjust sleep time to meet target * idle ratio. Similar to frequency modulation.
*/ #define DEFAULT_DURATION_JIFFIES (6)
/* This duration is in microseconds */ staticunsignedint duration; staticunsignedint pkg_cstate_ratio_cur; staticunsignedint window_size;
staticint duration_set(constchar *arg, conststruct kernel_param *kp)
{ int ret = 0; unsignedlong new_duration;
ret = kstrtoul(arg, 10, &new_duration); if (ret) gotoexit; if (new_duration > 25 || new_duration < 6) {
pr_err("Out of recommended range %lu, between 6-25ms\n",
new_duration);
ret = -EINVAL; gotoexit;
}
/* Can't set mask when cooling device is in use */ if (powerclamp_data.clamping) {
ret = -EAGAIN; goto skip_cpumask_set;
}
ret = alloc_cpumask_var(&new_mask, GFP_KERNEL); if (!ret) goto skip_cpumask_set;
ret = bitmap_parse(arg, strlen(arg), cpumask_bits(new_mask),
nr_cpumask_bits); if (ret) goto free_cpumask_set;
if (cpumask_empty(new_mask) || check_invalid(new_mask, max_idle)) {
ret = -EINVAL; goto free_cpumask_set;
}
/* * When module parameters are passed from kernel command line * during insmod, the module parameter callback is called * before powerclamp_init(), so we can't assume that some * cpumask can be allocated and copied before here. Also * in this case this cpumask is used as the default mask.
*/
ret = allocate_copy_idle_injection_mask(new_mask);
module_param_cb(max_idle, &max_idle_ops, &max_idle, 0644);
MODULE_PARM_DESC(max_idle, "maximum injected idle time to the total CPU time ratio in percent range:1-100");
struct powerclamp_calibration_data { unsignedlong confidence; /* used for calibration, basically a counter * gets incremented each time a clamping * period is completed without extra wakeups * once that counter is reached given level, * compensation is deemed usable.
*/ unsignedlong steady_comp; /* steady state compensation used when * no extra wakeups occurred.
*/ unsignedlong dynamic_comp; /* compensate excessive wakeup from idle * mostly from external interrupts.
*/
};
module_param_cb(window_size, &window_size_ops, &window_size, 0644);
MODULE_PARM_DESC(window_size, "sliding window in number of clamping cycles\n" "\tpowerclamp controls idle ratio within this window. larger\n" "\twindow size results in slower response time but more smooth\n" "\tclamping results. default to 2.");
struct pkg_cstate_info { bool skip; int msr_index; int cstate_id;
};
/* check result for the last window */
msr_now = pkg_state_counter();
tsc_now = rdtsc();
/* calculate pkg cstate vs tsc ratio */ if (!msr_last || !tsc_last)
current_ratio = 1; elseif (tsc_now-tsc_last) {
val64 = 100*(msr_now-msr_last);
do_div(val64, (tsc_now-tsc_last));
current_ratio = val64;
}
/* update record */
msr_last = msr_now;
tsc_last = tsc_now;
adjust_compensation(target_ratio, win);
/* if we are above target+guard, skip */ return powerclamp_data.target_ratio + guard <= current_ratio;
}
/* * This function calculates runtime from the current target ratio. * This function gets called under powerclamp_lock.
*/ staticunsignedint get_run_time(void)
{ unsignedint compensated_ratio; unsignedint runtime;
/* * make sure user selected ratio does not take effect until * the next round. adjust target_ratio if user has changed * target such that we can converge quickly.
*/
powerclamp_data.guard = 1 + powerclamp_data.target_ratio / 20;
powerclamp_data.window_size_now = window_size;
/* * systems may have different ability to enter package level * c-states, thus we need to compensate the injected idle ratio * to achieve the actual target reported by the HW.
*/
compensated_ratio = powerclamp_data.target_ratio +
get_compensation(powerclamp_data.target_ratio); if (compensated_ratio <= 0)
compensated_ratio = 1;
/* * 1 HZ polling while clamping is active, useful for userspace * to monitor actual idle ratio.
*/ staticvoid poll_pkg_cstate(struct work_struct *dummy); static DECLARE_DELAYED_WORK(poll_pkg_cstate_work, poll_pkg_cstate); staticvoid poll_pkg_cstate(struct work_struct *dummy)
{ static u64 msr_last; static u64 tsc_last;
u64 msr_now;
u64 tsc_now;
u64 val64;
msr_now = pkg_state_counter();
tsc_now = rdtsc();
/* calculate pkg cstate vs tsc ratio */ if (!msr_last || !tsc_last)
pkg_cstate_ratio_cur = 1; else { if (tsc_now - tsc_last) {
val64 = 100 * (msr_now - msr_last);
do_div(val64, (tsc_now - tsc_last));
pkg_cstate_ratio_cur = val64;
}
}
/* update record */
msr_last = msr_now;
tsc_last = tsc_now;
mutex_lock(&powerclamp_lock); if (powerclamp_data.clamping)
schedule_delayed_work(&poll_pkg_cstate_work, HZ);
mutex_unlock(&powerclamp_lock);
}
staticstruct idle_inject_device *ii_dev;
/* * This function is called from idle injection core on timer expiry * for the run duration. This allows powerclamp to readjust or skip * injecting idle for this cycle.
*/ staticbool idle_inject_update(void)
{ bool update = false;
/* We can't sleep in this callback */ if (!mutex_trylock(&powerclamp_lock)) returntrue;
if (!(powerclamp_data.count % powerclamp_data.window_size_now)) {
/* * This function is called from start_power_clamp() to register * CPUS with powercap idle injection register and set default * idle duration and latency.
*/ staticint powerclamp_idle_injection_register(void)
{
poll_pkg_cstate_enable = false; if (cpumask_equal(cpu_present_mask, idle_injection_cpu_mask)) {
ii_dev = idle_inject_register_full(idle_injection_cpu_mask, idle_inject_update); if (topology_max_packages() == 1 && topology_max_dies_per_package() == 1)
poll_pkg_cstate_enable = true;
} else {
ii_dev = idle_inject_register(idle_injection_cpu_mask);
}
if (!ii_dev) {
pr_err("powerclamp: idle_inject_register failed\n"); return -EAGAIN;
}
/* * This function is called from end_power_clamp() to stop idle injection * and unregister CPUS from powercap idle injection core.
*/ staticvoid remove_idle_injection(void)
{ if (!powerclamp_data.clamping) return;
/* * This function is called when user change the cooling device * state from zero to some other value.
*/ staticint start_power_clamp(void)
{ int ret;
ret = powerclamp_idle_injection_register(); if (!ret) {
trigger_idle_injection(); if (poll_pkg_cstate_enable)
schedule_delayed_work(&poll_pkg_cstate_work, 0);
}
return ret;
}
/* * This function is called when user change the cooling device * state from non zero value zero.
*/ staticvoid end_power_clamp(void)
{ if (powerclamp_data.clamping) {
remove_idle_injection();
idle_inject_unregister(ii_dev);
}
}
if (!x86_match_cpu(intel_powerclamp_ids)) {
pr_err("CPU does not support MWAIT\n"); return -ENODEV;
}
/* The goal for idle time alignment is to achieve package cstate. */ if (!has_pkg_state_counter()) {
pr_info("No package C-state available\n"); return -ENODEV;
}
return 0;
}
staticint powerclamp_debug_show(struct seq_file *m, void *unused)
{ int i = 0;
seq_printf(m, "pct confidence steady dynamic (compensation)\n"); for (i = 0; i < MAX_TARGET_RATIO; i++) {
seq_printf(m, "%d\t%lu\t%lu\t%lu\n",
i,
cal_data[i].confidence,
cal_data[i].steady_comp,
cal_data[i].dynamic_comp);
}
if (cpumask_available(idle_injection_cpu_mask))
free_cpumask_var(idle_injection_cpu_mask);
}
module_exit(powerclamp_exit);
MODULE_IMPORT_NS("IDLE_INJECT");
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>");
MODULE_AUTHOR("Jacob Pan <jacob.jun.pan@linux.intel.com>");
MODULE_DESCRIPTION("Package Level C-state Idle Injection for Intel CPUs");
Messung V0.5 in Prozent
¤ Dauer der Verarbeitung: 0.12 Sekunden
(vorverarbeitet am 2026-04-29)
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.