// SPDX-License-Identifier: GPL-2.0-only /* * intel_idle.c - native hardware idle loop for modern Intel processors * * Copyright (c) 2013 - 2020, Intel Corporation. * Len Brown <len.brown@intel.com> * Rafael J. Wysocki <rafael.j.wysocki@intel.com>
*/
/* * intel_idle is a cpuidle driver that loads on all Intel CPUs with MWAIT * in lieu of the legacy ACPI processor_idle driver. The intent is to * make Linux more efficient on these processors, as intel_idle knows * more than ACPI, as well as make Linux more immune to ACPI BIOS bugs.
*/
/* * Design Assumptions * * All CPUs have same idle states as boot CPU * * Chipset BM_STS (bus master status) bit is a NOP * for preventing entry into deep C-states * * CPU will flush caches as needed when entering a C-state via MWAIT * (in contrast to entering ACPI C3, in which case the WBINVD * instruction needs to be executed to flush the caches)
*/
/* * Known limitations * * ACPI has a .suspend hack to turn off deep c-statees during suspend * to avoid complications with the lapic timer workaround. * Have not seen issues with suspend, but may need same workaround here. *
*/
/* * Enable interrupts before entering the C-state. On some platforms and for * some C-states, this may measurably decrease interrupt latency.
*/ #define CPUIDLE_FLAG_IRQ_ENABLE BIT(14)
/* * Enable this state by default even if the ACPI _CST does not list it.
*/ #define CPUIDLE_FLAG_ALWAYS_ENABLE BIT(15)
/* * Disable IBRS across idle (when KERNEL_IBRS), is exclusive vs IRQ_ENABLE * above.
*/ #define CPUIDLE_FLAG_IBRS BIT(16)
/* * Initialize large xstate for the C6-state entrance.
*/ #define CPUIDLE_FLAG_INIT_XSTATE BIT(17)
/* * Ignore the sub-state when matching mwait hints between the ACPI _CST and * custom tables.
*/ #define CPUIDLE_FLAG_PARTIAL_HINT_MATCH BIT(18)
/* * MWAIT takes an 8-bit "hint" in EAX "suggesting" * the C-state (top nibble) and sub-state (bottom nibble) * 0x00 means "MWAIT(C1)", 0x10 means "MWAIT(C2)" etc. * * We store the hint at the top of our "flags" for each state.
*/ #define flg2MWAIT(flags) (((flags) >> 24) & 0xFF) #define MWAIT2flg(eax) ((eax & 0xFF) << 24)
static __always_inline int __intel_idle(struct cpuidle_device *dev, struct cpuidle_driver *drv, int index, bool irqoff)
{ struct cpuidle_state *state = &drv->states[index]; unsignedint eax = flg2MWAIT(state->flags); unsignedint ecx = 1*irqoff; /* break on interrupt flag */
mwait_idle_with_hints(eax, ecx);
return index;
}
/** * intel_idle - Ask the processor to enter the given idle state. * @dev: cpuidle device of the target CPU. * @drv: cpuidle driver (assumed to point to intel_idle_driver). * @index: Target idle state index. * * Use the MWAIT instruction to notify the processor that the CPU represented by * @dev is idle and it can try to enter the idle state corresponding to @index. * * If the local APIC timer is not known to be reliable in the target idle state, * enable one-shot tick broadcasting for the target CPU before executing MWAIT. * * Must be called under local_irq_disable().
*/ static __cpuidle int intel_idle(struct cpuidle_device *dev, struct cpuidle_driver *drv, int index)
{ return __intel_idle(dev, drv, index, true);
}
static __cpuidle int intel_idle_irq(struct cpuidle_device *dev, struct cpuidle_driver *drv, int index)
{ return __intel_idle(dev, drv, index, false);
}
static __cpuidle int intel_idle_ibrs(struct cpuidle_device *dev, struct cpuidle_driver *drv, int index)
{ bool smt_active = sched_smt_active();
u64 spec_ctrl = spec_ctrl_current(); int ret;
if (smt_active)
__update_spec_ctrl(0);
ret = __intel_idle(dev, drv, index, true);
if (smt_active)
__update_spec_ctrl(spec_ctrl);
return ret;
}
static __cpuidle int intel_idle_xstate(struct cpuidle_device *dev, struct cpuidle_driver *drv, int index)
{
fpu_idle_fpregs(); return __intel_idle(dev, drv, index, true);
}
/** * intel_idle_s2idle - Ask the processor to enter the given idle state. * @dev: cpuidle device of the target CPU. * @drv: cpuidle driver (assumed to point to intel_idle_driver). * @index: Target idle state index. * * Use the MWAIT instruction to notify the processor that the CPU represented by * @dev is idle and it can try to enter the idle state corresponding to @index. * * Invoked as a suspend-to-idle callback routine with frozen user space, frozen * scheduler tick and suspended scheduler clock on the target CPU.
*/ static __cpuidle int intel_idle_s2idle(struct cpuidle_device *dev, struct cpuidle_driver *drv, int index)
{ struct cpuidle_state *state = &drv->states[index]; unsignedint eax = flg2MWAIT(state->flags); unsignedint ecx = 1; /* break on interrupt flag */
if (state->flags & CPUIDLE_FLAG_INIT_XSTATE)
fpu_idle_fpregs();
/* * Switch over to one-shot tick broadcast if the target C-state * is deeper than C1.
*/ return !!((eax >> MWAIT_SUBSTATE_SIZE) & MWAIT_CSTATE_MASK);
}
staticbool no_acpi __read_mostly;
module_param(no_acpi, bool, 0444);
MODULE_PARM_DESC(no_acpi, "Do not use ACPI _CST for building the idle states list");
staticbool force_use_acpi __read_mostly; /* No effect if no_acpi is set. */
module_param_named(use_acpi, force_use_acpi, bool, 0444);
MODULE_PARM_DESC(use_acpi, "Use ACPI _CST for building the idle states list");
staticbool no_native __read_mostly; /* No effect if no_acpi is set. */
module_param_named(no_native, no_native, bool, 0444);
MODULE_PARM_DESC(no_native, "Ignore cpu specific (native) idle states in lieu of ACPI idle states");
/** * intel_idle_cst_usable - Check if the _CST information can be used. * * Check if all of the C-states listed by _CST in the max_cstate range are * ACPI_CSTATE_FFH, which means that they should be entered via MWAIT.
*/ staticbool __init intel_idle_cst_usable(void)
{ int cstate, limit;
/* * If limit > 0, intel_idle_cst_usable() has returned 'true', so all of * the interesting states are ACPI_CSTATE_FFH.
*/ for (cstate = 1; cstate < limit; cstate++) { struct acpi_processor_cx *cx; struct cpuidle_state *state;
if (intel_idle_max_cstate_reached(cstate - 1)) break;
cx = &acpi_state_table.states[cstate];
state = &drv->states[drv->state_count++];
snprintf(state->name, CPUIDLE_NAME_LEN, "C%d_ACPI", cstate);
strscpy(state->desc, cx->desc, CPUIDLE_DESC_LEN);
state->exit_latency = cx->latency; /* * For C1-type C-states use the same number for both the exit * latency and target residency, because that is the case for * C1 in the majority of the static C-states tables above. * For the other types of C-states, however, set the target * residency to 3 times the exit latency which should lead to * a reasonable balance between energy-efficiency and * performance in the majority of interesting cases.
*/
state->target_residency = cx->latency; if (cx->type > ACPI_STATE_C1)
state->target_residency *= 3;
state->flags = MWAIT2flg(cx->address); if (cx->type > ACPI_STATE_C2)
state->flags |= CPUIDLE_FLAG_TLB_FLUSHED;
if (disabled_states_mask & BIT(cstate))
state->flags |= CPUIDLE_FLAG_OFF;
if (intel_idle_state_needs_timer_stop(state))
state->flags |= CPUIDLE_FLAG_TIMER_STOP;
if (cx->type > ACPI_STATE_C1 && !boot_cpu_has(X86_FEATURE_NONSTOP_TSC))
mark_tsc_unstable("TSC halts in idle");
staticbool __init intel_idle_off_by_default(unsignedint flags, u32 mwait_hint)
{ int cstate, limit;
/* * If there are no _CST C-states, do not disable any C-states by * default.
*/ if (!acpi_state_table.count) returnfalse;
limit = min_t(int, CPUIDLE_STATE_MAX, acpi_state_table.count); /* * If limit > 0, intel_idle_cst_usable() has returned 'true', so all of * the interesting states are ACPI_CSTATE_FFH.
*/ for (cstate = 1; cstate < limit; cstate++) {
u32 acpi_hint = acpi_state_table.states[cstate].address;
u32 table_hint = mwait_hint;
/** * bxt_idle_state_table_update - Fix up the Broxton idle states table. * * On BXT, trust the IRTL (Interrupt Response Time Limit) MSR to show the * definitive maximum latency and use the same value for target_residency.
*/ staticvoid __init bxt_idle_state_table_update(void)
{ unsignedlonglong msr; unsignedint usec;
/** * sklh_idle_state_table_update - Fix up the Sky Lake idle states table. * * On SKL-H (model 0x5e) skip C8 and C9 if C10 is enabled and SGX disabled.
*/ staticvoid __init sklh_idle_state_table_update(void)
{ unsignedlonglong msr; unsignedint eax, ebx, ecx, edx;
/* if PC10 disabled via cmdline intel_idle.max_cstate=7 or shallower */ if (max_cstate <= 7) return;
/* if PC10 not present in CPUID.MWAIT.EDX */ if ((mwait_substates & (0xF << 28)) == 0) return;
rdmsrq(MSR_PKG_CST_CONFIG_CONTROL, msr);
/* PC10 is not enabled in PKG C-state limit */ if ((msr & 0xF) != 8) return;
ecx = 0;
cpuid(7, &eax, &ebx, &ecx, &edx);
/* if SGX is present */ if (ebx & (1 << 2)) {
rdmsrq(MSR_IA32_FEAT_CTL, msr);
/* if SGX is enabled */ if (msr & (1 << 18)) return;
}
/** * skx_idle_state_table_update - Adjust the Sky Lake/Cascade Lake * idle states table.
*/ staticvoid __init skx_idle_state_table_update(void)
{ unsignedlonglong msr;
rdmsrq(MSR_PKG_CST_CONFIG_CONTROL, msr);
/* * 000b: C0/C1 (no package C-state support) * 001b: C2 * 010b: C6 (non-retention) * 011b: C6 (retention) * 111b: No Package C state limits.
*/ if ((msr & 0x7) < 2) { /* * Uses the CC6 + PC0 latency and 3 times of * latency for target_residency if the PC6 * is disabled in BIOS. This is consistent * with how intel_idle driver uses _CST * to set the target_residency.
*/
skx_cstates[2].exit_latency = 92;
skx_cstates[2].target_residency = 276;
}
}
/** * adl_idle_state_table_update - Adjust AlderLake idle states table.
*/ staticvoid __init adl_idle_state_table_update(void)
{ /* Check if user prefers C1 over C1E. */ if (preferred_states_mask & BIT(1) && !(preferred_states_mask & BIT(2))) {
cpuidle_state_table[0].flags &= ~CPUIDLE_FLAG_UNUSABLE;
cpuidle_state_table[1].flags |= CPUIDLE_FLAG_UNUSABLE;
/* Disable C1E by clearing the "C1E promotion" bit. */
c1e_promotion = C1E_PROMOTION_DISABLE; return;
}
/* Make sure C1E is enabled by default */
c1e_promotion = C1E_PROMOTION_ENABLE;
}
/* * By default, the C6 state assumes the worst-case scenario of package * C6. However, if PC6 is disabled, we update the numbers to match * core C6.
*/
rdmsrq(MSR_PKG_CST_CONFIG_CONTROL, msr);
/* Limit value 2 and above allow for PC6. */ if ((msr & 0x7) < 2) {
spr_cstates[2].exit_latency = 190;
spr_cstates[2].target_residency = 600;
}
}
/* Ignore the C-state if there are NO sub-states in CPUID for it. */ if (num_substates == 0) returnfalse;
if (mwait_cstate > 2 && !boot_cpu_has(X86_FEATURE_NONSTOP_TSC))
mark_tsc_unstable("TSC halts in idle states deeper than C2");
returntrue;
}
staticvoid state_update_enter_method(struct cpuidle_state *state, int cstate)
{ if (state->flags & CPUIDLE_FLAG_INIT_XSTATE) { /* * Combining with XSTATE with IBRS or IRQ_ENABLE flags * is not currently supported but this driver.
*/
WARN_ON_ONCE(state->flags & CPUIDLE_FLAG_IBRS);
WARN_ON_ONCE(state->flags & CPUIDLE_FLAG_IRQ_ENABLE);
state->enter = intel_idle_xstate; return;
}
if (cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS) &&
((state->flags & CPUIDLE_FLAG_IBRS) || ibrs_off)) { /* * IBRS mitigation requires that C-states are entered * with interrupts disabled.
*/ if (ibrs_off && (state->flags & CPUIDLE_FLAG_IRQ_ENABLE))
state->flags &= ~CPUIDLE_FLAG_IRQ_ENABLE;
WARN_ON_ONCE(state->flags & CPUIDLE_FLAG_IRQ_ENABLE);
state->enter = intel_idle_ibrs; return;
}
if (state->flags & CPUIDLE_FLAG_IRQ_ENABLE) {
state->enter = intel_idle_irq; return;
}
if (force_irq_on) {
pr_info("forced intel_idle_irq for state %d\n", cstate);
state->enter = intel_idle_irq;
}
}
staticvoid __init intel_idle_init_cstates_icpu(struct cpuidle_driver *drv)
{ int cstate;
switch (boot_cpu_data.x86_vfm) { case INTEL_IVYBRIDGE_X:
ivt_idle_state_table_update(); break; case INTEL_ATOM_GOLDMONT: case INTEL_ATOM_GOLDMONT_PLUS:
bxt_idle_state_table_update(); break; case INTEL_SKYLAKE:
sklh_idle_state_table_update(); break; case INTEL_SKYLAKE_X:
skx_idle_state_table_update(); break; case INTEL_SAPPHIRERAPIDS_X: case INTEL_EMERALDRAPIDS_X:
spr_idle_state_table_update(); break; case INTEL_ALDERLAKE: case INTEL_ALDERLAKE_L: case INTEL_ATOM_GRACEMONT:
adl_idle_state_table_update(); break; case INTEL_ATOM_SILVERMONT: case INTEL_ATOM_AIRMONT:
byt_cht_auto_demotion_disable(); break;
}
if (!cpuidle_state_table[cstate].enter &&
!cpuidle_state_table[cstate].enter_s2idle) break;
if (!cpuidle_state_table[cstate].enter_dead)
cpuidle_state_table[cstate].enter_dead = intel_idle_enter_dead;
/* If marked as unusable, skip this state. */ if (cpuidle_state_table[cstate].flags & CPUIDLE_FLAG_UNUSABLE) {
pr_debug("state %s is disabled\n",
cpuidle_state_table[cstate].name); continue;
}
mwait_hint = flg2MWAIT(cpuidle_state_table[cstate].flags); if (!intel_idle_verify_cstate(mwait_hint)) continue;
/* Structure copy. */
drv->states[drv->state_count] = cpuidle_state_table[cstate];
state = &drv->states[drv->state_count];
if (intel_idle_state_needs_timer_stop(state))
state->flags |= CPUIDLE_FLAG_TIMER_STOP;
drv->state_count++;
}
}
/** * intel_idle_cpuidle_driver_init - Create the list of available idle states. * @drv: cpuidle driver structure to initialize.
*/ staticvoid __init intel_idle_cpuidle_driver_init(struct cpuidle_driver *drv)
{
cpuidle_poll_state_init(drv);
if (disabled_states_mask & BIT(0))
drv->states[0].flags |= CPUIDLE_FLAG_OFF;
drv->state_count = 1;
if (icpu && icpu->state_table)
intel_idle_init_cstates_icpu(drv); else
intel_idle_init_cstates_acpi(drv);
}
/** * intel_idle_cpu_init - Register the target CPU with the cpuidle core. * @cpu: CPU to initialize. * * Register a cpuidle device object for @cpu and update its MSRs in accordance * with the processor model flags.
*/ staticint intel_idle_cpu_init(unsignedint cpu)
{ struct cpuidle_device *dev;
dev = per_cpu_ptr(intel_idle_cpuidle_devices, cpu);
dev->cpu = cpu;
if (cpuidle_register_device(dev)) {
pr_debug("cpuidle_register_device %d failed!\n", cpu); return -EIO;
}
if (auto_demotion_disable_flags)
auto_demotion_disable();
if (c1e_promotion == C1E_PROMOTION_ENABLE)
c1e_promotion_enable(); elseif (c1e_promotion == C1E_PROMOTION_DISABLE)
c1e_promotion_disable();
if (!boot_cpu_has(X86_FEATURE_ARAT))
tick_broadcast_enable();
/* * Some systems can hotplug a cpu at runtime after * the kernel has booted, we have to initialize the * driver in this case
*/
dev = per_cpu_ptr(intel_idle_cpuidle_devices, cpu); if (!dev->registered) return intel_idle_cpu_init(cpu);
return 0;
}
/** * intel_idle_cpuidle_devices_uninit - Unregister all cpuidle devices.
*/ staticvoid __init intel_idle_cpuidle_devices_uninit(void)
{ int i;
rdmsrl(MSR_PKG_CST_CONFIG_CONTROL, msr_val); /* * Enable/disable C1 undemotion along with C1 demotion, as this is the * most sensible configuration in general.
*/ if (enable)
msr_val |= NHM_C1_AUTO_DEMOTE | SNB_C1_AUTO_UNDEMOTE; else
msr_val &= ~(NHM_C1_AUTO_DEMOTE | SNB_C1_AUTO_UNDEMOTE);
wrmsrl(MSR_PKG_CST_CONFIG_CONTROL, msr_val);
}
/* * Read the MSR value for a CPU and assume it is the same for all CPUs. Any other * configuration would be a BIOS bug.
*/
rdmsrl(MSR_PKG_CST_CONFIG_CONTROL, msr_val); return sysfs_emit(buf, "%d\n", !!(msr_val & NHM_C1_AUTO_DEMOTE));
} static DEVICE_ATTR_RW(intel_c1_demotion);
staticint __init intel_idle_sysfs_init(void)
{ int err;
if (!c1_demotion_supported) return 0;
sysfs_root = bus_get_dev_root(&cpu_subsys); if (!sysfs_root) return 0;
/* Do not load intel_idle at all for now if idle= is passed */ if (boot_option_idle_override != IDLE_NO_OVERRIDE) return -ENODEV;
if (max_cstate == 0) {
pr_debug("disabled\n"); return -EPERM;
}
id = x86_match_cpu(intel_idle_ids); if (id) { if (!boot_cpu_has(X86_FEATURE_MWAIT)) {
pr_debug("Please enable MWAIT in BIOS SETUP\n"); return -ENODEV;
}
} else {
id = x86_match_cpu(intel_mwait_ids); if (!id) return -ENODEV;
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.