#define ATTR_LEN 16 /* One object for each MCE bank, shared by all CPUs */ struct mce_bank_dev { struct device_attribute attr; /* device attribute */ char attrname[ATTR_LEN]; /* attribute name */
u8 bank; /* bank number */
}; staticstruct mce_bank_dev mce_bank_devs[MAX_NR_BANKS];
/* * MCA banks polled by the period polling timer for corrected events. * With Intel CMCI, this only has MCA banks which do not support CMCI (if any).
*/
DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
[0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
};
/* * MCA banks controlled through firmware first for corrected errors. * This is a global list of banks for which we won't enable CMCI and we * won't poll. Firmware controls these banks and is responsible for * reporting corrected errors through GHES. Uncorrected/recoverable * errors are still notified through a machine check.
*/
mce_banks_t mce_banks_ce_disabled;
/* * CPU/chipset specific EDAC code can register a notifier call here to print * MCE errors in a human-readable form.
*/
BLOCKING_NOTIFIER_HEAD(x86_mce_decoder_chain);
void mce_prep_record_common(struct mce *m)
{
m->cpuid = cpuid_eax(1);
m->cpuvendor = boot_cpu_data.x86_vendor;
m->mcgcap = native_rdmsrq(MSR_IA32_MCG_CAP); /* need the internal __ version to avoid deadlocks */
m->time = __ktime_get_real_seconds();
}
if (m->cs == __KERNEL_CS)
pr_cont("{%pS}", (void *)(unsignedlong)m->ip);
pr_cont("\n");
}
pr_emerg(HW_ERR "TSC %llx ", m->tsc); if (m->addr)
pr_cont("ADDR %llx ", m->addr); if (m->misc)
pr_cont("MISC %llx ", m->misc); if (m->ppin)
pr_cont("PPIN %llx ", m->ppin);
if (mce_flags.smca) { if (m->synd)
pr_cont("SYND %llx ", m->synd); if (err->vendor.amd.synd1)
pr_cont("SYND1 %llx ", err->vendor.amd.synd1); if (err->vendor.amd.synd2)
pr_cont("SYND2 %llx ", err->vendor.amd.synd2); if (m->ipid)
pr_cont("IPID %llx ", m->ipid);
}
pr_cont("\n");
/* * Note this output is parsed by external tools and old fields * should not be changed.
*/
pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x microcode %x\n",
m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid,
m->microcode);
}
/* * Allow instrumentation around external facilities usage. Not that it * matters a whole lot since the machine is going to panic anyway.
*/
instrumentation_begin();
if (!fake_panic) { /* * Make sure only one CPU runs in machine check panic
*/ if (atomic_inc_return(&mce_panicked) > 1)
wait_for_panic();
barrier();
bust_spinlocks(1);
console_verbose();
} else { /* Don't log too much for fake panic */ if (atomic_inc_return(&mce_fake_panicked) > 1) goto out;
}
pending = mce_gen_pool_prepare_records(); /* First print corrected ones that are still unlogged */
llist_for_each_entry(l, pending, llnode) { struct mce_hw_err *err = &l->err; struct mce *m = &err->m; if (!(m->status & MCI_STATUS_UC)) {
print_mce(err); if (!apei_err)
apei_err = apei_write_mce(m);
}
} /* Now print uncorrected but with the final one last */
llist_for_each_entry(l, pending, llnode) { struct mce_hw_err *err = &l->err; struct mce *m = &err->m; if (!(m->status & MCI_STATUS_UC)) continue; if (!final || mce_cmp(m, &final->m)) {
print_mce(err); if (!apei_err)
apei_err = apei_write_mce(m);
}
} if (final) {
print_mce(final); if (!apei_err)
apei_err = apei_write_mce(&final->m);
} if (exp)
pr_emerg(HW_ERR "Machine check: %s\n", exp);
memmsg = mce_dump_aux_info(&final->m); if (memmsg)
pr_emerg(HW_ERR "Machine check: %s\n", memmsg);
if (!fake_panic) { if (panic_timeout == 0)
panic_timeout = mca_cfg.panic_timeout;
/* * Kdump skips the poisoned page in order to avoid * touching the error bits again. Poison the page even * if the error is fatal and the machine is about to * panic.
*/ if (kexec_crash_loaded()) { if (final && (final->m.status & MCI_STATUS_ADDRV)) { struct page *p;
p = pfn_to_online_page(final->m.addr >> PAGE_SHIFT); if (p)
SetPageHWPoison(p);
}
}
panic(msg);
} else
pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg);
out:
instrumentation_end();
}
/* Support code for software error injection */
staticint msr_to_offset(u32 msr)
{ unsigned bank = __this_cpu_read(injectm.bank);
if (msr == mca_cfg.rip_msr) return offsetof(struct mce, ip); if (msr == mca_msr_reg(bank, MCA_STATUS)) return offsetof(struct mce, status); if (msr == mca_msr_reg(bank, MCA_ADDR)) return offsetof(struct mce, addr); if (msr == mca_msr_reg(bank, MCA_MISC)) return offsetof(struct mce, misc); if (msr == MSR_IA32_MCG_STATUS) return offsetof(struct mce, mcgstatus); return -1;
}
void ex_handler_msr_mce(struct pt_regs *regs, bool wrmsr)
{ if (wrmsr) {
pr_emerg("MSR access error: WRMSR to 0x%x (tried to write 0x%08x%08x) at rIP: 0x%lx (%pS)\n",
(unsignedint)regs->cx, (unsignedint)regs->dx, (unsignedint)regs->ax,
regs->ip, (void *)regs->ip);
} else {
pr_emerg("MSR access error: RDMSR from 0x%x at rIP: 0x%lx (%pS)\n",
(unsignedint)regs->cx, regs->ip, (void *)regs->ip);
}
show_stack_regs(regs);
panic("MCA architectural violation!\n");
while (true)
cpu_relax();
}
/* MSR access wrappers used for error injection */
noinstr u64 mce_rdmsrq(u32 msr)
{
EAX_EDX_DECLARE_ARGS(val, low, high);
if (__this_cpu_read(injectm.finished)) { int offset;
u64 ret;
instrumentation_begin();
offset = msr_to_offset(msr); if (offset < 0)
ret = 0; else
ret = *(u64 *)((char *)this_cpu_ptr(&injectm) + offset);
instrumentation_end();
return ret;
}
/* * RDMSR on MCA MSRs should not fault. If they do, this is very much an * architectural violation and needs to be reported to hw vendor. Panic * the box to not allow any further progress.
*/ asmvolatile("1: rdmsr\n" "2:\n"
_ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_RDMSR_IN_MCE)
: EAX_EDX_RET(val, low, high) : "c" (msr));
/* * Collect all global (w.r.t. this processor) status about this machine * check into our "mce" struct so that we can use it later to assess * the severity of the problem as we read per-bank specific details.
*/ static noinstr void mce_gather_info(struct mce_hw_err *err, struct pt_regs *regs)
{ struct mce *m; /* * Enable instrumentation around mce_prep_record() which calls external * facilities.
*/
instrumentation_begin();
mce_prep_record(err);
instrumentation_end();
m = &err->m;
m->mcgstatus = mce_rdmsrq(MSR_IA32_MCG_STATUS); if (regs) { /* * Get the address of the instruction at the time of * the machine check error.
*/ if (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) {
m->ip = regs->ip;
m->cs = regs->cs;
/* * When in VM86 mode make the cs look like ring 3 * always. This is a lie, but it's better than passing * the additional vm86 bit around everywhere.
*/ if (v8086_mode(regs))
m->cs |= 3;
} /* Use accurate RIP reporting if available. */ if (mca_cfg.rip_msr)
m->ip = mce_rdmsrq(mca_cfg.rip_msr);
}
}
bool mce_is_memory_error(struct mce *m)
{ switch (m->cpuvendor) { case X86_VENDOR_AMD: case X86_VENDOR_HYGON: return amd_mce_is_memory_error(m);
case X86_VENDOR_INTEL: case X86_VENDOR_ZHAOXIN: /* * Intel SDM Volume 3B - 15.9.2 Compound Error Codes * * Bit 7 of the MCACOD field of IA32_MCi_STATUS is used for * indicating a memory error. Bit 8 is used for indicating a * cache hierarchy error. The combination of bit 2 and bit 3 * is used for indicating a `generic' cache hierarchy error * But we can't just blindly check the above bits, because if * bit 11 is set, then it is a bus/interconnect error - and * either way the above bits just gives more detail on what * bus/interconnect error happened. Note that bit 12 can be * ignored, as it's the "filter" bit.
*/ return (m->status & 0xef80) == BIT(7) ||
(m->status & 0xef00) == BIT(8) ||
(m->status & 0xeffc) == 0xc;
/* * Notify the user(s) about new machine check events. * Can be called from interrupt context, but not from machine check/NMI * context.
*/ staticbool mce_notify_irq(void)
{ /* Not more than two messages every minute */ static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
if (test_and_clear_bit(0, &mce_need_notify)) {
mce_work_trigger();
if (__ratelimit(&ratelimit))
pr_info(HW_ERR "Machine check events logged\n");
if (mca_cfg.print_all || !(err->m.kflags))
__print_mce(err);
return NOTIFY_DONE;
}
staticstruct notifier_block mce_default_nb = {
.notifier_call = mce_default_notifier, /* lowest prio, we want it to run last. */
.priority = MCE_PRIO_LOWEST,
};
/* * Poll for corrected events or events that happened before reset. * Those are just logged through /dev/mcelog. * * This is executed in standard interrupt context. * * Note: spec recommends to panic for fatal unsignalled * errors here. However this would be quite problematic -- * we would need to reimplement the Monarch handling and * it would mess up the exclusion between exception handler * and poll handler -- * so we skip this for now. * These cases should not happen anyways, or only when the CPU * is already totally * confused. In this case it's likely it will * not fully execute the machine check handler either.
*/ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
{ struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); struct mce_hw_err err; struct mce *m; int i;
this_cpu_inc(mce_poll_count);
mce_gather_info(&err, NULL);
m = &err.m;
if (flags & MCP_TIMESTAMP)
m->tsc = rdtsc();
for (i = 0; i < this_cpu_read(mce_num_banks); i++) { if (!mce_banks[i].ctl || !test_bit(i, *b)) continue;
/* * Update storm tracking here, before checking for the * MCI_STATUS_VAL bit. Valid corrected errors count * towards declaring, or maintaining, storm status. No * error in a bank counts towards avoiding, or ending, * storm status.
*/ if (!mca_cfg.cmci_disabled)
mce_track_storm(m);
/* If this entry is not valid, ignore it */ if (!(m->status & MCI_STATUS_VAL)) continue;
/* * If we are logging everything (at CPU online) or this * is a corrected error, then we must log it.
*/ if ((flags & MCP_UC) || !(m->status & MCI_STATUS_UC)) goto log_it;
/* * Newer Intel systems that support software error * recovery need to make additional checks. Other * CPUs should skip over uncorrected errors, but log * everything else.
*/ if (!mca_cfg.ser) { if (m->status & MCI_STATUS_UC) continue; goto log_it;
}
/* * Skip anything else. Presumption is that our read of this * bank is racing with a machine check. Leave the log alone * for do_machine_check() to deal with it.
*/ continue;
log_it: if (flags & MCP_DONTLOG) goto clear_it;
mce_read_aux(&err, i);
m->severity = mce_severity(m, NULL, NULL, false); /* * Don't get the IP here because it's unlikely to * have anything to do with the actual error location.
*/
if (mca_cfg.dont_log_ce && !mce_usable_address(m)) goto clear_it;
if (flags & MCP_QUEUE_LOG)
mce_gen_pool_add(&err); else
mce_log(&err);
clear_it: /* * Clear state for this bank.
*/
mce_wrmsrq(mca_msr_reg(i, MCA_STATUS), 0);
}
/* * Don't clear MCG_STATUS here because it's only defined for * exceptions.
*/
/* * During IFU recovery Sandy Bridge -EP4S processors set the RIPV and * EIPV bits in MCG_STATUS to zero on the affected logical processor (SDM * Vol 3B Table 15-20). But this confuses both the code that determines * whether the machine check occurred in kernel or user mode, and also * the severity assessment code. Pretend that EIPV was set, and take the * ip/cs values from the pt_regs that mce_gather_info() ignored earlier.
*/ static __always_inline void
quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs)
{ if (bank != 0) return; if ((m->mcgstatus & (MCG_STATUS_EIPV|MCG_STATUS_RIPV)) != 0) return; if ((m->status & (MCI_STATUS_OVER|MCI_STATUS_UC|
MCI_STATUS_EN|MCI_STATUS_MISCV|MCI_STATUS_ADDRV|
MCI_STATUS_PCC|MCI_STATUS_S|MCI_STATUS_AR|
MCACOD)) !=
(MCI_STATUS_UC|MCI_STATUS_EN|
MCI_STATUS_MISCV|MCI_STATUS_ADDRV|MCI_STATUS_S|
MCI_STATUS_AR|MCACOD_INSTR)) return;
/* * Disable fast string copy and return from the MCE handler upon the first SRAR * MCE on bank 1 due to a CPU erratum on Intel Skylake/Cascade Lake/Cooper Lake * CPUs. * The fast string copy instructions ("REP; MOVS*") could consume an * uncorrectable memory error in the cache line _right after_ the desired region * to copy and raise an MCE with RIP pointing to the instruction _after_ the * "REP; MOVS*". * This mitigation addresses the issue completely with the caveat of performance * degradation on the CPU affected. This is still better than the OS crashing on * MCEs raised on an irrelevant process due to "REP; MOVS*" accesses from a * kernel context (e.g., copy_page). * * Returns true when fast string copy on CPU has been disabled.
*/ static noinstr bool quirk_skylake_repmov(void)
{
u64 mcgstatus = mce_rdmsrq(MSR_IA32_MCG_STATUS);
u64 misc_enable = mce_rdmsrq(MSR_IA32_MISC_ENABLE);
u64 mc1_status;
/* * Apply the quirk only to local machine checks, i.e., no broadcast * sync is needed.
*/ if (!(mcgstatus & MCG_STATUS_LMCES) ||
!(misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING)) returnfalse;
instrumentation_begin();
pr_err_once("Erratum detected, disable fast string copy instructions.\n");
instrumentation_end();
returntrue;
}
returnfalse;
}
/* * Some Zen-based Instruction Fetch Units set EIPV=RIPV=0 on poison consumption * errors. This means mce_gather_info() will not save the "ip" and "cs" registers. * * However, the context is still valid, so save the "cs" register for later use. * * The "ip" register is truly unknown, so don't save it or fixup EIPV/RIPV. * * The Instruction Fetch Unit is at MCA bank 1 for all affected systems.
*/ static __always_inline void quirk_zen_ifu(int bank, struct mce *m, struct pt_regs *regs)
{ if (bank != 1) return; if (!(m->status & MCI_STATUS_POISON)) return;
m->cs = regs->cs;
}
/* * Do a quick check if any of the events requires a panic. * This decides if we keep the events around or clear them.
*/ static __always_inline int mce_no_way_out(struct mce_hw_err *err, char **msg, unsignedlong *validp, struct pt_regs *regs)
{ struct mce *m = &err->m; char *tmp = *msg; int i;
for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
m->status = mce_rdmsrq(mca_msr_reg(i, MCA_STATUS)); if (!(m->status & MCI_STATUS_VAL)) continue;
arch___set_bit(i, validp); if (mce_flags.snb_ifu_quirk)
quirk_sandybridge_ifu(i, m, regs);
if (mce_flags.zen_ifu_quirk)
quirk_zen_ifu(i, m, regs);
/* * Variable to establish order between CPUs while scanning. * Each CPU spins initially until executing is equal its number.
*/ static atomic_t mce_executing;
/* * Defines order of CPUs on entry. First CPU becomes Monarch.
*/ static atomic_t mce_callin;
/* * Track which CPUs entered the MCA broadcast synchronization and which not in * order to print holdouts.
*/ static cpumask_t mce_missing_cpus = CPU_MASK_ALL;
/* * Check if a timeout waiting for other CPUs happened.
*/ static noinstr int mce_timed_out(u64 *t, constchar *msg)
{ int ret = 0;
/* Enable instrumentation around calls to external facilities */
instrumentation_begin();
/* * The others already did panic for some reason. * Bail out like in a timeout. * rmb() to tell the compiler that system_state * might have been modified by someone else.
*/
rmb(); if (atomic_read(&mce_panicked))
wait_for_panic(); if (!mca_cfg.monarch_timeout) goto out; if ((s64)*t < SPINUNIT) { if (cpumask_and(&mce_missing_cpus, cpu_online_mask, &mce_missing_cpus))
pr_emerg("CPUs not responding to MCE broadcast (may include false positives): %*pbl\n",
cpumask_pr_args(&mce_missing_cpus));
mce_panic(msg, NULL, NULL);
ret = 1; goto out;
}
*t -= SPINUNIT;
out:
touch_nmi_watchdog();
instrumentation_end();
return ret;
}
/* * The Monarch's reign. The Monarch is the CPU who entered * the machine check handler first. It waits for the others to * raise the exception too and then grades them. When any * error is fatal panic. Only then let the others continue. * * The other CPUs entering the MCE handler will be controlled by the * Monarch. They are called Subjects. * * This way we prevent any potential data corruption in a unrecoverable case * and also makes sure always all CPU's errors are examined. * * Also this detects the case of a machine check event coming from outer * space (not detected by any CPUs) In this case some external agent wants * us to shut down, so panic too. * * The other CPUs might still decide to panic if the handler happens * in a unrecoverable place, but in this case the system is in a semi-stable * state and won't corrupt anything by itself. It's ok to let the others * continue for a bit first. * * All the spin loops have timeouts; when a timeout happens a CPU * typically elects itself to be Monarch.
*/ staticvoid mce_reign(void)
{ struct mce_hw_err *err = NULL; struct mce *m = NULL; int global_worst = 0; char *msg = NULL; int cpu;
/* * This CPU is the Monarch and the other CPUs have run * through their handlers. * Grade the severity of the errors of all the CPUs.
*/
for_each_possible_cpu(cpu) { struct mce_hw_err *etmp = &per_cpu(hw_errs_seen, cpu); struct mce *mtmp = &etmp->m;
if (mtmp->severity > global_worst) {
global_worst = mtmp->severity;
err = &per_cpu(hw_errs_seen, cpu);
m = &err->m;
}
}
/* * Cannot recover? Panic here then. * This dumps all the mces in the log buffer and stops the * other CPUs.
*/ if (m && global_worst >= MCE_PANIC_SEVERITY) { /* call mce_severity() to get "msg" for panic */
mce_severity(m, NULL, &msg, true);
mce_panic("Fatal machine check", err, msg);
}
/* * For UC somewhere we let the CPU who detects it handle it. * Also must let continue the others, otherwise the handling * CPU could deadlock on a lock.
*/
/* * No machine check event found. Must be some external * source or one CPU is hung. Panic.
*/ if (global_worst <= MCE_KEEP_SEVERITY)
mce_panic("Fatal machine check from unknown source", NULL, NULL);
/* * Now clear all the hw_errs_seen so that they don't reappear on * the next mce.
*/
for_each_possible_cpu(cpu)
memset(&per_cpu(hw_errs_seen, cpu), 0, sizeof(struct mce_hw_err));
}
static atomic_t global_nwo;
/* * Start of Monarch synchronization. This waits until all CPUs have * entered the exception handler and then determines if any of them * saw a fatal event that requires panic. Then it executes them * in the entry order. * TBD double check parallel CPU hotunplug
*/ static noinstr int mce_start(int *no_way_out)
{
u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC; int order, ret = -1;
if (!timeout) return ret;
raw_atomic_add(*no_way_out, &global_nwo); /* * Rely on the implied barrier below, such that global_nwo * is updated before mce_callin.
*/
order = raw_atomic_inc_return(&mce_callin);
arch_cpumask_clear_cpu(smp_processor_id(), &mce_missing_cpus);
/* Enable instrumentation around calls to external facilities */
instrumentation_begin();
/* * Wait for everyone.
*/ while (raw_atomic_read(&mce_callin) != num_online_cpus()) { if (mce_timed_out(&timeout, "Timeout: Not all CPUs entered broadcast exception handler")) {
raw_atomic_set(&global_nwo, 0); goto out;
}
ndelay(SPINUNIT);
}
/* * mce_callin should be read before global_nwo
*/
smp_rmb();
if (order == 1) { /* * Monarch: Starts executing now, the others wait.
*/
raw_atomic_set(&mce_executing, 1);
} else { /* * Subject: Now start the scanning loop one by one in * the original callin order. * This way when there are any shared banks it will be * only seen by one CPU before cleared, avoiding duplicates.
*/ while (raw_atomic_read(&mce_executing) < order) { if (mce_timed_out(&timeout, "Timeout: Subject CPUs unable to finish machine check processing")) {
raw_atomic_set(&global_nwo, 0); goto out;
}
ndelay(SPINUNIT);
}
}
/* * Cache the global no_way_out state.
*/
*no_way_out = raw_atomic_read(&global_nwo);
ret = order;
out:
instrumentation_end();
return ret;
}
/* * Synchronize between CPUs after main scanning loop. * This invokes the bulk of the Monarch processing.
*/ static noinstr int mce_end(int order)
{
u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC; int ret = -1;
/* Allow instrumentation around external facilities. */
instrumentation_begin();
if (!timeout) goto reset; if (order < 0) goto reset;
/* * Allow others to run.
*/
atomic_inc(&mce_executing);
if (order == 1) { /* * Monarch: Wait for everyone to go through their scanning * loops.
*/ while (atomic_read(&mce_executing) <= num_online_cpus()) { if (mce_timed_out(&timeout, "Timeout: Monarch CPU unable to finish machine check processing")) goto reset;
ndelay(SPINUNIT);
}
mce_reign();
barrier();
ret = 0;
} else { /* * Subject: Wait for Monarch to finish.
*/ while (atomic_read(&mce_executing) != 0) { if (mce_timed_out(&timeout, "Timeout: Monarch CPU did not finish machine check processing")) goto reset;
ndelay(SPINUNIT);
}
/* * Don't reset anything. That's done by the Monarch.
*/
ret = 0; goto out;
}
/* * Reset all global state.
*/
reset:
atomic_set(&global_nwo, 0);
atomic_set(&mce_callin, 0);
cpumask_setall(&mce_missing_cpus);
barrier();
/* * Let others run again.
*/
atomic_set(&mce_executing, 0);
out:
instrumentation_end();
return ret;
}
static __always_inline void mce_clear_state(unsignedlong *toclear)
{ int i;
for (i = 0; i < this_cpu_read(mce_num_banks); i++) { if (arch_test_bit(i, toclear))
mce_wrmsrq(mca_msr_reg(i, MCA_STATUS), 0);
}
}
/* * Cases where we avoid rendezvous handler timeout: * 1) If this CPU is offline. * * 2) If crashing_cpu was set, e.g. we're entering kdump and we need to * skip those CPUs which remain looping in the 1st kernel - see * crash_nmi_callback(). * * Note: there still is a small window between kexec-ing and the new, * kdump kernel establishing a new #MC handler where a broadcasted MCE * might not get handled properly.
*/ static noinstr bool mce_check_crashing_cpu(void)
{ unsignedint cpu = smp_processor_id();
static __always_inline int
__mc_scan_banks(struct mce_hw_err *err, struct pt_regs *regs, struct mce_hw_err *final, unsignedlong *toclear, unsignedlong *valid_banks, int no_way_out, int *worst)
{ struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); struct mca_config *cfg = &mca_cfg; int severity, i, taint = 0; struct mce *m = &err->m;
for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
arch___clear_bit(i, toclear); if (!arch_test_bit(i, valid_banks)) continue;
if (!mce_banks[i].ctl) continue;
m->misc = 0;
m->addr = 0;
m->bank = i;
m->status = mce_rdmsrq(mca_msr_reg(i, MCA_STATUS)); if (!(m->status & MCI_STATUS_VAL)) continue;
/* * Corrected or non-signaled errors are handled by * machine_check_poll(). Leave them alone, unless this panics.
*/ if (!(m->status & (cfg->ser ? MCI_STATUS_S : MCI_STATUS_UC)) &&
!no_way_out) continue;
/* Set taint even when machine check was not enabled. */
taint++;
severity = mce_severity(m, regs, NULL, true);
/* * When machine check was for corrected/deferred handler don't * touch, unless we're panicking.
*/ if ((severity == MCE_KEEP_SEVERITY ||
severity == MCE_UCNA_SEVERITY) && !no_way_out) continue;
arch___set_bit(i, toclear);
/* Machine check event was not enabled. Clear, but ignore. */ if (severity == MCE_NO_SEVERITY) continue;
/* * Enable instrumentation around the mce_log() call which is * done in #MC context, where instrumentation is disabled.
*/
instrumentation_begin();
mce_log(err);
instrumentation_end();
p->mce_count = 0;
pr_err("Uncorrected hardware memory error in user-access at %llx", p->mce_addr);
if (!p->mce_ripv)
flags |= MF_MUST_KILL;
pfn = (p->mce_addr & MCI_ADDR_PHYSADDR) >> PAGE_SHIFT;
ret = memory_failure(pfn, flags); if (!ret) {
set_mce_nospec(pfn);
sync_core(); return;
}
/* * -EHWPOISON from memory_failure() means that it already sent SIGBUS * to the current process with the proper error info, * -EOPNOTSUPP means hwpoison_filter() filtered the error event, * * In both cases, no further processing is required.
*/ if (ret == -EHWPOISON || ret == -EOPNOTSUPP) return;
pr_err("Memory error not recovered");
kill_me_now(cb);
}
p->mce_count = 0;
pr_err("Kernel accessed poison in user space at %llx\n", p->mce_addr);
pfn = (p->mce_addr & MCI_ADDR_PHYSADDR) >> PAGE_SHIFT; if (!memory_failure(pfn, 0))
set_mce_nospec(pfn);
}
/* First call, save all the details */ if (count == 1) {
current->mce_addr = m->addr;
current->mce_kflags = m->kflags;
current->mce_ripv = !!(m->mcgstatus & MCG_STATUS_RIPV);
current->mce_whole_page = whole_page(m);
current->mce_kill_me.func = func;
}
/* Ten is likely overkill. Don't expect more than two faults before task_work() */ if (count > 10)
mce_panic("Too many consecutive machine checks while accessing user data",
err, msg);
/* Second or later call, make sure page address matches the one from first call */ if (count > 1 && (current->mce_addr >> PAGE_SHIFT) != (m->addr >> PAGE_SHIFT))
mce_panic("Consecutive machine checks to different user pages", err, msg);
/* Do not call task_work_add() more than once */ if (count > 1) return;
/* * The actual machine check handler. This only handles real exceptions when * something got corrupted coming in through int 18. * * This is executed in #MC context not subject to normal locking rules. * This implies that most kernel services cannot be safely used. Don't even * think about putting a printk in there! * * On Intel systems this is entered on all CPUs in parallel through * MCE broadcast. However some CPUs might be broken beyond repair, * so be always careful when synchronizing with others. * * Tracing and kprobes are disabled: if we interrupted a kernel context * with IF=1, we need to minimize stack usage. There are also recursion * issues: if the machine check was due to a failure of the memory * backing the user stack, tracing that reads the user stack will cause * potentially infinite recursion. * * Currently, the #MC handler calls out to a number of external facilities * and, therefore, allows instrumentation around them. The optimal thing to * have would be to do the absolutely minimal work required in #MC context * and have instrumentation disabled only around that. Further processing can * then happen in process context where instrumentation is allowed. Achieving * that requires careful auditing and modifications. Until then, the code * allows instrumentation temporarily, where required. *
*/
noinstr void do_machine_check(struct pt_regs *regs)
{ int worst = 0, order, no_way_out, kill_current_task, lmce, taint = 0;
DECLARE_BITMAP(valid_banks, MAX_NR_BANKS) = { 0 };
DECLARE_BITMAP(toclear, MAX_NR_BANKS) = { 0 }; struct mce_hw_err *final; struct mce_hw_err err; char *msg = NULL; struct mce *m;
/* * When no restart IP might need to kill or panic. * Assume the worst for now, but if we find the * severity is MCE_AR_SEVERITY we have other options.
*/ if (!(m->mcgstatus & MCG_STATUS_RIPV))
kill_current_task = 1; /* * Check if this MCE is signaled to only this logical processor, * on Intel, Zhaoxin only.
*/ if (m->cpuvendor == X86_VENDOR_INTEL ||
m->cpuvendor == X86_VENDOR_ZHAOXIN)
lmce = m->mcgstatus & MCG_STATUS_LMCES;
/* * Local machine check may already know that we have to panic. * Broadcast machine check begins rendezvous in mce_start() * Go through all banks in exclusion of the other CPUs. This way we * don't report duplicated events on shared banks because the first one * to see it will clear it.
*/ if (lmce) { if (no_way_out)
mce_panic("Fatal local machine check", &err, msg);
} else {
order = mce_start(&no_way_out);
}
/* * Do most of the synchronization with other CPUs. * When there's any problem use only local no_way_out state.
*/ if (!lmce) { if (mce_end(order) < 0) { if (!no_way_out)
no_way_out = worst >= MCE_PANIC_SEVERITY;
if (no_way_out)
mce_panic("Fatal machine check on current CPU", &err, msg);
}
} else { /* * If there was a fatal machine check we should have * already called mce_panic earlier in this function. * Since we re-read the banks, we might have found * something new. Check again to see if we found a * fatal error. We call "mce_severity()" again to * make sure we have the right "msg".
*/ if (worst >= MCE_PANIC_SEVERITY) {
mce_severity(m, regs, &msg, true);
mce_panic("Local fatal machine check!", &err, msg);
}
}
/* * Enable instrumentation around the external facilities like task_work_add() * (via queue_task_work()), fixup_exception() etc. For now, that is. Fixing this * properly would need a lot more involved reorganization.
*/
instrumentation_begin();
if (taint)
add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
if (worst != MCE_AR_SEVERITY && !kill_current_task) goto out;
/* Fault was in user mode and we need to take some action */ if ((m->cs & 3) == 3) { /* If this triggers there is no way to recover. Die hard. */
BUG_ON(!on_thread_stack() || !user_mode(regs));
if (!mce_usable_address(m))
queue_task_work(&err, msg, kill_me_now); else
queue_task_work(&err, msg, kill_me_maybe);
} elseif (m->mcgstatus & MCG_STATUS_SEAM_NR) { /* * Saved RIP on stack makes it look like the machine check * was taken in the kernel on the instruction following * the entry to SEAM mode. But MCG_STATUS_SEAM_NR indicates * that the machine check was taken inside SEAM non-root * mode. CPU core has already marked that guest as dead. * It is OK for the kernel to resume execution at the * apparent point of the machine check as the fault did * not occur there. Mark the page as poisoned so it won't * be added to free list when the guest is terminated.
*/ if (mce_usable_address(m)) { struct page *p = pfn_to_online_page(m->addr >> PAGE_SHIFT);
if (p)
SetPageHWPoison(p);
}
} else { /* * Handle an MCE which has happened in kernel space but from * which the kernel can recover: ex_has_fault_handler() has * already verified that the rIP at which the error happened is * a rIP from which the kernel can recover (by jumping to * recovery code specified in _ASM_EXTABLE_FAULT()) and the * corresponding exception handler which would do that is the * proper one.
*/ if (m->kflags & MCE_IN_KERNEL_RECOV) { if (!fixup_exception(regs, X86_TRAP_MC, 0, 0))
mce_panic("Failed kernel mode recovery", &err, msg);
}
if (m->kflags & MCE_IN_KERNEL_COPYIN)
queue_task_work(&err, msg, kill_me_never);
}
#ifndef CONFIG_MEMORY_FAILURE int memory_failure(unsignedlong pfn, int flags)
{ /* mce_severity() should not hand us an ACTION_REQUIRED error */
BUG_ON(flags & MF_ACTION_REQUIRED);
pr_err("Uncorrected memory error in page 0x%lx ignored\n" "Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n",
pfn);
return 0;
} #endif
/* * Periodic polling timer for "silent" machine check errors. If the * poller finds an MCE, poll 2x faster. When the poller finds no more * errors, poll 2x slower (up to check_interval seconds).
*/ staticunsignedlong check_interval = INITIAL_CHECK_INTERVAL;
if (mce_available(this_cpu_ptr(&cpu_info)))
mc_poll_banks();
/* * Alert userspace if needed. If we logged an MCE, reduce the polling * interval, otherwise increase the polling interval.
*/ if (mce_notify_irq())
iv = max(iv / 2, (unsignedlong) HZ/100); else
iv = min(iv * 2, round_jiffies_relative(check_interval * HZ));
/* * When a storm starts on any bank on this CPU, switch to polling * once per second. When the storm ends, revert to the default * polling interval.
*/ void mce_timer_kick(bool storm)
{ struct timer_list *t = this_cpu_ptr(&mce_timer);
mce_set_storm_mode(storm);
if (storm)
__start_timer(t, HZ); else
__this_cpu_write(mce_next_interval, check_interval * HZ);
}
/* Must not be called in IRQ context where timer_delete_sync() can deadlock */ staticvoid mce_timer_delete_all(void)
{ int cpu;
for (i = 0; i < n_banks; i++) { struct mce_bank *b = &mce_banks[i];
/* * Init them all, __mcheck_cpu_apply_quirks() is going to apply * the required vendor quirks before * __mcheck_cpu_init_clear_banks() does the final bank setup.
*/
b->ctl = -1ULL;
b->init = true;
}
}
/* * Initialize Machine Checks for a CPU.
*/ staticvoid __mcheck_cpu_cap_init(void)
{
u64 cap;
u8 b;
rdmsrq(MSR_IA32_MCG_CAP, cap);
b = cap & MCG_BANKCNT_MASK;
if (b > MAX_NR_BANKS) {
pr_warn("CPU%d: Using only %u machine check banks out of %u\n",
smp_processor_id(), MAX_NR_BANKS, b);
b = MAX_NR_BANKS;
}
this_cpu_write(mce_num_banks, b);
__mcheck_cpu_mce_banks_init();
/* Use accurate RIP reporting if available. */ if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9)
mca_cfg.rip_msr = MSR_IA32_MCG_EIP;
/* * Log the machine checks left over from the previous reset. Log them * only, do not start processing them. That will happen in mcheck_late_init() * when all consumers have been registered on the notifier chain.
*/
bitmap_fill(all_banks, MAX_NR_BANKS);
machine_check_poll(MCP_UC | MCP_QUEUE_LOG | m_fl, &all_banks);
cr4_set_bits(X86_CR4_MCE);
rdmsrq(MSR_IA32_MCG_CAP, cap); if (cap & MCG_CTL_P)
wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
}
staticvoid __mcheck_cpu_init_clear_banks(void)
{ struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); int i;
for (i = 0; i < this_cpu_read(mce_num_banks); i++) { struct mce_bank *b = &mce_banks[i];
/* * Do a final check to see if there are any unused/RAZ banks. * * This must be done after the banks have been initialized and any quirks have * been applied. * * Do not call this from any user-initiated flows, e.g. CPU hotplug or sysfs. * Otherwise, a user who disables a bank will not be able to re-enable it * without a system reboot.
*/ staticvoid __mcheck_cpu_check_banks(void)
{ struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
u64 msrval; int i;
for (i = 0; i < this_cpu_read(mce_num_banks); i++) { struct mce_bank *b = &mce_banks[i];
/* This should be disabled by the BIOS, but isn't always */ if (c->x86 == 15 && this_cpu_read(mce_num_banks) > 4) { /* * disable GART TBL walk error reporting, which * trips off incorrectly with the IOMMU & 3ware * & Cerberus:
*/
clear_bit(10, (unsignedlong *)&mce_banks[4].ctl);
}
if (c->x86 < 0x11 && mca_cfg.bootlog < 0) { /* * Lots of broken BIOS around that don't clear them * by default and leave crap in there. Don't log:
*/
mca_cfg.bootlog = 0;
}
/* * Various K7s with broken bank 0 around. Always disable * by default.
*/ if (c->x86 == 6 && this_cpu_read(mce_num_banks))
mce_banks[0].ctl = 0;
/* * overflow_recov is supported for F15h Models 00h-0fh * even though we don't have a CPUID bit for it.
*/ if (c->x86 == 0x15 && c->x86_model <= 0xf)
mce_flags.overflow_recov = 1;
/* Older CPUs (prior to family 6) don't need quirks. */ if (c->x86_vfm < INTEL_PENTIUM_PRO) return;
/* * SDM documents that on family 6 bank 0 should not be written * because it aliases to another special BIOS controlled * register. * But it's not aliased anymore on model 0x1a+ * Don't ignore bank 0 completely because there could be a * valid event later, merely don't write CTL0.
*/ if (c->x86_vfm < INTEL_NEHALEM_EP && this_cpu_read(mce_num_banks))
mce_banks[0].init = false;
/* * All newer Intel systems support MCE broadcasting. Enable * synchronization with a one second timeout.
*/ if (c->x86_vfm >= INTEL_CORE_YONAH && mca_cfg.monarch_timeout < 0)
mca_cfg.monarch_timeout = USEC_PER_SEC;
/* * There are also broken BIOSes on some Pentium M and * earlier systems:
*/ if (c->x86_vfm < INTEL_CORE_YONAH && mca_cfg.bootlog < 0)
mca_cfg.bootlog = 0;
if (c->x86_vfm == INTEL_SANDYBRIDGE_X)
mce_flags.snb_ifu_quirk = 1;
/* * Skylake, Cascacde Lake and Cooper Lake require a quirk on * rep movs.
*/ if (c->x86_vfm == INTEL_SKYLAKE_X)
mce_flags.skx_repmov_quirk = 1;
}
staticvoid apply_quirks_zhaoxin(struct cpuinfo_x86 *c)
{ /* * All newer Zhaoxin CPUs support MCE broadcasting. Enable * synchronization with a one second timeout.
*/ if (c->x86 > 6 || (c->x86_model == 0x19 || c->x86_model == 0x1f)) { if (mca_cfg.monarch_timeout < 0)
mca_cfg.monarch_timeout = USEC_PER_SEC;
}
}
/* Add per CPU specific workarounds here */ staticbool __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
{ struct mca_config *cfg = &mca_cfg;
switch (c->x86_vendor) { case X86_VENDOR_UNKNOWN:
pr_info("unknown CPU type - not enabling MCE support\n"); returnfalse; case X86_VENDOR_AMD:
apply_quirks_amd(c); break; case X86_VENDOR_INTEL:
apply_quirks_intel(c); break; case X86_VENDOR_ZHAOXIN:
apply_quirks_zhaoxin(c); break;
}
if (cfg->monarch_timeout < 0)
cfg->monarch_timeout = 0; if (cfg->bootlog != 0)
cfg->panic_timeout = 30;
returntrue;
}
staticbool __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
{ if (c->x86 != 5) returnfalse;
switch (c->x86_vendor) { case X86_VENDOR_INTEL:
intel_p5_mcheck_init(c);
mce_flags.p5 = 1; returntrue; case X86_VENDOR_CENTAUR:
winchip_mcheck_init(c);
mce_flags.winchip = 1; returntrue; default: returnfalse;
}
returnfalse;
}
/* * Init basic CPU features needed for early decoding of MCEs.
*/ staticvoid __mcheck_cpu_init_early(struct cpuinfo_x86 *c)
{ if (c->x86_vendor == X86_VENDOR_AMD || c->x86_vendor == X86_VENDOR_HYGON) {
mce_flags.overflow_recov = !!cpu_has(c, X86_FEATURE_OVERFLOW_RECOV);
mce_flags.succor = !!cpu_has(c, X86_FEATURE_SUCCOR);
mce_flags.smca = !!cpu_has(c, X86_FEATURE_SMCA);
mce_flags.amd_threshold = 1;
}
}
/* * These CPUs have MCA bank 8 which reports only one error type called * SVAD (System View Address Decoder). The reporting of that error is * controlled by IA32_MC8.CTL.0. * * If enabled, prefetching on these CPUs will cause SVAD MCE when * virtual machines start and result in a system panic. Always disable * bank 8 SVAD error by default.
*/ if ((c->x86 == 7 && c->x86_model == 0x1b) ||
(c->x86_model == 0x19 || c->x86_model == 0x1f)) { if (this_cpu_read(mce_num_banks) > 8)
mce_banks[8].ctl = 0;
}
#ifdef CONFIG_X86_FRED /* * When occurred on different ring level, i.e., from user or kernel * context, #MCE needs to be handled on different stack: User #MCE * on current task stack, while kernel #MCE on a dedicated stack. * * This is exactly how FRED event delivery invokes an exception * handler: ring 3 event on level 0 stack, i.e., current task stack; * ring 0 event on the #MCE dedicated stack specified in the * IA32_FRED_STKLVLS MSR. So unlike IDT, the FRED machine check entry * stub doesn't do stack switch.
*/
DEFINE_FREDENTRY_MCE(exc_machine_check)
{ unsignedlong dr7;
/* * Called for each booted CPU to set up machine checks. * Must be called with preempt off:
*/ void mcheck_cpu_init(struct cpuinfo_x86 *c)
{ if (mca_cfg.disabled) return;
if (__mcheck_cpu_ancient_init(c)) return;
if (!mce_available(c)) return;
__mcheck_cpu_cap_init();
if (!__mcheck_cpu_apply_quirks(c)) {
mca_cfg.disabled = 1; return;
}
if (!mce_gen_pool_init()) {
mca_cfg.disabled = 1;
pr_emerg("Couldn't allocate MCE records pool!\n"); return;
}
/* * Called for each booted CPU to clear some machine checks opt-ins
*/ void mcheck_cpu_clear(struct cpuinfo_x86 *c)
{ if (mca_cfg.disabled) return;
if (!mce_available(c)) return;
/* * Possibly to clear general settings generic to x86 * __mcheck_cpu_clear_generic(c);
*/
__mcheck_cpu_clear_vendor(c);
}
staticvoid __mce_disable_bank(void *arg)
{ int bank = *((int *)arg);
__clear_bit(bank, this_cpu_ptr(mce_poll_banks));
cmci_disable_bank(bank);
}
void mce_disable_bank(int bank)
{ if (bank >= this_cpu_read(mce_num_banks)) {
pr_warn(FW_BUG "Ignoring request to disable invalid MCA bank %d.\n",
bank); return;
}
set_bit(bank, mce_banks_ce_disabled);
on_each_cpu(__mce_disable_bank, &bank, 1);
}
/* * mce=off Disables machine check * mce=no_cmci Disables CMCI * mce=no_lmce Disables LMCE * mce=dont_log_ce Clears corrected events silently, no log created for CEs. * mce=print_all Print all machine check logs to console * mce=ignore_ce Disables polling and CMCI, corrected events are not cleared. * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above) * monarchtimeout is how long to wait for other CPUs on machine * check, or 0 to not wait * mce=bootlog Log MCEs from before booting. Disabled by default on AMD Fam10h and older. * mce=nobootlog Don't log MCEs from before booting. * mce=bios_cmci_threshold Don't program the CMCI threshold * mce=recovery force enable copy_mc_fragile()
*/ staticint __init mcheck_enable(char *str)
{ struct mca_config *cfg = &mca_cfg;
/* * Disable machine checks on suspend and shutdown. We can't really handle * them later.
*/ staticvoid mce_disable_error_reporting(void)
{ struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); int i;
for (i = 0; i < this_cpu_read(mce_num_banks); i++) { struct mce_bank *b = &mce_banks[i];
if (b->init)
wrmsrq(mca_msr_reg(i, MCA_CTL), 0);
} return;
}
staticvoid vendor_disable_error_reporting(void)
{ /* * Don't clear on Intel or AMD or Hygon or Zhaoxin CPUs. Some of these * MSRs are socket-wide. Disabling them for just a single offlined CPU * is bad, since it will inhibit reporting for all shared resources on * the socket like the last level cache (LLC), the integrated memory * controller (iMC), etc.
*/ if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL ||
boot_cpu_data.x86_vendor == X86_VENDOR_HYGON ||
boot_cpu_data.x86_vendor == X86_VENDOR_AMD ||
boot_cpu_data.x86_vendor == X86_VENDOR_ZHAOXIN) return;
/* * On resume clear all MCE state. Don't want to see leftovers from the BIOS. * Only one CPU is active at this time, the others get re-added later using * CPU hotplug:
*/ staticvoid mce_syscore_resume(void)
{
__mcheck_cpu_init_generic();
__mcheck_cpu_init_vendor(raw_cpu_ptr(&cpu_info));
__mcheck_cpu_init_clear_banks();
}
¤ Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.0.25Bemerkung:
(vorverarbeitet)
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.