// SPDX-License-Identifier: GPL-2.0 /* * Implement the AER root port service driver. The driver registers an IRQ * handler. When a root port triggers an AER interrupt, the IRQ handler * collects Root Port status and schedules work. * * Copyright (C) 2006 Intel Corp. * Tom Long Nguyen (tom.l.nguyen@intel.com) * Zhang Yanmin (yanmin.zhang@intel.com) * * (C) Copyright 2009 Hewlett-Packard Development Company, L.P. * Andrew Patterson <andrew.patterson@hp.com>
*/
/* * Fields for all AER capable devices. They indicate the errors * "as seen by this device". Note that this may mean that if an * Endpoint is causing problems, the AER counters may increment * at its link partner (e.g. Root Port) because the errors will be * "seen" by the link partner and not the problematic Endpoint * itself (which may report all counters as 0 as it never saw any * problems).
*/ /* Counters for different type of correctable errors */
u64 dev_cor_errs[AER_MAX_TYPEOF_COR_ERRS]; /* Counters for different type of fatal uncorrectable errors */
u64 dev_fatal_errs[AER_MAX_TYPEOF_UNCOR_ERRS]; /* Counters for different type of nonfatal uncorrectable errors */
u64 dev_nonfatal_errs[AER_MAX_TYPEOF_UNCOR_ERRS]; /* Total number of ERR_COR sent by this device */
u64 dev_total_cor_errs; /* Total number of ERR_FATAL sent by this device */
u64 dev_total_fatal_errs; /* Total number of ERR_NONFATAL sent by this device */
u64 dev_total_nonfatal_errs;
/* * Fields for Root Ports & Root Complex Event Collectors only; these * indicate the total number of ERR_COR, ERR_FATAL, and ERR_NONFATAL * messages received by the Root Port / Event Collector, INCLUDING the * ones that are generated internally (by the Root Port itself)
*/
u64 rootport_total_cor_errs;
u64 rootport_total_fatal_errs;
u64 rootport_total_nonfatal_errs;
#define ECRC_POLICY_DEFAULT 0 /* ECRC set by BIOS */ #define ECRC_POLICY_OFF 1 /* ECRC off for performance */ #define ECRC_POLICY_ON 2 /* ECRC on for data integrity */
/** * pcie_set_ecrc_checking - set/unset PCIe ECRC checking for a device based * on global policy * @dev: the PCI device
*/ void pcie_set_ecrc_checking(struct pci_dev *dev)
{ if (!pcie_aer_is_native(dev)) return;
switch (ecrc_policy) { case ECRC_POLICY_DEFAULT: return; case ECRC_POLICY_OFF:
disable_ecrc_checking(dev); break; case ECRC_POLICY_ON:
enable_ecrc_checking(dev); break; default: return;
}
}
/** * pcie_ecrc_get_policy - parse kernel command-line ecrc option * @str: ECRC policy from kernel command line to use
*/ void pcie_ecrc_get_policy(char *str)
{ int i;
i = match_string(ecrc_policy_str, ARRAY_SIZE(ecrc_policy_str), str); if (i < 0) return;
/* Clear status bits for ERR_FATAL errors only */
pci_read_config_dword(dev, aer + PCI_ERR_UNCOR_STATUS, &status);
pci_read_config_dword(dev, aer + PCI_ERR_UNCOR_SEVER, &sev);
status &= sev; if (status)
pci_write_config_dword(dev, aer + PCI_ERR_UNCOR_STATUS, status);
}
/** * pci_aer_raw_clear_status - Clear AER error registers. * @dev: the PCI device * * Clear AER error status registers unconditionally, regardless of * whether they're owned by firmware or the OS. * * Return: 0 on success, or negative on failure.
*/ int pci_aer_raw_clear_status(struct pci_dev *dev)
{ int aer = dev->aer_cap;
u32 status; int port_type;
staticconstchar *aer_correctable_error_string[] = { "RxErr", /* Bit Position 0 */
NULL,
NULL,
NULL,
NULL,
NULL, "BadTLP", /* Bit Position 6 */ "BadDLLP", /* Bit Position 7 */ "Rollover", /* Bit Position 8 */
NULL,
NULL,
NULL, "Timeout", /* Bit Position 12 */ "NonFatalErr", /* Bit Position 13 */ "CorrIntErr", /* Bit Position 14 */ "HeaderOF", /* Bit Position 15 */
NULL, /* Bit Position 16 */
NULL, /* Bit Position 17 */
NULL, /* Bit Position 18 */
NULL, /* Bit Position 19 */
NULL, /* Bit Position 20 */
NULL, /* Bit Position 21 */
NULL, /* Bit Position 22 */
NULL, /* Bit Position 23 */
NULL, /* Bit Position 24 */
NULL, /* Bit Position 25 */
NULL, /* Bit Position 26 */
NULL, /* Bit Position 27 */
NULL, /* Bit Position 28 */
NULL, /* Bit Position 29 */
NULL, /* Bit Position 30 */
NULL, /* Bit Position 31 */
};
staticconstchar *aer_uncorrectable_error_string[] = { "Undefined", /* Bit Position 0 */
NULL,
NULL,
NULL, "DLP", /* Bit Position 4 */ "SDES", /* Bit Position 5 */
NULL,
NULL,
NULL,
NULL,
NULL,
NULL, "TLP", /* Bit Position 12 */ "FCP", /* Bit Position 13 */ "CmpltTO", /* Bit Position 14 */ "CmpltAbrt", /* Bit Position 15 */ "UnxCmplt", /* Bit Position 16 */ "RxOF", /* Bit Position 17 */ "MalfTLP", /* Bit Position 18 */ "ECRC", /* Bit Position 19 */ "UnsupReq", /* Bit Position 20 */ "ACSViol", /* Bit Position 21 */ "UncorrIntErr", /* Bit Position 22 */ "BlockedTLP", /* Bit Position 23 */ "AtomicOpBlocked", /* Bit Position 24 */ "TLPBlockedErr", /* Bit Position 25 */ "PoisonTLPBlocked", /* Bit Position 26 */ "DMWrReqBlocked", /* Bit Position 27 */ "IDECheck", /* Bit Position 28 */ "MisIDETLP", /* Bit Position 29 */ "PCRC_CHECK", /* Bit Position 30 */ "TLPXlatBlocked", /* Bit Position 31 */
};
if (aer_severity != AER_CORRECTABLE)
aer_printk(info.level, dev, "aer_uncor_severity: 0x%08x\n",
aer->uncor_severity);
if (tlp_header_valid)
pcie_print_tlp_log(dev, &aer->header_log, info.level,
dev_fmt(" "));
}
EXPORT_SYMBOL_NS_GPL(pci_print_aer, "CXL");
/** * add_error_device - list device to be handled * @e_info: pointer to error info * @dev: pointer to pci_dev to be added
*/ staticint add_error_device(struct aer_err_info *e_info, struct pci_dev *dev)
{ int i = e_info->error_dev_num;
if (i >= AER_MAX_MULTI_ERR_DEVICES) return -ENOSPC;
/* * Ratelimit AER log messages. "dev" is either the source * identified by the root's Error Source ID or it has an unmasked * error logged in its own AER Capability. Messages are emitted * when "ratelimit_print[i]" is non-zero. If we will print detail * for a downstream device, make sure we print the Error Source ID * from the root as well.
*/ if (aer_ratelimit(dev, e_info->severity)) {
e_info->ratelimit_print[i] = 1;
e_info->root_ratelimit_print = 1;
} return 0;
}
/** * is_error_source - check whether the device is source of reported error * @dev: pointer to pci_dev to be checked * @e_info: pointer to reported error info
*/ staticbool is_error_source(struct pci_dev *dev, struct aer_err_info *e_info)
{ int aer = dev->aer_cap;
u32 status, mask;
u16 reg16;
/* * When bus ID is equal to 0, it might be a bad ID * reported by Root Port.
*/ if ((PCI_BUS_NUM(e_info->id) != 0) &&
!(dev->bus->bus_flags & PCI_BUS_FLAGS_NO_AERSID)) { /* Device ID match? */ if (e_info->id == pci_dev_id(dev)) returntrue;
/* Continue ID comparing if there is no multiple error */ if (!e_info->multi_error_valid) returnfalse;
}
/* * When either * 1) bus ID is equal to 0. Some ports might lose the bus * ID of error source id; * 2) bus flag PCI_BUS_FLAGS_NO_AERSID is set * 3) There are multiple errors and prior ID comparing fails; * We check AER status registers to find possible reporter.
*/ if (atomic_read(&dev->enable_cnt) == 0) returnfalse;
/* Check if AER is enabled */
pcie_capability_read_word(dev, PCI_EXP_DEVCTL, ®16); if (!(reg16 & PCI_EXP_AER_FLAGS)) returnfalse;
if (!aer) returnfalse;
/* Check if error is recorded */ if (e_info->severity == AER_CORRECTABLE) {
pci_read_config_dword(dev, aer + PCI_ERR_COR_STATUS, &status);
pci_read_config_dword(dev, aer + PCI_ERR_COR_MASK, &mask);
} else {
pci_read_config_dword(dev, aer + PCI_ERR_UNCOR_STATUS, &status);
pci_read_config_dword(dev, aer + PCI_ERR_UNCOR_MASK, &mask);
} if (status & ~mask) returntrue;
if (is_error_source(dev, e_info)) { /* List this device */ if (add_error_device(e_info, dev)) { /* We cannot handle more... Stop iteration */
pci_err(dev, "Exceeded max supported (%d) devices with errors logged\n",
AER_MAX_MULTI_ERR_DEVICES); return 1;
}
/* If there is only a single error, stop iteration */ if (!e_info->multi_error_valid) return 1;
} return 0;
}
/** * find_source_device - search through device hierarchy for source device * @parent: pointer to Root Port pci_dev data structure * @e_info: including detailed error information such as ID * * Return: true if found. * * Invoked by DPC when error is detected at the Root Port. * Caller of this function must set id, severity, and multi_error_valid of * struct aer_err_info pointed by @e_info properly. This function must fill * e_info->error_dev_num and e_info->dev[], based on the given information.
*/ staticbool find_source_device(struct pci_dev *parent, struct aer_err_info *e_info)
{ struct pci_dev *dev = parent; int result;
/* Must reset in this function */
e_info->error_dev_num = 0;
/* Is Root Port an agent that sends error message? */
result = find_device_iter(dev, e_info); if (result) returntrue;
if (!e_info->error_dev_num) returnfalse; returntrue;
}
#ifdef CONFIG_PCIEAER_CXL
/** * pci_aer_unmask_internal_errors - unmask internal errors * @dev: pointer to the pci_dev data structure * * Unmask internal errors in the Uncorrectable and Correctable Error * Mask registers. * * Note: AER must be enabled and supported by the device which must be * checked in advance, e.g. with pcie_aer_is_native().
*/ staticvoid pci_aer_unmask_internal_errors(struct pci_dev *dev)
{ int aer = dev->aer_cap;
u32 mask;
staticbool is_cxl_mem_dev(struct pci_dev *dev)
{ /* * The capability, status, and control fields in Device 0, * Function 0 DVSEC control the CXL functionality of the * entire device (CXL 3.0, 8.1.3).
*/ if (dev->devfn != PCI_DEVFN(0, 0)) returnfalse;
/* * CXL Memory Devices must have the 502h class code set (CXL * 3.0, 8.1.12.1).
*/ if ((dev->class >> 8) != PCI_CLASS_MEMORY_CXL) returnfalse;
if (info->severity == AER_CORRECTABLE) { if (err_handler->cor_error_detected)
err_handler->cor_error_detected(dev);
} elseif (err_handler->error_detected) { if (info->severity == AER_NONFATAL)
err_handler->error_detected(dev, pci_channel_io_normal); elseif (info->severity == AER_FATAL)
err_handler->error_detected(dev, pci_channel_io_frozen);
}
out:
device_unlock(&dev->dev); return 0;
}
staticvoid cxl_rch_handle_error(struct pci_dev *dev, struct aer_err_info *info)
{ /* * Internal errors of an RCEC indicate an AER error in an * RCH's downstream port. Check and handle them in the CXL.mem * device driver.
*/ if (pci_pcie_type(dev) == PCI_EXP_TYPE_RC_EC &&
is_internal_error(info))
pcie_walk_rcec(dev, cxl_rch_handle_error_iter, info);
}
/** * pci_aer_handle_error - handle logging error into an event log * @dev: pointer to pci_dev data structure of error source device * @info: comprehensive error information * * Invoked when an error being detected by Root Port.
*/ staticvoid pci_aer_handle_error(struct pci_dev *dev, struct aer_err_info *info)
{ int aer = dev->aer_cap;
if (info->severity == AER_CORRECTABLE) { /* * Correctable error does not need software intervention. * No need to go through error recovery process.
*/ if (aer)
pci_write_config_dword(dev, aer + PCI_ERR_COR_STATUS,
info->status); if (pcie_aer_is_native(dev)) { struct pci_driver *pdrv = dev->driver;
while (kfifo_get(&aer_recover_ring, &entry)) {
pdev = pci_get_domain_bus_and_slot(entry.domain, entry.bus,
entry.devfn); if (!pdev) {
pr_err_ratelimited("%04x:%02x:%02x.%x: no pci_dev found\n",
entry.domain, entry.bus,
PCI_SLOT(entry.devfn),
PCI_FUNC(entry.devfn)); continue;
}
pci_print_aer(pdev, entry.severity, entry.regs);
/* * Memory for aer_capability_regs(entry.regs) is being * allocated from the ghes_estatus_pool to protect it from * overwriting when multiple sections are present in the * error status. Thus free the same after processing the * data.
*/
ghes_estatus_pool_region_free((unsignedlong)entry.regs, sizeof(struct aer_capability_regs));
/* * Mutual exclusion for writers of aer_recover_ring, reader side don't * need lock, because there is only one reader and lock is not needed * between reader and writer.
*/ static DEFINE_SPINLOCK(aer_recover_ring_lock); static DECLARE_WORK(aer_recover_work, aer_recover_work_func);
if (kfifo_in_spinlocked(&aer_recover_ring, &entry, 1,
&aer_recover_ring_lock))
schedule_work(&aer_recover_work); else
pr_err("buffer overflow in recovery for %04x:%02x:%02x.%x\n",
domain, bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
}
EXPORT_SYMBOL_GPL(aer_recover_queue); #endif
/** * aer_get_device_error_info - read error status from dev and store it to info * @info: pointer to structure to store the error record * @i: index into info->dev[] * * Return: 1 on success, 0 on error. * * Note that @info is reused among all error devices. Clear fields properly.
*/ int aer_get_device_error_info(struct aer_err_info *info, int i)
{ struct pci_dev *dev; int type, aer;
u32 aercc;
if (i >= AER_MAX_MULTI_ERR_DEVICES) return 0;
dev = info->dev[i];
aer = dev->aer_cap;
type = pci_pcie_type(dev);
/* Must reset in this function */
info->status = 0;
info->tlp_header_valid = 0;
/* The device might not support AER */ if (!aer) return 0;
if (info->severity == AER_CORRECTABLE) {
pci_read_config_dword(dev, aer + PCI_ERR_COR_STATUS,
&info->status);
pci_read_config_dword(dev, aer + PCI_ERR_COR_MASK,
&info->mask); if (!(info->status & ~info->mask)) return 0;
} elseif (type == PCI_EXP_TYPE_ROOT_PORT ||
type == PCI_EXP_TYPE_RC_EC ||
type == PCI_EXP_TYPE_DOWNSTREAM ||
info->severity == AER_NONFATAL) {
/* Link is still healthy for IO reads */
pci_read_config_dword(dev, aer + PCI_ERR_UNCOR_STATUS,
&info->status);
pci_read_config_dword(dev, aer + PCI_ERR_UNCOR_MASK,
&info->mask); if (!(info->status & ~info->mask)) return 0;
/* Get First Error Pointer */
pci_read_config_dword(dev, aer + PCI_ERR_CAP, &aercc);
info->first_error = PCI_ERR_CAP_FEP(aercc);
staticinlinevoid aer_process_err_devices(struct aer_err_info *e_info)
{ int i;
/* Report all before handling them, to not lose records by reset etc. */ for (i = 0; i < e_info->error_dev_num && e_info->dev[i]; i++) { if (aer_get_device_error_info(e_info, i))
aer_print_error(e_info, i);
} for (i = 0; i < e_info->error_dev_num && e_info->dev[i]; i++) { if (aer_get_device_error_info(e_info, i))
handle_error_source(e_info->dev[i], e_info);
}
}
/** * aer_isr_one_error_type - consume a Correctable or Uncorrectable Error * detected by Root Port or RCEC * @root: pointer to Root Port or RCEC that signaled AER interrupt * @info: pointer to AER error info
*/ staticvoid aer_isr_one_error_type(struct pci_dev *root, struct aer_err_info *info)
{ bool found;
found = find_source_device(root, info);
/* * If we're going to log error messages, we've already set * "info->root_ratelimit_print" and "info->ratelimit_print[i]" to * non-zero (which enables printing) because this is either an * ERR_FATAL or we found a device with an error logged in its AER * Capability. * * If we didn't find the Error Source device, at least log the * Requester ID from the ERR_* Message received by the Root Port or * RCEC, ratelimited by the RP or RCEC.
*/ if (info->root_ratelimit_print ||
(!found && aer_ratelimit(root, info->severity)))
aer_print_source(root, info, found);
if (found)
aer_process_err_devices(info);
}
/** * aer_isr_one_error - consume error(s) signaled by an AER interrupt from * Root Port or RCEC * @root: pointer to Root Port or RCEC that signaled AER interrupt * @e_src: pointer to an error source
*/ staticvoid aer_isr_one_error(struct pci_dev *root, struct aer_err_source *e_src)
{
u32 status = e_src->status;
pci_rootport_aer_stats_incr(root, e_src);
/* * There is a possibility that both correctable error and * uncorrectable error being logged. Report correctable error first.
*/ if (status & PCI_ERR_ROOT_COR_RCV) { int multi = status & PCI_ERR_ROOT_MULTI_COR_RCV; struct aer_err_info e_info = {
.id = ERR_COR_ID(e_src->id),
.severity = AER_CORRECTABLE,
.level = KERN_WARNING,
.multi_error_valid = multi ? 1 : 0,
};
aer_isr_one_error_type(root, &e_info);
}
if (status & PCI_ERR_ROOT_UNCOR_RCV) { int fatal = status & PCI_ERR_ROOT_FATAL_RCV; int multi = status & PCI_ERR_ROOT_MULTI_UNCOR_RCV; struct aer_err_info e_info = {
.id = ERR_UNCOR_ID(e_src->id),
.severity = fatal ? AER_FATAL : AER_NONFATAL,
.level = KERN_ERR,
.multi_error_valid = multi ? 1 : 0,
};
aer_isr_one_error_type(root, &e_info);
}
}
/** * aer_isr - consume errors detected by Root Port * @irq: IRQ assigned to Root Port * @context: pointer to Root Port data structure * * Invoked, as DPC, when Root Port records new detected error
*/ static irqreturn_t aer_isr(int irq, void *context)
{ struct pcie_device *dev = (struct pcie_device *)context; struct aer_rpc *rpc = get_service_data(dev); struct aer_err_source e_src;
if (kfifo_is_empty(&rpc->aer_fifo)) return IRQ_NONE;
while (kfifo_get(&rpc->aer_fifo, &e_src))
aer_isr_one_error(rpc->rpd, &e_src); return IRQ_HANDLED;
}
/** * aer_irq - Root Port's ISR * @irq: IRQ assigned to Root Port * @context: pointer to Root Port data structure * * Invoked when Root Port detects AER messages.
*/ static irqreturn_t aer_irq(int irq, void *context)
{ struct pcie_device *pdev = (struct pcie_device *)context; struct aer_rpc *rpc = get_service_data(pdev); struct pci_dev *rp = rpc->rpd; int aer = rp->aer_cap; struct aer_err_source e_src = {};
/** * aer_root_reset - reset Root Port hierarchy, RCEC, or RCiEP * @dev: pointer to Root Port, RCEC, or RCiEP * * Invoked by Port Bus driver when performing reset.
*/ static pci_ers_result_t aer_root_reset(struct pci_dev *dev)
{ int type = pci_pcie_type(dev); struct pci_dev *root; int aer; struct pci_host_bridge *host = pci_find_host_bridge(dev->bus);
u32 reg32; int rc;
/* * Only Root Ports and RCECs have AER Root Command and Root Status * registers. If "dev" is an RCiEP, the relevant registers are in * the RCEC.
*/ if (type == PCI_EXP_TYPE_RC_END)
root = dev->rcec; else
root = pcie_find_root_port(dev);
/* * If the platform retained control of AER, an RCiEP may not have * an RCEC visible to us, so dev->rcec ("root") may be NULL. In * that case, firmware is responsible for these registers.
*/
aer = root ? root->aer_cap : 0;
if ((host->native_aer || pcie_ports_native) && aer)
aer_disable_irq(root);
if (type == PCI_EXP_TYPE_RC_EC || type == PCI_EXP_TYPE_RC_END) {
rc = pcie_reset_flr(dev, PCI_RESET_DO_RESET); if (!rc)
pci_info(dev, "has been reset\n"); else
pci_info(dev, "not reset (no FLR support: %d)\n", rc);
} else {
rc = pci_bus_error_reset(dev);
pci_info(dev, "%s Port link has been reset (%d)\n",
pci_is_root_bus(dev->bus) ? "Root" : "Downstream", rc);
}
/** * pcie_aer_init - register AER service driver * * Invoked when AER service driver is loaded.
*/ int __init pcie_aer_init(void)
{ if (!pci_aer_available()) return -ENXIO; return pcie_port_service_register(&aerdriver);
}
Messung V0.5
¤ Dauer der Verarbeitung: 0.19 Sekunden
(vorverarbeitet)
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.