/** * DOC: cxl core pci * * Compute Express Link protocols are layered on top of PCIe. CXL core provides * a set of helpers for CXL interactions which occur via PCIe.
*/
staticunsignedshort media_ready_timeout = 60;
module_param(media_ready_timeout, ushort, 0644);
MODULE_PARM_DESC(media_ready_timeout, "seconds to wait for media ready");
struct cxl_walk_context { struct pci_bus *bus; struct cxl_port *port; int type; int error; int count;
};
/** * devm_cxl_port_enumerate_dports - enumerate downstream ports of the upstream port * @port: cxl_port whose ->uport_dev is the upstream of dports to be enumerated * * Returns a positive number of dports enumerated or a negative error * code.
*/ int devm_cxl_port_enumerate_dports(struct cxl_port *port)
{ struct pci_bus *bus = cxl_port_to_pci_bus(port); struct cxl_walk_context ctx; int type;
if (!bus) return -ENXIO;
if (pci_is_root_bus(bus))
type = PCI_EXP_TYPE_ROOT_PORT; else
type = PCI_EXP_TYPE_DOWNSTREAM;
if (ctx.count == 0) return -ENODEV; if (ctx.error) return ctx.error; return ctx.count;
}
EXPORT_SYMBOL_NS_GPL(devm_cxl_port_enumerate_dports, "CXL");
staticint cxl_dvsec_mem_range_valid(struct cxl_dev_state *cxlds, int id)
{ struct pci_dev *pdev = to_pci_dev(cxlds->dev); int d = cxlds->cxl_dvsec; bool valid = false; int rc, i;
u32 temp;
if (id > CXL_DVSEC_RANGE_MAX) return -EINVAL;
/* Check MEM INFO VALID bit first, give up after 1s */
i = 1; do {
rc = pci_read_config_dword(pdev,
d + CXL_DVSEC_RANGE_SIZE_LOW(id),
&temp); if (rc) return rc;
valid = FIELD_GET(CXL_DVSEC_MEM_INFO_VALID, temp); if (valid) break;
msleep(1000);
} while (i--);
if (!valid) {
dev_err(&pdev->dev, "Timeout awaiting memory range %d valid after 1s.\n",
id); return -ETIMEDOUT;
}
return 0;
}
staticint cxl_dvsec_mem_range_active(struct cxl_dev_state *cxlds, int id)
{ struct pci_dev *pdev = to_pci_dev(cxlds->dev); int d = cxlds->cxl_dvsec; bool active = false; int rc, i;
u32 temp;
if (id > CXL_DVSEC_RANGE_MAX) return -EINVAL;
/* Check MEM ACTIVE bit, up to 60s timeout by default */ for (i = media_ready_timeout; i; i--) {
rc = pci_read_config_dword(
pdev, d + CXL_DVSEC_RANGE_SIZE_LOW(id), &temp); if (rc) return rc;
active = FIELD_GET(CXL_DVSEC_MEM_ACTIVE, temp); if (active) break;
msleep(1000);
}
if (!active) {
dev_err(&pdev->dev, "timeout awaiting memory active after %d seconds\n",
media_ready_timeout); return -ETIMEDOUT;
}
return 0;
}
/* * Wait up to @media_ready_timeout for the device to report memory * active.
*/ int cxl_await_media_ready(struct cxl_dev_state *cxlds)
{ struct pci_dev *pdev = to_pci_dev(cxlds->dev); int d = cxlds->cxl_dvsec; int rc, i, hdm_count;
u64 md_status;
u16 cap;
rc = pci_read_config_word(pdev,
d + CXL_DVSEC_CAP_OFFSET, &cap); if (rc) return rc;
hdm_count = FIELD_GET(CXL_DVSEC_HDM_COUNT_MASK, cap); for (i = 0; i < hdm_count; i++) {
rc = cxl_dvsec_mem_range_valid(cxlds, i); if (rc) return rc;
}
for (i = 0; i < hdm_count; i++) {
rc = cxl_dvsec_mem_range_active(cxlds, i); if (rc) return rc;
}
md_status = readq(cxlds->regs.memdev + CXLMDEV_STATUS_OFFSET); if (!CXLMDEV_READY(md_status)) return -EIO;
int cxl_dvsec_rr_decode(struct cxl_dev_state *cxlds, struct cxl_endpoint_dvsec_info *info)
{ struct pci_dev *pdev = to_pci_dev(cxlds->dev); struct device *dev = cxlds->dev; int hdm_count, rc, i, ranges = 0; int d = cxlds->cxl_dvsec;
u16 cap, ctrl;
if (!d) {
dev_dbg(dev, "No DVSEC Capability\n"); return -ENXIO;
}
rc = pci_read_config_word(pdev, d + CXL_DVSEC_CAP_OFFSET, &cap); if (rc) return rc;
if (!(cap & CXL_DVSEC_MEM_CAPABLE)) {
dev_dbg(dev, "Not MEM Capable\n"); return -ENXIO;
}
/* * It is not allowed by spec for MEM.capable to be set and have 0 legacy * HDM decoders (values > 2 are also undefined as of CXL 2.0). As this * driver is for a spec defined class code which must be CXL.mem * capable, there is no point in continuing to enable CXL.mem.
*/
hdm_count = FIELD_GET(CXL_DVSEC_HDM_COUNT_MASK, cap); if (!hdm_count || hdm_count > 2) return -EINVAL;
/* * The current DVSEC values are moot if the memory capability is * disabled, and they will remain moot after the HDM Decoder * capability is enabled.
*/
rc = pci_read_config_word(pdev, d + CXL_DVSEC_CTRL_OFFSET, &ctrl); if (rc) return rc;
info->mem_enabled = FIELD_GET(CXL_DVSEC_MEM_ENABLE, ctrl); if (!info->mem_enabled) return 0;
for (i = 0; i < hdm_count; i++) {
u64 base, size;
u32 temp;
rc = cxl_dvsec_mem_range_valid(cxlds, i); if (rc) return rc;
rc = pci_read_config_dword(
pdev, d + CXL_DVSEC_RANGE_SIZE_HIGH(i), &temp); if (rc) return rc;
size = (u64)temp << 32;
rc = pci_read_config_dword(
pdev, d + CXL_DVSEC_RANGE_SIZE_LOW(i), &temp); if (rc) return rc;
size |= temp & CXL_DVSEC_MEM_SIZE_LOW_MASK; if (!size) { continue;
}
rc = pci_read_config_dword(
pdev, d + CXL_DVSEC_RANGE_BASE_HIGH(i), &temp); if (rc) return rc;
base = (u64)temp << 32;
rc = pci_read_config_dword(
pdev, d + CXL_DVSEC_RANGE_BASE_LOW(i), &temp); if (rc) return rc;
/** * cxl_hdm_decode_init() - Setup HDM decoding for the endpoint * @cxlds: Device state * @cxlhdm: Mapped HDM decoder Capability * @info: Cached DVSEC range registers info * * Try to enable the endpoint's HDM Decoder Capability
*/ int cxl_hdm_decode_init(struct cxl_dev_state *cxlds, struct cxl_hdm *cxlhdm, struct cxl_endpoint_dvsec_info *info)
{ void __iomem *hdm = cxlhdm->regs.hdm_decoder; struct cxl_port *port = cxlhdm->port; struct device *dev = cxlds->dev; struct cxl_port *root; int i, rc, allowed;
u32 global_ctrl = 0;
if (hdm)
global_ctrl = readl(hdm + CXL_HDM_DECODER_CTRL_OFFSET);
/* * If the HDM Decoder Capability is already enabled then assume * that some other agent like platform firmware set it up.
*/ if (global_ctrl & CXL_HDM_DECODER_ENABLE || (!hdm && info->mem_enabled)) return devm_cxl_enable_mem(&port->dev, cxlds);
/* * If the HDM Decoder Capability does not exist and DVSEC was * not setup, the DVSEC based emulation cannot be used.
*/ if (!hdm) return -ENODEV;
/* The HDM Decoder Capability exists but is globally disabled. */
/* * If the DVSEC CXL Range registers are not enabled, just * enable and use the HDM Decoder Capability registers.
*/ if (!info->mem_enabled) {
rc = devm_cxl_enable_hdm(&port->dev, cxlhdm); if (rc) return rc;
return devm_cxl_enable_mem(&port->dev, cxlds);
}
/* * Per CXL 2.0 Section 8.1.3.8.3 and 8.1.3.8.4 DVSEC CXL Range 1 Base * [High,Low] when HDM operation is enabled the range register values * are ignored by the device, but the spec also recommends matching the * DVSEC Range 1,2 to HDM Decoder Range 0,1. So, non-zero info->ranges * are expected even though Linux does not require or maintain that * match. Check if at least one DVSEC range is enabled and allowed by * the platform. That is, the DVSEC range must be covered by a locked * platform window (CFMWS). Fail otherwise as the endpoint's decoders * cannot be used.
*/
root = to_cxl_port(port->dev.parent); while (!is_cxl_root(root) && is_cxl_port(root->dev.parent))
root = to_cxl_port(root->dev.parent); if (!is_cxl_root(root)) {
dev_err(dev, "Failed to acquire root port for HDM enable\n"); return -ENODEV;
}
for (i = 0, allowed = 0; i < info->ranges; i++) { struct device *cxld_dev;
cxld_dev = device_find_child(&root->dev, &info->dvsec_range[i],
dvsec_range_allowed); if (!cxld_dev) {
dev_dbg(dev, "DVSEC Range%d denied by platform\n", i); continue;
}
dev_dbg(dev, "DVSEC Range%d allowed by platform\n", i);
put_device(cxld_dev);
allowed++;
}
if (!allowed) {
dev_err(dev, "Range register decodes outside platform defined CXL ranges.\n"); return -ENXIO;
}
for (sum = 0, i = 0; i < size; i++)
sum += data[i]; return sum;
}
/** * read_cdat_data - Read the CDAT data on this port * @port: Port to read data from * * This call will sleep waiting for responses from the DOE mailbox.
*/ void read_cdat_data(struct cxl_port *port)
{ struct device *uport = port->uport_dev; struct device *dev = &port->dev; struct pci_doe_mb *doe_mb; struct pci_dev *pdev = NULL; struct cxl_memdev *cxlmd; struct cdat_doe_rsp *buf;
size_t table_length, length; int rc;
doe_mb = pci_find_doe_mailbox(pdev, PCI_VENDOR_ID_CXL,
CXL_DOE_PROTOCOL_TABLE_ACCESS); if (!doe_mb) {
dev_dbg(dev, "No CDAT mailbox\n"); return;
}
port->cdat_available = true;
if (cxl_cdat_get_length(dev, doe_mb, &length)) {
dev_dbg(dev, "No CDAT length\n"); return;
}
/* * The begin of the CDAT buffer needs space for additional 4 * bytes for the DOE header. Table data starts afterwards.
*/
buf = devm_kzalloc(dev, sizeof(*buf) + length, GFP_KERNEL); if (!buf) goto err;
table_length = length;
rc = cxl_cdat_read_table(dev, doe_mb, buf, &length); if (rc) goto err;
for (i = 0; i < log_u32_size; i++) {
*log_addr = readl(addr);
log_addr++;
addr += sizeof(u32);
}
}
/* * Log the state of the RAS status registers and prepare them to log the * next error status. Return 1 if reset needed.
*/ staticbool __cxl_handle_ras(struct cxl_dev_state *cxlds, void __iomem *ras_base)
{
u32 hl[CXL_HEADERLOG_SIZE_U32]; void __iomem *addr;
u32 status;
u32 fe;
if (!ras_base) returnfalse;
addr = ras_base + CXL_RAS_UNCORRECTABLE_STATUS_OFFSET;
status = readl(addr); if (!(status & CXL_RAS_UNCORRECTABLE_STATUS_MASK)) returnfalse;
/* If multiple errors, log header points to first error from ctrl reg */ if (hweight32(status) > 1) { void __iomem *rcc_addr =
ras_base + CXL_RAS_CAP_CONTROL_OFFSET;
fe = BIT(FIELD_GET(CXL_RAS_CAP_CONTROL_FE_MASK,
readl(rcc_addr)));
} else {
fe = status;
}
/* * Copy the AER capability registers using 32 bit read accesses. * This is necessary because RCRB AER capability is MMIO mapped. Clear the * status after copying. * * @aer_base: base address of AER capability block in RCRB * @aer_regs: destination for copying AER capability
*/ staticbool cxl_rch_get_aer_info(void __iomem *aer_base, struct aer_capability_regs *aer_regs)
{ int read_cnt = sizeof(struct aer_capability_regs) / sizeof(u32);
u32 *aer_regs_buf = (u32 *)aer_regs; int n;
if (!aer_base) returnfalse;
/* Use readl() to guarantee 32-bit accesses */ for (n = 0; n < read_cnt; n++)
aer_regs_buf[n] = readl(aer_base + n * sizeof(u32));
/* Get AER severity. Return false if there is no error. */ staticbool cxl_rch_get_aer_severity(struct aer_capability_regs *aer_regs, int *severity)
{ if (aer_regs->uncor_status & ~aer_regs->uncor_mask) { if (aer_regs->uncor_status & PCI_ERR_ROOT_FATAL_RCV)
*severity = AER_FATAL; else
*severity = AER_NONFATAL; returntrue;
}
if (aer_regs->cor_status & ~aer_regs->cor_mask) {
*severity = AER_CORRECTABLE; returntrue;
}
if (cxlds->rcd)
cxl_handle_rdport_errors(cxlds); /* * A frozen channel indicates an impending reset which is fatal to * CXL.mem operation, and will likely crash the system. On the off * chance the situation is recoverable dump the status of the RAS * capability registers and bounce the active state of the memdev.
*/
ue = cxl_handle_endpoint_ras(cxlds);
}
switch (state) { case pci_channel_io_normal: if (ue) {
device_release_driver(dev); return PCI_ERS_RESULT_NEED_RESET;
} return PCI_ERS_RESULT_CAN_RECOVER; case pci_channel_io_frozen:
dev_warn(&pdev->dev, "%s: frozen state error detected, disable CXL.mem\n",
dev_name(dev));
device_release_driver(dev); return PCI_ERS_RESULT_NEED_RESET; case pci_channel_io_perm_failure:
dev_warn(&pdev->dev, "failure state error detected, request disconnect\n"); return PCI_ERS_RESULT_DISCONNECT;
} return PCI_ERS_RESULT_NEED_RESET;
}
EXPORT_SYMBOL_NS_GPL(cxl_error_detected, "CXL");
staticint cxl_flit_size(struct pci_dev *pdev)
{ if (cxl_pci_flit_256(pdev)) return 256;
return 68;
}
/** * cxl_pci_get_latency - calculate the link latency for the PCIe link * @pdev: PCI device * * return: calculated latency or 0 for no latency * * CXL Memory Device SW Guide v1.0 2.11.4 Link latency calculation * Link latency = LinkPropagationLatency + FlitLatency + RetimerLatency * LinkProgationLatency is negligible, so 0 will be used * RetimerLatency is assumed to be negligible and 0 will be used * FlitLatency = FlitSize / LinkBandwidth * FlitSize is defined by spec. CXL rev3.0 4.2.1. * 68B flit is used up to 32GT/s. >32GT/s, 256B flit size is used. * The FlitLatency is converted to picoseconds.
*/ long cxl_pci_get_latency(struct pci_dev *pdev)
{ long bw;
for (int i = 0; i < ACCESS_COORDINATE_MAX; i++) {
c[i].read_bandwidth = bw;
c[i].write_bandwidth = bw;
}
return 0;
}
/* * Set max timeout such that platforms will optimize GPF flow to avoid * the implied worst-case scenario delays. On a sane platform, all * devices should always complete GPF within the energy budget of * the GPF flow. The kernel does not have enough information to pick * anything better than "maximize timeouts and hope it works". * * A misbehaving device could block forward progress of GPF for all * the other devices, exhausting the energy budget of the platform. * However, the spec seems to assume that moving on from slow to respond * devices is a virtue. It is not possible to know that, in actuality, * the slow to respond device is *the* most critical device in the * system to wait.
*/ #define GPF_TIMEOUT_BASE_MAX 2 #define GPF_TIMEOUT_SCALE_MAX 7 /* 10 seconds */
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.