mirror of
git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2025-09-18 22:14:16 +00:00
cxl/ras: Fix CPER handler device confusion
By inspection, cxl_cper_handle_prot_err() is making a series of fragile
assumptions that can lead to crashes:
1/ It assumes that endpoints identified in the record are a CXL-type-3
device, nothing guarantees that.
2/ It assumes that the device is bound to the cxl_pci driver, nothing
guarantees that.
3/ Minor, it holds the device lock over the switch-port tracing for no
reason as the trace is 100% generated from data in the record.
Correct those by checking that the PCIe endpoint parents a cxl_memdev
before assuming the format of the driver data, and move the lock to where
it is required. Consequently this also makes the implementation ready for
CXL accelerators that are not bound to cxl_pci.
Fixes: 36f257e3b0
("acpi/ghes, cxl/pci: Process CXL CPER Protocol Errors")
Cc: Terry Bowman <terry.bowman@amd.com>
Cc: Li Ming <ming.li@zohomail.com>
Cc: Alison Schofield <alison.schofield@intel.com>
Cc: Ira Weiny <ira.weiny@intel.com>
Cc: Tony Luck <tony.luck@intel.com>
Reviewed-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
Reviewed-by: Dave Jiang <dave.jiang@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Reviewed-by: Li Ming <ming.li@zohomail.com>
Link: https://patch.msgid.link/20250612192043.2254617-1-dan.j.williams@intel.com
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
This commit is contained in:
parent
a403fe6c0b
commit
3c70ec71ab
1 changed files with 27 additions and 20 deletions
|
@ -31,40 +31,38 @@ static void cxl_cper_trace_uncorr_port_prot_err(struct pci_dev *pdev,
|
||||||
ras_cap.header_log);
|
ras_cap.header_log);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void cxl_cper_trace_corr_prot_err(struct pci_dev *pdev,
|
static void cxl_cper_trace_corr_prot_err(struct cxl_memdev *cxlmd,
|
||||||
struct cxl_ras_capability_regs ras_cap)
|
struct cxl_ras_capability_regs ras_cap)
|
||||||
{
|
{
|
||||||
u32 status = ras_cap.cor_status & ~ras_cap.cor_mask;
|
u32 status = ras_cap.cor_status & ~ras_cap.cor_mask;
|
||||||
struct cxl_dev_state *cxlds;
|
|
||||||
|
|
||||||
cxlds = pci_get_drvdata(pdev);
|
trace_cxl_aer_correctable_error(cxlmd, status);
|
||||||
if (!cxlds)
|
|
||||||
return;
|
|
||||||
|
|
||||||
trace_cxl_aer_correctable_error(cxlds->cxlmd, status);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static void cxl_cper_trace_uncorr_prot_err(struct pci_dev *pdev,
|
static void
|
||||||
struct cxl_ras_capability_regs ras_cap)
|
cxl_cper_trace_uncorr_prot_err(struct cxl_memdev *cxlmd,
|
||||||
|
struct cxl_ras_capability_regs ras_cap)
|
||||||
{
|
{
|
||||||
u32 status = ras_cap.uncor_status & ~ras_cap.uncor_mask;
|
u32 status = ras_cap.uncor_status & ~ras_cap.uncor_mask;
|
||||||
struct cxl_dev_state *cxlds;
|
|
||||||
u32 fe;
|
u32 fe;
|
||||||
|
|
||||||
cxlds = pci_get_drvdata(pdev);
|
|
||||||
if (!cxlds)
|
|
||||||
return;
|
|
||||||
|
|
||||||
if (hweight32(status) > 1)
|
if (hweight32(status) > 1)
|
||||||
fe = BIT(FIELD_GET(CXL_RAS_CAP_CONTROL_FE_MASK,
|
fe = BIT(FIELD_GET(CXL_RAS_CAP_CONTROL_FE_MASK,
|
||||||
ras_cap.cap_control));
|
ras_cap.cap_control));
|
||||||
else
|
else
|
||||||
fe = status;
|
fe = status;
|
||||||
|
|
||||||
trace_cxl_aer_uncorrectable_error(cxlds->cxlmd, status, fe,
|
trace_cxl_aer_uncorrectable_error(cxlmd, status, fe,
|
||||||
ras_cap.header_log);
|
ras_cap.header_log);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static int match_memdev_by_parent(struct device *dev, const void *uport)
|
||||||
|
{
|
||||||
|
if (is_cxl_memdev(dev) && dev->parent == uport)
|
||||||
|
return 1;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
static void cxl_cper_handle_prot_err(struct cxl_cper_prot_err_work_data *data)
|
static void cxl_cper_handle_prot_err(struct cxl_cper_prot_err_work_data *data)
|
||||||
{
|
{
|
||||||
unsigned int devfn = PCI_DEVFN(data->prot_err.agent_addr.device,
|
unsigned int devfn = PCI_DEVFN(data->prot_err.agent_addr.device,
|
||||||
|
@ -73,13 +71,12 @@ static void cxl_cper_handle_prot_err(struct cxl_cper_prot_err_work_data *data)
|
||||||
pci_get_domain_bus_and_slot(data->prot_err.agent_addr.segment,
|
pci_get_domain_bus_and_slot(data->prot_err.agent_addr.segment,
|
||||||
data->prot_err.agent_addr.bus,
|
data->prot_err.agent_addr.bus,
|
||||||
devfn);
|
devfn);
|
||||||
|
struct cxl_memdev *cxlmd;
|
||||||
int port_type;
|
int port_type;
|
||||||
|
|
||||||
if (!pdev)
|
if (!pdev)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
guard(device)(&pdev->dev);
|
|
||||||
|
|
||||||
port_type = pci_pcie_type(pdev);
|
port_type = pci_pcie_type(pdev);
|
||||||
if (port_type == PCI_EXP_TYPE_ROOT_PORT ||
|
if (port_type == PCI_EXP_TYPE_ROOT_PORT ||
|
||||||
port_type == PCI_EXP_TYPE_DOWNSTREAM ||
|
port_type == PCI_EXP_TYPE_DOWNSTREAM ||
|
||||||
|
@ -92,10 +89,20 @@ static void cxl_cper_handle_prot_err(struct cxl_cper_prot_err_work_data *data)
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
guard(device)(&pdev->dev);
|
||||||
|
if (!pdev->dev.driver)
|
||||||
|
return;
|
||||||
|
|
||||||
|
struct device *mem_dev __free(put_device) = bus_find_device(
|
||||||
|
&cxl_bus_type, NULL, pdev, match_memdev_by_parent);
|
||||||
|
if (!mem_dev)
|
||||||
|
return;
|
||||||
|
|
||||||
|
cxlmd = to_cxl_memdev(mem_dev);
|
||||||
if (data->severity == AER_CORRECTABLE)
|
if (data->severity == AER_CORRECTABLE)
|
||||||
cxl_cper_trace_corr_prot_err(pdev, data->ras_cap);
|
cxl_cper_trace_corr_prot_err(cxlmd, data->ras_cap);
|
||||||
else
|
else
|
||||||
cxl_cper_trace_uncorr_prot_err(pdev, data->ras_cap);
|
cxl_cper_trace_uncorr_prot_err(cxlmd, data->ras_cap);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void cxl_cper_prot_err_work_fn(struct work_struct *work)
|
static void cxl_cper_prot_err_work_fn(struct work_struct *work)
|
||||||
|
|
Loading…
Add table
Reference in a new issue