linux/drivers/cxl/core/ras.c
Dan Williams 3c70ec71ab cxl/ras: Fix CPER handler device confusion
By inspection, cxl_cper_handle_prot_err() is making a series of fragile
assumptions that can lead to crashes:

1/ It assumes that endpoints identified in the record are a CXL-type-3
   device, nothing guarantees that.

2/ It assumes that the device is bound to the cxl_pci driver, nothing
   guarantees that.

3/ Minor, it holds the device lock over the switch-port tracing for no
   reason as the trace is 100% generated from data in the record.

Correct those by checking that the PCIe endpoint parents a cxl_memdev
before assuming the format of the driver data, and move the lock to where
it is required. Consequently this also makes the implementation ready for
CXL accelerators that are not bound to cxl_pci.

Fixes: 36f257e3b0 ("acpi/ghes, cxl/pci: Process CXL CPER Protocol Errors")
Cc: Terry Bowman <terry.bowman@amd.com>
Cc: Li Ming <ming.li@zohomail.com>
Cc: Alison Schofield <alison.schofield@intel.com>
Cc: Ira Weiny <ira.weiny@intel.com>
Cc: Tony Luck <tony.luck@intel.com>
Reviewed-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
Reviewed-by: Dave Jiang <dave.jiang@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Reviewed-by: Li Ming <ming.li@zohomail.com>
Link: https://patch.msgid.link/20250612192043.2254617-1-dan.j.williams@intel.com
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
2025-06-13 09:02:04 -07:00

126 lines
3.2 KiB
C

// SPDX-License-Identifier: GPL-2.0-only
/* Copyright(c) 2025 AMD Corporation. All rights reserved. */
#include <linux/pci.h>
#include <linux/aer.h>
#include <cxl/event.h>
#include <cxlmem.h>
#include "trace.h"
static void cxl_cper_trace_corr_port_prot_err(struct pci_dev *pdev,
struct cxl_ras_capability_regs ras_cap)
{
u32 status = ras_cap.cor_status & ~ras_cap.cor_mask;
trace_cxl_port_aer_correctable_error(&pdev->dev, status);
}
static void cxl_cper_trace_uncorr_port_prot_err(struct pci_dev *pdev,
struct cxl_ras_capability_regs ras_cap)
{
u32 status = ras_cap.uncor_status & ~ras_cap.uncor_mask;
u32 fe;
if (hweight32(status) > 1)
fe = BIT(FIELD_GET(CXL_RAS_CAP_CONTROL_FE_MASK,
ras_cap.cap_control));
else
fe = status;
trace_cxl_port_aer_uncorrectable_error(&pdev->dev, status, fe,
ras_cap.header_log);
}
static void cxl_cper_trace_corr_prot_err(struct cxl_memdev *cxlmd,
struct cxl_ras_capability_regs ras_cap)
{
u32 status = ras_cap.cor_status & ~ras_cap.cor_mask;
trace_cxl_aer_correctable_error(cxlmd, status);
}
static void
cxl_cper_trace_uncorr_prot_err(struct cxl_memdev *cxlmd,
struct cxl_ras_capability_regs ras_cap)
{
u32 status = ras_cap.uncor_status & ~ras_cap.uncor_mask;
u32 fe;
if (hweight32(status) > 1)
fe = BIT(FIELD_GET(CXL_RAS_CAP_CONTROL_FE_MASK,
ras_cap.cap_control));
else
fe = status;
trace_cxl_aer_uncorrectable_error(cxlmd, status, fe,
ras_cap.header_log);
}
static int match_memdev_by_parent(struct device *dev, const void *uport)
{
if (is_cxl_memdev(dev) && dev->parent == uport)
return 1;
return 0;
}
static void cxl_cper_handle_prot_err(struct cxl_cper_prot_err_work_data *data)
{
unsigned int devfn = PCI_DEVFN(data->prot_err.agent_addr.device,
data->prot_err.agent_addr.function);
struct pci_dev *pdev __free(pci_dev_put) =
pci_get_domain_bus_and_slot(data->prot_err.agent_addr.segment,
data->prot_err.agent_addr.bus,
devfn);
struct cxl_memdev *cxlmd;
int port_type;
if (!pdev)
return;
port_type = pci_pcie_type(pdev);
if (port_type == PCI_EXP_TYPE_ROOT_PORT ||
port_type == PCI_EXP_TYPE_DOWNSTREAM ||
port_type == PCI_EXP_TYPE_UPSTREAM) {
if (data->severity == AER_CORRECTABLE)
cxl_cper_trace_corr_port_prot_err(pdev, data->ras_cap);
else
cxl_cper_trace_uncorr_port_prot_err(pdev, data->ras_cap);
return;
}
guard(device)(&pdev->dev);
if (!pdev->dev.driver)
return;
struct device *mem_dev __free(put_device) = bus_find_device(
&cxl_bus_type, NULL, pdev, match_memdev_by_parent);
if (!mem_dev)
return;
cxlmd = to_cxl_memdev(mem_dev);
if (data->severity == AER_CORRECTABLE)
cxl_cper_trace_corr_prot_err(cxlmd, data->ras_cap);
else
cxl_cper_trace_uncorr_prot_err(cxlmd, data->ras_cap);
}
static void cxl_cper_prot_err_work_fn(struct work_struct *work)
{
struct cxl_cper_prot_err_work_data wd;
while (cxl_cper_prot_err_kfifo_get(&wd))
cxl_cper_handle_prot_err(&wd);
}
static DECLARE_WORK(cxl_cper_prot_err_work, cxl_cper_prot_err_work_fn);
int cxl_ras_init(void)
{
return cxl_cper_register_prot_err_work(&cxl_cper_prot_err_work);
}
void cxl_ras_exit(void)
{
cxl_cper_unregister_prot_err_work(&cxl_cper_prot_err_work);
cancel_work_sync(&cxl_cper_prot_err_work);
}