acpi/ghes, cxl/pci: Process CXL CPER Protocol Errors

When PCIe AER is in FW-First, OS should process CXL Protocol errors from
CPER records. Introduce support for handling and logging CXL Protocol
errors.

The defined trace events cxl_aer_uncorrectable_error and
cxl_aer_correctable_error trace native CXL AER endpoint errors. Reuse them
to trace FW-First Protocol errors.

Since the CXL code is required to be called from process context and
GHES is in interrupt context, use workqueues for processing.

Similar to CXL CPER event handling, use kfifo to handle errors as it
simplifies queue processing by providing lock free fifo operations.

Add the ability for the CXL sub-system to register a workqueue to
process CXL CPER protocol errors.

[DJ: return cxl_cper_register_prot_err_work() directly in cxl_ras_init()]

Signed-off-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
Reviewed-by: Li Ming <ming.li@zohomail.com>
Reviewed-by: Alison Schofield <alison.schofield@intel.com>
Reviewed-by: Ira Weiny <ira.weiny@intel.com>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Link: https://patch.msgid.link/20250310223839.31342-2-Smita.KoralahalliChannabasappa@amd.com
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
This commit is contained in:
Smita Koralahalli 2025-03-10 22:38:38 +00:00 committed by Dave Jiang
parent 315c2f0b53
commit 36f257e3b0
7 changed files with 158 additions and 0 deletions

View file

@ -674,6 +674,15 @@ static void ghes_defer_non_standard_event(struct acpi_hest_generic_data *gdata,
schedule_work(&entry->work);
}
/* Room for 8 entries */
#define CXL_CPER_PROT_ERR_FIFO_DEPTH 8
static DEFINE_KFIFO(cxl_cper_prot_err_fifo, struct cxl_cper_prot_err_work_data,
CXL_CPER_PROT_ERR_FIFO_DEPTH);
/* Synchronize schedule_work() with cxl_cper_prot_err_work changes */
static DEFINE_SPINLOCK(cxl_cper_prot_err_work_lock);
struct work_struct *cxl_cper_prot_err_work;
static void cxl_cper_post_prot_err(struct cxl_cper_sec_prot_err *prot_err,
int severity)
{
@ -700,6 +709,11 @@ static void cxl_cper_post_prot_err(struct cxl_cper_sec_prot_err *prot_err,
if (!(prot_err->valid_bits & PROT_ERR_VALID_SERIAL_NUMBER))
pr_warn(FW_WARN "CXL CPER no device serial number\n");
guard(spinlock_irqsave)(&cxl_cper_prot_err_work_lock);
if (!cxl_cper_prot_err_work)
return;
switch (prot_err->agent_type) {
case RCD:
case DEVICE:
@ -721,9 +735,44 @@ static void cxl_cper_post_prot_err(struct cxl_cper_sec_prot_err *prot_err,
prot_err->agent_type);
return;
}
if (!kfifo_put(&cxl_cper_prot_err_fifo, wd)) {
pr_err_ratelimited("CXL CPER kfifo overflow\n");
return;
}
schedule_work(cxl_cper_prot_err_work);
#endif
}
int cxl_cper_register_prot_err_work(struct work_struct *work)
{
if (cxl_cper_prot_err_work)
return -EINVAL;
guard(spinlock)(&cxl_cper_prot_err_work_lock);
cxl_cper_prot_err_work = work;
return 0;
}
EXPORT_SYMBOL_NS_GPL(cxl_cper_register_prot_err_work, "CXL");
int cxl_cper_unregister_prot_err_work(struct work_struct *work)
{
if (cxl_cper_prot_err_work != work)
return -EINVAL;
guard(spinlock)(&cxl_cper_prot_err_work_lock);
cxl_cper_prot_err_work = NULL;
return 0;
}
EXPORT_SYMBOL_NS_GPL(cxl_cper_unregister_prot_err_work, "CXL");
int cxl_cper_prot_err_kfifo_get(struct cxl_cper_prot_err_work_data *wd)
{
return kfifo_get(&cxl_cper_prot_err_fifo, wd);
}
EXPORT_SYMBOL_NS_GPL(cxl_cper_prot_err_kfifo_get, "CXL");
/* Room for 8 entries for each of the 4 event log queues */
#define CXL_CPER_FIFO_DEPTH 32
DEFINE_KFIFO(cxl_cper_fifo, struct cxl_cper_work_data, CXL_CPER_FIFO_DEPTH);

View file

@ -14,5 +14,6 @@ cxl_core-y += pci.o
cxl_core-y += hdm.o
cxl_core-y += pmu.o
cxl_core-y += cdat.o
cxl_core-y += ras.o
cxl_core-$(CONFIG_TRACING) += trace.o
cxl_core-$(CONFIG_CXL_REGION) += region.o

View file

@ -115,4 +115,7 @@ bool cxl_need_node_perf_attrs_update(int nid);
int cxl_port_get_switch_dport_bandwidth(struct cxl_port *port,
struct access_coordinate *c);
int cxl_ras_init(void);
void cxl_ras_exit(void);
#endif /* __CXL_CORE_H__ */

View file

@ -2339,8 +2339,14 @@ static __init int cxl_core_init(void)
if (rc)
goto err_region;
rc = cxl_ras_init();
if (rc)
goto err_ras;
return 0;
err_ras:
cxl_region_exit();
err_region:
bus_unregister(&cxl_bus_type);
err_bus:
@ -2352,6 +2358,7 @@ err_wq:
static void cxl_core_exit(void)
{
cxl_ras_exit();
cxl_region_exit();
bus_unregister(&cxl_bus_type);
destroy_workqueue(cxl_bus_wq);

82
drivers/cxl/core/ras.c Normal file
View file

@ -0,0 +1,82 @@
// SPDX-License-Identifier: GPL-2.0-only
/* Copyright(c) 2025 AMD Corporation. All rights reserved. */
#include <linux/pci.h>
#include <linux/aer.h>
#include <cxl/event.h>
#include <cxlmem.h>
#include "trace.h"
static void cxl_cper_trace_corr_prot_err(struct pci_dev *pdev,
struct cxl_ras_capability_regs ras_cap)
{
u32 status = ras_cap.cor_status & ~ras_cap.cor_mask;
struct cxl_dev_state *cxlds;
cxlds = pci_get_drvdata(pdev);
if (!cxlds)
return;
trace_cxl_aer_correctable_error(cxlds->cxlmd, status);
}
static void cxl_cper_trace_uncorr_prot_err(struct pci_dev *pdev,
struct cxl_ras_capability_regs ras_cap)
{
u32 status = ras_cap.uncor_status & ~ras_cap.uncor_mask;
struct cxl_dev_state *cxlds;
u32 fe;
cxlds = pci_get_drvdata(pdev);
if (!cxlds)
return;
if (hweight32(status) > 1)
fe = BIT(FIELD_GET(CXL_RAS_CAP_CONTROL_FE_MASK,
ras_cap.cap_control));
else
fe = status;
trace_cxl_aer_uncorrectable_error(cxlds->cxlmd, status, fe,
ras_cap.header_log);
}
static void cxl_cper_handle_prot_err(struct cxl_cper_prot_err_work_data *data)
{
unsigned int devfn = PCI_DEVFN(data->prot_err.agent_addr.device,
data->prot_err.agent_addr.function);
struct pci_dev *pdev __free(pci_dev_put) =
pci_get_domain_bus_and_slot(data->prot_err.agent_addr.segment,
data->prot_err.agent_addr.bus,
devfn);
if (!pdev)
return;
guard(device)(&pdev->dev);
if (data->severity == AER_CORRECTABLE)
cxl_cper_trace_corr_prot_err(pdev, data->ras_cap);
else
cxl_cper_trace_uncorr_prot_err(pdev, data->ras_cap);
}
static void cxl_cper_prot_err_work_fn(struct work_struct *work)
{
struct cxl_cper_prot_err_work_data wd;
while (cxl_cper_prot_err_kfifo_get(&wd))
cxl_cper_handle_prot_err(&wd);
}
static DECLARE_WORK(cxl_cper_prot_err_work, cxl_cper_prot_err_work_fn);
int cxl_ras_init(void)
{
return cxl_cper_register_prot_err_work(&cxl_cper_prot_err_work);
}
void cxl_ras_exit(void)
{
cxl_cper_unregister_prot_err_work(&cxl_cper_prot_err_work);
cancel_work_sync(&cxl_cper_prot_err_work);
}

View file

@ -254,6 +254,9 @@ struct cxl_cper_prot_err_work_data {
int cxl_cper_register_work(struct work_struct *work);
int cxl_cper_unregister_work(struct work_struct *work);
int cxl_cper_kfifo_get(struct cxl_cper_work_data *wd);
int cxl_cper_register_prot_err_work(struct work_struct *work);
int cxl_cper_unregister_prot_err_work(struct work_struct *work);
int cxl_cper_prot_err_kfifo_get(struct cxl_cper_prot_err_work_data *wd);
#else
static inline int cxl_cper_register_work(struct work_struct *work)
{
@ -268,6 +271,18 @@ static inline int cxl_cper_kfifo_get(struct cxl_cper_work_data *wd)
{
return 0;
}
static inline int cxl_cper_register_prot_err_work(struct work_struct *work)
{
return 0;
}
static inline int cxl_cper_unregister_prot_err_work(struct work_struct *work)
{
return 0;
}
static inline int cxl_cper_prot_err_kfifo_get(struct cxl_cper_prot_err_work_data *wd)
{
return 0;
}
#endif
#endif /* _LINUX_CXL_EVENT_H */

View file

@ -61,6 +61,7 @@ cxl_core-y += $(CXL_CORE_SRC)/pci.o
cxl_core-y += $(CXL_CORE_SRC)/hdm.o
cxl_core-y += $(CXL_CORE_SRC)/pmu.o
cxl_core-y += $(CXL_CORE_SRC)/cdat.o
cxl_core-y += $(CXL_CORE_SRC)/ras.o
cxl_core-$(CONFIG_TRACING) += $(CXL_CORE_SRC)/trace.o
cxl_core-$(CONFIG_CXL_REGION) += $(CXL_CORE_SRC)/region.o
cxl_core-y += config_check.o