mirror of
git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2025-04-13 09:59:31 +00:00

When PCIe AER is in FW-First, OS should process CXL Protocol errors from CPER records. Introduce support for handling and logging CXL Protocol errors. The defined trace events cxl_aer_uncorrectable_error and cxl_aer_correctable_error trace native CXL AER endpoint errors. Reuse them to trace FW-First Protocol errors. Since the CXL code is required to be called from process context and GHES is in interrupt context, use workqueues for processing. Similar to CXL CPER event handling, use kfifo to handle errors as it simplifies queue processing by providing lock free fifo operations. Add the ability for the CXL sub-system to register a workqueue to process CXL CPER protocol errors. [DJ: return cxl_cper_register_prot_err_work() directly in cxl_ras_init()] Signed-off-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com> Reviewed-by: Li Ming <ming.li@zohomail.com> Reviewed-by: Alison Schofield <alison.schofield@intel.com> Reviewed-by: Ira Weiny <ira.weiny@intel.com> Reviewed-by: Tony Luck <tony.luck@intel.com> Link: https://patch.msgid.link/20250310223839.31342-2-Smita.KoralahalliChannabasappa@amd.com Signed-off-by: Dave Jiang <dave.jiang@intel.com>
82 lines
2 KiB
C
82 lines
2 KiB
C
// SPDX-License-Identifier: GPL-2.0-only
|
|
/* Copyright(c) 2025 AMD Corporation. All rights reserved. */
|
|
|
|
#include <linux/pci.h>
|
|
#include <linux/aer.h>
|
|
#include <cxl/event.h>
|
|
#include <cxlmem.h>
|
|
#include "trace.h"
|
|
|
|
static void cxl_cper_trace_corr_prot_err(struct pci_dev *pdev,
|
|
struct cxl_ras_capability_regs ras_cap)
|
|
{
|
|
u32 status = ras_cap.cor_status & ~ras_cap.cor_mask;
|
|
struct cxl_dev_state *cxlds;
|
|
|
|
cxlds = pci_get_drvdata(pdev);
|
|
if (!cxlds)
|
|
return;
|
|
|
|
trace_cxl_aer_correctable_error(cxlds->cxlmd, status);
|
|
}
|
|
|
|
static void cxl_cper_trace_uncorr_prot_err(struct pci_dev *pdev,
|
|
struct cxl_ras_capability_regs ras_cap)
|
|
{
|
|
u32 status = ras_cap.uncor_status & ~ras_cap.uncor_mask;
|
|
struct cxl_dev_state *cxlds;
|
|
u32 fe;
|
|
|
|
cxlds = pci_get_drvdata(pdev);
|
|
if (!cxlds)
|
|
return;
|
|
|
|
if (hweight32(status) > 1)
|
|
fe = BIT(FIELD_GET(CXL_RAS_CAP_CONTROL_FE_MASK,
|
|
ras_cap.cap_control));
|
|
else
|
|
fe = status;
|
|
|
|
trace_cxl_aer_uncorrectable_error(cxlds->cxlmd, status, fe,
|
|
ras_cap.header_log);
|
|
}
|
|
|
|
static void cxl_cper_handle_prot_err(struct cxl_cper_prot_err_work_data *data)
|
|
{
|
|
unsigned int devfn = PCI_DEVFN(data->prot_err.agent_addr.device,
|
|
data->prot_err.agent_addr.function);
|
|
struct pci_dev *pdev __free(pci_dev_put) =
|
|
pci_get_domain_bus_and_slot(data->prot_err.agent_addr.segment,
|
|
data->prot_err.agent_addr.bus,
|
|
devfn);
|
|
|
|
if (!pdev)
|
|
return;
|
|
|
|
guard(device)(&pdev->dev);
|
|
|
|
if (data->severity == AER_CORRECTABLE)
|
|
cxl_cper_trace_corr_prot_err(pdev, data->ras_cap);
|
|
else
|
|
cxl_cper_trace_uncorr_prot_err(pdev, data->ras_cap);
|
|
}
|
|
|
|
static void cxl_cper_prot_err_work_fn(struct work_struct *work)
|
|
{
|
|
struct cxl_cper_prot_err_work_data wd;
|
|
|
|
while (cxl_cper_prot_err_kfifo_get(&wd))
|
|
cxl_cper_handle_prot_err(&wd);
|
|
}
|
|
static DECLARE_WORK(cxl_cper_prot_err_work, cxl_cper_prot_err_work_fn);
|
|
|
|
int cxl_ras_init(void)
|
|
{
|
|
return cxl_cper_register_prot_err_work(&cxl_cper_prot_err_work);
|
|
}
|
|
|
|
void cxl_ras_exit(void)
|
|
{
|
|
cxl_cper_unregister_prot_err_work(&cxl_cper_prot_err_work);
|
|
cancel_work_sync(&cxl_cper_prot_err_work);
|
|
}
|