From 188e9529a606f35c57e34cf860b99bc2b191b5f4 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 3 Feb 2025 20:24:06 -0800 Subject: [PATCH 01/39] cxl: Remove the CXL_DECODER_MIXED mistake CXL_DECODER_MIXED is a safety mechanism introduced for the case where platform firmware has programmed an endpoint decoder that straddles a DPA partition boundary. While the kernel is careful to only allocate DPA capacity within a single partition there is no guarantee that platform firmware, or anything that touched the device before the current kernel, gets that right. However, __cxl_dpa_reserve() will never get to the CXL_DECODER_MIXED designation because of the way it tracks partition boundaries. A request_resource() that spans ->ram_res and ->pmem_res fails with the following signature: __cxl_dpa_reserve: cxl_port endpoint15: decoder15.0: failed to reserve allocation CXL_DECODER_MIXED is dead defensive programming after the driver has already given up on the device. It has never offered any protection in practice, just delete it. Reviewed-by: Ira Weiny Reviewed-by: Jonathan Cameron Reviewed-by: Alejandro Lucero Reviewed-by: Dave Jiang Signed-off-by: Dan Williams Reviewed-by: Fan Ni Tested-by: Alejandro Lucero Link: https://patch.msgid.link/173864304660.668823.17000888505587850279.stgit@dwillia2-xfh.jf.intel.com Signed-off-by: Dave Jiang --- drivers/cxl/core/hdm.c | 6 +++--- drivers/cxl/core/region.c | 12 ------------ drivers/cxl/cxl.h | 4 +--- 3 files changed, 4 insertions(+), 18 deletions(-) diff --git a/drivers/cxl/core/hdm.c b/drivers/cxl/core/hdm.c index 50e6a45b30ba..b7f6a2d69f78 100644 --- a/drivers/cxl/core/hdm.c +++ b/drivers/cxl/core/hdm.c @@ -332,9 +332,9 @@ static int __cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled, else if (resource_contains(&cxlds->ram_res, res)) cxled->mode = CXL_DECODER_RAM; else { - dev_warn(dev, "decoder%d.%d: %pr mixed mode not supported\n", - port->id, cxled->cxld.id, cxled->dpa_res); - cxled->mode = CXL_DECODER_MIXED; + dev_warn(dev, "decoder%d.%d: %pr does not map any partition\n", + port->id, cxled->cxld.id, res); + cxled->mode = CXL_DECODER_NONE; } port->hdm_end++; diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index e8d11a988fd9..34aa3d45fafa 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -2738,18 +2738,6 @@ static int poison_by_decoder(struct device *dev, void *arg) if (!cxled->dpa_res || !resource_size(cxled->dpa_res)) return rc; - /* - * Regions are only created with single mode decoders: pmem or ram. - * Linux does not support mixed mode decoders. This means that - * reading poison per endpoint decoder adheres to the requirement - * that poison reads of pmem and ram must be separated. - * CXL 3.0 Spec 8.2.9.8.4.1 - */ - if (cxled->mode == CXL_DECODER_MIXED) { - dev_dbg(dev, "poison list read unsupported in mixed mode\n"); - return rc; - } - cxlmd = cxled_to_memdev(cxled); if (cxled->skip) { offset = cxled->dpa_res->start - cxled->skip; diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index bbbaa0d0a670..9fd1524ed150 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -380,7 +380,6 @@ enum cxl_decoder_mode { CXL_DECODER_NONE, CXL_DECODER_RAM, CXL_DECODER_PMEM, - CXL_DECODER_MIXED, CXL_DECODER_DEAD, }; @@ -390,10 +389,9 @@ static inline const char *cxl_decoder_mode_name(enum cxl_decoder_mode mode) [CXL_DECODER_NONE] = "none", [CXL_DECODER_RAM] = "ram", [CXL_DECODER_PMEM] = "pmem", - [CXL_DECODER_MIXED] = "mixed", }; - if (mode >= CXL_DECODER_NONE && mode <= CXL_DECODER_MIXED) + if (mode >= CXL_DECODER_NONE && mode < CXL_DECODER_DEAD) return names[mode]; return "mixed"; } From d77ca6c2b52508c0d2e673e801aec342e5cdbece Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 3 Feb 2025 20:24:12 -0800 Subject: [PATCH 02/39] cxl: Introduce to_{ram,pmem}_{res,perf}() helpers In preparation for consolidating all DPA partition information into an array of DPA metadata, introduce helpers that hide the layout of the current data. I.e. make the eventual replacement of ->ram_res, ->pmem_res, ->ram_perf, and ->pmem_perf with a new DPA metadata array a no-op for code paths that consume that information, and reduce the noise of follow-on patches. The end goal is to consolidate all DPA information in 'struct cxl_dev_state', but for now the helpers just make it appear that all DPA metadata is relative to @cxlds. As the conversion to generic partition metadata walking is completed, these helpers will naturally be eliminated, or reduced in scope. Cc: Alejandro Lucero Reviewed-by: Ira Weiny Reviewed-by: Dave Jiang Signed-off-by: Dan Williams Reviewed-by: Jonathan Cameron Reviewed-by: Fan Ni Tested-by: Alejandro Lucero Link: https://patch.msgid.link/173864305238.668823.16553986866633608541.stgit@dwillia2-xfh.jf.intel.com Signed-off-by: Dave Jiang --- drivers/cxl/core/cdat.c | 71 +++++++++++++++++++++++------------- drivers/cxl/core/hdm.c | 26 +++++++------ drivers/cxl/core/mbox.c | 18 +++++---- drivers/cxl/core/memdev.c | 42 +++++++++++---------- drivers/cxl/core/region.c | 10 +++-- drivers/cxl/cxlmem.h | 58 +++++++++++++++++++++++++---- drivers/cxl/mem.c | 2 +- tools/testing/cxl/test/cxl.c | 25 +++++++------ 8 files changed, 162 insertions(+), 90 deletions(-) diff --git a/drivers/cxl/core/cdat.c b/drivers/cxl/core/cdat.c index 8153f8d83a16..797baad483cb 100644 --- a/drivers/cxl/core/cdat.c +++ b/drivers/cxl/core/cdat.c @@ -258,27 +258,36 @@ static void update_perf_entry(struct device *dev, struct dsmas_entry *dent, static void cxl_memdev_set_qos_class(struct cxl_dev_state *cxlds, struct xarray *dsmas_xa) { - struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlds); struct device *dev = cxlds->dev; - struct range pmem_range = { - .start = cxlds->pmem_res.start, - .end = cxlds->pmem_res.end, - }; - struct range ram_range = { - .start = cxlds->ram_res.start, - .end = cxlds->ram_res.end, - }; struct dsmas_entry *dent; unsigned long index; + const struct resource *partition[] = { + to_ram_res(cxlds), + to_pmem_res(cxlds), + }; + struct cxl_dpa_perf *perf[] = { + to_ram_perf(cxlds), + to_pmem_perf(cxlds), + }; xa_for_each(dsmas_xa, index, dent) { - if (resource_size(&cxlds->ram_res) && - range_contains(&ram_range, &dent->dpa_range)) - update_perf_entry(dev, dent, &mds->ram_perf); - else if (resource_size(&cxlds->pmem_res) && - range_contains(&pmem_range, &dent->dpa_range)) - update_perf_entry(dev, dent, &mds->pmem_perf); - else + bool found = false; + + for (int i = 0; i < ARRAY_SIZE(partition); i++) { + const struct resource *res = partition[i]; + struct range range = { + .start = res->start, + .end = res->end, + }; + + if (range_contains(&range, &dent->dpa_range)) { + update_perf_entry(dev, dent, perf[i]); + found = true; + break; + } + } + + if (!found) dev_dbg(dev, "no partition for dsmas dpa: %pra\n", &dent->dpa_range); } @@ -304,6 +313,9 @@ static int match_cxlrd_qos_class(struct device *dev, void *data) static void reset_dpa_perf(struct cxl_dpa_perf *dpa_perf) { + if (!dpa_perf) + return; + *dpa_perf = (struct cxl_dpa_perf) { .qos_class = CXL_QOS_CLASS_INVALID, }; @@ -312,6 +324,9 @@ static void reset_dpa_perf(struct cxl_dpa_perf *dpa_perf) static bool cxl_qos_match(struct cxl_port *root_port, struct cxl_dpa_perf *dpa_perf) { + if (!dpa_perf) + return false; + if (dpa_perf->qos_class == CXL_QOS_CLASS_INVALID) return false; @@ -346,7 +361,8 @@ static int match_cxlrd_hb(struct device *dev, void *data) static int cxl_qos_class_verify(struct cxl_memdev *cxlmd) { struct cxl_dev_state *cxlds = cxlmd->cxlds; - struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlds); + struct cxl_dpa_perf *ram_perf = to_ram_perf(cxlds), + *pmem_perf = to_pmem_perf(cxlds); struct cxl_port *root_port; int rc; @@ -359,17 +375,17 @@ static int cxl_qos_class_verify(struct cxl_memdev *cxlmd) root_port = &cxl_root->port; /* Check that the QTG IDs are all sane between end device and root decoders */ - if (!cxl_qos_match(root_port, &mds->ram_perf)) - reset_dpa_perf(&mds->ram_perf); - if (!cxl_qos_match(root_port, &mds->pmem_perf)) - reset_dpa_perf(&mds->pmem_perf); + if (!cxl_qos_match(root_port, ram_perf)) + reset_dpa_perf(ram_perf); + if (!cxl_qos_match(root_port, pmem_perf)) + reset_dpa_perf(pmem_perf); /* Check to make sure that the device's host bridge is under a root decoder */ rc = device_for_each_child(&root_port->dev, cxlmd->endpoint->host_bridge, match_cxlrd_hb); if (!rc) { - reset_dpa_perf(&mds->ram_perf); - reset_dpa_perf(&mds->pmem_perf); + reset_dpa_perf(ram_perf); + reset_dpa_perf(pmem_perf); } return rc; @@ -574,20 +590,23 @@ static struct cxl_dpa_perf *cxled_get_dpa_perf(struct cxl_endpoint_decoder *cxle enum cxl_decoder_mode mode) { struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); - struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlmd->cxlds); + struct cxl_dev_state *cxlds = cxlmd->cxlds; struct cxl_dpa_perf *perf; switch (mode) { case CXL_DECODER_RAM: - perf = &mds->ram_perf; + perf = to_ram_perf(cxlds); break; case CXL_DECODER_PMEM: - perf = &mds->pmem_perf; + perf = to_pmem_perf(cxlds); break; default: return ERR_PTR(-EINVAL); } + if (!perf) + return ERR_PTR(-EINVAL); + if (!dpa_perf_contains(perf, cxled->dpa_res)) return ERR_PTR(-EINVAL); diff --git a/drivers/cxl/core/hdm.c b/drivers/cxl/core/hdm.c index b7f6a2d69f78..48b26d3dad26 100644 --- a/drivers/cxl/core/hdm.c +++ b/drivers/cxl/core/hdm.c @@ -327,9 +327,9 @@ static int __cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled, cxled->dpa_res = res; cxled->skip = skipped; - if (resource_contains(&cxlds->pmem_res, res)) + if (resource_contains(to_pmem_res(cxlds), res)) cxled->mode = CXL_DECODER_PMEM; - else if (resource_contains(&cxlds->ram_res, res)) + else if (resource_contains(to_ram_res(cxlds), res)) cxled->mode = CXL_DECODER_RAM; else { dev_warn(dev, "decoder%d.%d: %pr does not map any partition\n", @@ -442,11 +442,11 @@ int cxl_dpa_set_mode(struct cxl_endpoint_decoder *cxled, * Only allow modes that are supported by the current partition * configuration */ - if (mode == CXL_DECODER_PMEM && !resource_size(&cxlds->pmem_res)) { + if (mode == CXL_DECODER_PMEM && !cxl_pmem_size(cxlds)) { dev_dbg(dev, "no available pmem capacity\n"); return -ENXIO; } - if (mode == CXL_DECODER_RAM && !resource_size(&cxlds->ram_res)) { + if (mode == CXL_DECODER_RAM && !cxl_ram_size(cxlds)) { dev_dbg(dev, "no available ram capacity\n"); return -ENXIO; } @@ -464,6 +464,8 @@ int cxl_dpa_alloc(struct cxl_endpoint_decoder *cxled, unsigned long long size) struct device *dev = &cxled->cxld.dev; resource_size_t start, avail, skip; struct resource *p, *last; + const struct resource *ram_res = to_ram_res(cxlds); + const struct resource *pmem_res = to_pmem_res(cxlds); int rc; down_write(&cxl_dpa_rwsem); @@ -480,37 +482,37 @@ int cxl_dpa_alloc(struct cxl_endpoint_decoder *cxled, unsigned long long size) goto out; } - for (p = cxlds->ram_res.child, last = NULL; p; p = p->sibling) + for (p = ram_res->child, last = NULL; p; p = p->sibling) last = p; if (last) free_ram_start = last->end + 1; else - free_ram_start = cxlds->ram_res.start; + free_ram_start = ram_res->start; - for (p = cxlds->pmem_res.child, last = NULL; p; p = p->sibling) + for (p = pmem_res->child, last = NULL; p; p = p->sibling) last = p; if (last) free_pmem_start = last->end + 1; else - free_pmem_start = cxlds->pmem_res.start; + free_pmem_start = pmem_res->start; if (cxled->mode == CXL_DECODER_RAM) { start = free_ram_start; - avail = cxlds->ram_res.end - start + 1; + avail = ram_res->end - start + 1; skip = 0; } else if (cxled->mode == CXL_DECODER_PMEM) { resource_size_t skip_start, skip_end; start = free_pmem_start; - avail = cxlds->pmem_res.end - start + 1; + avail = pmem_res->end - start + 1; skip_start = free_ram_start; /* * If some pmem is already allocated, then that allocation * already handled the skip. */ - if (cxlds->pmem_res.child && - skip_start == cxlds->pmem_res.child->start) + if (pmem_res->child && + skip_start == pmem_res->child->start) skip_end = skip_start - 1; else skip_end = start - 1; diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c index 548564c770c0..3502f1633ad2 100644 --- a/drivers/cxl/core/mbox.c +++ b/drivers/cxl/core/mbox.c @@ -1270,24 +1270,26 @@ static int add_dpa_res(struct device *dev, struct resource *parent, int cxl_mem_create_range_info(struct cxl_memdev_state *mds) { struct cxl_dev_state *cxlds = &mds->cxlds; + struct resource *ram_res = to_ram_res(cxlds); + struct resource *pmem_res = to_pmem_res(cxlds); struct device *dev = cxlds->dev; int rc; if (!cxlds->media_ready) { cxlds->dpa_res = DEFINE_RES_MEM(0, 0); - cxlds->ram_res = DEFINE_RES_MEM(0, 0); - cxlds->pmem_res = DEFINE_RES_MEM(0, 0); + *ram_res = DEFINE_RES_MEM(0, 0); + *pmem_res = DEFINE_RES_MEM(0, 0); return 0; } cxlds->dpa_res = DEFINE_RES_MEM(0, mds->total_bytes); if (mds->partition_align_bytes == 0) { - rc = add_dpa_res(dev, &cxlds->dpa_res, &cxlds->ram_res, 0, + rc = add_dpa_res(dev, &cxlds->dpa_res, ram_res, 0, mds->volatile_only_bytes, "ram"); if (rc) return rc; - return add_dpa_res(dev, &cxlds->dpa_res, &cxlds->pmem_res, + return add_dpa_res(dev, &cxlds->dpa_res, pmem_res, mds->volatile_only_bytes, mds->persistent_only_bytes, "pmem"); } @@ -1298,11 +1300,11 @@ int cxl_mem_create_range_info(struct cxl_memdev_state *mds) return rc; } - rc = add_dpa_res(dev, &cxlds->dpa_res, &cxlds->ram_res, 0, + rc = add_dpa_res(dev, &cxlds->dpa_res, ram_res, 0, mds->active_volatile_bytes, "ram"); if (rc) return rc; - return add_dpa_res(dev, &cxlds->dpa_res, &cxlds->pmem_res, + return add_dpa_res(dev, &cxlds->dpa_res, pmem_res, mds->active_volatile_bytes, mds->active_persistent_bytes, "pmem"); } @@ -1450,8 +1452,8 @@ struct cxl_memdev_state *cxl_memdev_state_create(struct device *dev) mds->cxlds.reg_map.host = dev; mds->cxlds.reg_map.resource = CXL_RESOURCE_NONE; mds->cxlds.type = CXL_DEVTYPE_CLASSMEM; - mds->ram_perf.qos_class = CXL_QOS_CLASS_INVALID; - mds->pmem_perf.qos_class = CXL_QOS_CLASS_INVALID; + to_ram_perf(&mds->cxlds)->qos_class = CXL_QOS_CLASS_INVALID; + to_pmem_perf(&mds->cxlds)->qos_class = CXL_QOS_CLASS_INVALID; return mds; } diff --git a/drivers/cxl/core/memdev.c b/drivers/cxl/core/memdev.c index ae3dfcbe8938..c5f8320ed330 100644 --- a/drivers/cxl/core/memdev.c +++ b/drivers/cxl/core/memdev.c @@ -80,7 +80,7 @@ static ssize_t ram_size_show(struct device *dev, struct device_attribute *attr, { struct cxl_memdev *cxlmd = to_cxl_memdev(dev); struct cxl_dev_state *cxlds = cxlmd->cxlds; - unsigned long long len = resource_size(&cxlds->ram_res); + unsigned long long len = resource_size(to_ram_res(cxlds)); return sysfs_emit(buf, "%#llx\n", len); } @@ -93,7 +93,7 @@ static ssize_t pmem_size_show(struct device *dev, struct device_attribute *attr, { struct cxl_memdev *cxlmd = to_cxl_memdev(dev); struct cxl_dev_state *cxlds = cxlmd->cxlds; - unsigned long long len = resource_size(&cxlds->pmem_res); + unsigned long long len = cxl_pmem_size(cxlds); return sysfs_emit(buf, "%#llx\n", len); } @@ -198,16 +198,20 @@ static int cxl_get_poison_by_memdev(struct cxl_memdev *cxlmd) int rc = 0; /* CXL 3.0 Spec 8.2.9.8.4.1 Separate pmem and ram poison requests */ - if (resource_size(&cxlds->pmem_res)) { - offset = cxlds->pmem_res.start; - length = resource_size(&cxlds->pmem_res); + if (cxl_pmem_size(cxlds)) { + const struct resource *res = to_pmem_res(cxlds); + + offset = res->start; + length = resource_size(res); rc = cxl_mem_get_poison(cxlmd, offset, length, NULL); if (rc) return rc; } - if (resource_size(&cxlds->ram_res)) { - offset = cxlds->ram_res.start; - length = resource_size(&cxlds->ram_res); + if (cxl_ram_size(cxlds)) { + const struct resource *res = to_ram_res(cxlds); + + offset = res->start; + length = resource_size(res); rc = cxl_mem_get_poison(cxlmd, offset, length, NULL); /* * Invalid Physical Address is not an error for @@ -409,9 +413,8 @@ static ssize_t pmem_qos_class_show(struct device *dev, { struct cxl_memdev *cxlmd = to_cxl_memdev(dev); struct cxl_dev_state *cxlds = cxlmd->cxlds; - struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlds); - return sysfs_emit(buf, "%d\n", mds->pmem_perf.qos_class); + return sysfs_emit(buf, "%d\n", to_pmem_perf(cxlds)->qos_class); } static struct device_attribute dev_attr_pmem_qos_class = @@ -428,9 +431,8 @@ static ssize_t ram_qos_class_show(struct device *dev, { struct cxl_memdev *cxlmd = to_cxl_memdev(dev); struct cxl_dev_state *cxlds = cxlmd->cxlds; - struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlds); - return sysfs_emit(buf, "%d\n", mds->ram_perf.qos_class); + return sysfs_emit(buf, "%d\n", to_ram_perf(cxlds)->qos_class); } static struct device_attribute dev_attr_ram_qos_class = @@ -466,11 +468,11 @@ static umode_t cxl_ram_visible(struct kobject *kobj, struct attribute *a, int n) { struct device *dev = kobj_to_dev(kobj); struct cxl_memdev *cxlmd = to_cxl_memdev(dev); - struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlmd->cxlds); + struct cxl_dpa_perf *perf = to_ram_perf(cxlmd->cxlds); - if (a == &dev_attr_ram_qos_class.attr) - if (mds->ram_perf.qos_class == CXL_QOS_CLASS_INVALID) - return 0; + if (a == &dev_attr_ram_qos_class.attr && + (!perf || perf->qos_class == CXL_QOS_CLASS_INVALID)) + return 0; return a->mode; } @@ -485,11 +487,11 @@ static umode_t cxl_pmem_visible(struct kobject *kobj, struct attribute *a, int n { struct device *dev = kobj_to_dev(kobj); struct cxl_memdev *cxlmd = to_cxl_memdev(dev); - struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlmd->cxlds); + struct cxl_dpa_perf *perf = to_pmem_perf(cxlmd->cxlds); - if (a == &dev_attr_pmem_qos_class.attr) - if (mds->pmem_perf.qos_class == CXL_QOS_CLASS_INVALID) - return 0; + if (a == &dev_attr_pmem_qos_class.attr && + (!perf || perf->qos_class == CXL_QOS_CLASS_INVALID)) + return 0; return a->mode; } diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index 34aa3d45fafa..6706a1b79549 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -2701,7 +2701,7 @@ static int cxl_get_poison_unmapped(struct cxl_memdev *cxlmd, if (ctx->mode == CXL_DECODER_RAM) { offset = ctx->offset; - length = resource_size(&cxlds->ram_res) - offset; + length = cxl_ram_size(cxlds) - offset; rc = cxl_mem_get_poison(cxlmd, offset, length, NULL); if (rc == -EFAULT) rc = 0; @@ -2713,9 +2713,11 @@ static int cxl_get_poison_unmapped(struct cxl_memdev *cxlmd, length = resource_size(&cxlds->dpa_res) - offset; if (!length) return 0; - } else if (resource_size(&cxlds->pmem_res)) { - offset = cxlds->pmem_res.start; - length = resource_size(&cxlds->pmem_res); + } else if (cxl_pmem_size(cxlds)) { + const struct resource *res = to_pmem_res(cxlds); + + offset = res->start; + length = resource_size(res); } else { return 0; } diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h index 2a25d1957ddb..78e92e24d7b5 100644 --- a/drivers/cxl/cxlmem.h +++ b/drivers/cxl/cxlmem.h @@ -423,8 +423,8 @@ struct cxl_dpa_perf { * @rcd: operating in RCD mode (CXL 3.0 9.11.8 CXL Devices Attached to an RCH) * @media_ready: Indicate whether the device media is usable * @dpa_res: Overall DPA resource tree for the device - * @pmem_res: Active Persistent memory capacity configuration - * @ram_res: Active Volatile memory capacity configuration + * @_pmem_res: Active Persistent memory capacity configuration + * @_ram_res: Active Volatile memory capacity configuration * @serial: PCIe Device Serial Number * @type: Generic Memory Class device or Vendor Specific Memory device * @cxl_mbox: CXL mailbox context @@ -438,13 +438,41 @@ struct cxl_dev_state { bool rcd; bool media_ready; struct resource dpa_res; - struct resource pmem_res; - struct resource ram_res; + struct resource _pmem_res; + struct resource _ram_res; u64 serial; enum cxl_devtype type; struct cxl_mailbox cxl_mbox; }; +static inline struct resource *to_ram_res(struct cxl_dev_state *cxlds) +{ + return &cxlds->_ram_res; +} + +static inline struct resource *to_pmem_res(struct cxl_dev_state *cxlds) +{ + return &cxlds->_pmem_res; +} + +static inline resource_size_t cxl_ram_size(struct cxl_dev_state *cxlds) +{ + const struct resource *res = to_ram_res(cxlds); + + if (!res) + return 0; + return resource_size(res); +} + +static inline resource_size_t cxl_pmem_size(struct cxl_dev_state *cxlds) +{ + const struct resource *res = to_pmem_res(cxlds); + + if (!res) + return 0; + return resource_size(res); +} + static inline struct cxl_dev_state *mbox_to_cxlds(struct cxl_mailbox *cxl_mbox) { return dev_get_drvdata(cxl_mbox->host); @@ -471,8 +499,8 @@ static inline struct cxl_dev_state *mbox_to_cxlds(struct cxl_mailbox *cxl_mbox) * @active_persistent_bytes: sum of hard + soft persistent * @next_volatile_bytes: volatile capacity change pending device reset * @next_persistent_bytes: persistent capacity change pending device reset - * @ram_perf: performance data entry matched to RAM partition - * @pmem_perf: performance data entry matched to PMEM partition + * @_ram_perf: performance data entry matched to RAM partition + * @_pmem_perf: performance data entry matched to PMEM partition * @event: event log driver state * @poison: poison driver state info * @security: security driver state info @@ -496,8 +524,8 @@ struct cxl_memdev_state { u64 next_volatile_bytes; u64 next_persistent_bytes; - struct cxl_dpa_perf ram_perf; - struct cxl_dpa_perf pmem_perf; + struct cxl_dpa_perf _ram_perf; + struct cxl_dpa_perf _pmem_perf; struct cxl_event_state event; struct cxl_poison_state poison; @@ -505,6 +533,20 @@ struct cxl_memdev_state { struct cxl_fw_state fw; }; +static inline struct cxl_dpa_perf *to_ram_perf(struct cxl_dev_state *cxlds) +{ + struct cxl_memdev_state *mds = container_of(cxlds, typeof(*mds), cxlds); + + return &mds->_ram_perf; +} + +static inline struct cxl_dpa_perf *to_pmem_perf(struct cxl_dev_state *cxlds) +{ + struct cxl_memdev_state *mds = container_of(cxlds, typeof(*mds), cxlds); + + return &mds->_pmem_perf; +} + static inline struct cxl_memdev_state * to_cxl_memdev_state(struct cxl_dev_state *cxlds) { diff --git a/drivers/cxl/mem.c b/drivers/cxl/mem.c index 2f03a4d5606e..9675243bd05b 100644 --- a/drivers/cxl/mem.c +++ b/drivers/cxl/mem.c @@ -152,7 +152,7 @@ static int cxl_mem_probe(struct device *dev) return -ENXIO; } - if (resource_size(&cxlds->pmem_res) && IS_ENABLED(CONFIG_CXL_PMEM)) { + if (cxl_pmem_size(cxlds) && IS_ENABLED(CONFIG_CXL_PMEM)) { rc = devm_cxl_add_nvdimm(parent_port, cxlmd); if (rc) { if (rc == -ENODEV) diff --git a/tools/testing/cxl/test/cxl.c b/tools/testing/cxl/test/cxl.c index cc8948f49117..9654513bbccf 100644 --- a/tools/testing/cxl/test/cxl.c +++ b/tools/testing/cxl/test/cxl.c @@ -1000,25 +1000,28 @@ static void mock_cxl_endpoint_parse_cdat(struct cxl_port *port) find_cxl_root(port); struct cxl_memdev *cxlmd = to_cxl_memdev(port->uport_dev); struct cxl_dev_state *cxlds = cxlmd->cxlds; - struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlds); struct access_coordinate ep_c[ACCESS_COORDINATE_MAX]; - struct range pmem_range = { - .start = cxlds->pmem_res.start, - .end = cxlds->pmem_res.end, + const struct resource *partition[] = { + to_ram_res(cxlds), + to_pmem_res(cxlds), }; - struct range ram_range = { - .start = cxlds->ram_res.start, - .end = cxlds->ram_res.end, + struct cxl_dpa_perf *perf[] = { + to_ram_perf(cxlds), + to_pmem_perf(cxlds), }; if (!cxl_root) return; - if (range_len(&ram_range)) - dpa_perf_setup(port, &ram_range, &mds->ram_perf); + for (int i = 0; i < ARRAY_SIZE(partition); i++) { + const struct resource *res = partition[i]; + struct range range = { + .start = res->start, + .end = res->end, + }; - if (range_len(&pmem_range)) - dpa_perf_setup(port, &pmem_range, &mds->pmem_perf); + dpa_perf_setup(port, &range, perf[i]); + } cxl_memdev_update_perf(cxlmd); From 8e4c411c533f79407bbc970011aaa73ac602a9d1 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 3 Feb 2025 20:24:18 -0800 Subject: [PATCH 03/39] cxl: Introduce 'struct cxl_dpa_partition' and 'struct cxl_range_info' The pending efforts to add CXL Accelerator (type-2) device [1], and Dynamic Capacity (DCD) support [2], tripped on the no-longer-fit-for-purpose design in the CXL subsystem for tracking device-physical-address (DPA) metadata. Trip hazards include: - CXL Memory Devices need to consider a PMEM partition, but Accelerator devices with CXL.mem likely do not in the common case. - CXL Memory Devices enumerate DPA through Memory Device mailbox commands like Partition Info, Accelerators devices do not. - CXL Memory Devices that support DCD support more than 2 partitions. Some of the driver algorithms are awkward to expand to > 2 partition cases. - DPA performance data is a general capability that can be shared with accelerators, so tracking it in 'struct cxl_memdev_state' is no longer suitable. - Hardcoded assumptions around the PMEM partition always being index-1 if RAM is zero-sized or PMEM is zero sized. - 'enum cxl_decoder_mode' is sometimes a partition id and sometimes a memory property, it should be phased in favor of a partition id and the memory property comes from the partition info. Towards cleaning up those issues and allowing a smoother landing for the aforementioned pending efforts, introduce a 'struct cxl_dpa_partition' array to 'struct cxl_dev_state', and 'struct cxl_range_info' as a shared way for Memory Devices and Accelerators to initialize the DPA information in 'struct cxl_dev_state'. For now, split a new cxl_dpa_setup() from cxl_mem_create_range_info() to get the new data structure initialized, and cleanup some qos_class init. Follow on patches will go further to use the new data structure to cleanup algorithms that are better suited to loop over all possible partitions. cxl_dpa_setup() follows the locking expectations of mutating the device DPA map, and is suitable for Accelerator drivers to use. Accelerators likely only have one hardcoded 'ram' partition to convey to the cxl_core. Link: http://lore.kernel.org/20241230214445.27602-1-alejandro.lucero-palau@amd.com [1] Link: http://lore.kernel.org/20241210-dcd-type2-upstream-v8-0-812852504400@intel.com [2] Reviewed-by: Ira Weiny Reviewed-by: Dave Jiang Reviewed-by: Alejandro Lucero Signed-off-by: Dan Williams Reviewed-by: Jonathan Cameron Tested-by: Alejandro Lucero Link: https://patch.msgid.link/173864305827.668823.13978794102080021276.stgit@dwillia2-xfh.jf.intel.com Signed-off-by: Dave Jiang --- drivers/cxl/core/cdat.c | 15 ++---- drivers/cxl/core/hdm.c | 88 +++++++++++++++++++++++++++++++++- drivers/cxl/core/mbox.c | 68 +++++++++----------------- drivers/cxl/core/memdev.c | 2 +- drivers/cxl/cxlmem.h | 93 +++++++++++++++++++++++++----------- drivers/cxl/pci.c | 7 ++- tools/testing/cxl/test/cxl.c | 15 ++---- tools/testing/cxl/test/mem.c | 7 ++- 8 files changed, 195 insertions(+), 100 deletions(-) diff --git a/drivers/cxl/core/cdat.c b/drivers/cxl/core/cdat.c index 797baad483cb..33cabe4aa026 100644 --- a/drivers/cxl/core/cdat.c +++ b/drivers/cxl/core/cdat.c @@ -261,27 +261,20 @@ static void cxl_memdev_set_qos_class(struct cxl_dev_state *cxlds, struct device *dev = cxlds->dev; struct dsmas_entry *dent; unsigned long index; - const struct resource *partition[] = { - to_ram_res(cxlds), - to_pmem_res(cxlds), - }; - struct cxl_dpa_perf *perf[] = { - to_ram_perf(cxlds), - to_pmem_perf(cxlds), - }; xa_for_each(dsmas_xa, index, dent) { bool found = false; - for (int i = 0; i < ARRAY_SIZE(partition); i++) { - const struct resource *res = partition[i]; + for (int i = 0; i < cxlds->nr_partitions; i++) { + struct resource *res = &cxlds->part[i].res; struct range range = { .start = res->start, .end = res->end, }; if (range_contains(&range, &dent->dpa_range)) { - update_perf_entry(dev, dent, perf[i]); + update_perf_entry(dev, dent, + &cxlds->part[i].perf); found = true; break; } diff --git a/drivers/cxl/core/hdm.c b/drivers/cxl/core/hdm.c index 48b26d3dad26..ea4e792f789f 100644 --- a/drivers/cxl/core/hdm.c +++ b/drivers/cxl/core/hdm.c @@ -327,9 +327,9 @@ static int __cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled, cxled->dpa_res = res; cxled->skip = skipped; - if (resource_contains(to_pmem_res(cxlds), res)) + if (to_pmem_res(cxlds) && resource_contains(to_pmem_res(cxlds), res)) cxled->mode = CXL_DECODER_PMEM; - else if (resource_contains(to_ram_res(cxlds), res)) + else if (to_ram_res(cxlds) && resource_contains(to_ram_res(cxlds), res)) cxled->mode = CXL_DECODER_RAM; else { dev_warn(dev, "decoder%d.%d: %pr does not map any partition\n", @@ -342,6 +342,90 @@ static int __cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled, return 0; } +static int add_dpa_res(struct device *dev, struct resource *parent, + struct resource *res, resource_size_t start, + resource_size_t size, const char *type) +{ + int rc; + + *res = (struct resource) { + .name = type, + .start = start, + .end = start + size - 1, + .flags = IORESOURCE_MEM, + }; + if (resource_size(res) == 0) { + dev_dbg(dev, "DPA(%s): no capacity\n", res->name); + return 0; + } + rc = request_resource(parent, res); + if (rc) { + dev_err(dev, "DPA(%s): failed to track %pr (%d)\n", res->name, + res, rc); + return rc; + } + + dev_dbg(dev, "DPA(%s): %pr\n", res->name, res); + + return 0; +} + +static const char *cxl_mode_name(enum cxl_partition_mode mode) +{ + switch (mode) { + case CXL_PARTMODE_RAM: + return "ram"; + case CXL_PARTMODE_PMEM: + return "pmem"; + default: + return ""; + }; +} + +/* if this fails the caller must destroy @cxlds, there is no recovery */ +int cxl_dpa_setup(struct cxl_dev_state *cxlds, const struct cxl_dpa_info *info) +{ + struct device *dev = cxlds->dev; + + guard(rwsem_write)(&cxl_dpa_rwsem); + + if (cxlds->nr_partitions) + return -EBUSY; + + if (!info->size || !info->nr_partitions) { + cxlds->dpa_res = DEFINE_RES_MEM(0, 0); + cxlds->nr_partitions = 0; + return 0; + } + + cxlds->dpa_res = DEFINE_RES_MEM(0, info->size); + + for (int i = 0; i < info->nr_partitions; i++) { + const struct cxl_dpa_part_info *part = &info->part[i]; + int rc; + + cxlds->part[i].perf.qos_class = CXL_QOS_CLASS_INVALID; + cxlds->part[i].mode = part->mode; + + /* Require ordered + contiguous partitions */ + if (i) { + const struct cxl_dpa_part_info *prev = &info->part[i - 1]; + + if (prev->range.end + 1 != part->range.start) + return -EINVAL; + } + rc = add_dpa_res(dev, &cxlds->dpa_res, &cxlds->part[i].res, + part->range.start, range_len(&part->range), + cxl_mode_name(part->mode)); + if (rc) + return rc; + cxlds->nr_partitions++; + } + + return 0; +} +EXPORT_SYMBOL_GPL(cxl_dpa_setup); + int devm_cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled, resource_size_t base, resource_size_t len, resource_size_t skipped) diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c index 3502f1633ad2..62bb3653362f 100644 --- a/drivers/cxl/core/mbox.c +++ b/drivers/cxl/core/mbox.c @@ -1241,57 +1241,39 @@ int cxl_mem_sanitize(struct cxl_memdev *cxlmd, u16 cmd) return rc; } -static int add_dpa_res(struct device *dev, struct resource *parent, - struct resource *res, resource_size_t start, - resource_size_t size, const char *type) +static void add_part(struct cxl_dpa_info *info, u64 start, u64 size, enum cxl_partition_mode mode) { - int rc; + int i = info->nr_partitions; - res->name = type; - res->start = start; - res->end = start + size - 1; - res->flags = IORESOURCE_MEM; - if (resource_size(res) == 0) { - dev_dbg(dev, "DPA(%s): no capacity\n", res->name); - return 0; - } - rc = request_resource(parent, res); - if (rc) { - dev_err(dev, "DPA(%s): failed to track %pr (%d)\n", res->name, - res, rc); - return rc; - } + if (size == 0) + return; - dev_dbg(dev, "DPA(%s): %pr\n", res->name, res); - - return 0; + info->part[i].range = (struct range) { + .start = start, + .end = start + size - 1, + }; + info->part[i].mode = mode; + info->nr_partitions++; } -int cxl_mem_create_range_info(struct cxl_memdev_state *mds) +int cxl_mem_dpa_fetch(struct cxl_memdev_state *mds, struct cxl_dpa_info *info) { struct cxl_dev_state *cxlds = &mds->cxlds; - struct resource *ram_res = to_ram_res(cxlds); - struct resource *pmem_res = to_pmem_res(cxlds); struct device *dev = cxlds->dev; int rc; if (!cxlds->media_ready) { - cxlds->dpa_res = DEFINE_RES_MEM(0, 0); - *ram_res = DEFINE_RES_MEM(0, 0); - *pmem_res = DEFINE_RES_MEM(0, 0); + info->size = 0; return 0; } - cxlds->dpa_res = DEFINE_RES_MEM(0, mds->total_bytes); + info->size = mds->total_bytes; if (mds->partition_align_bytes == 0) { - rc = add_dpa_res(dev, &cxlds->dpa_res, ram_res, 0, - mds->volatile_only_bytes, "ram"); - if (rc) - return rc; - return add_dpa_res(dev, &cxlds->dpa_res, pmem_res, - mds->volatile_only_bytes, - mds->persistent_only_bytes, "pmem"); + add_part(info, 0, mds->volatile_only_bytes, CXL_PARTMODE_RAM); + add_part(info, mds->volatile_only_bytes, + mds->persistent_only_bytes, CXL_PARTMODE_PMEM); + return 0; } rc = cxl_mem_get_partition_info(mds); @@ -1300,15 +1282,13 @@ int cxl_mem_create_range_info(struct cxl_memdev_state *mds) return rc; } - rc = add_dpa_res(dev, &cxlds->dpa_res, ram_res, 0, - mds->active_volatile_bytes, "ram"); - if (rc) - return rc; - return add_dpa_res(dev, &cxlds->dpa_res, pmem_res, - mds->active_volatile_bytes, - mds->active_persistent_bytes, "pmem"); + add_part(info, 0, mds->active_volatile_bytes, CXL_PARTMODE_RAM); + add_part(info, mds->active_volatile_bytes, mds->active_persistent_bytes, + CXL_PARTMODE_PMEM); + + return 0; } -EXPORT_SYMBOL_NS_GPL(cxl_mem_create_range_info, "CXL"); +EXPORT_SYMBOL_NS_GPL(cxl_mem_dpa_fetch, "CXL"); int cxl_set_timestamp(struct cxl_memdev_state *mds) { @@ -1452,8 +1432,6 @@ struct cxl_memdev_state *cxl_memdev_state_create(struct device *dev) mds->cxlds.reg_map.host = dev; mds->cxlds.reg_map.resource = CXL_RESOURCE_NONE; mds->cxlds.type = CXL_DEVTYPE_CLASSMEM; - to_ram_perf(&mds->cxlds)->qos_class = CXL_QOS_CLASS_INVALID; - to_pmem_perf(&mds->cxlds)->qos_class = CXL_QOS_CLASS_INVALID; return mds; } diff --git a/drivers/cxl/core/memdev.c b/drivers/cxl/core/memdev.c index c5f8320ed330..be0eb57086e1 100644 --- a/drivers/cxl/core/memdev.c +++ b/drivers/cxl/core/memdev.c @@ -80,7 +80,7 @@ static ssize_t ram_size_show(struct device *dev, struct device_attribute *attr, { struct cxl_memdev *cxlmd = to_cxl_memdev(dev); struct cxl_dev_state *cxlds = cxlmd->cxlds; - unsigned long long len = resource_size(to_ram_res(cxlds)); + unsigned long long len = cxl_ram_size(cxlds); return sysfs_emit(buf, "%#llx\n", len); } diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h index 78e92e24d7b5..e33b2d5efed9 100644 --- a/drivers/cxl/cxlmem.h +++ b/drivers/cxl/cxlmem.h @@ -97,6 +97,24 @@ int devm_cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled, resource_size_t base, resource_size_t len, resource_size_t skipped); +enum cxl_partition_mode { + CXL_PARTMODE_RAM, + CXL_PARTMODE_PMEM, +}; + +#define CXL_NR_PARTITIONS_MAX 2 + +struct cxl_dpa_info { + u64 size; + struct cxl_dpa_part_info { + struct range range; + enum cxl_partition_mode mode; + } part[CXL_NR_PARTITIONS_MAX]; + int nr_partitions; +}; + +int cxl_dpa_setup(struct cxl_dev_state *cxlds, const struct cxl_dpa_info *info); + static inline struct cxl_ep *cxl_ep_load(struct cxl_port *port, struct cxl_memdev *cxlmd) { @@ -408,6 +426,18 @@ struct cxl_dpa_perf { int qos_class; }; +/** + * struct cxl_dpa_partition - DPA partition descriptor + * @res: shortcut to the partition in the DPA resource tree (cxlds->dpa_res) + * @perf: performance attributes of the partition from CDAT + * @mode: operation mode for the DPA capacity, e.g. ram, pmem, dynamic... + */ +struct cxl_dpa_partition { + struct resource res; + struct cxl_dpa_perf perf; + enum cxl_partition_mode mode; +}; + /** * struct cxl_dev_state - The driver device state * @@ -423,8 +453,8 @@ struct cxl_dpa_perf { * @rcd: operating in RCD mode (CXL 3.0 9.11.8 CXL Devices Attached to an RCH) * @media_ready: Indicate whether the device media is usable * @dpa_res: Overall DPA resource tree for the device - * @_pmem_res: Active Persistent memory capacity configuration - * @_ram_res: Active Volatile memory capacity configuration + * @part: DPA partition array + * @nr_partitions: Number of DPA partitions * @serial: PCIe Device Serial Number * @type: Generic Memory Class device or Vendor Specific Memory device * @cxl_mbox: CXL mailbox context @@ -438,21 +468,47 @@ struct cxl_dev_state { bool rcd; bool media_ready; struct resource dpa_res; - struct resource _pmem_res; - struct resource _ram_res; + struct cxl_dpa_partition part[CXL_NR_PARTITIONS_MAX]; + unsigned int nr_partitions; u64 serial; enum cxl_devtype type; struct cxl_mailbox cxl_mbox; }; -static inline struct resource *to_ram_res(struct cxl_dev_state *cxlds) + +/* Static RAM is only expected at partition 0. */ +static inline const struct resource *to_ram_res(struct cxl_dev_state *cxlds) { - return &cxlds->_ram_res; + if (cxlds->part[0].mode != CXL_PARTMODE_RAM) + return NULL; + return &cxlds->part[0].res; } -static inline struct resource *to_pmem_res(struct cxl_dev_state *cxlds) +/* + * Static PMEM may be at partition index 0 when there is no static RAM + * capacity. + */ +static inline const struct resource *to_pmem_res(struct cxl_dev_state *cxlds) { - return &cxlds->_pmem_res; + for (int i = 0; i < cxlds->nr_partitions; i++) + if (cxlds->part[i].mode == CXL_PARTMODE_PMEM) + return &cxlds->part[i].res; + return NULL; +} + +static inline struct cxl_dpa_perf *to_ram_perf(struct cxl_dev_state *cxlds) +{ + if (cxlds->part[0].mode != CXL_PARTMODE_RAM) + return NULL; + return &cxlds->part[0].perf; +} + +static inline struct cxl_dpa_perf *to_pmem_perf(struct cxl_dev_state *cxlds) +{ + for (int i = 0; i < cxlds->nr_partitions; i++) + if (cxlds->part[i].mode == CXL_PARTMODE_PMEM) + return &cxlds->part[i].perf; + return NULL; } static inline resource_size_t cxl_ram_size(struct cxl_dev_state *cxlds) @@ -499,8 +555,6 @@ static inline struct cxl_dev_state *mbox_to_cxlds(struct cxl_mailbox *cxl_mbox) * @active_persistent_bytes: sum of hard + soft persistent * @next_volatile_bytes: volatile capacity change pending device reset * @next_persistent_bytes: persistent capacity change pending device reset - * @_ram_perf: performance data entry matched to RAM partition - * @_pmem_perf: performance data entry matched to PMEM partition * @event: event log driver state * @poison: poison driver state info * @security: security driver state info @@ -524,29 +578,12 @@ struct cxl_memdev_state { u64 next_volatile_bytes; u64 next_persistent_bytes; - struct cxl_dpa_perf _ram_perf; - struct cxl_dpa_perf _pmem_perf; - struct cxl_event_state event; struct cxl_poison_state poison; struct cxl_security_state security; struct cxl_fw_state fw; }; -static inline struct cxl_dpa_perf *to_ram_perf(struct cxl_dev_state *cxlds) -{ - struct cxl_memdev_state *mds = container_of(cxlds, typeof(*mds), cxlds); - - return &mds->_ram_perf; -} - -static inline struct cxl_dpa_perf *to_pmem_perf(struct cxl_dev_state *cxlds) -{ - struct cxl_memdev_state *mds = container_of(cxlds, typeof(*mds), cxlds); - - return &mds->_pmem_perf; -} - static inline struct cxl_memdev_state * to_cxl_memdev_state(struct cxl_dev_state *cxlds) { @@ -860,7 +897,7 @@ int cxl_internal_send_cmd(struct cxl_mailbox *cxl_mbox, int cxl_dev_state_identify(struct cxl_memdev_state *mds); int cxl_await_media_ready(struct cxl_dev_state *cxlds); int cxl_enumerate_cmds(struct cxl_memdev_state *mds); -int cxl_mem_create_range_info(struct cxl_memdev_state *mds); +int cxl_mem_dpa_fetch(struct cxl_memdev_state *mds, struct cxl_dpa_info *info); struct cxl_memdev_state *cxl_memdev_state_create(struct device *dev); void set_exclusive_cxl_commands(struct cxl_memdev_state *mds, unsigned long *cmds); diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c index a96e54c6259e..b2c943a4de0a 100644 --- a/drivers/cxl/pci.c +++ b/drivers/cxl/pci.c @@ -903,6 +903,7 @@ __ATTRIBUTE_GROUPS(cxl_rcd); static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) { struct pci_host_bridge *host_bridge = pci_find_host_bridge(pdev->bus); + struct cxl_dpa_info range_info = { 0 }; struct cxl_memdev_state *mds; struct cxl_dev_state *cxlds; struct cxl_register_map map; @@ -993,7 +994,11 @@ static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) if (rc) return rc; - rc = cxl_mem_create_range_info(mds); + rc = cxl_mem_dpa_fetch(mds, &range_info); + if (rc) + return rc; + + rc = cxl_dpa_setup(cxlds, &range_info); if (rc) return rc; diff --git a/tools/testing/cxl/test/cxl.c b/tools/testing/cxl/test/cxl.c index 9654513bbccf..083a66a52731 100644 --- a/tools/testing/cxl/test/cxl.c +++ b/tools/testing/cxl/test/cxl.c @@ -1001,26 +1001,19 @@ static void mock_cxl_endpoint_parse_cdat(struct cxl_port *port) struct cxl_memdev *cxlmd = to_cxl_memdev(port->uport_dev); struct cxl_dev_state *cxlds = cxlmd->cxlds; struct access_coordinate ep_c[ACCESS_COORDINATE_MAX]; - const struct resource *partition[] = { - to_ram_res(cxlds), - to_pmem_res(cxlds), - }; - struct cxl_dpa_perf *perf[] = { - to_ram_perf(cxlds), - to_pmem_perf(cxlds), - }; if (!cxl_root) return; - for (int i = 0; i < ARRAY_SIZE(partition); i++) { - const struct resource *res = partition[i]; + for (int i = 0; i < cxlds->nr_partitions; i++) { + struct resource *res = &cxlds->part[i].res; + struct cxl_dpa_perf *perf = &cxlds->part[i].perf; struct range range = { .start = res->start, .end = res->end, }; - dpa_perf_setup(port, &range, perf[i]); + dpa_perf_setup(port, &range, perf); } cxl_memdev_update_perf(cxlmd); diff --git a/tools/testing/cxl/test/mem.c b/tools/testing/cxl/test/mem.c index 8d731bd63988..495199238335 100644 --- a/tools/testing/cxl/test/mem.c +++ b/tools/testing/cxl/test/mem.c @@ -1494,6 +1494,7 @@ static int cxl_mock_mem_probe(struct platform_device *pdev) struct cxl_dev_state *cxlds; struct cxl_mockmem_data *mdata; struct cxl_mailbox *cxl_mbox; + struct cxl_dpa_info range_info = { 0 }; int rc; mdata = devm_kzalloc(dev, sizeof(*mdata), GFP_KERNEL); @@ -1554,7 +1555,11 @@ static int cxl_mock_mem_probe(struct platform_device *pdev) if (rc) return rc; - rc = cxl_mem_create_range_info(mds); + rc = cxl_mem_dpa_fetch(mds, &range_info); + if (rc) + return rc; + + rc = cxl_dpa_setup(cxlds, &range_info); if (rc) return rc; From 991d98f17d31644826977e49477544987000a08a Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 3 Feb 2025 20:24:24 -0800 Subject: [PATCH 04/39] cxl: Make cxl_dpa_alloc() DPA partition number agnostic cxl_dpa_alloc() is a hard coded nest of assumptions around PMEM allocations being distinct from RAM allocations in specific ways when in practice the allocation rules are only relative to DPA partition index. The rules for cxl_dpa_alloc() are: - allocations can only come from 1 partition - if allocating at partition-index-N, all free space in partitions less than partition-index-N must be skipped over Use the new 'struct cxl_dpa_partition' array to support allocation with an arbitrary number of DPA partitions on the device. A follow-on patch can go further to cleanup 'enum cxl_decoder_mode' concept and supersede it with looking up the memory properties from partition metadata. Until then cxl_part_mode() temporarily bridges code that looks up partitions by @cxled->mode. Reviewed-by: Ira Weiny Reviewed-by: Alejandro Lucero Reviewed-by: Dave Jiang Signed-off-by: Dan Williams Reviewed-by: Jonathan Cameron Tested-by: Alejandro Lucero Link: https://patch.msgid.link/173864306400.668823.12143134425285426523.stgit@dwillia2-xfh.jf.intel.com Signed-off-by: Dave Jiang --- drivers/cxl/core/hdm.c | 205 +++++++++++++++++++++++++++++------------ drivers/cxl/cxlmem.h | 14 +++ 2 files changed, 159 insertions(+), 60 deletions(-) diff --git a/drivers/cxl/core/hdm.c b/drivers/cxl/core/hdm.c index ea4e792f789f..78ecb88bad7e 100644 --- a/drivers/cxl/core/hdm.c +++ b/drivers/cxl/core/hdm.c @@ -223,6 +223,37 @@ void cxl_dpa_debug(struct seq_file *file, struct cxl_dev_state *cxlds) } EXPORT_SYMBOL_NS_GPL(cxl_dpa_debug, "CXL"); +/* See request_skip() kernel-doc */ +static resource_size_t __adjust_skip(struct cxl_dev_state *cxlds, + const resource_size_t skip_base, + const resource_size_t skip_len, + const char *requester) +{ + const resource_size_t skip_end = skip_base + skip_len - 1; + + for (int i = 0; i < cxlds->nr_partitions; i++) { + const struct resource *part_res = &cxlds->part[i].res; + resource_size_t adjust_start, adjust_end, size; + + adjust_start = max(skip_base, part_res->start); + adjust_end = min(skip_end, part_res->end); + + if (adjust_end < adjust_start) + continue; + + size = adjust_end - adjust_start + 1; + + if (!requester) + __release_region(&cxlds->dpa_res, adjust_start, size); + else if (!__request_region(&cxlds->dpa_res, adjust_start, size, + requester, 0)) + return adjust_start - skip_base; + } + + return skip_len; +} +#define release_skip(c, b, l) __adjust_skip((c), (b), (l), NULL) + /* * Must be called in a context that synchronizes against this decoder's * port ->remove() callback (like an endpoint decoder sysfs attribute) @@ -241,7 +272,7 @@ static void __cxl_dpa_release(struct cxl_endpoint_decoder *cxled) skip_start = res->start - cxled->skip; __release_region(&cxlds->dpa_res, res->start, resource_size(res)); if (cxled->skip) - __release_region(&cxlds->dpa_res, skip_start, cxled->skip); + release_skip(cxlds, skip_start, cxled->skip); cxled->skip = 0; cxled->dpa_res = NULL; put_device(&cxled->cxld.dev); @@ -268,6 +299,58 @@ static void devm_cxl_dpa_release(struct cxl_endpoint_decoder *cxled) __cxl_dpa_release(cxled); } +/** + * request_skip() - Track DPA 'skip' in @cxlds->dpa_res resource tree + * @cxlds: CXL.mem device context that parents @cxled + * @cxled: Endpoint decoder establishing new allocation that skips lower DPA + * @skip_base: DPA < start of new DPA allocation (DPAnew) + * @skip_len: @skip_base + @skip_len == DPAnew + * + * DPA 'skip' arises from out-of-sequence DPA allocation events relative + * to free capacity across multiple partitions. It is a wasteful event + * as usable DPA gets thrown away, but if a deployment has, for example, + * a dual RAM+PMEM device, wants to use PMEM, and has unallocated RAM + * DPA, the free RAM DPA must be sacrificed to start allocating PMEM. + * See third "Implementation Note" in CXL 3.1 8.2.4.19.13 "Decoder + * Protection" for more details. + * + * A 'skip' always covers the last allocated DPA in a previous partition + * to the start of the current partition to allocate. Allocations never + * start in the middle of a partition, and allocations are always + * de-allocated in reverse order (see cxl_dpa_free(), or natural devm + * unwind order from forced in-order allocation). + * + * If @cxlds->nr_partitions was guaranteed to be <= 2 then the 'skip' + * would always be contained to a single partition. Given + * @cxlds->nr_partitions may be > 2 it results in cases where the 'skip' + * might span "tail capacity of partition[0], all of partition[1], ..., + * all of partition[N-1]" to support allocating from partition[N]. That + * in turn interacts with the partition 'struct resource' boundaries + * within @cxlds->dpa_res whereby 'skip' requests need to be divided by + * partition. I.e. this is a quirk of using a 'struct resource' tree to + * detect range conflicts while also tracking partition boundaries in + * @cxlds->dpa_res. + */ +static int request_skip(struct cxl_dev_state *cxlds, + struct cxl_endpoint_decoder *cxled, + const resource_size_t skip_base, + const resource_size_t skip_len) +{ + resource_size_t skipped = __adjust_skip(cxlds, skip_base, skip_len, + dev_name(&cxled->cxld.dev)); + + if (skipped == skip_len) + return 0; + + dev_dbg(cxlds->dev, + "%s: failed to reserve skipped space (%pa %pa %pa)\n", + dev_name(&cxled->cxld.dev), &skip_base, &skip_len, &skipped); + + release_skip(cxlds, skip_base, skipped); + + return -EBUSY; +} + static int __cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled, resource_size_t base, resource_size_t len, resource_size_t skipped) @@ -276,7 +359,9 @@ static int __cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled, struct cxl_port *port = cxled_to_port(cxled); struct cxl_dev_state *cxlds = cxlmd->cxlds; struct device *dev = &port->dev; + enum cxl_decoder_mode mode; struct resource *res; + int rc; lockdep_assert_held_write(&cxl_dpa_rwsem); @@ -305,14 +390,9 @@ static int __cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled, } if (skipped) { - res = __request_region(&cxlds->dpa_res, base - skipped, skipped, - dev_name(&cxled->cxld.dev), 0); - if (!res) { - dev_dbg(dev, - "decoder%d.%d: failed to reserve skipped space\n", - port->id, cxled->cxld.id); - return -EBUSY; - } + rc = request_skip(cxlds, cxled, base - skipped, skipped); + if (rc) + return rc; } res = __request_region(&cxlds->dpa_res, base, len, dev_name(&cxled->cxld.dev), 0); @@ -320,22 +400,23 @@ static int __cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled, dev_dbg(dev, "decoder%d.%d: failed to reserve allocation\n", port->id, cxled->cxld.id); if (skipped) - __release_region(&cxlds->dpa_res, base - skipped, - skipped); + release_skip(cxlds, base - skipped, skipped); return -EBUSY; } cxled->dpa_res = res; cxled->skip = skipped; - if (to_pmem_res(cxlds) && resource_contains(to_pmem_res(cxlds), res)) - cxled->mode = CXL_DECODER_PMEM; - else if (to_ram_res(cxlds) && resource_contains(to_ram_res(cxlds), res)) - cxled->mode = CXL_DECODER_RAM; - else { + mode = CXL_DECODER_NONE; + for (int i = 0; cxlds->nr_partitions; i++) + if (resource_contains(&cxlds->part[i].res, res)) { + mode = cxl_part_mode(cxlds->part[i].mode); + break; + } + + if (mode == CXL_DECODER_NONE) dev_warn(dev, "decoder%d.%d: %pr does not map any partition\n", port->id, cxled->cxld.id, res); - cxled->mode = CXL_DECODER_NONE; - } + cxled->mode = mode; port->hdm_end++; get_device(&cxled->cxld.dev); @@ -542,15 +623,13 @@ int cxl_dpa_set_mode(struct cxl_endpoint_decoder *cxled, int cxl_dpa_alloc(struct cxl_endpoint_decoder *cxled, unsigned long long size) { struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); - resource_size_t free_ram_start, free_pmem_start; struct cxl_port *port = cxled_to_port(cxled); struct cxl_dev_state *cxlds = cxlmd->cxlds; struct device *dev = &cxled->cxld.dev; - resource_size_t start, avail, skip; + struct resource *res, *prev = NULL; + resource_size_t start, avail, skip, skip_start; struct resource *p, *last; - const struct resource *ram_res = to_ram_res(cxlds); - const struct resource *pmem_res = to_pmem_res(cxlds); - int rc; + int part, rc; down_write(&cxl_dpa_rwsem); if (cxled->cxld.region) { @@ -566,47 +645,53 @@ int cxl_dpa_alloc(struct cxl_endpoint_decoder *cxled, unsigned long long size) goto out; } - for (p = ram_res->child, last = NULL; p; p = p->sibling) - last = p; - if (last) - free_ram_start = last->end + 1; - else - free_ram_start = ram_res->start; + part = -1; + for (int i = 0; i < cxlds->nr_partitions; i++) { + if (cxled->mode == cxl_part_mode(cxlds->part[i].mode)) { + part = i; + break; + } + } - for (p = pmem_res->child, last = NULL; p; p = p->sibling) - last = p; - if (last) - free_pmem_start = last->end + 1; - else - free_pmem_start = pmem_res->start; - - if (cxled->mode == CXL_DECODER_RAM) { - start = free_ram_start; - avail = ram_res->end - start + 1; - skip = 0; - } else if (cxled->mode == CXL_DECODER_PMEM) { - resource_size_t skip_start, skip_end; - - start = free_pmem_start; - avail = pmem_res->end - start + 1; - skip_start = free_ram_start; - - /* - * If some pmem is already allocated, then that allocation - * already handled the skip. - */ - if (pmem_res->child && - skip_start == pmem_res->child->start) - skip_end = skip_start - 1; - else - skip_end = start - 1; - skip = skip_end - skip_start + 1; - } else { - dev_dbg(dev, "mode not set\n"); - rc = -EINVAL; + if (part < 0) { + rc = -EBUSY; goto out; } + res = &cxlds->part[part].res; + for (p = res->child, last = NULL; p; p = p->sibling) + last = p; + if (last) + start = last->end + 1; + else + start = res->start; + + /* + * To allocate at partition N, a skip needs to be calculated for all + * unallocated space at lower partitions indices. + * + * If a partition has any allocations, the search can end because a + * previous cxl_dpa_alloc() invocation is assumed to have accounted for + * all previous partitions. + */ + skip_start = CXL_RESOURCE_NONE; + for (int i = part; i; i--) { + prev = &cxlds->part[i - 1].res; + for (p = prev->child, last = NULL; p; p = p->sibling) + last = p; + if (last) { + skip_start = last->end + 1; + break; + } + skip_start = prev->start; + } + + avail = res->end - start + 1; + if (skip_start == CXL_RESOURCE_NONE) + skip = 0; + else + skip = res->start - skip_start; + if (size > avail) { dev_dbg(dev, "%pa exceeds available %s capacity: %pa\n", &size, cxl_decoder_mode_name(cxled->mode), &avail); diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h index e33b2d5efed9..9fe615d96573 100644 --- a/drivers/cxl/cxlmem.h +++ b/drivers/cxl/cxlmem.h @@ -529,6 +529,20 @@ static inline resource_size_t cxl_pmem_size(struct cxl_dev_state *cxlds) return resource_size(res); } +/* + * Translate the operational mode of memory capacity with the + * operational mode of a decoder + * TODO: kill 'enum cxl_decoder_mode' to obviate this helper + */ +static inline enum cxl_decoder_mode cxl_part_mode(enum cxl_partition_mode mode) +{ + if (mode == CXL_PARTMODE_RAM) + return CXL_DECODER_RAM; + if (mode == CXL_PARTMODE_PMEM) + return CXL_DECODER_PMEM; + return CXL_DECODER_NONE; +} + static inline struct cxl_dev_state *mbox_to_cxlds(struct cxl_mailbox *cxl_mbox) { return dev_get_drvdata(cxl_mbox->host); From be5cbd0840275c68b3b7d0685d7acc26436c0d99 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 3 Feb 2025 20:24:29 -0800 Subject: [PATCH 05/39] cxl: Kill enum cxl_decoder_mode Now that the operational mode of DPA capacity (ram vs pmem... etc) is tracked in the partition, and no code paths have dependencies on the mode implying the partition index, the ambiguous 'enum cxl_decoder_mode' can be cleaned up, specifically this ambiguity on whether the operation mode implied anything about the partition order. Endpoint decoders simply reference their assigned partition where the operational mode can be retrieved as partition mode. With this in place PMEM can now be partition0 which happens today when the RAM capacity size is zero. Dynamic RAM can appear above PMEM when DCD arrives, etc. Code sequences that hard coded the "PMEM after RAM" assumption can now just iterate partitions and consult the partition mode after the fact. Reviewed-by: Ira Weiny Reviewed-by: Alejandro Lucero Reviewed-by: Dave Jiang Signed-off-by: Dan Williams Reviewed-by: Jonathan Cameron Tested-by: Alejandro Lucero Link: https://patch.msgid.link/173864306972.668823.3327008645125276726.stgit@dwillia2-xfh.jf.intel.com Signed-off-by: Dave Jiang --- drivers/cxl/core/cdat.c | 18 ++---- drivers/cxl/core/core.h | 4 +- drivers/cxl/core/hdm.c | 68 +++++++++---------- drivers/cxl/core/memdev.c | 15 +---- drivers/cxl/core/port.c | 21 ++++-- drivers/cxl/core/region.c | 133 +++++++++++++++++++++----------------- drivers/cxl/cxl.h | 37 +++-------- drivers/cxl/cxlmem.h | 19 ------ 8 files changed, 137 insertions(+), 178 deletions(-) diff --git a/drivers/cxl/core/cdat.c b/drivers/cxl/core/cdat.c index 33cabe4aa026..f96ae28022b0 100644 --- a/drivers/cxl/core/cdat.c +++ b/drivers/cxl/core/cdat.c @@ -579,23 +579,15 @@ static bool dpa_perf_contains(struct cxl_dpa_perf *perf, return range_contains(&perf->dpa_range, &dpa); } -static struct cxl_dpa_perf *cxled_get_dpa_perf(struct cxl_endpoint_decoder *cxled, - enum cxl_decoder_mode mode) +static struct cxl_dpa_perf *cxled_get_dpa_perf(struct cxl_endpoint_decoder *cxled) { struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); struct cxl_dev_state *cxlds = cxlmd->cxlds; struct cxl_dpa_perf *perf; - switch (mode) { - case CXL_DECODER_RAM: - perf = to_ram_perf(cxlds); - break; - case CXL_DECODER_PMEM: - perf = to_pmem_perf(cxlds); - break; - default: + if (cxled->part < 0) return ERR_PTR(-EINVAL); - } + perf = &cxlds->part[cxled->part].perf; if (!perf) return ERR_PTR(-EINVAL); @@ -659,7 +651,7 @@ static int cxl_endpoint_gather_bandwidth(struct cxl_region *cxlr, if (cxlds->rcd) return -ENODEV; - perf = cxled_get_dpa_perf(cxled, cxlr->mode); + perf = cxled_get_dpa_perf(cxled); if (IS_ERR(perf)) return PTR_ERR(perf); @@ -1065,7 +1057,7 @@ void cxl_region_perf_data_calculate(struct cxl_region *cxlr, lockdep_assert_held(&cxl_dpa_rwsem); - perf = cxled_get_dpa_perf(cxled, cxlr->mode); + perf = cxled_get_dpa_perf(cxled); if (IS_ERR(perf)) return; diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h index 800466f96a68..22dac79c5192 100644 --- a/drivers/cxl/core/core.h +++ b/drivers/cxl/core/core.h @@ -72,8 +72,8 @@ void __iomem *devm_cxl_iomap_block(struct device *dev, resource_size_t addr, resource_size_t length); struct dentry *cxl_debugfs_create_dir(const char *dir); -int cxl_dpa_set_mode(struct cxl_endpoint_decoder *cxled, - enum cxl_decoder_mode mode); +int cxl_dpa_set_part(struct cxl_endpoint_decoder *cxled, + enum cxl_partition_mode mode); int cxl_dpa_alloc(struct cxl_endpoint_decoder *cxled, unsigned long long size); int cxl_dpa_free(struct cxl_endpoint_decoder *cxled); resource_size_t cxl_dpa_size(struct cxl_endpoint_decoder *cxled); diff --git a/drivers/cxl/core/hdm.c b/drivers/cxl/core/hdm.c index 78ecb88bad7e..d705dec1471e 100644 --- a/drivers/cxl/core/hdm.c +++ b/drivers/cxl/core/hdm.c @@ -359,7 +359,6 @@ static int __cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled, struct cxl_port *port = cxled_to_port(cxled); struct cxl_dev_state *cxlds = cxlmd->cxlds; struct device *dev = &port->dev; - enum cxl_decoder_mode mode; struct resource *res; int rc; @@ -406,17 +405,21 @@ static int __cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled, cxled->dpa_res = res; cxled->skip = skipped; - mode = CXL_DECODER_NONE; - for (int i = 0; cxlds->nr_partitions; i++) - if (resource_contains(&cxlds->part[i].res, res)) { - mode = cxl_part_mode(cxlds->part[i].mode); - break; - } + /* + * When allocating new capacity, ->part is already set, when + * discovering decoder settings at initial enumeration, ->part + * is not set. + */ + if (cxled->part < 0) + for (int i = 0; cxlds->nr_partitions; i++) + if (resource_contains(&cxlds->part[i].res, res)) { + cxled->part = i; + break; + } - if (mode == CXL_DECODER_NONE) + if (cxled->part < 0) dev_warn(dev, "decoder%d.%d: %pr does not map any partition\n", port->id, cxled->cxld.id, res); - cxled->mode = mode; port->hdm_end++; get_device(&cxled->cxld.dev); @@ -583,40 +586,33 @@ out: return rc; } -int cxl_dpa_set_mode(struct cxl_endpoint_decoder *cxled, - enum cxl_decoder_mode mode) +int cxl_dpa_set_part(struct cxl_endpoint_decoder *cxled, + enum cxl_partition_mode mode) { struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); struct cxl_dev_state *cxlds = cxlmd->cxlds; struct device *dev = &cxled->cxld.dev; - - switch (mode) { - case CXL_DECODER_RAM: - case CXL_DECODER_PMEM: - break; - default: - dev_dbg(dev, "unsupported mode: %d\n", mode); - return -EINVAL; - } + int part; guard(rwsem_write)(&cxl_dpa_rwsem); if (cxled->cxld.flags & CXL_DECODER_F_ENABLE) return -EBUSY; - /* - * Only allow modes that are supported by the current partition - * configuration - */ - if (mode == CXL_DECODER_PMEM && !cxl_pmem_size(cxlds)) { - dev_dbg(dev, "no available pmem capacity\n"); - return -ENXIO; + for (part = 0; part < cxlds->nr_partitions; part++) + if (cxlds->part[part].mode == mode) + break; + + if (part >= cxlds->nr_partitions) { + dev_dbg(dev, "unsupported mode: %d\n", mode); + return -EINVAL; } - if (mode == CXL_DECODER_RAM && !cxl_ram_size(cxlds)) { - dev_dbg(dev, "no available ram capacity\n"); + + if (!resource_size(&cxlds->part[part].res)) { + dev_dbg(dev, "no available capacity for mode: %d\n", mode); return -ENXIO; } - cxled->mode = mode; + cxled->part = part; return 0; } @@ -645,15 +641,9 @@ int cxl_dpa_alloc(struct cxl_endpoint_decoder *cxled, unsigned long long size) goto out; } - part = -1; - for (int i = 0; i < cxlds->nr_partitions; i++) { - if (cxled->mode == cxl_part_mode(cxlds->part[i].mode)) { - part = i; - break; - } - } - + part = cxled->part; if (part < 0) { + dev_dbg(dev, "partition not set\n"); rc = -EBUSY; goto out; } @@ -694,7 +684,7 @@ int cxl_dpa_alloc(struct cxl_endpoint_decoder *cxled, unsigned long long size) if (size > avail) { dev_dbg(dev, "%pa exceeds available %s capacity: %pa\n", &size, - cxl_decoder_mode_name(cxled->mode), &avail); + res->name, &avail); rc = -ENOSPC; goto out; } diff --git a/drivers/cxl/core/memdev.c b/drivers/cxl/core/memdev.c index be0eb57086e1..615cbd861f66 100644 --- a/drivers/cxl/core/memdev.c +++ b/drivers/cxl/core/memdev.c @@ -198,17 +198,8 @@ static int cxl_get_poison_by_memdev(struct cxl_memdev *cxlmd) int rc = 0; /* CXL 3.0 Spec 8.2.9.8.4.1 Separate pmem and ram poison requests */ - if (cxl_pmem_size(cxlds)) { - const struct resource *res = to_pmem_res(cxlds); - - offset = res->start; - length = resource_size(res); - rc = cxl_mem_get_poison(cxlmd, offset, length, NULL); - if (rc) - return rc; - } - if (cxl_ram_size(cxlds)) { - const struct resource *res = to_ram_res(cxlds); + for (int i = 0; i < cxlds->nr_partitions; i++) { + const struct resource *res = &cxlds->part[i].res; offset = res->start; length = resource_size(res); @@ -217,7 +208,7 @@ static int cxl_get_poison_by_memdev(struct cxl_memdev *cxlmd) * Invalid Physical Address is not an error for * volatile addresses. Device support is optional. */ - if (rc == -EFAULT) + if (rc == -EFAULT && cxlds->part[i].mode == CXL_PARTMODE_RAM) rc = 0; } return rc; diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c index 78a5c2c25982..fc24c5bc9f2a 100644 --- a/drivers/cxl/core/port.c +++ b/drivers/cxl/core/port.c @@ -194,25 +194,35 @@ static ssize_t mode_show(struct device *dev, struct device_attribute *attr, char *buf) { struct cxl_endpoint_decoder *cxled = to_cxl_endpoint_decoder(dev); + struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); + struct cxl_dev_state *cxlds = cxlmd->cxlds; + /* without @cxl_dpa_rwsem, make sure @part is not reloaded */ + int part = READ_ONCE(cxled->part); + const char *desc; - return sysfs_emit(buf, "%s\n", cxl_decoder_mode_name(cxled->mode)); + if (part < 0) + desc = "none"; + else + desc = cxlds->part[part].res.name; + + return sysfs_emit(buf, "%s\n", desc); } static ssize_t mode_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { struct cxl_endpoint_decoder *cxled = to_cxl_endpoint_decoder(dev); - enum cxl_decoder_mode mode; + enum cxl_partition_mode mode; ssize_t rc; if (sysfs_streq(buf, "pmem")) - mode = CXL_DECODER_PMEM; + mode = CXL_PARTMODE_PMEM; else if (sysfs_streq(buf, "ram")) - mode = CXL_DECODER_RAM; + mode = CXL_PARTMODE_RAM; else return -EINVAL; - rc = cxl_dpa_set_mode(cxled, mode); + rc = cxl_dpa_set_part(cxled, mode); if (rc) return rc; @@ -1899,6 +1909,7 @@ struct cxl_endpoint_decoder *cxl_endpoint_decoder_alloc(struct cxl_port *port) return ERR_PTR(-ENOMEM); cxled->pos = -1; + cxled->part = -1; cxld = &cxled->cxld; rc = cxl_decoder_init(port, cxld); if (rc) { diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index 6706a1b79549..84ce625b8591 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -144,7 +144,7 @@ static ssize_t uuid_show(struct device *dev, struct device_attribute *attr, rc = down_read_interruptible(&cxl_region_rwsem); if (rc) return rc; - if (cxlr->mode != CXL_DECODER_PMEM) + if (cxlr->mode != CXL_PARTMODE_PMEM) rc = sysfs_emit(buf, "\n"); else rc = sysfs_emit(buf, "%pUb\n", &p->uuid); @@ -441,7 +441,7 @@ static umode_t cxl_region_visible(struct kobject *kobj, struct attribute *a, * Support tooling that expects to find a 'uuid' attribute for all * regions regardless of mode. */ - if (a == &dev_attr_uuid.attr && cxlr->mode != CXL_DECODER_PMEM) + if (a == &dev_attr_uuid.attr && cxlr->mode != CXL_PARTMODE_PMEM) return 0444; return a->mode; } @@ -603,8 +603,16 @@ static ssize_t mode_show(struct device *dev, struct device_attribute *attr, char *buf) { struct cxl_region *cxlr = to_cxl_region(dev); + const char *desc; - return sysfs_emit(buf, "%s\n", cxl_decoder_mode_name(cxlr->mode)); + if (cxlr->mode == CXL_PARTMODE_RAM) + desc = "ram"; + else if (cxlr->mode == CXL_PARTMODE_PMEM) + desc = "pmem"; + else + desc = ""; + + return sysfs_emit(buf, "%s\n", desc); } static DEVICE_ATTR_RO(mode); @@ -630,7 +638,7 @@ static int alloc_hpa(struct cxl_region *cxlr, resource_size_t size) /* ways, granularity and uuid (if PMEM) need to be set before HPA */ if (!p->interleave_ways || !p->interleave_granularity || - (cxlr->mode == CXL_DECODER_PMEM && uuid_is_null(&p->uuid))) + (cxlr->mode == CXL_PARTMODE_PMEM && uuid_is_null(&p->uuid))) return -ENXIO; div64_u64_rem(size, (u64)SZ_256M * p->interleave_ways, &remainder); @@ -1888,6 +1896,7 @@ static int cxl_region_attach(struct cxl_region *cxlr, { struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(cxlr->dev.parent); struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); + struct cxl_dev_state *cxlds = cxlmd->cxlds; struct cxl_region_params *p = &cxlr->params; struct cxl_port *ep_port, *root_port; struct cxl_dport *dport; @@ -1902,17 +1911,17 @@ static int cxl_region_attach(struct cxl_region *cxlr, return rc; } - if (cxled->mode != cxlr->mode) { - dev_dbg(&cxlr->dev, "%s region mode: %d mismatch: %d\n", - dev_name(&cxled->cxld.dev), cxlr->mode, cxled->mode); - return -EINVAL; - } - - if (cxled->mode == CXL_DECODER_DEAD) { + if (cxled->part < 0) { dev_dbg(&cxlr->dev, "%s dead\n", dev_name(&cxled->cxld.dev)); return -ENODEV; } + if (cxlds->part[cxled->part].mode != cxlr->mode) { + dev_dbg(&cxlr->dev, "%s region mode: %d mismatch\n", + dev_name(&cxled->cxld.dev), cxlr->mode); + return -EINVAL; + } + /* all full of members, or interleave config not established? */ if (p->state > CXL_CONFIG_INTERLEAVE_ACTIVE) { dev_dbg(&cxlr->dev, "region already active\n"); @@ -2115,7 +2124,7 @@ out: void cxl_decoder_kill_region(struct cxl_endpoint_decoder *cxled) { down_write(&cxl_region_rwsem); - cxled->mode = CXL_DECODER_DEAD; + cxled->part = -1; cxl_region_detach(cxled); up_write(&cxl_region_rwsem); } @@ -2471,7 +2480,7 @@ static int cxl_region_calculate_adistance(struct notifier_block *nb, */ static struct cxl_region *devm_cxl_add_region(struct cxl_root_decoder *cxlrd, int id, - enum cxl_decoder_mode mode, + enum cxl_partition_mode mode, enum cxl_decoder_type type) { struct cxl_port *port = to_cxl_port(cxlrd->cxlsd.cxld.dev.parent); @@ -2525,13 +2534,13 @@ static ssize_t create_ram_region_show(struct device *dev, } static struct cxl_region *__create_region(struct cxl_root_decoder *cxlrd, - enum cxl_decoder_mode mode, int id) + enum cxl_partition_mode mode, int id) { int rc; switch (mode) { - case CXL_DECODER_RAM: - case CXL_DECODER_PMEM: + case CXL_PARTMODE_RAM: + case CXL_PARTMODE_PMEM: break; default: dev_err(&cxlrd->cxlsd.cxld.dev, "unsupported mode %d\n", mode); @@ -2551,7 +2560,7 @@ static struct cxl_region *__create_region(struct cxl_root_decoder *cxlrd, } static ssize_t create_region_store(struct device *dev, const char *buf, - size_t len, enum cxl_decoder_mode mode) + size_t len, enum cxl_partition_mode mode) { struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(dev); struct cxl_region *cxlr; @@ -2572,7 +2581,7 @@ static ssize_t create_pmem_region_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { - return create_region_store(dev, buf, len, CXL_DECODER_PMEM); + return create_region_store(dev, buf, len, CXL_PARTMODE_PMEM); } DEVICE_ATTR_RW(create_pmem_region); @@ -2580,7 +2589,7 @@ static ssize_t create_ram_region_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { - return create_region_store(dev, buf, len, CXL_DECODER_RAM); + return create_region_store(dev, buf, len, CXL_PARTMODE_RAM); } DEVICE_ATTR_RW(create_ram_region); @@ -2678,7 +2687,7 @@ EXPORT_SYMBOL_NS_GPL(to_cxl_pmem_region, "CXL"); struct cxl_poison_context { struct cxl_port *port; - enum cxl_decoder_mode mode; + int part; u64 offset; }; @@ -2686,49 +2695,45 @@ static int cxl_get_poison_unmapped(struct cxl_memdev *cxlmd, struct cxl_poison_context *ctx) { struct cxl_dev_state *cxlds = cxlmd->cxlds; + const struct resource *res; + struct resource *p, *last; u64 offset, length; int rc = 0; - /* - * Collect poison for the remaining unmapped resources - * after poison is collected by committed endpoints. - * - * Knowing that PMEM must always follow RAM, get poison - * for unmapped resources based on the last decoder's mode: - * ram: scan remains of ram range, then any pmem range - * pmem: scan remains of pmem range - */ - - if (ctx->mode == CXL_DECODER_RAM) { - offset = ctx->offset; - length = cxl_ram_size(cxlds) - offset; - rc = cxl_mem_get_poison(cxlmd, offset, length, NULL); - if (rc == -EFAULT) - rc = 0; - if (rc) - return rc; - } - if (ctx->mode == CXL_DECODER_PMEM) { - offset = ctx->offset; - length = resource_size(&cxlds->dpa_res) - offset; - if (!length) - return 0; - } else if (cxl_pmem_size(cxlds)) { - const struct resource *res = to_pmem_res(cxlds); - - offset = res->start; - length = resource_size(res); - } else { + if (ctx->part < 0) return 0; + + /* + * Collect poison for the remaining unmapped resources after + * poison is collected by committed endpoints decoders. + */ + for (int i = ctx->part; i < cxlds->nr_partitions; i++) { + res = &cxlds->part[i].res; + for (p = res->child, last = NULL; p; p = p->sibling) + last = p; + if (last) + offset = last->end + 1; + else + offset = res->start; + length = res->end - offset + 1; + if (!length) + break; + rc = cxl_mem_get_poison(cxlmd, offset, length, NULL); + if (rc == -EFAULT && cxlds->part[i].mode == CXL_PARTMODE_RAM) + continue; + if (rc) + break; } - return cxl_mem_get_poison(cxlmd, offset, length, NULL); + return rc; } static int poison_by_decoder(struct device *dev, void *arg) { struct cxl_poison_context *ctx = arg; struct cxl_endpoint_decoder *cxled; + enum cxl_partition_mode mode; + struct cxl_dev_state *cxlds; struct cxl_memdev *cxlmd; u64 offset, length; int rc = 0; @@ -2737,15 +2742,18 @@ static int poison_by_decoder(struct device *dev, void *arg) return rc; cxled = to_cxl_endpoint_decoder(dev); - if (!cxled->dpa_res || !resource_size(cxled->dpa_res)) + if (!cxled->dpa_res) return rc; cxlmd = cxled_to_memdev(cxled); + cxlds = cxlmd->cxlds; + mode = cxlds->part[cxled->part].mode; + if (cxled->skip) { offset = cxled->dpa_res->start - cxled->skip; length = cxled->skip; rc = cxl_mem_get_poison(cxlmd, offset, length, NULL); - if (rc == -EFAULT && cxled->mode == CXL_DECODER_RAM) + if (rc == -EFAULT && mode == CXL_PARTMODE_RAM) rc = 0; if (rc) return rc; @@ -2754,7 +2762,7 @@ static int poison_by_decoder(struct device *dev, void *arg) offset = cxled->dpa_res->start; length = cxled->dpa_res->end - offset + 1; rc = cxl_mem_get_poison(cxlmd, offset, length, cxled->cxld.region); - if (rc == -EFAULT && cxled->mode == CXL_DECODER_RAM) + if (rc == -EFAULT && mode == CXL_PARTMODE_RAM) rc = 0; if (rc) return rc; @@ -2762,7 +2770,7 @@ static int poison_by_decoder(struct device *dev, void *arg) /* Iterate until commit_end is reached */ if (cxled->cxld.id == ctx->port->commit_end) { ctx->offset = cxled->dpa_res->end + 1; - ctx->mode = cxled->mode; + ctx->part = cxled->part; return 1; } @@ -2775,7 +2783,8 @@ int cxl_get_poison_by_endpoint(struct cxl_port *port) int rc = 0; ctx = (struct cxl_poison_context) { - .port = port + .port = port, + .part = -1, }; rc = device_for_each_child(&port->dev, &ctx, poison_by_decoder); @@ -3220,14 +3229,18 @@ static struct cxl_region *construct_region(struct cxl_root_decoder *cxlrd, { struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); struct cxl_port *port = cxlrd_to_port(cxlrd); + struct cxl_dev_state *cxlds = cxlmd->cxlds; struct range *hpa = &cxled->cxld.hpa_range; + int rc, part = READ_ONCE(cxled->part); struct cxl_region_params *p; struct cxl_region *cxlr; struct resource *res; - int rc; + + if (part < 0) + return ERR_PTR(-EBUSY); do { - cxlr = __create_region(cxlrd, cxled->mode, + cxlr = __create_region(cxlrd, cxlds->part[part].mode, atomic_read(&cxlrd->region_id)); } while (IS_ERR(cxlr) && PTR_ERR(cxlr) == -EBUSY); @@ -3430,9 +3443,9 @@ out: return rc; switch (cxlr->mode) { - case CXL_DECODER_PMEM: + case CXL_PARTMODE_PMEM: return devm_cxl_add_pmem_region(cxlr); - case CXL_DECODER_RAM: + case CXL_PARTMODE_RAM: /* * The region can not be manged by CXL if any portion of * it is already online as 'System RAM' diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index 9fd1524ed150..94a34833eedd 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -372,30 +372,6 @@ struct cxl_decoder { void (*reset)(struct cxl_decoder *cxld); }; -/* - * CXL_DECODER_DEAD prevents endpoints from being reattached to regions - * while cxld_unregister() is running - */ -enum cxl_decoder_mode { - CXL_DECODER_NONE, - CXL_DECODER_RAM, - CXL_DECODER_PMEM, - CXL_DECODER_DEAD, -}; - -static inline const char *cxl_decoder_mode_name(enum cxl_decoder_mode mode) -{ - static const char * const names[] = { - [CXL_DECODER_NONE] = "none", - [CXL_DECODER_RAM] = "ram", - [CXL_DECODER_PMEM] = "pmem", - }; - - if (mode >= CXL_DECODER_NONE && mode < CXL_DECODER_DEAD) - return names[mode]; - return "mixed"; -} - /* * Track whether this decoder is reserved for region autodiscovery, or * free for userspace provisioning. @@ -410,16 +386,16 @@ enum cxl_decoder_state { * @cxld: base cxl_decoder_object * @dpa_res: actively claimed DPA span of this decoder * @skip: offset into @dpa_res where @cxld.hpa_range maps - * @mode: which memory type / access-mode-partition this decoder targets * @state: autodiscovery state + * @part: partition index this decoder maps * @pos: interleave position in @cxld.region */ struct cxl_endpoint_decoder { struct cxl_decoder cxld; struct resource *dpa_res; resource_size_t skip; - enum cxl_decoder_mode mode; enum cxl_decoder_state state; + int part; int pos; }; @@ -504,6 +480,11 @@ struct cxl_region_params { int nr_targets; }; +enum cxl_partition_mode { + CXL_PARTMODE_RAM, + CXL_PARTMODE_PMEM, +}; + /* * Indicate whether this region has been assembled by autodetection or * userspace assembly. Prevent endpoint decoders outside of automatic @@ -523,7 +504,7 @@ struct cxl_region_params { * struct cxl_region - CXL region * @dev: This region's device * @id: This region's id. Id is globally unique across all regions - * @mode: Endpoint decoder allocation / access mode + * @mode: Operational mode of the mapped capacity * @type: Endpoint decoder target type * @cxl_nvb: nvdimm bridge for coordinating @cxlr_pmem setup / shutdown * @cxlr_pmem: (for pmem regions) cached copy of the nvdimm bridge @@ -536,7 +517,7 @@ struct cxl_region_params { struct cxl_region { struct device dev; int id; - enum cxl_decoder_mode mode; + enum cxl_partition_mode mode; enum cxl_decoder_type type; struct cxl_nvdimm_bridge *cxl_nvb; struct cxl_pmem_region *cxlr_pmem; diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h index 9fe615d96573..f218d43dec9f 100644 --- a/drivers/cxl/cxlmem.h +++ b/drivers/cxl/cxlmem.h @@ -97,11 +97,6 @@ int devm_cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled, resource_size_t base, resource_size_t len, resource_size_t skipped); -enum cxl_partition_mode { - CXL_PARTMODE_RAM, - CXL_PARTMODE_PMEM, -}; - #define CXL_NR_PARTITIONS_MAX 2 struct cxl_dpa_info { @@ -529,20 +524,6 @@ static inline resource_size_t cxl_pmem_size(struct cxl_dev_state *cxlds) return resource_size(res); } -/* - * Translate the operational mode of memory capacity with the - * operational mode of a decoder - * TODO: kill 'enum cxl_decoder_mode' to obviate this helper - */ -static inline enum cxl_decoder_mode cxl_part_mode(enum cxl_partition_mode mode) -{ - if (mode == CXL_PARTMODE_RAM) - return CXL_DECODER_RAM; - if (mode == CXL_PARTMODE_PMEM) - return CXL_DECODER_PMEM; - return CXL_DECODER_NONE; -} - static inline struct cxl_dev_state *mbox_to_cxlds(struct cxl_mailbox *cxl_mbox) { return dev_get_drvdata(cxl_mbox->host); From 58d60bbe0a99539afb1f29d03c28f06747f94531 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 3 Feb 2025 20:24:35 -0800 Subject: [PATCH 06/39] cxl: Cleanup partition size and perf helpers Now that the 'struct cxl_dpa_partition' array contains both size and performance information, all paths that iterate over that information can use a loop rather than hard-code 'ram' and 'pmem' lookups. Remove, or reduce the scope of the temporary helpers that bridged the pre-'struct cxl_dpa_partition' state of the code to the post-'struct cxl_dpa_partition' state. - to_{ram,pmem}_perf(): scope reduced to just sysfs_emit + is_visible() helpers - to_{ram,pmem}_res(): fold into their only users cxl_{ram,pmem}_size() - cxl_ram_size(): scope reduced to ram_size_show() (Note, cxl_pmem_size() also used to gate nvdimm registration) In short, memdev sysfs ABI already made the promise that 0-sized partitions will show for memdevs, but that can be avoided for future partitions by using dynamic sysfs group visibility (new relative to when the partition ABI first shipped upstream). Cc: Dave Jiang Cc: Alejandro Lucero Cc: Ira Weiny Signed-off-by: Dan Williams Reviewed-by: Jonathan Cameron Tested-by: Alejandro Lucero Link: https://patch.msgid.link/173864307519.668823.10800104022426067621.stgit@dwillia2-xfh.jf.intel.com Signed-off-by: Dave Jiang --- drivers/cxl/core/cdat.c | 49 +++++++++++++++++---------------- drivers/cxl/core/memdev.c | 23 ++++++++++++++++ drivers/cxl/cxlmem.h | 58 ++++++--------------------------------- 3 files changed, 57 insertions(+), 73 deletions(-) diff --git a/drivers/cxl/core/cdat.c b/drivers/cxl/core/cdat.c index f96ae28022b0..bfba7f5019cf 100644 --- a/drivers/cxl/core/cdat.c +++ b/drivers/cxl/core/cdat.c @@ -306,9 +306,6 @@ static int match_cxlrd_qos_class(struct device *dev, void *data) static void reset_dpa_perf(struct cxl_dpa_perf *dpa_perf) { - if (!dpa_perf) - return; - *dpa_perf = (struct cxl_dpa_perf) { .qos_class = CXL_QOS_CLASS_INVALID, }; @@ -317,9 +314,6 @@ static void reset_dpa_perf(struct cxl_dpa_perf *dpa_perf) static bool cxl_qos_match(struct cxl_port *root_port, struct cxl_dpa_perf *dpa_perf) { - if (!dpa_perf) - return false; - if (dpa_perf->qos_class == CXL_QOS_CLASS_INVALID) return false; @@ -351,37 +345,46 @@ static int match_cxlrd_hb(struct device *dev, void *data) return 0; } -static int cxl_qos_class_verify(struct cxl_memdev *cxlmd) +static void cxl_qos_class_verify(struct cxl_memdev *cxlmd) { struct cxl_dev_state *cxlds = cxlmd->cxlds; - struct cxl_dpa_perf *ram_perf = to_ram_perf(cxlds), - *pmem_perf = to_pmem_perf(cxlds); struct cxl_port *root_port; - int rc; struct cxl_root *cxl_root __free(put_cxl_root) = find_cxl_root(cxlmd->endpoint); + /* + * No need to reset_dpa_perf() here as find_cxl_root() is guaranteed to + * succeed when called in the cxl_endpoint_port_probe() path. + */ if (!cxl_root) - return -ENODEV; + return; root_port = &cxl_root->port; - /* Check that the QTG IDs are all sane between end device and root decoders */ - if (!cxl_qos_match(root_port, ram_perf)) - reset_dpa_perf(ram_perf); - if (!cxl_qos_match(root_port, pmem_perf)) - reset_dpa_perf(pmem_perf); + /* + * Save userspace from needing to check if a qos class has any matches + * by hiding qos class info if the memdev is not mapped by a root + * decoder, or the partition class does not match any root decoder + * class. + */ + if (!device_for_each_child(&root_port->dev, + cxlmd->endpoint->host_bridge, + match_cxlrd_hb)) { + for (int i = 0; i < cxlds->nr_partitions; i++) { + struct cxl_dpa_perf *perf = &cxlds->part[i].perf; - /* Check to make sure that the device's host bridge is under a root decoder */ - rc = device_for_each_child(&root_port->dev, - cxlmd->endpoint->host_bridge, match_cxlrd_hb); - if (!rc) { - reset_dpa_perf(ram_perf); - reset_dpa_perf(pmem_perf); + reset_dpa_perf(perf); + } + return; } - return rc; + for (int i = 0; i < cxlds->nr_partitions; i++) { + struct cxl_dpa_perf *perf = &cxlds->part[i].perf; + + if (!cxl_qos_match(root_port, perf)) + reset_dpa_perf(perf); + } } static void discard_dsmas(struct xarray *xa) diff --git a/drivers/cxl/core/memdev.c b/drivers/cxl/core/memdev.c index 615cbd861f66..63c6c681125d 100644 --- a/drivers/cxl/core/memdev.c +++ b/drivers/cxl/core/memdev.c @@ -75,6 +75,14 @@ static ssize_t label_storage_size_show(struct device *dev, } static DEVICE_ATTR_RO(label_storage_size); +static resource_size_t cxl_ram_size(struct cxl_dev_state *cxlds) +{ + /* Static RAM is only expected at partition 0. */ + if (cxlds->part[0].mode != CXL_PARTMODE_RAM) + return 0; + return resource_size(&cxlds->part[0].res); +} + static ssize_t ram_size_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -399,6 +407,14 @@ static struct attribute *cxl_memdev_attributes[] = { NULL, }; +static struct cxl_dpa_perf *to_pmem_perf(struct cxl_dev_state *cxlds) +{ + for (int i = 0; i < cxlds->nr_partitions; i++) + if (cxlds->part[i].mode == CXL_PARTMODE_PMEM) + return &cxlds->part[i].perf; + return NULL; +} + static ssize_t pmem_qos_class_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -417,6 +433,13 @@ static struct attribute *cxl_memdev_pmem_attributes[] = { NULL, }; +static struct cxl_dpa_perf *to_ram_perf(struct cxl_dev_state *cxlds) +{ + if (cxlds->part[0].mode != CXL_PARTMODE_RAM) + return NULL; + return &cxlds->part[0].perf; +} + static ssize_t ram_qos_class_show(struct device *dev, struct device_attribute *attr, char *buf) { diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h index f218d43dec9f..36a091597066 100644 --- a/drivers/cxl/cxlmem.h +++ b/drivers/cxl/cxlmem.h @@ -470,58 +470,16 @@ struct cxl_dev_state { struct cxl_mailbox cxl_mbox; }; - -/* Static RAM is only expected at partition 0. */ -static inline const struct resource *to_ram_res(struct cxl_dev_state *cxlds) -{ - if (cxlds->part[0].mode != CXL_PARTMODE_RAM) - return NULL; - return &cxlds->part[0].res; -} - -/* - * Static PMEM may be at partition index 0 when there is no static RAM - * capacity. - */ -static inline const struct resource *to_pmem_res(struct cxl_dev_state *cxlds) -{ - for (int i = 0; i < cxlds->nr_partitions; i++) - if (cxlds->part[i].mode == CXL_PARTMODE_PMEM) - return &cxlds->part[i].res; - return NULL; -} - -static inline struct cxl_dpa_perf *to_ram_perf(struct cxl_dev_state *cxlds) -{ - if (cxlds->part[0].mode != CXL_PARTMODE_RAM) - return NULL; - return &cxlds->part[0].perf; -} - -static inline struct cxl_dpa_perf *to_pmem_perf(struct cxl_dev_state *cxlds) -{ - for (int i = 0; i < cxlds->nr_partitions; i++) - if (cxlds->part[i].mode == CXL_PARTMODE_PMEM) - return &cxlds->part[i].perf; - return NULL; -} - -static inline resource_size_t cxl_ram_size(struct cxl_dev_state *cxlds) -{ - const struct resource *res = to_ram_res(cxlds); - - if (!res) - return 0; - return resource_size(res); -} - static inline resource_size_t cxl_pmem_size(struct cxl_dev_state *cxlds) { - const struct resource *res = to_pmem_res(cxlds); - - if (!res) - return 0; - return resource_size(res); + /* + * Static PMEM may be at partition index 0 when there is no static RAM + * capacity. + */ + for (int i = 0; i < cxlds->nr_partitions; i++) + if (cxlds->part[i].mode == CXL_PARTMODE_PMEM) + return resource_size(&cxlds->part[i].res); + return 0; } static inline struct cxl_dev_state *mbox_to_cxlds(struct cxl_mailbox *cxl_mbox) From 84973331442a57bac31b40eaef6f264496c6f3fd Mon Sep 17 00:00:00 2001 From: Smita Koralahalli Date: Thu, 23 Jan 2025 08:44:16 +0000 Subject: [PATCH 07/39] efi/cper, cxl: Prefix protocol error struct and function names with cxl_ Rename the protocol error struct from struct cper_sec_prot_err to struct cxl_cper_sec_prot_err and cper_print_prot_err() to cxl_cper_print_prot_err() to maintain naming consistency. No functional changes. Signed-off-by: Smita Koralahalli Reviewed-by: Jonathan Cameron Reviewed-by: Ira Weiny Reviewed-by: Dave Jiang Reviewed-by: Fan Ni Reviewed-by: Gregory Price Reviewed-by: Dan Williams Link: https://patch.msgid.link/20250123084421.127697-2-Smita.KoralahalliChannabasappa@amd.com Signed-off-by: Dave Jiang --- drivers/firmware/efi/cper.c | 4 ++-- drivers/firmware/efi/cper_cxl.c | 3 ++- drivers/firmware/efi/cper_cxl.h | 5 +++-- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/drivers/firmware/efi/cper.c b/drivers/firmware/efi/cper.c index b69e68ef3f02..8e5762f7ef2e 100644 --- a/drivers/firmware/efi/cper.c +++ b/drivers/firmware/efi/cper.c @@ -624,11 +624,11 @@ cper_estatus_print_section(const char *pfx, struct acpi_hest_generic_data *gdata else goto err_section_too_small; } else if (guid_equal(sec_type, &CPER_SEC_CXL_PROT_ERR)) { - struct cper_sec_prot_err *prot_err = acpi_hest_get_payload(gdata); + struct cxl_cper_sec_prot_err *prot_err = acpi_hest_get_payload(gdata); printk("%ssection_type: CXL Protocol Error\n", newpfx); if (gdata->error_data_length >= sizeof(*prot_err)) - cper_print_prot_err(newpfx, prot_err); + cxl_cper_print_prot_err(newpfx, prot_err); else goto err_section_too_small; } else { diff --git a/drivers/firmware/efi/cper_cxl.c b/drivers/firmware/efi/cper_cxl.c index a55771b99a97..cbaabcb7382d 100644 --- a/drivers/firmware/efi/cper_cxl.c +++ b/drivers/firmware/efi/cper_cxl.c @@ -55,7 +55,8 @@ enum { USP, /* CXL Upstream Switch Port */ }; -void cper_print_prot_err(const char *pfx, const struct cper_sec_prot_err *prot_err) +void cxl_cper_print_prot_err(const char *pfx, + const struct cxl_cper_sec_prot_err *prot_err) { if (prot_err->valid_bits & PROT_ERR_VALID_AGENT_TYPE) pr_info("%s agent_type: %d, %s\n", pfx, prot_err->agent_type, diff --git a/drivers/firmware/efi/cper_cxl.h b/drivers/firmware/efi/cper_cxl.h index 86bfcf7909ec..0e3ab0ba17c3 100644 --- a/drivers/firmware/efi/cper_cxl.h +++ b/drivers/firmware/efi/cper_cxl.h @@ -18,7 +18,7 @@ #pragma pack(1) /* Compute Express Link Protocol Error Section, UEFI v2.10 sec N.2.13 */ -struct cper_sec_prot_err { +struct cxl_cper_sec_prot_err { u64 valid_bits; u8 agent_type; u8 reserved[7]; @@ -61,6 +61,7 @@ struct cper_sec_prot_err { #pragma pack() -void cper_print_prot_err(const char *pfx, const struct cper_sec_prot_err *prot_err); +void cxl_cper_print_prot_err(const char *pfx, + const struct cxl_cper_sec_prot_err *prot_err); #endif //__CPER_CXL_ From 958c3a6706863f9f77b21c90d2428473441cd8a1 Mon Sep 17 00:00:00 2001 From: Smita Koralahalli Date: Thu, 23 Jan 2025 08:44:17 +0000 Subject: [PATCH 08/39] efi/cper, cxl: Make definitions and structures global In preparation to add tracepoint support, move protocol error UUID definition to a common location, Also, make struct CXL RAS capability, cxl_cper_sec_prot_err and CPER validation flags global for use across different modules. Signed-off-by: Smita Koralahalli Reviewed-by: Jonathan Cameron Reviewed-by: Ira Weiny Reviewed-by: Dave Jiang Reviewed-by: Fan Ni Reviewed-by: Gregory Price Reviewed-by: Dan Williams Link: https://patch.msgid.link/20250123084421.127697-3-Smita.KoralahalliChannabasappa@amd.com Signed-off-by: Dave Jiang --- drivers/firmware/efi/cper.c | 1 + drivers/firmware/efi/cper_cxl.c | 35 +-------------- drivers/firmware/efi/cper_cxl.h | 51 --------------------- include/cxl/event.h | 80 +++++++++++++++++++++++++++++++++ include/linux/cper.h | 4 ++ 5 files changed, 86 insertions(+), 85 deletions(-) diff --git a/drivers/firmware/efi/cper.c b/drivers/firmware/efi/cper.c index 8e5762f7ef2e..ae1953e2b214 100644 --- a/drivers/firmware/efi/cper.c +++ b/drivers/firmware/efi/cper.c @@ -24,6 +24,7 @@ #include #include #include +#include #include "cper_cxl.h" /* diff --git a/drivers/firmware/efi/cper_cxl.c b/drivers/firmware/efi/cper_cxl.c index cbaabcb7382d..64c0dd27be6e 100644 --- a/drivers/firmware/efi/cper_cxl.c +++ b/drivers/firmware/efi/cper_cxl.c @@ -8,27 +8,9 @@ */ #include +#include #include "cper_cxl.h" -#define PROT_ERR_VALID_AGENT_TYPE BIT_ULL(0) -#define PROT_ERR_VALID_AGENT_ADDRESS BIT_ULL(1) -#define PROT_ERR_VALID_DEVICE_ID BIT_ULL(2) -#define PROT_ERR_VALID_SERIAL_NUMBER BIT_ULL(3) -#define PROT_ERR_VALID_CAPABILITY BIT_ULL(4) -#define PROT_ERR_VALID_DVSEC BIT_ULL(5) -#define PROT_ERR_VALID_ERROR_LOG BIT_ULL(6) - -/* CXL RAS Capability Structure, CXL v3.0 sec 8.2.4.16 */ -struct cxl_ras_capability_regs { - u32 uncor_status; - u32 uncor_mask; - u32 uncor_severity; - u32 cor_status; - u32 cor_mask; - u32 cap_control; - u32 header_log[16]; -}; - static const char * const prot_err_agent_type_strs[] = { "Restricted CXL Device", "Restricted CXL Host Downstream Port", @@ -40,21 +22,6 @@ static const char * const prot_err_agent_type_strs[] = { "CXL Upstream Switch Port", }; -/* - * The layout of the enumeration and the values matches CXL Agent Type - * field in the UEFI 2.10 Section N.2.13, - */ -enum { - RCD, /* Restricted CXL Device */ - RCH_DP, /* Restricted CXL Host Downstream Port */ - DEVICE, /* CXL Device */ - LD, /* CXL Logical Device */ - FMLD, /* CXL Fabric Manager managed Logical Device */ - RP, /* CXL Root Port */ - DSP, /* CXL Downstream Switch Port */ - USP, /* CXL Upstream Switch Port */ -}; - void cxl_cper_print_prot_err(const char *pfx, const struct cxl_cper_sec_prot_err *prot_err) { diff --git a/drivers/firmware/efi/cper_cxl.h b/drivers/firmware/efi/cper_cxl.h index 0e3ab0ba17c3..5ce1401ee17a 100644 --- a/drivers/firmware/efi/cper_cxl.h +++ b/drivers/firmware/efi/cper_cxl.h @@ -10,57 +10,6 @@ #ifndef LINUX_CPER_CXL_H #define LINUX_CPER_CXL_H -/* CXL Protocol Error Section */ -#define CPER_SEC_CXL_PROT_ERR \ - GUID_INIT(0x80B9EFB4, 0x52B5, 0x4DE3, 0xA7, 0x77, 0x68, 0x78, \ - 0x4B, 0x77, 0x10, 0x48) - -#pragma pack(1) - -/* Compute Express Link Protocol Error Section, UEFI v2.10 sec N.2.13 */ -struct cxl_cper_sec_prot_err { - u64 valid_bits; - u8 agent_type; - u8 reserved[7]; - - /* - * Except for RCH Downstream Port, all the remaining CXL Agent - * types are uniquely identified by the PCIe compatible SBDF number. - */ - union { - u64 rcrb_base_addr; - struct { - u8 function; - u8 device; - u8 bus; - u16 segment; - u8 reserved_1[3]; - }; - } agent_addr; - - struct { - u16 vendor_id; - u16 device_id; - u16 subsystem_vendor_id; - u16 subsystem_id; - u8 class_code[2]; - u16 slot; - u8 reserved_1[4]; - } device_id; - - struct { - u32 lower_dw; - u32 upper_dw; - } dev_serial_num; - - u8 capability[60]; - u16 dvsec_len; - u16 err_len; - u8 reserved_2[4]; -}; - -#pragma pack() - void cxl_cper_print_prot_err(const char *pfx, const struct cxl_cper_sec_prot_err *prot_err); diff --git a/include/cxl/event.h b/include/cxl/event.h index 04edd44bd26f..7a6c14c05215 100644 --- a/include/cxl/event.h +++ b/include/cxl/event.h @@ -164,6 +164,86 @@ struct cxl_cper_work_data { struct cxl_cper_event_rec rec; }; +#define PROT_ERR_VALID_AGENT_TYPE BIT_ULL(0) +#define PROT_ERR_VALID_AGENT_ADDRESS BIT_ULL(1) +#define PROT_ERR_VALID_DEVICE_ID BIT_ULL(2) +#define PROT_ERR_VALID_SERIAL_NUMBER BIT_ULL(3) +#define PROT_ERR_VALID_CAPABILITY BIT_ULL(4) +#define PROT_ERR_VALID_DVSEC BIT_ULL(5) +#define PROT_ERR_VALID_ERROR_LOG BIT_ULL(6) + +/* + * The layout of the enumeration and the values matches CXL Agent Type + * field in the UEFI 2.10 Section N.2.13, + */ +enum { + RCD, /* Restricted CXL Device */ + RCH_DP, /* Restricted CXL Host Downstream Port */ + DEVICE, /* CXL Device */ + LD, /* CXL Logical Device */ + FMLD, /* CXL Fabric Manager managed Logical Device */ + RP, /* CXL Root Port */ + DSP, /* CXL Downstream Switch Port */ + USP, /* CXL Upstream Switch Port */ +}; + +#pragma pack(1) + +/* Compute Express Link Protocol Error Section, UEFI v2.10 sec N.2.13 */ +struct cxl_cper_sec_prot_err { + u64 valid_bits; + u8 agent_type; + u8 reserved[7]; + + /* + * Except for RCH Downstream Port, all the remaining CXL Agent + * types are uniquely identified by the PCIe compatible SBDF number. + */ + union { + u64 rcrb_base_addr; + struct { + u8 function; + u8 device; + u8 bus; + u16 segment; + u8 reserved_1[3]; + }; + } agent_addr; + + struct { + u16 vendor_id; + u16 device_id; + u16 subsystem_vendor_id; + u16 subsystem_id; + u8 class_code[2]; + u16 slot; + u8 reserved_1[4]; + } device_id; + + struct { + u32 lower_dw; + u32 upper_dw; + } dev_serial_num; + + u8 capability[60]; + u16 dvsec_len; + u16 err_len; + u8 reserved_2[4]; +}; + +#pragma pack() + +/* CXL RAS Capability Structure, CXL v3.0 sec 8.2.4.16 */ +struct cxl_ras_capability_regs { + u32 uncor_status; + u32 uncor_mask; + u32 uncor_severity; + u32 cor_status; + u32 cor_mask; + u32 cap_control; + u32 header_log[16]; +}; + #ifdef CONFIG_ACPI_APEI_GHES int cxl_cper_register_work(struct work_struct *work); int cxl_cper_unregister_work(struct work_struct *work); diff --git a/include/linux/cper.h b/include/linux/cper.h index 265b0f8fc0b3..5c6d4d5b9975 100644 --- a/include/linux/cper.h +++ b/include/linux/cper.h @@ -89,6 +89,10 @@ enum { #define CPER_NOTIFY_DMAR \ GUID_INIT(0x667DD791, 0xC6B3, 0x4c27, 0x8A, 0x6B, 0x0F, 0x8E, \ 0x72, 0x2D, 0xEB, 0x41) +/* CXL Protocol Error Section */ +#define CPER_SEC_CXL_PROT_ERR \ + GUID_INIT(0x80B9EFB4, 0x52B5, 0x4DE3, 0xA7, 0x77, 0x68, 0x78, \ + 0x4B, 0x77, 0x10, 0x48) /* CXL Event record UUIDs are formatted as GUIDs and reported in section type */ /* From 61eac5f7f6439e8fe99b5fb29406acb0fd7b83c6 Mon Sep 17 00:00:00 2001 From: Smita Koralahalli Date: Thu, 23 Jan 2025 08:44:18 +0000 Subject: [PATCH 09/39] efi/cper, cxl: Remove cper_cxl.h Move the declaration of cxl_cper_print_prot_err() to include/linux/cper.h to avoid maintaining a separate header file just for this function declaration. Remove drivers/firmware/efi/cper_cxl.h as its contents have been reorganized. No functional changes. Signed-off-by: Smita Koralahalli Reviewed-by: Ira Weiny Reviewed-by: Jonathan Cameron Reviewed-by: Dave Jiang Reviewed-by: Fan Ni Reviewed-by: Gregory Price Reviewed-by: Dan Williams Link: https://patch.msgid.link/20250123084421.127697-4-Smita.KoralahalliChannabasappa@amd.com Signed-off-by: Dave Jiang --- drivers/firmware/efi/cper.c | 1 - drivers/firmware/efi/cper_cxl.c | 1 - drivers/firmware/efi/cper_cxl.h | 16 ---------------- include/linux/cper.h | 4 ++++ 4 files changed, 4 insertions(+), 18 deletions(-) delete mode 100644 drivers/firmware/efi/cper_cxl.h diff --git a/drivers/firmware/efi/cper.c b/drivers/firmware/efi/cper.c index ae1953e2b214..928409199a1a 100644 --- a/drivers/firmware/efi/cper.c +++ b/drivers/firmware/efi/cper.c @@ -25,7 +25,6 @@ #include #include #include -#include "cper_cxl.h" /* * CPER record ID need to be unique even after reboot, because record diff --git a/drivers/firmware/efi/cper_cxl.c b/drivers/firmware/efi/cper_cxl.c index 64c0dd27be6e..8a7667faf953 100644 --- a/drivers/firmware/efi/cper_cxl.c +++ b/drivers/firmware/efi/cper_cxl.c @@ -9,7 +9,6 @@ #include #include -#include "cper_cxl.h" static const char * const prot_err_agent_type_strs[] = { "Restricted CXL Device", diff --git a/drivers/firmware/efi/cper_cxl.h b/drivers/firmware/efi/cper_cxl.h deleted file mode 100644 index 5ce1401ee17a..000000000000 --- a/drivers/firmware/efi/cper_cxl.h +++ /dev/null @@ -1,16 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * UEFI Common Platform Error Record (CPER) support for CXL Section. - * - * Copyright (C) 2022 Advanced Micro Devices, Inc. - * - * Author: Smita Koralahalli - */ - -#ifndef LINUX_CPER_CXL_H -#define LINUX_CPER_CXL_H - -void cxl_cper_print_prot_err(const char *pfx, - const struct cxl_cper_sec_prot_err *prot_err); - -#endif //__CPER_CXL_ diff --git a/include/linux/cper.h b/include/linux/cper.h index 5c6d4d5b9975..0ed60a91eca9 100644 --- a/include/linux/cper.h +++ b/include/linux/cper.h @@ -605,4 +605,8 @@ void cper_estatus_print(const char *pfx, int cper_estatus_check_header(const struct acpi_hest_generic_status *estatus); int cper_estatus_check(const struct acpi_hest_generic_status *estatus); +struct cxl_cper_sec_prot_err; +void cxl_cper_print_prot_err(const char *pfx, + const struct cxl_cper_sec_prot_err *prot_err); + #endif From 315c2f0b53ba2645062627443a12cea73f3dad9c Mon Sep 17 00:00:00 2001 From: Smita Koralahalli Date: Thu, 23 Jan 2025 08:44:19 +0000 Subject: [PATCH 10/39] acpi/ghes, cper: Recognize and cache CXL Protocol errors Add support in GHES to detect and process CXL CPER Protocol errors, as defined in UEFI v2.10, section N.2.13. Define struct cxl_cper_prot_err_work_data to cache CXL protocol error information, including RAS capabilities and severity, for further handling. These cached CXL CPER records will later be processed by workqueues within the CXL subsystem. Signed-off-by: Smita Koralahalli Reviewed-by: Jonathan Cameron Reviewed-by: Dave Jiang Reviewed-by: Ira Weiny Reviewed-by: Tony Luck Reviewed-by: Gregory Price Reviewed-by: Dan Williams Link: https://patch.msgid.link/20250123084421.127697-5-Smita.KoralahalliChannabasappa@amd.com Signed-off-by: Dave Jiang --- drivers/acpi/apei/ghes.c | 54 ++++++++++++++++++++++++++++++++++++++++ include/cxl/event.h | 6 +++++ 2 files changed, 60 insertions(+) diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c index b72772494655..4d725d988c43 100644 --- a/drivers/acpi/apei/ghes.c +++ b/drivers/acpi/apei/ghes.c @@ -674,6 +674,56 @@ static void ghes_defer_non_standard_event(struct acpi_hest_generic_data *gdata, schedule_work(&entry->work); } +static void cxl_cper_post_prot_err(struct cxl_cper_sec_prot_err *prot_err, + int severity) +{ +#ifdef CONFIG_ACPI_APEI_PCIEAER + struct cxl_cper_prot_err_work_data wd; + u8 *dvsec_start, *cap_start; + + if (!(prot_err->valid_bits & PROT_ERR_VALID_AGENT_ADDRESS)) { + pr_err_ratelimited("CXL CPER invalid agent type\n"); + return; + } + + if (!(prot_err->valid_bits & PROT_ERR_VALID_ERROR_LOG)) { + pr_err_ratelimited("CXL CPER invalid protocol error log\n"); + return; + } + + if (prot_err->err_len != sizeof(struct cxl_ras_capability_regs)) { + pr_err_ratelimited("CXL CPER invalid RAS Cap size (%u)\n", + prot_err->err_len); + return; + } + + if (!(prot_err->valid_bits & PROT_ERR_VALID_SERIAL_NUMBER)) + pr_warn(FW_WARN "CXL CPER no device serial number\n"); + + switch (prot_err->agent_type) { + case RCD: + case DEVICE: + case LD: + case FMLD: + case RP: + case DSP: + case USP: + memcpy(&wd.prot_err, prot_err, sizeof(wd.prot_err)); + + dvsec_start = (u8 *)(prot_err + 1); + cap_start = dvsec_start + prot_err->dvsec_len; + + memcpy(&wd.ras_cap, cap_start, sizeof(wd.ras_cap)); + wd.severity = cper_severity_to_aer(severity); + break; + default: + pr_err_ratelimited("CXL CPER invalid agent type: %d\n", + prot_err->agent_type); + return; + } +#endif +} + /* Room for 8 entries for each of the 4 event log queues */ #define CXL_CPER_FIFO_DEPTH 32 DEFINE_KFIFO(cxl_cper_fifo, struct cxl_cper_work_data, CXL_CPER_FIFO_DEPTH); @@ -777,6 +827,10 @@ static bool ghes_do_proc(struct ghes *ghes, } else if (guid_equal(sec_type, &CPER_SEC_PROC_ARM)) { queued = ghes_handle_arm_hw_error(gdata, sev, sync); + } else if (guid_equal(sec_type, &CPER_SEC_CXL_PROT_ERR)) { + struct cxl_cper_sec_prot_err *prot_err = acpi_hest_get_payload(gdata); + + cxl_cper_post_prot_err(prot_err, gdata->error_severity); } else if (guid_equal(sec_type, &CPER_SEC_CXL_GEN_MEDIA_GUID)) { struct cxl_cper_event_rec *rec = acpi_hest_get_payload(gdata); diff --git a/include/cxl/event.h b/include/cxl/event.h index 7a6c14c05215..8381a07052d0 100644 --- a/include/cxl/event.h +++ b/include/cxl/event.h @@ -244,6 +244,12 @@ struct cxl_ras_capability_regs { u32 header_log[16]; }; +struct cxl_cper_prot_err_work_data { + struct cxl_cper_sec_prot_err prot_err; + struct cxl_ras_capability_regs ras_cap; + int severity; +}; + #ifdef CONFIG_ACPI_APEI_GHES int cxl_cper_register_work(struct work_struct *work); int cxl_cper_unregister_work(struct work_struct *work); From 84b25926fa7abb5b634d4af9544f47ab0aabc399 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Wed, 26 Feb 2025 09:21:18 -0700 Subject: [PATCH 11/39] acpi: numa: Add support to enumerate and store extended linear address mode Store the address mode as part of the cache attriutes. Export the mode attribute to sysfs as all other cache attributes. Link: https://lore.kernel.org/linux-cxl/668333b17e4b2_5639294fd@dwillia2-xfh.jf.intel.com.notmuch/ Reviewed-by: Jonathan Cameron Reviewed-by: Li Ming Reviewed-by: Alison Schofield Link: https://patch.msgid.link/20250226162224.3633792-2-dave.jiang@intel.com Signed-off-by: Dave Jiang --- Documentation/ABI/stable/sysfs-devices-node | 6 ++++++ drivers/acpi/numa/hmat.c | 5 +++++ drivers/base/node.c | 2 ++ include/linux/node.h | 7 +++++++ 4 files changed, 20 insertions(+) diff --git a/Documentation/ABI/stable/sysfs-devices-node b/Documentation/ABI/stable/sysfs-devices-node index 402af4b2b905..a02707cb7cbc 100644 --- a/Documentation/ABI/stable/sysfs-devices-node +++ b/Documentation/ABI/stable/sysfs-devices-node @@ -177,6 +177,12 @@ Description: The cache write policy: 0 for write-back, 1 for write-through, other or unknown. +What: /sys/devices/system/node/nodeX/memory_side_cache/indexY/address_mode +Date: March 2025 +Contact: Dave Jiang +Description: + The address mode: 0 for reserved, 1 for extended-linear. + What: /sys/devices/system/node/nodeX/x86/sgx_total_bytes Date: November 2021 Contact: Jarkko Sakkinen diff --git a/drivers/acpi/numa/hmat.c b/drivers/acpi/numa/hmat.c index bfbb08b1e6af..2630511937f5 100644 --- a/drivers/acpi/numa/hmat.c +++ b/drivers/acpi/numa/hmat.c @@ -506,6 +506,11 @@ static __init int hmat_parse_cache(union acpi_subtable_headers *header, switch ((attrs & ACPI_HMAT_CACHE_ASSOCIATIVITY) >> 8) { case ACPI_HMAT_CA_DIRECT_MAPPED: tcache->cache_attrs.indexing = NODE_CACHE_DIRECT_MAP; + /* Extended Linear mode is only valid if cache is direct mapped */ + if (cache->address_mode == ACPI_HMAT_CACHE_MODE_EXTENDED_LINEAR) { + tcache->cache_attrs.address_mode = + NODE_CACHE_ADDR_MODE_EXTENDED_LINEAR; + } break; case ACPI_HMAT_CA_COMPLEX_CACHE_INDEXING: tcache->cache_attrs.indexing = NODE_CACHE_INDEXED; diff --git a/drivers/base/node.c b/drivers/base/node.c index 0ea653fa3433..cd13ef287011 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -244,12 +244,14 @@ CACHE_ATTR(size, "%llu") CACHE_ATTR(line_size, "%u") CACHE_ATTR(indexing, "%u") CACHE_ATTR(write_policy, "%u") +CACHE_ATTR(address_mode, "%#x") static struct attribute *cache_attrs[] = { &dev_attr_indexing.attr, &dev_attr_size.attr, &dev_attr_line_size.attr, &dev_attr_write_policy.attr, + &dev_attr_address_mode.attr, NULL, }; ATTRIBUTE_GROUPS(cache); diff --git a/include/linux/node.h b/include/linux/node.h index 9a881c2208b3..2b7517892230 100644 --- a/include/linux/node.h +++ b/include/linux/node.h @@ -57,6 +57,11 @@ enum cache_write_policy { NODE_CACHE_WRITE_OTHER, }; +enum cache_mode { + NODE_CACHE_ADDR_MODE_RESERVED, + NODE_CACHE_ADDR_MODE_EXTENDED_LINEAR, +}; + /** * struct node_cache_attrs - system memory caching attributes * @@ -65,6 +70,7 @@ enum cache_write_policy { * @size: Total size of cache in bytes * @line_size: Number of bytes fetched on a cache miss * @level: The cache hierarchy level + * @address_mode: The address mode */ struct node_cache_attrs { enum cache_indexing indexing; @@ -72,6 +78,7 @@ struct node_cache_attrs { u64 size; u16 line_size; u8 level; + u16 address_mode; }; #ifdef CONFIG_HMEM_REPORTING From 0ec9849b63338da7883440ed3f52757cd8c847b1 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Wed, 26 Feb 2025 09:21:19 -0700 Subject: [PATCH 12/39] acpi/hmat / cxl: Add extended linear cache support for CXL The current cxl region size only indicates the size of the CXL memory region without accounting for the extended linear cache size. Retrieve the cache size from HMAT and append that to the cxl region size for the cxl region range that matches the SRAT range that has extended linear cache enabled. The SRAT defines the whole memory range that includes the extended linear cache and the CXL memory region. The new HMAT ECN/ECR to the Memory Side Cache Information Structure defines the size of the extended linear cache size and matches to the SRAT Memory Affinity Structure by the memory proxmity domain. Add a helper to match the cxl range to the SRAT memory range in order to retrieve the cache size. There are several places that checks the cxl region range against the decoder range. Use new helper to check between the two ranges and address the new cache size. Reviewed-by: Jonathan Cameron Reviewed-by: Li Ming Reviewed-by: Alison Schofield Link: https://patch.msgid.link/20250226162224.3633792-3-dave.jiang@intel.com Signed-off-by: Dave Jiang --- drivers/acpi/numa/hmat.c | 39 ++++++++++++++++++ drivers/cxl/core/Makefile | 1 + drivers/cxl/core/acpi.c | 11 +++++ drivers/cxl/core/core.h | 3 ++ drivers/cxl/core/region.c | 86 +++++++++++++++++++++++++++++++++++---- drivers/cxl/cxl.h | 2 + include/linux/acpi.h | 11 +++++ tools/testing/cxl/Kbuild | 1 + 8 files changed, 147 insertions(+), 7 deletions(-) create mode 100644 drivers/cxl/core/acpi.c diff --git a/drivers/acpi/numa/hmat.c b/drivers/acpi/numa/hmat.c index 2630511937f5..9d9052258e92 100644 --- a/drivers/acpi/numa/hmat.c +++ b/drivers/acpi/numa/hmat.c @@ -108,6 +108,45 @@ static struct memory_target *find_mem_target(unsigned int mem_pxm) return NULL; } +/** + * hmat_get_extended_linear_cache_size - Retrieve the extended linear cache size + * @backing_res: resource from the backing media + * @nid: node id for the memory region + * @cache_size: (Output) size of extended linear cache. + * + * Return: 0 on success. Errno on failure. + * + */ +int hmat_get_extended_linear_cache_size(struct resource *backing_res, int nid, + resource_size_t *cache_size) +{ + unsigned int pxm = node_to_pxm(nid); + struct memory_target *target; + struct target_cache *tcache; + struct resource *res; + + target = find_mem_target(pxm); + if (!target) + return -ENOENT; + + list_for_each_entry(tcache, &target->caches, node) { + if (tcache->cache_attrs.address_mode != + NODE_CACHE_ADDR_MODE_EXTENDED_LINEAR) + continue; + + res = &target->memregions; + if (!resource_contains(res, backing_res)) + continue; + + *cache_size = tcache->cache_attrs.size; + return 0; + } + + *cache_size = 0; + return 0; +} +EXPORT_SYMBOL_NS_GPL(hmat_get_extended_linear_cache_size, "CXL"); + static struct memory_target *acpi_find_genport_target(u32 uid) { struct memory_target *target; diff --git a/drivers/cxl/core/Makefile b/drivers/cxl/core/Makefile index 9259bcc6773c..1a0c9c6ca818 100644 --- a/drivers/cxl/core/Makefile +++ b/drivers/cxl/core/Makefile @@ -14,5 +14,6 @@ cxl_core-y += pci.o cxl_core-y += hdm.o cxl_core-y += pmu.o cxl_core-y += cdat.o +cxl_core-y += acpi.o cxl_core-$(CONFIG_TRACING) += trace.o cxl_core-$(CONFIG_CXL_REGION) += region.o diff --git a/drivers/cxl/core/acpi.c b/drivers/cxl/core/acpi.c new file mode 100644 index 000000000000..f13b4dae6ac5 --- /dev/null +++ b/drivers/cxl/core/acpi.c @@ -0,0 +1,11 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright(c) 2024 Intel Corporation. All rights reserved. */ +#include +#include "cxl.h" +#include "core.h" + +int cxl_acpi_get_extended_linear_cache_size(struct resource *backing_res, + int nid, resource_size_t *size) +{ + return hmat_get_extended_linear_cache_size(backing_res, nid, size); +} diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h index 800466f96a68..0fb779b612d1 100644 --- a/drivers/cxl/core/core.h +++ b/drivers/cxl/core/core.h @@ -115,4 +115,7 @@ bool cxl_need_node_perf_attrs_update(int nid); int cxl_port_get_switch_dport_bandwidth(struct cxl_port *port, struct access_coordinate *c); +int cxl_acpi_get_extended_linear_cache_size(struct resource *backing_res, + int nid, resource_size_t *size); + #endif /* __CXL_CORE_H__ */ diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index e8d11a988fd9..69af651a8f46 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -824,6 +824,21 @@ static int match_free_decoder(struct device *dev, const void *data) return 1; } +static bool region_res_match_cxl_range(const struct cxl_region_params *p, + struct range *range) +{ + if (!p->res) + return false; + + /* + * If an extended linear cache region then the CXL range is assumed + * to be fronted by the DRAM range in current known implementation. + * This assumption will be made until a variant implementation exists. + */ + return p->res->start + p->cache_size == range->start && + p->res->end == range->end; +} + static int match_auto_decoder(struct device *dev, const void *data) { const struct cxl_region_params *p = data; @@ -836,7 +851,7 @@ static int match_auto_decoder(struct device *dev, const void *data) cxld = to_cxl_decoder(dev); r = &cxld->hpa_range; - if (p->res && p->res->start == r->start && p->res->end == r->end) + if (region_res_match_cxl_range(p, r)) return 1; return 0; @@ -1424,8 +1439,7 @@ static int cxl_port_setup_targets(struct cxl_port *port, if (test_bit(CXL_REGION_F_AUTO, &cxlr->flags)) { if (cxld->interleave_ways != iw || cxld->interleave_granularity != ig || - cxld->hpa_range.start != p->res->start || - cxld->hpa_range.end != p->res->end || + !region_res_match_cxl_range(p, &cxld->hpa_range) || ((cxld->flags & CXL_DECODER_F_ENABLE) == 0)) { dev_err(&cxlr->dev, "%s:%s %s expected iw: %d ig: %d %pr\n", @@ -1951,13 +1965,13 @@ static int cxl_region_attach(struct cxl_region *cxlr, return -ENXIO; } - if (resource_size(cxled->dpa_res) * p->interleave_ways != + if (resource_size(cxled->dpa_res) * p->interleave_ways + p->cache_size != resource_size(p->res)) { dev_dbg(&cxlr->dev, - "%s:%s: decoder-size-%#llx * ways-%d != region-size-%#llx\n", + "%s:%s-size-%#llx * ways-%d + cache-%#llx != region-size-%#llx\n", dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev), (u64)resource_size(cxled->dpa_res), p->interleave_ways, - (u64)resource_size(p->res)); + (u64)p->cache_size, (u64)resource_size(p->res)); return -EINVAL; } @@ -2921,7 +2935,7 @@ u64 cxl_dpa_to_hpa(struct cxl_region *cxlr, const struct cxl_memdev *cxlmd, hpa_offset |= dpa_offset & GENMASK_ULL(eig + 7, 0); /* Apply the hpa_offset to the region base address */ - hpa = hpa_offset + p->res->start; + hpa = hpa_offset + p->res->start + p->cache_size; /* Root decoder translation overrides typical modulo decode */ if (cxlrd->hpa_to_spa) @@ -3224,6 +3238,52 @@ static int match_region_by_range(struct device *dev, const void *data) return rc; } +static int cxl_extended_linear_cache_resize(struct cxl_region *cxlr, + struct resource *res) +{ + struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(cxlr->dev.parent); + struct cxl_region_params *p = &cxlr->params; + int nid = phys_to_target_node(res->start); + resource_size_t size, cache_size, start; + int rc; + + size = resource_size(res); + if (!size) + return -EINVAL; + + rc = cxl_acpi_get_extended_linear_cache_size(res, nid, &cache_size); + if (rc) + return rc; + + if (!cache_size) + return 0; + + if (size != cache_size) { + dev_warn(&cxlr->dev, + "Extended Linear Cache size %#lld != CXL size %#lld. No Support!", + cache_size, size); + return -EOPNOTSUPP; + } + + /* + * Move the start of the range to where the cache range starts. The + * implementation assumes that the cache range is in front of the + * CXL range. This is not dictated by the HMAT spec but is how the + * current known implementation is configured. + * + * The cache range is expected to be within the CFMWS. The adjusted + * res->start should not be less than cxlrd->res->start. + */ + start = res->start - cache_size; + if (start < cxlrd->res->start) + return -ENXIO; + + res->start = start; + p->cache_size = cache_size; + + return 0; +} + /* Establish an empty region covering the given HPA range */ static struct cxl_region *construct_region(struct cxl_root_decoder *cxlrd, struct cxl_endpoint_decoder *cxled) @@ -3270,6 +3330,18 @@ static struct cxl_region *construct_region(struct cxl_root_decoder *cxlrd, *res = DEFINE_RES_MEM_NAMED(hpa->start, range_len(hpa), dev_name(&cxlr->dev)); + + rc = cxl_extended_linear_cache_resize(cxlr, res); + if (rc) { + /* + * Failing to support extended linear cache region resize does not + * prevent the region from functioning. Only causes cxl list showing + * incorrect region size. + */ + dev_warn(cxlmd->dev.parent, + "Extended linear cache calculation failed.\n"); + } + rc = insert_resource(cxlrd->res, res); if (rc) { /* diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index bbbaa0d0a670..7ee96867ac73 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -493,6 +493,7 @@ enum cxl_config_state { * @res: allocated iomem capacity for this region * @targets: active ordered targets in current decoder configuration * @nr_targets: number of targets + * @cache_size: extended linear cache size if exists, otherwise zero. * * State transitions are protected by the cxl_region_rwsem */ @@ -504,6 +505,7 @@ struct cxl_region_params { struct resource *res; struct cxl_endpoint_decoder *targets[CXL_DECODER_MAX_INTERLEAVE]; int nr_targets; + resource_size_t cache_size; }; /* diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 4e495b29c640..cbd933504dbf 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -1095,6 +1095,17 @@ static inline acpi_handle acpi_get_processor_handle(int cpu) #endif /* !CONFIG_ACPI */ +#ifdef CONFIG_ACPI_HMAT +int hmat_get_extended_linear_cache_size(struct resource *backing_res, int nid, + resource_size_t *size); +#else +static inline int hmat_get_extended_linear_cache_size(struct resource *backing_res, + int nid, resource_size_t *size) +{ + return -EOPNOTSUPP; +} +#endif + extern void arch_post_acpi_subsys_init(void); #ifdef CONFIG_ACPI_HOTPLUG_IOAPIC diff --git a/tools/testing/cxl/Kbuild b/tools/testing/cxl/Kbuild index b1256fee3567..1ae13987a8a2 100644 --- a/tools/testing/cxl/Kbuild +++ b/tools/testing/cxl/Kbuild @@ -61,6 +61,7 @@ cxl_core-y += $(CXL_CORE_SRC)/pci.o cxl_core-y += $(CXL_CORE_SRC)/hdm.o cxl_core-y += $(CXL_CORE_SRC)/pmu.o cxl_core-y += $(CXL_CORE_SRC)/cdat.o +cxl_core-y += $(CXL_CORE_SRC)/acpi.o cxl_core-$(CONFIG_TRACING) += $(CXL_CORE_SRC)/trace.o cxl_core-$(CONFIG_CXL_REGION) += $(CXL_CORE_SRC)/region.o cxl_core-y += config_check.o From 8c520c5f1e767ed6b47feefca1ed32a097e9b707 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Wed, 26 Feb 2025 09:21:20 -0700 Subject: [PATCH 13/39] cxl: Add extended linear cache address alias emission for cxl events Add the aliased address of extended linear cache when emitting event trace for poison, DRAM and general media of CXL events. Reviewed-by: Jonathan Cameron Reviewed-by: Li Ming Reviewed-by: Alison Schofield Link: https://patch.msgid.link/20250226162224.3633792-4-dave.jiang@intel.com Signed-off-by: Dave Jiang --- drivers/cxl/core/mbox.c | 14 ++++++++++---- drivers/cxl/core/region.c | 2 +- drivers/cxl/core/trace.h | 34 ++++++++++++++++++++++++---------- 3 files changed, 35 insertions(+), 15 deletions(-) diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c index 548564c770c0..f26b96dd7410 100644 --- a/drivers/cxl/core/mbox.c +++ b/drivers/cxl/core/mbox.c @@ -871,7 +871,7 @@ void cxl_event_trace_record(const struct cxl_memdev *cxlmd, } if (trace_cxl_general_media_enabled() || trace_cxl_dram_enabled()) { - u64 dpa, hpa = ULLONG_MAX; + u64 dpa, hpa = ULLONG_MAX, hpa_alias = ULLONG_MAX; struct cxl_region *cxlr; /* @@ -884,14 +884,20 @@ void cxl_event_trace_record(const struct cxl_memdev *cxlmd, dpa = le64_to_cpu(evt->media_hdr.phys_addr) & CXL_DPA_MASK; cxlr = cxl_dpa_to_region(cxlmd, dpa); - if (cxlr) + if (cxlr) { + u64 cache_size = cxlr->params.cache_size; + hpa = cxl_dpa_to_hpa(cxlr, cxlmd, dpa); + if (cache_size) + hpa_alias = hpa - cache_size; + } if (event_type == CXL_CPER_EVENT_GEN_MEDIA) trace_cxl_general_media(cxlmd, type, cxlr, hpa, - &evt->gen_media); + hpa_alias, &evt->gen_media); else if (event_type == CXL_CPER_EVENT_DRAM) - trace_cxl_dram(cxlmd, type, cxlr, hpa, &evt->dram); + trace_cxl_dram(cxlmd, type, cxlr, hpa, hpa_alias, + &evt->dram); } } EXPORT_SYMBOL_NS_GPL(cxl_event_trace_record, "CXL"); diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index 69af651a8f46..a20ef3f10fef 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -3260,7 +3260,7 @@ static int cxl_extended_linear_cache_resize(struct cxl_region *cxlr, if (size != cache_size) { dev_warn(&cxlr->dev, - "Extended Linear Cache size %#lld != CXL size %#lld. No Support!", + "Extended Linear Cache size %lld != CXL size %lld. No Support!", cache_size, size); return -EOPNOTSUPP; } diff --git a/drivers/cxl/core/trace.h b/drivers/cxl/core/trace.h index cea706b683b5..e3f842dcdf1d 100644 --- a/drivers/cxl/core/trace.h +++ b/drivers/cxl/core/trace.h @@ -392,9 +392,10 @@ TRACE_EVENT(cxl_generic_event, TRACE_EVENT(cxl_general_media, TP_PROTO(const struct cxl_memdev *cxlmd, enum cxl_event_log_type log, - struct cxl_region *cxlr, u64 hpa, struct cxl_event_gen_media *rec), + struct cxl_region *cxlr, u64 hpa, u64 hpa_alias0, + struct cxl_event_gen_media *rec), - TP_ARGS(cxlmd, log, cxlr, hpa, rec), + TP_ARGS(cxlmd, log, cxlr, hpa, hpa_alias0, rec), TP_STRUCT__entry( CXL_EVT_TP_entry @@ -408,6 +409,7 @@ TRACE_EVENT(cxl_general_media, __array(u8, comp_id, CXL_EVENT_GEN_MED_COMP_ID_SIZE) /* Following are out of order to pack trace record */ __field(u64, hpa) + __field(u64, hpa_alias0) __field_struct(uuid_t, region_uuid) __field(u16, validity_flags) __field(u8, rank) @@ -438,6 +440,7 @@ TRACE_EVENT(cxl_general_media, CXL_EVENT_GEN_MED_COMP_ID_SIZE); __entry->validity_flags = get_unaligned_le16(&rec->media_hdr.validity_flags); __entry->hpa = hpa; + __entry->hpa_alias0 = hpa_alias0; if (cxlr) { __assign_str(region_name); uuid_copy(&__entry->region_uuid, &cxlr->params.uuid); @@ -455,7 +458,7 @@ TRACE_EVENT(cxl_general_media, "device=%x validity_flags='%s' " \ "comp_id=%s comp_id_pldm_valid_flags='%s' " \ "pldm_entity_id=%s pldm_resource_id=%s " \ - "hpa=%llx region=%s region_uuid=%pUb " \ + "hpa=%llx hpa_alias0=%llx region=%s region_uuid=%pUb " \ "cme_threshold_ev_flags='%s' cme_count=%u", __entry->dpa, show_dpa_flags(__entry->dpa_flags), show_event_desc_flags(__entry->descriptor), @@ -470,7 +473,7 @@ TRACE_EVENT(cxl_general_media, CXL_GMER_VALID_COMPONENT_ID_FORMAT, __entry->comp_id), show_pldm_resource_id(__entry->validity_flags, CXL_GMER_VALID_COMPONENT, CXL_GMER_VALID_COMPONENT_ID_FORMAT, __entry->comp_id), - __entry->hpa, __get_str(region_name), &__entry->region_uuid, + __entry->hpa, __entry->hpa_alias0, __get_str(region_name), &__entry->region_uuid, show_cme_threshold_ev_flags(__entry->cme_threshold_ev_flags), __entry->cme_count ) ); @@ -529,9 +532,10 @@ TRACE_EVENT(cxl_general_media, TRACE_EVENT(cxl_dram, TP_PROTO(const struct cxl_memdev *cxlmd, enum cxl_event_log_type log, - struct cxl_region *cxlr, u64 hpa, struct cxl_event_dram *rec), + struct cxl_region *cxlr, u64 hpa, u64 hpa_alias0, + struct cxl_event_dram *rec), - TP_ARGS(cxlmd, log, cxlr, hpa, rec), + TP_ARGS(cxlmd, log, cxlr, hpa, hpa_alias0, rec), TP_STRUCT__entry( CXL_EVT_TP_entry @@ -547,6 +551,7 @@ TRACE_EVENT(cxl_dram, __field(u32, row) __array(u8, cor_mask, CXL_EVENT_DER_CORRECTION_MASK_SIZE) __field(u64, hpa) + __field(u64, hpa_alias0) __field_struct(uuid_t, region_uuid) __field(u8, rank) /* Out of order to pack trace record */ __field(u8, bank_group) /* Out of order to pack trace record */ @@ -584,6 +589,7 @@ TRACE_EVENT(cxl_dram, memcpy(__entry->cor_mask, &rec->correction_mask, CXL_EVENT_DER_CORRECTION_MASK_SIZE); __entry->hpa = hpa; + __entry->hpa_alias0 = hpa_alias0; if (cxlr) { __assign_str(region_name); uuid_copy(&__entry->region_uuid, &cxlr->params.uuid); @@ -604,7 +610,7 @@ TRACE_EVENT(cxl_dram, "validity_flags='%s' " \ "comp_id=%s comp_id_pldm_valid_flags='%s' " \ "pldm_entity_id=%s pldm_resource_id=%s " \ - "hpa=%llx region=%s region_uuid=%pUb " \ + "hpa=%llx hpa_alias0=%llx region=%s region_uuid=%pUb " \ "sub_channel=%u cme_threshold_ev_flags='%s' cvme_count=%u", __entry->dpa, show_dpa_flags(__entry->dpa_flags), show_event_desc_flags(__entry->descriptor), @@ -622,7 +628,7 @@ TRACE_EVENT(cxl_dram, CXL_DER_VALID_COMPONENT_ID_FORMAT, __entry->comp_id), show_pldm_resource_id(__entry->validity_flags, CXL_DER_VALID_COMPONENT, CXL_DER_VALID_COMPONENT_ID_FORMAT, __entry->comp_id), - __entry->hpa, __get_str(region_name), &__entry->region_uuid, + __entry->hpa, __entry->hpa_alias0, __get_str(region_name), &__entry->region_uuid, __entry->sub_channel, show_cme_threshold_ev_flags(__entry->cme_threshold_ev_flags), __entry->cvme_count ) @@ -870,6 +876,7 @@ TRACE_EVENT(cxl_poison, __string(region, cxlr ? dev_name(&cxlr->dev) : "") __field(u64, overflow_ts) __field(u64, hpa) + __field(u64, hpa_alias0) __field(u64, dpa) __field(u32, dpa_length) __array(char, uuid, 16) @@ -892,16 +899,22 @@ TRACE_EVENT(cxl_poison, memcpy(__entry->uuid, &cxlr->params.uuid, 16); __entry->hpa = cxl_dpa_to_hpa(cxlr, cxlmd, __entry->dpa); + if (__entry->hpa != ULLONG_MAX && cxlr->params.cache_size) + __entry->hpa_alias0 = __entry->hpa + + cxlr->params.cache_size; + else + __entry->hpa_alias0 = ULLONG_MAX; } else { __assign_str(region); memset(__entry->uuid, 0, 16); __entry->hpa = ULLONG_MAX; + __entry->hpa_alias0 = ULLONG_MAX; } ), TP_printk("memdev=%s host=%s serial=%lld trace_type=%s region=%s " \ - "region_uuid=%pU hpa=0x%llx dpa=0x%llx dpa_length=0x%x " \ - "source=%s flags=%s overflow_time=%llu", + "region_uuid=%pU hpa=0x%llx hpa_alias0=0x%llx dpa=0x%llx " \ + "dpa_length=0x%x source=%s flags=%s overflow_time=%llu", __get_str(memdev), __get_str(host), __entry->serial, @@ -909,6 +922,7 @@ TRACE_EVENT(cxl_poison, __get_str(region), __entry->uuid, __entry->hpa, + __entry->hpa_alias0, __entry->dpa, __entry->dpa_length, show_poison_source(__entry->source), From 516e5bd0b6bf4ae1ad072df637b428a737c3c870 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Wed, 26 Feb 2025 09:21:21 -0700 Subject: [PATCH 14/39] cxl: Add mce notifier to emit aliased address for extended linear cache Below is a setup with extended linear cache configuration with an example layout of memory region shown below presented as a single memory region consists of 256G memory where there's 128G of DRAM and 128G of CXL memory. The kernel sees a region of total 256G of system memory. 128G DRAM 128G CXL memory |-----------------------------------|-------------------------------------| Data resides in either DRAM or far memory (FM) with no replication. Hot data is swapped into DRAM by the hardware behind the scenes. When error is detected in one location, it is possible that error also resides in the aliased location. Therefore when a memory location that is flagged by MCE is part of the special region, the aliased memory location needs to be offlined as well. Add an mce notify callback to identify if the MCE address location is part of an extended linear cache region and handle accordingly. Added symbol export to set_mce_nospec() in x86 code in order to call set_mce_nospec() from the CXL MCE notify callback. Link: https://lore.kernel.org/linux-cxl/668333b17e4b2_5639294fd@dwillia2-xfh.jf.intel.com.notmuch/ Reviewed-by: Jonathan Cameron Reviewed-by: Li Ming Reviewed-by: Alison Schofield Link: https://patch.msgid.link/20250226162224.3633792-5-dave.jiang@intel.com Signed-off-by: Dave Jiang --- arch/x86/mm/pat/set_memory.c | 1 + drivers/cxl/Kconfig | 4 +++ drivers/cxl/core/Makefile | 1 + drivers/cxl/core/mbox.c | 6 ++++ drivers/cxl/core/mce.c | 65 ++++++++++++++++++++++++++++++++++++ drivers/cxl/core/mce.h | 20 +++++++++++ drivers/cxl/core/region.c | 28 ++++++++++++++++ drivers/cxl/cxl.h | 6 ++++ drivers/cxl/cxlmem.h | 2 ++ tools/testing/cxl/Kbuild | 1 + 10 files changed, 134 insertions(+) create mode 100644 drivers/cxl/core/mce.c create mode 100644 drivers/cxl/core/mce.h diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c index ef4514d64c05..255a3d176956 100644 --- a/arch/x86/mm/pat/set_memory.c +++ b/arch/x86/mm/pat/set_memory.c @@ -2081,6 +2081,7 @@ int set_mce_nospec(unsigned long pfn) pr_warn("Could not invalidate pfn=0x%lx from 1:1 map\n", pfn); return rc; } +EXPORT_SYMBOL_GPL(set_mce_nospec); /* Restore full speculative operation to the pfn. */ int clear_mce_nospec(unsigned long pfn) diff --git a/drivers/cxl/Kconfig b/drivers/cxl/Kconfig index 876469e23f7a..d1c91dacae56 100644 --- a/drivers/cxl/Kconfig +++ b/drivers/cxl/Kconfig @@ -146,4 +146,8 @@ config CXL_REGION_INVALIDATION_TEST If unsure, or if this kernel is meant for production environments, say N. +config CXL_MCE + def_bool y + depends on X86_MCE && MEMORY_FAILURE + endif diff --git a/drivers/cxl/core/Makefile b/drivers/cxl/core/Makefile index 1a0c9c6ca818..61c9332b3582 100644 --- a/drivers/cxl/core/Makefile +++ b/drivers/cxl/core/Makefile @@ -17,3 +17,4 @@ cxl_core-y += cdat.o cxl_core-y += acpi.o cxl_core-$(CONFIG_TRACING) += trace.o cxl_core-$(CONFIG_CXL_REGION) += region.o +cxl_core-$(CONFIG_CXL_MCE) += mce.o diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c index f26b96dd7410..c06f19a729e8 100644 --- a/drivers/cxl/core/mbox.c +++ b/drivers/cxl/core/mbox.c @@ -11,6 +11,7 @@ #include "core.h" #include "trace.h" +#include "mce.h" static bool cxl_raw_allow_all; @@ -1444,6 +1445,7 @@ EXPORT_SYMBOL_NS_GPL(cxl_mailbox_init, "CXL"); struct cxl_memdev_state *cxl_memdev_state_create(struct device *dev) { struct cxl_memdev_state *mds; + int rc; mds = devm_kzalloc(dev, sizeof(*mds), GFP_KERNEL); if (!mds) { @@ -1459,6 +1461,10 @@ struct cxl_memdev_state *cxl_memdev_state_create(struct device *dev) mds->ram_perf.qos_class = CXL_QOS_CLASS_INVALID; mds->pmem_perf.qos_class = CXL_QOS_CLASS_INVALID; + rc = devm_cxl_register_mce_notifier(dev, &mds->mce_notifier); + if (rc) + return ERR_PTR(rc); + return mds; } EXPORT_SYMBOL_NS_GPL(cxl_memdev_state_create, "CXL"); diff --git a/drivers/cxl/core/mce.c b/drivers/cxl/core/mce.c new file mode 100644 index 000000000000..ff8d078c6ca1 --- /dev/null +++ b/drivers/cxl/core/mce.c @@ -0,0 +1,65 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright(c) 2024 Intel Corporation. All rights reserved. */ +#include +#include +#include +#include +#include +#include "mce.h" + +static int cxl_handle_mce(struct notifier_block *nb, unsigned long val, + void *data) +{ + struct cxl_memdev_state *mds = container_of(nb, struct cxl_memdev_state, + mce_notifier); + struct cxl_memdev *cxlmd = mds->cxlds.cxlmd; + struct cxl_port *endpoint = cxlmd->endpoint; + struct mce *mce = data; + u64 spa, spa_alias; + unsigned long pfn; + + if (!mce || !mce_usable_address(mce)) + return NOTIFY_DONE; + + if (!endpoint) + return NOTIFY_DONE; + + spa = mce->addr & MCI_ADDR_PHYSADDR; + + pfn = spa >> PAGE_SHIFT; + if (!pfn_valid(pfn)) + return NOTIFY_DONE; + + spa_alias = cxl_port_get_spa_cache_alias(endpoint, spa); + if (spa_alias == ~0ULL) + return NOTIFY_DONE; + + pfn = spa_alias >> PAGE_SHIFT; + + /* + * Take down the aliased memory page. The original memory page flagged + * by the MCE will be taken cared of by the standard MCE handler. + */ + dev_emerg(mds->cxlds.dev, "Offlining aliased SPA address0: %#llx\n", + spa_alias); + if (!memory_failure(pfn, 0)) + set_mce_nospec(pfn); + + return NOTIFY_OK; +} + +static void cxl_unregister_mce_notifier(void *mce_notifier) +{ + mce_unregister_decode_chain(mce_notifier); +} + +int devm_cxl_register_mce_notifier(struct device *dev, + struct notifier_block *mce_notifier) +{ + mce_notifier->notifier_call = cxl_handle_mce; + mce_notifier->priority = MCE_PRIO_UC; + mce_register_decode_chain(mce_notifier); + + return devm_add_action_or_reset(dev, cxl_unregister_mce_notifier, + mce_notifier); +} diff --git a/drivers/cxl/core/mce.h b/drivers/cxl/core/mce.h new file mode 100644 index 000000000000..ace73424eeb6 --- /dev/null +++ b/drivers/cxl/core/mce.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* Copyright(c) 2024 Intel Corporation. All rights reserved. */ +#ifndef _CXL_CORE_MCE_H_ +#define _CXL_CORE_MCE_H_ + +#include + +#ifdef CONFIG_CXL_MCE +int devm_cxl_register_mce_notifier(struct device *dev, + struct notifier_block *mce_notifer); +#else +static inline int +devm_cxl_register_mce_notifier(struct device *dev, + struct notifier_block *mce_notifier) +{ + return -EOPNOTSUPP; +} +#endif + +#endif diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index a20ef3f10fef..c2b4162aee42 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -3447,6 +3447,34 @@ out: } EXPORT_SYMBOL_NS_GPL(cxl_add_to_region, "CXL"); +u64 cxl_port_get_spa_cache_alias(struct cxl_port *endpoint, u64 spa) +{ + struct cxl_region_ref *iter; + unsigned long index; + + if (!endpoint) + return ~0ULL; + + guard(rwsem_write)(&cxl_region_rwsem); + + xa_for_each(&endpoint->regions, index, iter) { + struct cxl_region_params *p = &iter->region->params; + + if (p->res->start <= spa && spa <= p->res->end) { + if (!p->cache_size) + return ~0ULL; + + if (spa > p->res->start + p->cache_size) + return spa - p->cache_size; + + return spa + p->cache_size; + } + } + + return ~0ULL; +} +EXPORT_SYMBOL_NS_GPL(cxl_port_get_spa_cache_alias, "CXL"); + static int is_system_ram(struct resource *res, void *arg) { struct cxl_region *cxlr = arg; diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index 7ee96867ac73..4785cff5209f 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -877,6 +877,7 @@ struct cxl_pmem_region *to_cxl_pmem_region(struct device *dev); int cxl_add_to_region(struct cxl_port *root, struct cxl_endpoint_decoder *cxled); struct cxl_dax_region *to_cxl_dax_region(struct device *dev); +u64 cxl_port_get_spa_cache_alias(struct cxl_port *endpoint, u64 spa); #else static inline bool is_cxl_pmem_region(struct device *dev) { @@ -895,6 +896,11 @@ static inline struct cxl_dax_region *to_cxl_dax_region(struct device *dev) { return NULL; } +static inline u64 cxl_port_get_spa_cache_alias(struct cxl_port *endpoint, + u64 spa) +{ + return 0; +} #endif void cxl_endpoint_parse_cdat(struct cxl_port *port); diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h index 2a25d1957ddb..55752cbf408c 100644 --- a/drivers/cxl/cxlmem.h +++ b/drivers/cxl/cxlmem.h @@ -477,6 +477,7 @@ static inline struct cxl_dev_state *mbox_to_cxlds(struct cxl_mailbox *cxl_mbox) * @poison: poison driver state info * @security: security driver state info * @fw: firmware upload / activation state + * @mce_notifier: MCE notifier * * See CXL 3.0 8.2.9.8.2 Capacity Configuration and Label Storage for * details on capacity parameters. @@ -503,6 +504,7 @@ struct cxl_memdev_state { struct cxl_poison_state poison; struct cxl_security_state security; struct cxl_fw_state fw; + struct notifier_block mce_notifier; }; static inline struct cxl_memdev_state * diff --git a/tools/testing/cxl/Kbuild b/tools/testing/cxl/Kbuild index 1ae13987a8a2..f625eb2d2dc5 100644 --- a/tools/testing/cxl/Kbuild +++ b/tools/testing/cxl/Kbuild @@ -64,6 +64,7 @@ cxl_core-y += $(CXL_CORE_SRC)/cdat.o cxl_core-y += $(CXL_CORE_SRC)/acpi.o cxl_core-$(CONFIG_TRACING) += $(CXL_CORE_SRC)/trace.o cxl_core-$(CONFIG_CXL_REGION) += $(CXL_CORE_SRC)/region.o +cxl_core-$(CONFIG_CXL_MCE) += $(CXL_CORE_SRC)/mce.o cxl_core-y += config_check.o cxl_core-y += cxl_core_test.o cxl_core-y += cxl_core_exports.o From 36f257e3b0ba904f5a4e7fa8dafaa60e88cdd28c Mon Sep 17 00:00:00 2001 From: Smita Koralahalli Date: Mon, 10 Mar 2025 22:38:38 +0000 Subject: [PATCH 15/39] acpi/ghes, cxl/pci: Process CXL CPER Protocol Errors When PCIe AER is in FW-First, OS should process CXL Protocol errors from CPER records. Introduce support for handling and logging CXL Protocol errors. The defined trace events cxl_aer_uncorrectable_error and cxl_aer_correctable_error trace native CXL AER endpoint errors. Reuse them to trace FW-First Protocol errors. Since the CXL code is required to be called from process context and GHES is in interrupt context, use workqueues for processing. Similar to CXL CPER event handling, use kfifo to handle errors as it simplifies queue processing by providing lock free fifo operations. Add the ability for the CXL sub-system to register a workqueue to process CXL CPER protocol errors. [DJ: return cxl_cper_register_prot_err_work() directly in cxl_ras_init()] Signed-off-by: Smita Koralahalli Reviewed-by: Li Ming Reviewed-by: Alison Schofield Reviewed-by: Ira Weiny Reviewed-by: Tony Luck Link: https://patch.msgid.link/20250310223839.31342-2-Smita.KoralahalliChannabasappa@amd.com Signed-off-by: Dave Jiang --- drivers/acpi/apei/ghes.c | 49 +++++++++++++++++++++++ drivers/cxl/core/Makefile | 1 + drivers/cxl/core/core.h | 3 ++ drivers/cxl/core/port.c | 7 ++++ drivers/cxl/core/ras.c | 82 +++++++++++++++++++++++++++++++++++++++ include/cxl/event.h | 15 +++++++ tools/testing/cxl/Kbuild | 1 + 7 files changed, 158 insertions(+) create mode 100644 drivers/cxl/core/ras.c diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c index 4d725d988c43..289e365f84b2 100644 --- a/drivers/acpi/apei/ghes.c +++ b/drivers/acpi/apei/ghes.c @@ -674,6 +674,15 @@ static void ghes_defer_non_standard_event(struct acpi_hest_generic_data *gdata, schedule_work(&entry->work); } +/* Room for 8 entries */ +#define CXL_CPER_PROT_ERR_FIFO_DEPTH 8 +static DEFINE_KFIFO(cxl_cper_prot_err_fifo, struct cxl_cper_prot_err_work_data, + CXL_CPER_PROT_ERR_FIFO_DEPTH); + +/* Synchronize schedule_work() with cxl_cper_prot_err_work changes */ +static DEFINE_SPINLOCK(cxl_cper_prot_err_work_lock); +struct work_struct *cxl_cper_prot_err_work; + static void cxl_cper_post_prot_err(struct cxl_cper_sec_prot_err *prot_err, int severity) { @@ -700,6 +709,11 @@ static void cxl_cper_post_prot_err(struct cxl_cper_sec_prot_err *prot_err, if (!(prot_err->valid_bits & PROT_ERR_VALID_SERIAL_NUMBER)) pr_warn(FW_WARN "CXL CPER no device serial number\n"); + guard(spinlock_irqsave)(&cxl_cper_prot_err_work_lock); + + if (!cxl_cper_prot_err_work) + return; + switch (prot_err->agent_type) { case RCD: case DEVICE: @@ -721,9 +735,44 @@ static void cxl_cper_post_prot_err(struct cxl_cper_sec_prot_err *prot_err, prot_err->agent_type); return; } + + if (!kfifo_put(&cxl_cper_prot_err_fifo, wd)) { + pr_err_ratelimited("CXL CPER kfifo overflow\n"); + return; + } + + schedule_work(cxl_cper_prot_err_work); #endif } +int cxl_cper_register_prot_err_work(struct work_struct *work) +{ + if (cxl_cper_prot_err_work) + return -EINVAL; + + guard(spinlock)(&cxl_cper_prot_err_work_lock); + cxl_cper_prot_err_work = work; + return 0; +} +EXPORT_SYMBOL_NS_GPL(cxl_cper_register_prot_err_work, "CXL"); + +int cxl_cper_unregister_prot_err_work(struct work_struct *work) +{ + if (cxl_cper_prot_err_work != work) + return -EINVAL; + + guard(spinlock)(&cxl_cper_prot_err_work_lock); + cxl_cper_prot_err_work = NULL; + return 0; +} +EXPORT_SYMBOL_NS_GPL(cxl_cper_unregister_prot_err_work, "CXL"); + +int cxl_cper_prot_err_kfifo_get(struct cxl_cper_prot_err_work_data *wd) +{ + return kfifo_get(&cxl_cper_prot_err_fifo, wd); +} +EXPORT_SYMBOL_NS_GPL(cxl_cper_prot_err_kfifo_get, "CXL"); + /* Room for 8 entries for each of the 4 event log queues */ #define CXL_CPER_FIFO_DEPTH 32 DEFINE_KFIFO(cxl_cper_fifo, struct cxl_cper_work_data, CXL_CPER_FIFO_DEPTH); diff --git a/drivers/cxl/core/Makefile b/drivers/cxl/core/Makefile index 9259bcc6773c..ba5f0916d379 100644 --- a/drivers/cxl/core/Makefile +++ b/drivers/cxl/core/Makefile @@ -14,5 +14,6 @@ cxl_core-y += pci.o cxl_core-y += hdm.o cxl_core-y += pmu.o cxl_core-y += cdat.o +cxl_core-y += ras.o cxl_core-$(CONFIG_TRACING) += trace.o cxl_core-$(CONFIG_CXL_REGION) += region.o diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h index 800466f96a68..c87dbfcc9403 100644 --- a/drivers/cxl/core/core.h +++ b/drivers/cxl/core/core.h @@ -115,4 +115,7 @@ bool cxl_need_node_perf_attrs_update(int nid); int cxl_port_get_switch_dport_bandwidth(struct cxl_port *port, struct access_coordinate *c); +int cxl_ras_init(void); +void cxl_ras_exit(void); + #endif /* __CXL_CORE_H__ */ diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c index 78a5c2c25982..7ed6e8f374af 100644 --- a/drivers/cxl/core/port.c +++ b/drivers/cxl/core/port.c @@ -2339,8 +2339,14 @@ static __init int cxl_core_init(void) if (rc) goto err_region; + rc = cxl_ras_init(); + if (rc) + goto err_ras; + return 0; +err_ras: + cxl_region_exit(); err_region: bus_unregister(&cxl_bus_type); err_bus: @@ -2352,6 +2358,7 @@ err_wq: static void cxl_core_exit(void) { + cxl_ras_exit(); cxl_region_exit(); bus_unregister(&cxl_bus_type); destroy_workqueue(cxl_bus_wq); diff --git a/drivers/cxl/core/ras.c b/drivers/cxl/core/ras.c new file mode 100644 index 000000000000..91273af6ccbc --- /dev/null +++ b/drivers/cxl/core/ras.c @@ -0,0 +1,82 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright(c) 2025 AMD Corporation. All rights reserved. */ + +#include +#include +#include +#include +#include "trace.h" + +static void cxl_cper_trace_corr_prot_err(struct pci_dev *pdev, + struct cxl_ras_capability_regs ras_cap) +{ + u32 status = ras_cap.cor_status & ~ras_cap.cor_mask; + struct cxl_dev_state *cxlds; + + cxlds = pci_get_drvdata(pdev); + if (!cxlds) + return; + + trace_cxl_aer_correctable_error(cxlds->cxlmd, status); +} + +static void cxl_cper_trace_uncorr_prot_err(struct pci_dev *pdev, + struct cxl_ras_capability_regs ras_cap) +{ + u32 status = ras_cap.uncor_status & ~ras_cap.uncor_mask; + struct cxl_dev_state *cxlds; + u32 fe; + + cxlds = pci_get_drvdata(pdev); + if (!cxlds) + return; + + if (hweight32(status) > 1) + fe = BIT(FIELD_GET(CXL_RAS_CAP_CONTROL_FE_MASK, + ras_cap.cap_control)); + else + fe = status; + + trace_cxl_aer_uncorrectable_error(cxlds->cxlmd, status, fe, + ras_cap.header_log); +} + +static void cxl_cper_handle_prot_err(struct cxl_cper_prot_err_work_data *data) +{ + unsigned int devfn = PCI_DEVFN(data->prot_err.agent_addr.device, + data->prot_err.agent_addr.function); + struct pci_dev *pdev __free(pci_dev_put) = + pci_get_domain_bus_and_slot(data->prot_err.agent_addr.segment, + data->prot_err.agent_addr.bus, + devfn); + + if (!pdev) + return; + + guard(device)(&pdev->dev); + + if (data->severity == AER_CORRECTABLE) + cxl_cper_trace_corr_prot_err(pdev, data->ras_cap); + else + cxl_cper_trace_uncorr_prot_err(pdev, data->ras_cap); +} + +static void cxl_cper_prot_err_work_fn(struct work_struct *work) +{ + struct cxl_cper_prot_err_work_data wd; + + while (cxl_cper_prot_err_kfifo_get(&wd)) + cxl_cper_handle_prot_err(&wd); +} +static DECLARE_WORK(cxl_cper_prot_err_work, cxl_cper_prot_err_work_fn); + +int cxl_ras_init(void) +{ + return cxl_cper_register_prot_err_work(&cxl_cper_prot_err_work); +} + +void cxl_ras_exit(void) +{ + cxl_cper_unregister_prot_err_work(&cxl_cper_prot_err_work); + cancel_work_sync(&cxl_cper_prot_err_work); +} diff --git a/include/cxl/event.h b/include/cxl/event.h index 8381a07052d0..f9ae1796da85 100644 --- a/include/cxl/event.h +++ b/include/cxl/event.h @@ -254,6 +254,9 @@ struct cxl_cper_prot_err_work_data { int cxl_cper_register_work(struct work_struct *work); int cxl_cper_unregister_work(struct work_struct *work); int cxl_cper_kfifo_get(struct cxl_cper_work_data *wd); +int cxl_cper_register_prot_err_work(struct work_struct *work); +int cxl_cper_unregister_prot_err_work(struct work_struct *work); +int cxl_cper_prot_err_kfifo_get(struct cxl_cper_prot_err_work_data *wd); #else static inline int cxl_cper_register_work(struct work_struct *work) { @@ -268,6 +271,18 @@ static inline int cxl_cper_kfifo_get(struct cxl_cper_work_data *wd) { return 0; } +static inline int cxl_cper_register_prot_err_work(struct work_struct *work) +{ + return 0; +} +static inline int cxl_cper_unregister_prot_err_work(struct work_struct *work) +{ + return 0; +} +static inline int cxl_cper_prot_err_kfifo_get(struct cxl_cper_prot_err_work_data *wd) +{ + return 0; +} #endif #endif /* _LINUX_CXL_EVENT_H */ diff --git a/tools/testing/cxl/Kbuild b/tools/testing/cxl/Kbuild index b1256fee3567..3d71447c0bd8 100644 --- a/tools/testing/cxl/Kbuild +++ b/tools/testing/cxl/Kbuild @@ -61,6 +61,7 @@ cxl_core-y += $(CXL_CORE_SRC)/pci.o cxl_core-y += $(CXL_CORE_SRC)/hdm.o cxl_core-y += $(CXL_CORE_SRC)/pmu.o cxl_core-y += $(CXL_CORE_SRC)/cdat.o +cxl_core-y += $(CXL_CORE_SRC)/ras.o cxl_core-$(CONFIG_TRACING) += $(CXL_CORE_SRC)/trace.o cxl_core-$(CONFIG_CXL_REGION) += $(CXL_CORE_SRC)/region.o cxl_core-y += config_check.o From 02f4f0177d8e7647016fc29f11c1a7bb75bc2182 Mon Sep 17 00:00:00 2001 From: Smita Koralahalli Date: Mon, 10 Mar 2025 22:38:39 +0000 Subject: [PATCH 16/39] cxl/pci: Add trace logging for CXL PCIe Port RAS errors The CXL drivers use kernel trace functions for logging endpoint and Restricted CXL host (RCH) Downstream Port RAS errors. Similar functionality is required for CXL Root Ports, CXL Downstream Switch Ports, and CXL Upstream Switch Ports. Introduce trace logging functions for both RAS correctable and uncorrectable errors specific to CXL PCIe Ports. Use them to trace FW-First Protocol errors. Co-developed-by: Terry Bowman Signed-off-by: Terry Bowman Signed-off-by: Smita Koralahalli Reviewed-by: Ira Weiny Reviewed-by: Li Ming Reviewed-by: Alison Schofield Reviewed-by: Tony Luck Link: https://patch.msgid.link/20250310223839.31342-3-Smita.KoralahalliChannabasappa@amd.com Signed-off-by: Dave Jiang --- drivers/cxl/core/ras.c | 37 +++++++++++++++++++++++++++++++ drivers/cxl/core/trace.h | 47 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 84 insertions(+) diff --git a/drivers/cxl/core/ras.c b/drivers/cxl/core/ras.c index 91273af6ccbc..485a831695c7 100644 --- a/drivers/cxl/core/ras.c +++ b/drivers/cxl/core/ras.c @@ -7,6 +7,30 @@ #include #include "trace.h" +static void cxl_cper_trace_corr_port_prot_err(struct pci_dev *pdev, + struct cxl_ras_capability_regs ras_cap) +{ + u32 status = ras_cap.cor_status & ~ras_cap.cor_mask; + + trace_cxl_port_aer_correctable_error(&pdev->dev, status); +} + +static void cxl_cper_trace_uncorr_port_prot_err(struct pci_dev *pdev, + struct cxl_ras_capability_regs ras_cap) +{ + u32 status = ras_cap.uncor_status & ~ras_cap.uncor_mask; + u32 fe; + + if (hweight32(status) > 1) + fe = BIT(FIELD_GET(CXL_RAS_CAP_CONTROL_FE_MASK, + ras_cap.cap_control)); + else + fe = status; + + trace_cxl_port_aer_uncorrectable_error(&pdev->dev, status, fe, + ras_cap.header_log); +} + static void cxl_cper_trace_corr_prot_err(struct pci_dev *pdev, struct cxl_ras_capability_regs ras_cap) { @@ -49,12 +73,25 @@ static void cxl_cper_handle_prot_err(struct cxl_cper_prot_err_work_data *data) pci_get_domain_bus_and_slot(data->prot_err.agent_addr.segment, data->prot_err.agent_addr.bus, devfn); + int port_type; if (!pdev) return; guard(device)(&pdev->dev); + port_type = pci_pcie_type(pdev); + if (port_type == PCI_EXP_TYPE_ROOT_PORT || + port_type == PCI_EXP_TYPE_DOWNSTREAM || + port_type == PCI_EXP_TYPE_UPSTREAM) { + if (data->severity == AER_CORRECTABLE) + cxl_cper_trace_corr_port_prot_err(pdev, data->ras_cap); + else + cxl_cper_trace_uncorr_port_prot_err(pdev, data->ras_cap); + + return; + } + if (data->severity == AER_CORRECTABLE) cxl_cper_trace_corr_prot_err(pdev, data->ras_cap); else diff --git a/drivers/cxl/core/trace.h b/drivers/cxl/core/trace.h index cea706b683b5..342ae8d50f32 100644 --- a/drivers/cxl/core/trace.h +++ b/drivers/cxl/core/trace.h @@ -48,6 +48,34 @@ { CXL_RAS_UC_IDE_RX_ERR, "IDE Rx Error" } \ ) +TRACE_EVENT(cxl_port_aer_uncorrectable_error, + TP_PROTO(struct device *dev, u32 status, u32 fe, u32 *hl), + TP_ARGS(dev, status, fe, hl), + TP_STRUCT__entry( + __string(device, dev_name(dev)) + __string(host, dev_name(dev->parent)) + __field(u32, status) + __field(u32, first_error) + __array(u32, header_log, CXL_HEADERLOG_SIZE_U32) + ), + TP_fast_assign( + __assign_str(device); + __assign_str(host); + __entry->status = status; + __entry->first_error = fe; + /* + * Embed the 512B headerlog data for user app retrieval and + * parsing, but no need to print this in the trace buffer. + */ + memcpy(__entry->header_log, hl, CXL_HEADERLOG_SIZE); + ), + TP_printk("device=%s host=%s status: '%s' first_error: '%s'", + __get_str(device), __get_str(host), + show_uc_errs(__entry->status), + show_uc_errs(__entry->first_error) + ) +); + TRACE_EVENT(cxl_aer_uncorrectable_error, TP_PROTO(const struct cxl_memdev *cxlmd, u32 status, u32 fe, u32 *hl), TP_ARGS(cxlmd, status, fe, hl), @@ -96,6 +124,25 @@ TRACE_EVENT(cxl_aer_uncorrectable_error, { CXL_RAS_CE_PHYS_LAYER_ERR, "Received Error From Physical Layer" } \ ) +TRACE_EVENT(cxl_port_aer_correctable_error, + TP_PROTO(struct device *dev, u32 status), + TP_ARGS(dev, status), + TP_STRUCT__entry( + __string(device, dev_name(dev)) + __string(host, dev_name(dev->parent)) + __field(u32, status) + ), + TP_fast_assign( + __assign_str(device); + __assign_str(host); + __entry->status = status; + ), + TP_printk("device=%s host=%s status='%s'", + __get_str(device), __get_str(host), + show_ce_errs(__entry->status) + ) +); + TRACE_EVENT(cxl_aer_correctable_error, TP_PROTO(const struct cxl_memdev *cxlmd, u32 status), TP_ARGS(cxlmd, status), From eeba74747a6634c59887750efcf534b335899993 Mon Sep 17 00:00:00 2001 From: Li Ming Date: Fri, 21 Feb 2025 09:24:47 +0800 Subject: [PATCH 17/39] cxl/core: Use guard() to replace open-coded down_read/write() Some down/up_read() and down/up_write() cases can be replaced by a guard() simply to drop explicit unlock invoked. It helps to align coding style with current CXL subsystem's. Reviewed-by: Jonathan Cameron Reviewed-by: Dave Jiang Reviewed-by: Alison Schofield Reviewed-by: Ira Weiny Acked-by: Davidlohr Bueso Signed-off-by: Li Ming Link: https://patch.msgid.link/20250221012453.126366-2-ming.li@zohomail.com Signed-off-by: Dave Jiang --- drivers/cxl/core/hdm.c | 15 +++++---------- drivers/cxl/core/memdev.c | 9 +++------ drivers/cxl/core/port.c | 8 ++------ drivers/cxl/core/region.c | 8 +++----- 4 files changed, 13 insertions(+), 27 deletions(-) diff --git a/drivers/cxl/core/hdm.c b/drivers/cxl/core/hdm.c index 50e6a45b30ba..4a578092377e 100644 --- a/drivers/cxl/core/hdm.c +++ b/drivers/cxl/core/hdm.c @@ -213,13 +213,12 @@ void cxl_dpa_debug(struct seq_file *file, struct cxl_dev_state *cxlds) { struct resource *p1, *p2; - down_read(&cxl_dpa_rwsem); + guard(rwsem_read)(&cxl_dpa_rwsem); for (p1 = cxlds->dpa_res.child; p1; p1 = p1->sibling) { __cxl_dpa_debug(file, p1, 0); for (p2 = p1->child; p2; p2 = p2->sibling) __cxl_dpa_debug(file, p2, 1); } - up_read(&cxl_dpa_rwsem); } EXPORT_SYMBOL_NS_GPL(cxl_dpa_debug, "CXL"); @@ -250,9 +249,8 @@ static void __cxl_dpa_release(struct cxl_endpoint_decoder *cxled) static void cxl_dpa_release(void *cxled) { - down_write(&cxl_dpa_rwsem); + guard(rwsem_write)(&cxl_dpa_rwsem); __cxl_dpa_release(cxled); - up_write(&cxl_dpa_rwsem); } /* @@ -362,14 +360,11 @@ EXPORT_SYMBOL_NS_GPL(devm_cxl_dpa_reserve, "CXL"); resource_size_t cxl_dpa_size(struct cxl_endpoint_decoder *cxled) { - resource_size_t size = 0; - - down_read(&cxl_dpa_rwsem); + guard(rwsem_read)(&cxl_dpa_rwsem); if (cxled->dpa_res) - size = resource_size(cxled->dpa_res); - up_read(&cxl_dpa_rwsem); + return resource_size(cxled->dpa_res); - return size; + return 0; } resource_size_t cxl_dpa_resource_start(struct cxl_endpoint_decoder *cxled) diff --git a/drivers/cxl/core/memdev.c b/drivers/cxl/core/memdev.c index ae3dfcbe8938..98c05426aa4a 100644 --- a/drivers/cxl/core/memdev.c +++ b/drivers/cxl/core/memdev.c @@ -564,10 +564,9 @@ EXPORT_SYMBOL_NS_GPL(is_cxl_memdev, "CXL"); void set_exclusive_cxl_commands(struct cxl_memdev_state *mds, unsigned long *cmds) { - down_write(&cxl_memdev_rwsem); + guard(rwsem_write)(&cxl_memdev_rwsem); bitmap_or(mds->exclusive_cmds, mds->exclusive_cmds, cmds, CXL_MEM_COMMAND_ID_MAX); - up_write(&cxl_memdev_rwsem); } EXPORT_SYMBOL_NS_GPL(set_exclusive_cxl_commands, "CXL"); @@ -579,10 +578,9 @@ EXPORT_SYMBOL_NS_GPL(set_exclusive_cxl_commands, "CXL"); void clear_exclusive_cxl_commands(struct cxl_memdev_state *mds, unsigned long *cmds) { - down_write(&cxl_memdev_rwsem); + guard(rwsem_write)(&cxl_memdev_rwsem); bitmap_andnot(mds->exclusive_cmds, mds->exclusive_cmds, cmds, CXL_MEM_COMMAND_ID_MAX); - up_write(&cxl_memdev_rwsem); } EXPORT_SYMBOL_NS_GPL(clear_exclusive_cxl_commands, "CXL"); @@ -590,9 +588,8 @@ static void cxl_memdev_shutdown(struct device *dev) { struct cxl_memdev *cxlmd = to_cxl_memdev(dev); - down_write(&cxl_memdev_rwsem); + guard(rwsem_write)(&cxl_memdev_rwsem); cxlmd->cxlds = NULL; - up_write(&cxl_memdev_rwsem); } static void cxl_memdev_unregister(void *_cxlmd) diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c index 78a5c2c25982..2c59d65bc18b 100644 --- a/drivers/cxl/core/port.c +++ b/drivers/cxl/core/port.c @@ -549,13 +549,9 @@ static ssize_t decoders_committed_show(struct device *dev, struct device_attribute *attr, char *buf) { struct cxl_port *port = to_cxl_port(dev); - int rc; - down_read(&cxl_region_rwsem); - rc = sysfs_emit(buf, "%d\n", cxl_num_decoders_committed(port)); - up_read(&cxl_region_rwsem); - - return rc; + guard(rwsem_read)(&cxl_region_rwsem); + return sysfs_emit(buf, "%d\n", cxl_num_decoders_committed(port)); } static DEVICE_ATTR_RO(decoders_committed); diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index e8d11a988fd9..d8a71f9f9fa5 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -3208,7 +3208,6 @@ static int match_region_by_range(struct device *dev, const void *data) struct cxl_region_params *p; struct cxl_region *cxlr; const struct range *r = data; - int rc = 0; if (!is_cxl_region(dev)) return 0; @@ -3216,12 +3215,11 @@ static int match_region_by_range(struct device *dev, const void *data) cxlr = to_cxl_region(dev); p = &cxlr->params; - down_read(&cxl_region_rwsem); + guard(rwsem_read)(&cxl_region_rwsem); if (p->res && p->res->start == r->start && p->res->end == r->end) - rc = 1; - up_read(&cxl_region_rwsem); + return 1; - return rc; + return 0; } /* Establish an empty region covering the given HPA range */ From 3ad4f59f38965071e429ace93c75a94a2ede5456 Mon Sep 17 00:00:00 2001 From: Li Ming Date: Fri, 21 Feb 2025 09:24:48 +0800 Subject: [PATCH 18/39] cxl/core: cxl_mem_sanitize() cleanup In cxl_mem_sanitize(), the down_read() and up_read() for cxl_region_rwsem can be simply replaced by a guard(rwsem_read), and the local variable 'rc' can be removed. Reviewed-by: Jonathan Cameron Reviewed-by: Dave Jiang Reviewed-by: Alison Schofield Reviewed-by: Ira Weiny Acked-by: Davidlohr Bueso Signed-off-by: Li Ming Link: https://patch.msgid.link/20250221012453.126366-3-ming.li@zohomail.com Signed-off-by: Dave Jiang --- drivers/cxl/core/mbox.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c index 548564c770c0..0601297af0c9 100644 --- a/drivers/cxl/core/mbox.c +++ b/drivers/cxl/core/mbox.c @@ -1222,23 +1222,19 @@ int cxl_mem_sanitize(struct cxl_memdev *cxlmd, u16 cmd) { struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlmd->cxlds); struct cxl_port *endpoint; - int rc; /* synchronize with cxl_mem_probe() and decoder write operations */ guard(device)(&cxlmd->dev); endpoint = cxlmd->endpoint; - down_read(&cxl_region_rwsem); + guard(rwsem_read)(&cxl_region_rwsem); /* * Require an endpoint to be safe otherwise the driver can not * be sure that the device is unmapped. */ if (endpoint && cxl_num_decoders_committed(endpoint) == 0) - rc = __cxl_mem_sanitize(mds, cmd); - else - rc = -EBUSY; - up_read(&cxl_region_rwsem); + return __cxl_mem_sanitize(mds, cmd); - return rc; + return -EBUSY; } static int add_dpa_res(struct device *dev, struct resource *parent, From a58afda8bfd4a113295f0e12c0697c35a69614b2 Mon Sep 17 00:00:00 2001 From: Li Ming Date: Fri, 21 Feb 2025 09:24:49 +0800 Subject: [PATCH 19/39] cxl/memdev: cxl_memdev_ioctl() cleanup In cxl_memdev_ioctl(), the down_read(&cxl_memdev_rwsem) and up_read(&cxl_memdev_rwsem) can be replaced by a guard(rwsem_read)(&cxl_memdev_rwsem), it helps to remove the open-coded up_read(&cxl_memdev_rwsem). Besides, the local var 'rc' can be also removed to make the code more cleaner. Reviewed-by: Jonathan Cameron Reviewed-by: Dave Jiang Reviewed-by: Alison Schofield Reviewed-by: Ira Weiny Acked-by: Davidlohr Bueso Signed-off-by: Li Ming Link: https://patch.msgid.link/20250221012453.126366-4-ming.li@zohomail.com Signed-off-by: Dave Jiang --- drivers/cxl/core/memdev.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/drivers/cxl/core/memdev.c b/drivers/cxl/core/memdev.c index 98c05426aa4a..6f90dcd626f9 100644 --- a/drivers/cxl/core/memdev.c +++ b/drivers/cxl/core/memdev.c @@ -668,15 +668,13 @@ static long cxl_memdev_ioctl(struct file *file, unsigned int cmd, { struct cxl_memdev *cxlmd = file->private_data; struct cxl_dev_state *cxlds; - int rc = -ENXIO; - down_read(&cxl_memdev_rwsem); + guard(rwsem_read)(&cxl_memdev_rwsem); cxlds = cxlmd->cxlds; if (cxlds && cxlds->type == CXL_DEVTYPE_CLASSMEM) - rc = __cxl_memdev_ioctl(cxlmd, cmd, arg); - up_read(&cxl_memdev_rwsem); + return __cxl_memdev_ioctl(cxlmd, cmd, arg); - return rc; + return -ENXIO; } static int cxl_memdev_open(struct inode *inode, struct file *file) From 16fe6ec4ac3d828b3976bd36e4d99af73f8e43d2 Mon Sep 17 00:00:00 2001 From: Li Ming Date: Fri, 21 Feb 2025 09:24:50 +0800 Subject: [PATCH 20/39] cxl/core: Use guard() to drop the goto pattern of cxl_dpa_free() cxl_dpa_free() has a goto pattern to call up_write() for cxl_dpa_rwsem, it can be removed by using a guard() to replace the down_write() and up_write(). Reviewed-by: Jonathan Cameron Reviewed-by: Dave Jiang Reviewed-by: Alison Schofield Reviewed-by: Ira Weiny Acked-by: Davidlohr Bueso Signed-off-by: Li Ming Link: https://patch.msgid.link/20250221012453.126366-5-ming.li@zohomail.com Signed-off-by: Dave Jiang --- drivers/cxl/core/hdm.c | 24 ++++++++---------------- 1 file changed, 8 insertions(+), 16 deletions(-) diff --git a/drivers/cxl/core/hdm.c b/drivers/cxl/core/hdm.c index 4a578092377e..874a791f8292 100644 --- a/drivers/cxl/core/hdm.c +++ b/drivers/cxl/core/hdm.c @@ -382,35 +382,27 @@ int cxl_dpa_free(struct cxl_endpoint_decoder *cxled) { struct cxl_port *port = cxled_to_port(cxled); struct device *dev = &cxled->cxld.dev; - int rc; - down_write(&cxl_dpa_rwsem); - if (!cxled->dpa_res) { - rc = 0; - goto out; - } + guard(rwsem_write)(&cxl_dpa_rwsem); + if (!cxled->dpa_res) + return 0; if (cxled->cxld.region) { dev_dbg(dev, "decoder assigned to: %s\n", dev_name(&cxled->cxld.region->dev)); - rc = -EBUSY; - goto out; + return -EBUSY; } if (cxled->cxld.flags & CXL_DECODER_F_ENABLE) { dev_dbg(dev, "decoder enabled\n"); - rc = -EBUSY; - goto out; + return -EBUSY; } if (cxled->cxld.id != port->hdm_end) { dev_dbg(dev, "expected decoder%d.%d\n", port->id, port->hdm_end); - rc = -EBUSY; - goto out; + return -EBUSY; } + devm_cxl_dpa_release(cxled); - rc = 0; -out: - up_write(&cxl_dpa_rwsem); - return rc; + return 0; } int cxl_dpa_set_mode(struct cxl_endpoint_decoder *cxled, From a81ebe7d19b6fdc6de0159878fbed9945120813e Mon Sep 17 00:00:00 2001 From: Li Ming Date: Fri, 21 Feb 2025 09:24:51 +0800 Subject: [PATCH 21/39] cxl/core: Use guard() to drop goto pattern of cxl_dpa_alloc() In cxl_dpa_alloc(), some checking and operations need to be protected by a rwsem called cxl_dpa_rwsem, so there is a goto pattern in cxl_dpa_alloc() to release the rwsem. The goto pattern can be optimized by using guard() to hold the rwsem. Creating a new function called __cxl_dpa_alloc() to include all checking and operations needed to be protected by cxl_dpa_rwsem. Using guard(rwsem_write()) to hold cxl_dpa_rwsem at the beginning of the new function. Reviewed-by: Jonathan Cameron Reviewed-by: Dave Jiang Reviewed-by: Alison Schofield Reviewed-by: Dan Williams Reviewed-by: Ira Weiny Acked-by: Davidlohr Bueso Signed-off-by: Li Ming Link: https://patch.msgid.link/20250221012453.126366-6-ming.li@zohomail.com Signed-off-by: Dave Jiang --- drivers/cxl/core/hdm.c | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/drivers/cxl/core/hdm.c b/drivers/cxl/core/hdm.c index 874a791f8292..1edbf7873471 100644 --- a/drivers/cxl/core/hdm.c +++ b/drivers/cxl/core/hdm.c @@ -442,29 +442,25 @@ int cxl_dpa_set_mode(struct cxl_endpoint_decoder *cxled, return 0; } -int cxl_dpa_alloc(struct cxl_endpoint_decoder *cxled, unsigned long long size) +static int __cxl_dpa_alloc(struct cxl_endpoint_decoder *cxled, unsigned long long size) { struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); resource_size_t free_ram_start, free_pmem_start; - struct cxl_port *port = cxled_to_port(cxled); struct cxl_dev_state *cxlds = cxlmd->cxlds; struct device *dev = &cxled->cxld.dev; resource_size_t start, avail, skip; struct resource *p, *last; - int rc; - down_write(&cxl_dpa_rwsem); + guard(rwsem_write)(&cxl_dpa_rwsem); if (cxled->cxld.region) { dev_dbg(dev, "decoder attached to %s\n", dev_name(&cxled->cxld.region->dev)); - rc = -EBUSY; - goto out; + return -EBUSY; } if (cxled->cxld.flags & CXL_DECODER_F_ENABLE) { dev_dbg(dev, "decoder enabled\n"); - rc = -EBUSY; - goto out; + return -EBUSY; } for (p = cxlds->ram_res.child, last = NULL; p; p = p->sibling) @@ -504,21 +500,24 @@ int cxl_dpa_alloc(struct cxl_endpoint_decoder *cxled, unsigned long long size) skip = skip_end - skip_start + 1; } else { dev_dbg(dev, "mode not set\n"); - rc = -EINVAL; - goto out; + return -EINVAL; } if (size > avail) { dev_dbg(dev, "%pa exceeds available %s capacity: %pa\n", &size, cxl_decoder_mode_name(cxled->mode), &avail); - rc = -ENOSPC; - goto out; + return -ENOSPC; } - rc = __cxl_dpa_reserve(cxled, start, size, skip); -out: - up_write(&cxl_dpa_rwsem); + return __cxl_dpa_reserve(cxled, start, size, skip); +} +int cxl_dpa_alloc(struct cxl_endpoint_decoder *cxled, unsigned long long size) +{ + struct cxl_port *port = cxled_to_port(cxled); + int rc; + + rc = __cxl_dpa_alloc(cxled, size); if (rc) return rc; From 9e7b7ab5af69aaa5cd027656529663407da61e6f Mon Sep 17 00:00:00 2001 From: Li Ming Date: Fri, 21 Feb 2025 09:24:52 +0800 Subject: [PATCH 22/39] cxl/region: Drop goto pattern in cxl_dax_region_alloc() In cxl_dax_region_alloc(), there is a goto pattern to release the rwsem cxl_region_rwsem when the function returns, the down_read() and up_read can be replaced by a guard(rwsem_read) then the goto pattern can be removed. Reviewed-by: Jonathan Cameron Reviewed-by: Dave Jiang Reviewed-by: Alison Schofield Reviewed-by: Ira Weiny Acked-by: Davidlohr Bueso Signed-off-by: Li Ming Link: https://patch.msgid.link/20250221012453.126366-7-ming.li@zohomail.com Signed-off-by: Dave Jiang --- drivers/cxl/core/region.c | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index d8a71f9f9fa5..320a3f218131 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -3038,17 +3038,13 @@ static struct cxl_dax_region *cxl_dax_region_alloc(struct cxl_region *cxlr) struct cxl_dax_region *cxlr_dax; struct device *dev; - down_read(&cxl_region_rwsem); - if (p->state != CXL_CONFIG_COMMIT) { - cxlr_dax = ERR_PTR(-ENXIO); - goto out; - } + guard(rwsem_read)(&cxl_region_rwsem); + if (p->state != CXL_CONFIG_COMMIT) + return ERR_PTR(-ENXIO); cxlr_dax = kzalloc(sizeof(*cxlr_dax), GFP_KERNEL); - if (!cxlr_dax) { - cxlr_dax = ERR_PTR(-ENOMEM); - goto out; - } + if (!cxlr_dax) + return ERR_PTR(-ENOMEM); cxlr_dax->hpa_range.start = p->res->start; cxlr_dax->hpa_range.end = p->res->end; @@ -3061,8 +3057,6 @@ static struct cxl_dax_region *cxl_dax_region_alloc(struct cxl_region *cxlr) dev->parent = &cxlr->dev; dev->bus = &cxl_bus_type; dev->type = &cxl_dax_region_type; -out: - up_read(&cxl_region_rwsem); return cxlr_dax; } From 5ec67596e368cdddddd6770fc2dd2e577e82fbe8 Mon Sep 17 00:00:00 2001 From: Li Ming Date: Fri, 21 Feb 2025 09:32:05 +0800 Subject: [PATCH 23/39] cxl/region: Drop goto pattern of construct_region() Some operations need to be protected by the cxl_region_rwsem in construct_region(). Currently, construct_region() uses down_write() and up_write() for the cxl_region_rwsem locking, so there is a goto pattern after down_write() invoked to release cxl_region_rwsem. construct region() can be optimized to remove the goto pattern. The changes are creating a new function called __construct_region() which will include all checking and operations protected by the cxl_region_rwsem, and using guard(rwsem_write) to replace down_write() and up_write() in __construct_region(). Reviewed-by: Jonathan Cameron Reviewed-by: Dave Jiang Reviewed-by: Alison Schofield Reviewed-by: Dan Williams Reviewed-by: Ira Weiny Acked-by: Davidlohr Bueso Signed-off-by: Li Ming Link: https://patch.msgid.link/20250221013205.126419-1-ming.li@zohomail.com Signed-off-by: Dave Jiang --- drivers/cxl/core/region.c | 71 +++++++++++++++++++++------------------ 1 file changed, 39 insertions(+), 32 deletions(-) diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index 320a3f218131..a750107a3bff 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -3216,49 +3216,31 @@ static int match_region_by_range(struct device *dev, const void *data) return 0; } -/* Establish an empty region covering the given HPA range */ -static struct cxl_region *construct_region(struct cxl_root_decoder *cxlrd, - struct cxl_endpoint_decoder *cxled) +static int __construct_region(struct cxl_region *cxlr, + struct cxl_root_decoder *cxlrd, + struct cxl_endpoint_decoder *cxled) { struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); - struct cxl_port *port = cxlrd_to_port(cxlrd); struct range *hpa = &cxled->cxld.hpa_range; struct cxl_region_params *p; - struct cxl_region *cxlr; struct resource *res; int rc; - do { - cxlr = __create_region(cxlrd, cxled->mode, - atomic_read(&cxlrd->region_id)); - } while (IS_ERR(cxlr) && PTR_ERR(cxlr) == -EBUSY); - - if (IS_ERR(cxlr)) { - dev_err(cxlmd->dev.parent, - "%s:%s: %s failed assign region: %ld\n", - dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev), - __func__, PTR_ERR(cxlr)); - return cxlr; - } - - down_write(&cxl_region_rwsem); + guard(rwsem_write)(&cxl_region_rwsem); p = &cxlr->params; if (p->state >= CXL_CONFIG_INTERLEAVE_ACTIVE) { dev_err(cxlmd->dev.parent, "%s:%s: %s autodiscovery interrupted\n", dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev), __func__); - rc = -EBUSY; - goto err; + return -EBUSY; } set_bit(CXL_REGION_F_AUTO, &cxlr->flags); res = kmalloc(sizeof(*res), GFP_KERNEL); - if (!res) { - rc = -ENOMEM; - goto err; - } + if (!res) + return -ENOMEM; *res = DEFINE_RES_MEM_NAMED(hpa->start, range_len(hpa), dev_name(&cxlr->dev)); @@ -3281,7 +3263,7 @@ static struct cxl_region *construct_region(struct cxl_root_decoder *cxlrd, rc = sysfs_update_group(&cxlr->dev.kobj, get_cxl_region_target_group()); if (rc) - goto err; + return rc; dev_dbg(cxlmd->dev.parent, "%s:%s: %s %s res: %pr iw: %d ig: %d\n", dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev), __func__, @@ -3290,14 +3272,39 @@ static struct cxl_region *construct_region(struct cxl_root_decoder *cxlrd, /* ...to match put_device() in cxl_add_to_region() */ get_device(&cxlr->dev); - up_write(&cxl_region_rwsem); + + return 0; +} + +/* Establish an empty region covering the given HPA range */ +static struct cxl_region *construct_region(struct cxl_root_decoder *cxlrd, + struct cxl_endpoint_decoder *cxled) +{ + struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); + struct cxl_port *port = cxlrd_to_port(cxlrd); + struct cxl_region *cxlr; + int rc; + + do { + cxlr = __create_region(cxlrd, cxled->mode, + atomic_read(&cxlrd->region_id)); + } while (IS_ERR(cxlr) && PTR_ERR(cxlr) == -EBUSY); + + if (IS_ERR(cxlr)) { + dev_err(cxlmd->dev.parent, + "%s:%s: %s failed assign region: %ld\n", + dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev), + __func__, PTR_ERR(cxlr)); + return cxlr; + } + + rc = __construct_region(cxlr, cxlrd, cxled); + if (rc) { + devm_release_action(port->uport_dev, unregister_region, cxlr); + return ERR_PTR(rc); + } return cxlr; - -err: - up_write(&cxl_region_rwsem); - devm_release_action(port->uport_dev, unregister_region, cxlr); - return ERR_PTR(rc); } int cxl_add_to_region(struct cxl_port *root, struct cxl_endpoint_decoder *cxled) From 16ca2f5431ee7c003ae3ba3b0c4d4ddc57670b44 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Thu, 6 Feb 2025 13:34:00 -0600 Subject: [PATCH 24/39] cxl/memdev: Remove unused partition values The next volatile and next persistent values are unused and are cluttering the cxl_memdev_state. Remove these values. Reviewed-by: Davidlohr Bueso Reviewed-by: Dave Jiang Reviewed-by: Jonathan Cameron Reviewed-by: Dan Williams Reviewed-by: Fan Ni Signed-off-by: Ira Weiny Link: https://patch.msgid.link/20250206-cxl-cleanup-v1-1-9ddf26dd8433@intel.com Signed-off-by: Dave Jiang --- drivers/cxl/core/mbox.c | 4 ---- drivers/cxl/cxlmem.h | 4 ---- 2 files changed, 8 deletions(-) diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c index 62bb3653362f..998e1df36db6 100644 --- a/drivers/cxl/core/mbox.c +++ b/drivers/cxl/core/mbox.c @@ -1097,10 +1097,6 @@ static int cxl_mem_get_partition_info(struct cxl_memdev_state *mds) le64_to_cpu(pi.active_volatile_cap) * CXL_CAPACITY_MULTIPLIER; mds->active_persistent_bytes = le64_to_cpu(pi.active_persistent_cap) * CXL_CAPACITY_MULTIPLIER; - mds->next_volatile_bytes = - le64_to_cpu(pi.next_volatile_cap) * CXL_CAPACITY_MULTIPLIER; - mds->next_persistent_bytes = - le64_to_cpu(pi.next_volatile_cap) * CXL_CAPACITY_MULTIPLIER; return 0; } diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h index 36a091597066..2baf29a0afce 100644 --- a/drivers/cxl/cxlmem.h +++ b/drivers/cxl/cxlmem.h @@ -506,8 +506,6 @@ static inline struct cxl_dev_state *mbox_to_cxlds(struct cxl_mailbox *cxl_mbox) * @partition_align_bytes: alignment size for partition-able capacity * @active_volatile_bytes: sum of hard + soft volatile * @active_persistent_bytes: sum of hard + soft persistent - * @next_volatile_bytes: volatile capacity change pending device reset - * @next_persistent_bytes: persistent capacity change pending device reset * @event: event log driver state * @poison: poison driver state info * @security: security driver state info @@ -528,8 +526,6 @@ struct cxl_memdev_state { u64 partition_align_bytes; u64 active_volatile_bytes; u64 active_persistent_bytes; - u64 next_volatile_bytes; - u64 next_persistent_bytes; struct cxl_event_state event; struct cxl_poison_state poison; From e0feac20d150949dc8b74c1c5998dea70d19bf35 Mon Sep 17 00:00:00 2001 From: Li Ming Date: Tue, 11 Feb 2025 14:20:54 +0800 Subject: [PATCH 25/39] cxl/cdat: Remove redundant gp_port initialization gp_port is already pointed to the grandparent port during its definition, remove a redundant code to let gp_port point to the grandparent port again. Signed-off-by: Li Ming Reviewed-by: Jonathan Cameron Reviewed-by: Davidlohr Bueso Link: https://patch.msgid.link/20250211062054.300108-1-ming.li@zohomail.com Signed-off-by: Dave Jiang --- drivers/cxl/core/cdat.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/cxl/core/cdat.c b/drivers/cxl/core/cdat.c index bfba7f5019cf..edb4f41eeacc 100644 --- a/drivers/cxl/core/cdat.c +++ b/drivers/cxl/core/cdat.c @@ -658,7 +658,6 @@ static int cxl_endpoint_gather_bandwidth(struct cxl_region *cxlr, if (IS_ERR(perf)) return PTR_ERR(perf); - gp_port = to_cxl_port(parent_port->dev.parent); *gp_is_root = is_cxl_root(gp_port); /* From 2da9ad027e8094c0944b7dfc28c9e3db368d61cc Mon Sep 17 00:00:00 2001 From: Yuquan Wang Date: Wed, 19 Feb 2025 12:00:29 +0800 Subject: [PATCH 26/39] cxl/pmem: debug invalid serial number data In a nvdimm interleave-set each device with an invalid or zero serial number may cause pmem region initialization to fail, but in cxl case such device could still set cookies of nd_interleave_set and create a nvdimm pmem region. This adds the validation of serial number in cxl pmem region creation. The event of no serial number would cause to fail to set the cookie and pmem region. For cxl-test to work properly, always +1 on mock device's serial number. Signed-off-by: Yuquan Wang Reviewed-by: Jonathan Cameron Reviewed-by: Alison Schofield Reviewed-by: Ira Weiny Link: https://patch.msgid.link/20250219040029.515451-2-wangyuquan1236@phytium.com.cn Signed-off-by: Dave Jiang --- drivers/cxl/pmem.c | 12 ++++++++++-- tools/testing/cxl/test/mem.c | 2 +- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/drivers/cxl/pmem.c b/drivers/cxl/pmem.c index f9c95996e937..11c5a65acacf 100644 --- a/drivers/cxl/pmem.c +++ b/drivers/cxl/pmem.c @@ -375,6 +375,16 @@ static int cxl_pmem_region_probe(struct device *dev) goto out_nvd; } + if (cxlds->serial == 0) { + /* include missing alongside invalid in this error message. */ + dev_err(dev, "%s: invalid or missing serial number\n", + dev_name(&cxlmd->dev)); + rc = -ENXIO; + goto out_nvd; + } + info[i].serial = cxlds->serial; + info[i].offset = m->start; + m->cxl_nvd = cxl_nvd; mappings[i] = (struct nd_mapping_desc) { .nvdimm = nvdimm, @@ -382,8 +392,6 @@ static int cxl_pmem_region_probe(struct device *dev) .size = m->size, .position = i, }; - info[i].offset = m->start; - info[i].serial = cxlds->serial; } ndr_desc.num_mappings = cxlr_pmem->nr_mappings; ndr_desc.mapping = mappings; diff --git a/tools/testing/cxl/test/mem.c b/tools/testing/cxl/test/mem.c index 495199238335..4cbfafdf5371 100644 --- a/tools/testing/cxl/test/mem.c +++ b/tools/testing/cxl/test/mem.c @@ -1534,7 +1534,7 @@ static int cxl_mock_mem_probe(struct platform_device *pdev) mds->event.buf = (struct cxl_get_event_payload *) mdata->event_buf; INIT_DELAYED_WORK(&mds->security.poll_dwork, cxl_mockmem_sanitize_work); - cxlds->serial = pdev->id; + cxlds->serial = pdev->id + 1; if (is_rcd(pdev)) cxlds->rcd = true; From eb8081bcc53fdf951676ee199ed2a8f60d16f0ce Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Tue, 18 Feb 2025 14:48:52 -0800 Subject: [PATCH 27/39] cxl: Plug typos in ABI doc Trivially update where necessary. Reviewed-by: Jonathan Cameron Signed-off-by: Davidlohr Bueso Reviewed-by: Li Ming Reviewed-by: Dave Jiang Reviewed-by: Ira Weiny Link: https://patch.msgid.link/20250218224853.67457-2-dave@stgolabs.net Signed-off-by: Dave Jiang --- Documentation/ABI/testing/sysfs-bus-cxl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-bus-cxl b/Documentation/ABI/testing/sysfs-bus-cxl index 3f5627a1210a..7bdf0eb79d7c 100644 --- a/Documentation/ABI/testing/sysfs-bus-cxl +++ b/Documentation/ABI/testing/sysfs-bus-cxl @@ -1,5 +1,5 @@ What: /sys/bus/cxl/flush -Date: Januarry, 2022 +Date: January, 2022 KernelVersion: v5.18 Contact: linux-cxl@vger.kernel.org Description: @@ -33,7 +33,7 @@ Date: May, 2023 KernelVersion: v6.8 Contact: linux-cxl@vger.kernel.org Description: - (RO) For CXL host platforms that support "QoS Telemmetry" + (RO) For CXL host platforms that support "QoS Telemetry" this attribute conveys a comma delimited list of platform specific cookies that identifies a QoS performance class for the volatile partition of the CXL mem device. These @@ -60,7 +60,7 @@ Date: May, 2023 KernelVersion: v6.8 Contact: linux-cxl@vger.kernel.org Description: - (RO) For CXL host platforms that support "QoS Telemmetry" + (RO) For CXL host platforms that support "QoS Telemetry" this attribute conveys a comma delimited list of platform specific cookies that identifies a QoS performance class for the persistent partition of the CXL mem device. These @@ -423,7 +423,7 @@ Date: May, 2023 KernelVersion: v6.5 Contact: linux-cxl@vger.kernel.org Description: - (RO) For CXL host platforms that support "QoS Telemmetry" this + (RO) For CXL host platforms that support "QoS Telemetry" this root-decoder-only attribute conveys a platform specific cookie that identifies a QoS performance class for the CXL Window. This class-id can be compared against a similar "qos_class" From 17218b02283a8b0de199cfbad079417c8e9de5c9 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Tue, 18 Feb 2025 14:48:53 -0800 Subject: [PATCH 28/39] cxl: Document missing sysfs files Add to the ABI documentation the payload_max and label_storage_size read-only files, which have been there since the early days. Signed-off-by: Davidlohr Bueso Reviewed-by: Li Ming Reviewed-by: Dave Jiang Reviewed-by: Ira Weiny Link: https://patch.msgid.link/20250218224853.67457-3-dave@stgolabs.net Signed-off-by: Dave Jiang --- Documentation/ABI/testing/sysfs-bus-cxl | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-bus-cxl b/Documentation/ABI/testing/sysfs-bus-cxl index 7bdf0eb79d7c..04a880bd1dde 100644 --- a/Documentation/ABI/testing/sysfs-bus-cxl +++ b/Documentation/ABI/testing/sysfs-bus-cxl @@ -18,6 +18,24 @@ Description: specification. +What: /sys/bus/cxl/devices/memX/payload_max +Date: December, 2020 +KernelVersion: v5.12 +Contact: linux-cxl@vger.kernel.org +Description: + (RO) Maximum size (in bytes) of the mailbox command payload + registers. Linux caps this at 1MB if the device reports a + larger size. + + +What: /sys/bus/cxl/devices/memX/label_storage_size +Date: May, 2021 +KernelVersion: v5.13 +Contact: linux-cxl@vger.kernel.org +Description: + (RO) Size (in bytes) of the Label Storage Area (LSA). + + What: /sys/bus/cxl/devices/memX/ram/size Date: December, 2020 KernelVersion: v5.12 From a52b6a2c1c997b5047a724ccde955910f6150a97 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Fri, 24 Jan 2025 15:35:33 -0800 Subject: [PATCH 29/39] cxl/pci: Support Global Persistent Flush (GPF) Add support for GPF flows. It is found that the CXL specification around this to be a bit too involved from the driver side. And while this should really all handled by the hardware, this patch takes things with a grain of salt. Upon respective port enumeration, both phase timeouts are set to a max of 20 seconds, which is the NMI watchdog default for lockup detection. The premise is that the kernel does not have enough information to set anything better than a max across the board and hope devices finish their GPF flows within the platform energy budget. Timeout detection is based on dirty Shutdown semantics. The driver will mark it as dirty, expecting that the device clear it upon a successful GPF event. The admin may consult the device Health and check the dirty shutdown counter to see if there was a problem with data integrity. [ davej: Explicitly set return to 0 in update_gpf_port_dvsec() ] [ davej: Add spec reference for 'struct cxl_mbox_set_shutdown_state_in ] [ davej: Fix 0-day reported issue ] Signed-off-by: Davidlohr Bueso Reviewed-by: Jonathan Cameron Reviewed-by: Dan Williams Link: https://patch.msgid.link/20250124233533.910535-1-dave@stgolabs.net Signed-off-by: Dave Jiang --- Documentation/driver-api/cxl/maturity-map.rst | 2 +- drivers/cxl/core/core.h | 2 + drivers/cxl/core/mbox.c | 18 ++++ drivers/cxl/core/pci.c | 87 +++++++++++++++++++ drivers/cxl/core/port.c | 2 + drivers/cxl/cxl.h | 2 + drivers/cxl/cxlmem.h | 6 ++ drivers/cxl/cxlpci.h | 6 ++ drivers/cxl/pmem.c | 8 ++ 9 files changed, 132 insertions(+), 1 deletion(-) diff --git a/Documentation/driver-api/cxl/maturity-map.rst b/Documentation/driver-api/cxl/maturity-map.rst index df8e2ac2a320..99dd2c841e69 100644 --- a/Documentation/driver-api/cxl/maturity-map.rst +++ b/Documentation/driver-api/cxl/maturity-map.rst @@ -130,7 +130,7 @@ Mailbox commands * [0] Switch CCI * [3] Timestamp * [1] PMEM labels -* [0] PMEM GPF / Dirty Shutdown +* [1] PMEM GPF / Dirty Shutdown * [0] Scan Media PMU diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h index 800466f96a68..8f2eb76a3c8c 100644 --- a/drivers/cxl/core/core.h +++ b/drivers/cxl/core/core.h @@ -115,4 +115,6 @@ bool cxl_need_node_perf_attrs_update(int nid); int cxl_port_get_switch_dport_bandwidth(struct cxl_port *port, struct access_coordinate *c); +int cxl_gpf_port_setup(struct device *dport_dev, struct cxl_port *port); + #endif /* __CXL_CORE_H__ */ diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c index 548564c770c0..5b89ae5c5e28 100644 --- a/drivers/cxl/core/mbox.c +++ b/drivers/cxl/core/mbox.c @@ -1308,6 +1308,24 @@ int cxl_mem_create_range_info(struct cxl_memdev_state *mds) } EXPORT_SYMBOL_NS_GPL(cxl_mem_create_range_info, "CXL"); +int cxl_dirty_shutdown_state(struct cxl_memdev_state *mds) +{ + struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox; + struct cxl_mbox_cmd mbox_cmd; + struct cxl_mbox_set_shutdown_state_in in = { + .state = 1 + }; + + mbox_cmd = (struct cxl_mbox_cmd) { + .opcode = CXL_MBOX_OP_SET_SHUTDOWN_STATE, + .size_in = sizeof(in), + .payload_in = &in, + }; + + return cxl_internal_send_cmd(cxl_mbox, &mbox_cmd); +} +EXPORT_SYMBOL_NS_GPL(cxl_dirty_shutdown_state, "CXL"); + int cxl_set_timestamp(struct cxl_memdev_state *mds) { struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox; diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c index 013b869b66cb..a5c65f79db18 100644 --- a/drivers/cxl/core/pci.c +++ b/drivers/cxl/core/pci.c @@ -1054,3 +1054,90 @@ int cxl_pci_get_bandwidth(struct pci_dev *pdev, struct access_coordinate *c) return 0; } + +/* + * Set max timeout such that platforms will optimize GPF flow to avoid + * the implied worst-case scenario delays. On a sane platform, all + * devices should always complete GPF within the energy budget of + * the GPF flow. The kernel does not have enough information to pick + * anything better than "maximize timeouts and hope it works". + * + * A misbehaving device could block forward progress of GPF for all + * the other devices, exhausting the energy budget of the platform. + * However, the spec seems to assume that moving on from slow to respond + * devices is a virtue. It is not possible to know that, in actuality, + * the slow to respond device is *the* most critical device in the + * system to wait. + */ +#define GPF_TIMEOUT_BASE_MAX 2 +#define GPF_TIMEOUT_SCALE_MAX 7 /* 10 seconds */ + +static int update_gpf_port_dvsec(struct pci_dev *pdev, int dvsec, int phase) +{ + u64 base, scale; + int rc, offset; + u16 ctrl; + + switch (phase) { + case 1: + offset = CXL_DVSEC_PORT_GPF_PHASE_1_CONTROL_OFFSET; + base = CXL_DVSEC_PORT_GPF_PHASE_1_TMO_BASE_MASK; + scale = CXL_DVSEC_PORT_GPF_PHASE_1_TMO_SCALE_MASK; + break; + case 2: + offset = CXL_DVSEC_PORT_GPF_PHASE_2_CONTROL_OFFSET; + base = CXL_DVSEC_PORT_GPF_PHASE_2_TMO_BASE_MASK; + scale = CXL_DVSEC_PORT_GPF_PHASE_2_TMO_SCALE_MASK; + break; + default: + return -EINVAL; + } + + rc = pci_read_config_word(pdev, dvsec + offset, &ctrl); + if (rc) + return rc; + + if (FIELD_GET(base, ctrl) == GPF_TIMEOUT_BASE_MAX && + FIELD_GET(scale, ctrl) == GPF_TIMEOUT_SCALE_MAX) + return 0; + + ctrl = FIELD_PREP(base, GPF_TIMEOUT_BASE_MAX); + ctrl |= FIELD_PREP(scale, GPF_TIMEOUT_SCALE_MAX); + + rc = pci_write_config_word(pdev, dvsec + offset, ctrl); + if (!rc) + pci_dbg(pdev, "Port GPF phase %d timeout: %d0 secs\n", + phase, GPF_TIMEOUT_BASE_MAX); + + return rc; +} + +int cxl_gpf_port_setup(struct device *dport_dev, struct cxl_port *port) +{ + struct pci_dev *pdev; + + if (!dev_is_pci(dport_dev)) + return 0; + + pdev = to_pci_dev(dport_dev); + if (!pdev || !port) + return -EINVAL; + + if (!port->gpf_dvsec) { + int dvsec; + + dvsec = pci_find_dvsec_capability(pdev, PCI_VENDOR_ID_CXL, + CXL_DVSEC_PORT_GPF); + if (!dvsec) { + pci_warn(pdev, "Port GPF DVSEC not present\n"); + return -EINVAL; + } + + port->gpf_dvsec = dvsec; + } + + update_gpf_port_dvsec(pdev, port->gpf_dvsec, 1); + update_gpf_port_dvsec(pdev, port->gpf_dvsec, 2); + + return 0; +} diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c index 78a5c2c25982..95cd6f11bbfa 100644 --- a/drivers/cxl/core/port.c +++ b/drivers/cxl/core/port.c @@ -1672,6 +1672,8 @@ retry: if (rc && rc != -EBUSY) return rc; + cxl_gpf_port_setup(dport_dev, port); + /* Any more ports to add between this one and the root? */ if (!dev_is_cxl_root_child(&port->dev)) continue; diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index bbbaa0d0a670..55af041df7b2 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -610,6 +610,7 @@ struct cxl_dax_region { * @cdat: Cached CDAT data * @cdat_available: Should a CDAT attribute be available in sysfs * @pci_latency: Upstream latency in picoseconds + * @gpf_dvsec: Cached GPF port DVSEC */ struct cxl_port { struct device dev; @@ -633,6 +634,7 @@ struct cxl_port { } cdat; bool cdat_available; long pci_latency; + int gpf_dvsec; }; /** diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h index 2a25d1957ddb..5d49e0a93426 100644 --- a/drivers/cxl/cxlmem.h +++ b/drivers/cxl/cxlmem.h @@ -693,6 +693,11 @@ struct cxl_mbox_set_partition_info { #define CXL_SET_PARTITION_IMMEDIATE_FLAG BIT(0) +/* Set Shutdown State Input Payload CXL 3.2 Spec 8.2.10.9.3.5 Table 8-152 */ +struct cxl_mbox_set_shutdown_state_in { + u8 state; +} __packed; + /* Set Timestamp CXL 3.0 Spec 8.2.9.4.2 */ struct cxl_mbox_set_timestamp_in { __le64 timestamp; @@ -829,6 +834,7 @@ void cxl_event_trace_record(const struct cxl_memdev *cxlmd, enum cxl_event_log_type type, enum cxl_event_type event_type, const uuid_t *uuid, union cxl_event *evt); +int cxl_dirty_shutdown_state(struct cxl_memdev_state *mds); int cxl_set_timestamp(struct cxl_memdev_state *mds); int cxl_poison_state_init(struct cxl_memdev_state *mds); int cxl_mem_get_poison(struct cxl_memdev *cxlmd, u64 offset, u64 len, diff --git a/drivers/cxl/cxlpci.h b/drivers/cxl/cxlpci.h index 4da07727ab9c..54e219b0049e 100644 --- a/drivers/cxl/cxlpci.h +++ b/drivers/cxl/cxlpci.h @@ -40,6 +40,12 @@ /* CXL 2.0 8.1.6: GPF DVSEC for CXL Port */ #define CXL_DVSEC_PORT_GPF 4 +#define CXL_DVSEC_PORT_GPF_PHASE_1_CONTROL_OFFSET 0x0C +#define CXL_DVSEC_PORT_GPF_PHASE_1_TMO_BASE_MASK GENMASK(3, 0) +#define CXL_DVSEC_PORT_GPF_PHASE_1_TMO_SCALE_MASK GENMASK(11, 8) +#define CXL_DVSEC_PORT_GPF_PHASE_2_CONTROL_OFFSET 0xE +#define CXL_DVSEC_PORT_GPF_PHASE_2_TMO_BASE_MASK GENMASK(3, 0) +#define CXL_DVSEC_PORT_GPF_PHASE_2_TMO_SCALE_MASK GENMASK(11, 8) /* CXL 2.0 8.1.7: GPF DVSEC for CXL Device */ #define CXL_DVSEC_DEVICE_GPF 5 diff --git a/drivers/cxl/pmem.c b/drivers/cxl/pmem.c index f9c95996e937..a39e2c52d7ab 100644 --- a/drivers/cxl/pmem.c +++ b/drivers/cxl/pmem.c @@ -85,6 +85,14 @@ static int cxl_nvdimm_probe(struct device *dev) if (!nvdimm) return -ENOMEM; + /* + * Set dirty shutdown now, with the expectation that the device + * clear it upon a successful GPF flow. The exception to this + * is upon Viral detection, per CXL 3.2 section 12.4.2. + */ + if (cxl_dirty_shutdown_state(mds)) + dev_warn(dev, "GPF: could not dirty shutdown state\n"); + dev_set_drvdata(dev, nvdimm); return devm_add_action_or_reset(dev, unregister_nvdimm, nvdimm); } From 021b7e42fa7bc2c30a4bf676355f1079aa0fe6be Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Thu, 20 Feb 2025 14:02:32 -0800 Subject: [PATCH 30/39] cxl/pci: Introduce cxl_gpf_get_dvsec() Add a helper to fetch the port/device GPF dvsecs. This is currently only used for ports, but a later patch to export dirty count to users will make use of the device one. Signed-off-by: Davidlohr Bueso Reviewed-by: Li Ming Reviewed-by: Dave Jiang Reviewed-by: Ira Weiny Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20250220220235.276831-2-dave@stgolabs.net Signed-off-by: Dave Jiang --- drivers/cxl/core/pci.c | 30 ++++++++++++++++++++---------- drivers/cxl/cxl.h | 2 ++ 2 files changed, 22 insertions(+), 10 deletions(-) diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c index a5c65f79db18..96fecb799cbc 100644 --- a/drivers/cxl/core/pci.c +++ b/drivers/cxl/core/pci.c @@ -1072,6 +1072,22 @@ int cxl_pci_get_bandwidth(struct pci_dev *pdev, struct access_coordinate *c) #define GPF_TIMEOUT_BASE_MAX 2 #define GPF_TIMEOUT_SCALE_MAX 7 /* 10 seconds */ +u16 cxl_gpf_get_dvsec(struct device *dev, bool is_port) +{ + u16 dvsec; + + if (!dev_is_pci(dev)) + return 0; + + dvsec = pci_find_dvsec_capability(to_pci_dev(dev), PCI_VENDOR_ID_CXL, + is_port ? CXL_DVSEC_PORT_GPF : CXL_DVSEC_DEVICE_GPF); + if (!dvsec) + dev_warn(dev, "%s GPF DVSEC not present\n", + is_port ? "Port" : "Device"); + return dvsec; +} +EXPORT_SYMBOL_NS_GPL(cxl_gpf_get_dvsec, "CXL"); + static int update_gpf_port_dvsec(struct pci_dev *pdev, int dvsec, int phase) { u64 base, scale; @@ -1116,26 +1132,20 @@ int cxl_gpf_port_setup(struct device *dport_dev, struct cxl_port *port) { struct pci_dev *pdev; - if (!dev_is_pci(dport_dev)) - return 0; - - pdev = to_pci_dev(dport_dev); - if (!pdev || !port) + if (!port) return -EINVAL; if (!port->gpf_dvsec) { int dvsec; - dvsec = pci_find_dvsec_capability(pdev, PCI_VENDOR_ID_CXL, - CXL_DVSEC_PORT_GPF); - if (!dvsec) { - pci_warn(pdev, "Port GPF DVSEC not present\n"); + dvsec = cxl_gpf_get_dvsec(dport_dev, true); + if (!dvsec) return -EINVAL; - } port->gpf_dvsec = dvsec; } + pdev = to_pci_dev(dport_dev); update_gpf_port_dvsec(pdev, port->gpf_dvsec, 1); update_gpf_port_dvsec(pdev, port->gpf_dvsec, 2); diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index 55af041df7b2..fc4edd5536b9 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -922,4 +922,6 @@ bool cxl_endpoint_decoder_reset_detected(struct cxl_port *port); #define __mock static #endif +u16 cxl_gpf_get_dvsec(struct device *dev, bool is_port); + #endif /* __CXL_H__ */ From 86349aaaeacd6855914ee1b5a76ef0952fa134eb Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Thu, 20 Feb 2025 14:02:33 -0800 Subject: [PATCH 31/39] cxl/pmem: Rename cxl_dirty_shutdown_state() ... to a better suited 'cxl_arm_dirty_shutdown()'. Signed-off-by: Davidlohr Bueso Reviewed-by: Dave Jiang Reviewed-by: Li Ming Reviewed-by: Ira Weiny Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20250220220235.276831-3-dave@stgolabs.net Signed-off-by: Dave Jiang --- drivers/cxl/core/mbox.c | 4 ++-- drivers/cxl/cxlmem.h | 2 +- drivers/cxl/pmem.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c index 5b89ae5c5e28..3687c8da1927 100644 --- a/drivers/cxl/core/mbox.c +++ b/drivers/cxl/core/mbox.c @@ -1308,7 +1308,7 @@ int cxl_mem_create_range_info(struct cxl_memdev_state *mds) } EXPORT_SYMBOL_NS_GPL(cxl_mem_create_range_info, "CXL"); -int cxl_dirty_shutdown_state(struct cxl_memdev_state *mds) +int cxl_arm_dirty_shutdown(struct cxl_memdev_state *mds) { struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox; struct cxl_mbox_cmd mbox_cmd; @@ -1324,7 +1324,7 @@ int cxl_dirty_shutdown_state(struct cxl_memdev_state *mds) return cxl_internal_send_cmd(cxl_mbox, &mbox_cmd); } -EXPORT_SYMBOL_NS_GPL(cxl_dirty_shutdown_state, "CXL"); +EXPORT_SYMBOL_NS_GPL(cxl_arm_dirty_shutdown, "CXL"); int cxl_set_timestamp(struct cxl_memdev_state *mds) { diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h index 5d49e0a93426..0e2df6904454 100644 --- a/drivers/cxl/cxlmem.h +++ b/drivers/cxl/cxlmem.h @@ -834,7 +834,7 @@ void cxl_event_trace_record(const struct cxl_memdev *cxlmd, enum cxl_event_log_type type, enum cxl_event_type event_type, const uuid_t *uuid, union cxl_event *evt); -int cxl_dirty_shutdown_state(struct cxl_memdev_state *mds); +int cxl_arm_dirty_shutdown(struct cxl_memdev_state *mds); int cxl_set_timestamp(struct cxl_memdev_state *mds); int cxl_poison_state_init(struct cxl_memdev_state *mds); int cxl_mem_get_poison(struct cxl_memdev *cxlmd, u64 offset, u64 len, diff --git a/drivers/cxl/pmem.c b/drivers/cxl/pmem.c index a39e2c52d7ab..6b284962592f 100644 --- a/drivers/cxl/pmem.c +++ b/drivers/cxl/pmem.c @@ -90,7 +90,7 @@ static int cxl_nvdimm_probe(struct device *dev) * clear it upon a successful GPF flow. The exception to this * is upon Viral detection, per CXL 3.2 section 12.4.2. */ - if (cxl_dirty_shutdown_state(mds)) + if (cxl_arm_dirty_shutdown(mds)) dev_warn(dev, "GPF: could not dirty shutdown state\n"); dev_set_drvdata(dev, nvdimm); From 7d0ecc0bd83dc2b2f46087f955c9572073e45aca Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Thu, 20 Feb 2025 14:02:34 -0800 Subject: [PATCH 32/39] cxl/pmem: Export dirty shutdown count via sysfs Similar to how the acpi_nfit driver exports Optane dirty shutdown count, introduce: /sys/bus/cxl/devices/nvdimm-bridge0/ndbusX/nmemY/cxl/dirty_shutdown Under the conditions that 1) dirty shutdown can be set, 2) Device GPF DVSEC exists, and 3) the count itself can be retrieved. Suggested-by: Dan Williams Signed-off-by: Davidlohr Bueso Reviewed-by: Dave Jiang Reviewed-by: Ira Weiny Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20250220220235.276831-4-dave@stgolabs.net Signed-off-by: Dave Jiang --- Documentation/ABI/testing/sysfs-bus-cxl | 12 +++ Documentation/driver-api/cxl/maturity-map.rst | 2 +- drivers/cxl/core/mbox.c | 21 +++++ drivers/cxl/cxl.h | 1 + drivers/cxl/cxlmem.h | 13 ++++ drivers/cxl/pmem.c | 77 +++++++++++++++++-- 6 files changed, 117 insertions(+), 9 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-bus-cxl b/Documentation/ABI/testing/sysfs-bus-cxl index 3f5627a1210a..a7491d214098 100644 --- a/Documentation/ABI/testing/sysfs-bus-cxl +++ b/Documentation/ABI/testing/sysfs-bus-cxl @@ -586,3 +586,15 @@ Description: See Documentation/ABI/stable/sysfs-devices-node. access0 provides the number to the closest initiator and access1 provides the number to the closest CPU. + + +What: /sys/bus/cxl/devices/nvdimm-bridge0/ndbusX/nmemY/cxl/dirty_shutdown +Date: Feb, 2025 +KernelVersion: v6.15 +Contact: linux-cxl@vger.kernel.org +Description: + (RO) The device dirty shutdown count value, which is the number + of times the device could have incurred in potential data loss. + The count is persistent across power loss and wraps back to 0 + upon overflow. If this file is not present, the device does not + have the necessary support for dirty tracking. diff --git a/Documentation/driver-api/cxl/maturity-map.rst b/Documentation/driver-api/cxl/maturity-map.rst index 99dd2c841e69..a2288f9df658 100644 --- a/Documentation/driver-api/cxl/maturity-map.rst +++ b/Documentation/driver-api/cxl/maturity-map.rst @@ -130,7 +130,7 @@ Mailbox commands * [0] Switch CCI * [3] Timestamp * [1] PMEM labels -* [1] PMEM GPF / Dirty Shutdown +* [3] PMEM GPF / Dirty Shutdown * [0] Scan Media PMU diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c index 3687c8da1927..ec7b70ae5c06 100644 --- a/drivers/cxl/core/mbox.c +++ b/drivers/cxl/core/mbox.c @@ -1308,6 +1308,27 @@ int cxl_mem_create_range_info(struct cxl_memdev_state *mds) } EXPORT_SYMBOL_NS_GPL(cxl_mem_create_range_info, "CXL"); +int cxl_get_dirty_count(struct cxl_memdev_state *mds, u32 *count) +{ + struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox; + struct cxl_mbox_get_health_info_out hi; + struct cxl_mbox_cmd mbox_cmd; + int rc; + + mbox_cmd = (struct cxl_mbox_cmd) { + .opcode = CXL_MBOX_OP_GET_HEALTH_INFO, + .size_out = sizeof(hi), + .payload_out = &hi, + }; + + rc = cxl_internal_send_cmd(cxl_mbox, &mbox_cmd); + if (!rc) + *count = le32_to_cpu(hi.dirty_shutdown_cnt); + + return rc; +} +EXPORT_SYMBOL_NS_GPL(cxl_get_dirty_count, "CXL"); + int cxl_arm_dirty_shutdown(struct cxl_memdev_state *mds) { struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox; diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index fc4edd5536b9..b265347cedce 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -563,6 +563,7 @@ struct cxl_nvdimm { struct device dev; struct cxl_memdev *cxlmd; u8 dev_id[CXL_DEV_ID_LEN]; /* for nvdimm, string of 'serial' */ + u64 dirty_shutdowns; }; struct cxl_pmem_region_mapping { diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h index 0e2df6904454..236e418d26f4 100644 --- a/drivers/cxl/cxlmem.h +++ b/drivers/cxl/cxlmem.h @@ -693,6 +693,18 @@ struct cxl_mbox_set_partition_info { #define CXL_SET_PARTITION_IMMEDIATE_FLAG BIT(0) +/* Get Health Info Output Payload CXL 3.2 Spec 8.2.10.9.3.1 Table 8-148 */ +struct cxl_mbox_get_health_info_out { + u8 health_status; + u8 media_status; + u8 additional_status; + u8 life_used; + __le16 device_temperature; + __le32 dirty_shutdown_cnt; + __le32 corrected_volatile_error_cnt; + __le32 corrected_persistent_error_cnt; +} __packed; + /* Set Shutdown State Input Payload CXL 3.2 Spec 8.2.10.9.3.5 Table 8-152 */ struct cxl_mbox_set_shutdown_state_in { u8 state; @@ -834,6 +846,7 @@ void cxl_event_trace_record(const struct cxl_memdev *cxlmd, enum cxl_event_log_type type, enum cxl_event_type event_type, const uuid_t *uuid, union cxl_event *evt); +int cxl_get_dirty_count(struct cxl_memdev_state *mds, u32 *count); int cxl_arm_dirty_shutdown(struct cxl_memdev_state *mds); int cxl_set_timestamp(struct cxl_memdev_state *mds); int cxl_poison_state_init(struct cxl_memdev_state *mds); diff --git a/drivers/cxl/pmem.c b/drivers/cxl/pmem.c index 6b284962592f..81db5d629049 100644 --- a/drivers/cxl/pmem.c +++ b/drivers/cxl/pmem.c @@ -42,15 +42,44 @@ static ssize_t id_show(struct device *dev, struct device_attribute *attr, char * } static DEVICE_ATTR_RO(id); +static ssize_t dirty_shutdown_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nvdimm *nvdimm = to_nvdimm(dev); + struct cxl_nvdimm *cxl_nvd = nvdimm_provider_data(nvdimm); + + return sysfs_emit(buf, "%llu\n", cxl_nvd->dirty_shutdowns); +} +static DEVICE_ATTR_RO(dirty_shutdown); + static struct attribute *cxl_dimm_attributes[] = { &dev_attr_id.attr, &dev_attr_provider.attr, + &dev_attr_dirty_shutdown.attr, NULL }; +#define CXL_INVALID_DIRTY_SHUTDOWN_COUNT ULLONG_MAX +static umode_t cxl_dimm_visible(struct kobject *kobj, + struct attribute *a, int n) +{ + if (a == &dev_attr_dirty_shutdown.attr) { + struct device *dev = kobj_to_dev(kobj); + struct nvdimm *nvdimm = to_nvdimm(dev); + struct cxl_nvdimm *cxl_nvd = nvdimm_provider_data(nvdimm); + + if (cxl_nvd->dirty_shutdowns == + CXL_INVALID_DIRTY_SHUTDOWN_COUNT) + return 0; + } + + return a->mode; +} + static const struct attribute_group cxl_dimm_attribute_group = { .name = "cxl", .attrs = cxl_dimm_attributes, + .is_visible = cxl_dimm_visible }; static const struct attribute_group *cxl_dimm_attribute_groups[] = { @@ -58,6 +87,38 @@ static const struct attribute_group *cxl_dimm_attribute_groups[] = { NULL }; +static void cxl_nvdimm_arm_dirty_shutdown_tracking(struct cxl_nvdimm *cxl_nvd) +{ + struct cxl_memdev *cxlmd = cxl_nvd->cxlmd; + struct cxl_dev_state *cxlds = cxlmd->cxlds; + struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlds); + struct device *dev = &cxl_nvd->dev; + u32 count; + + /* + * Dirty tracking is enabled and exposed to the user, only when: + * - dirty shutdown on the device can be set, and, + * - the device has a Device GPF DVSEC (albeit unused), and, + * - the Get Health Info cmd can retrieve the device's dirty count. + */ + cxl_nvd->dirty_shutdowns = CXL_INVALID_DIRTY_SHUTDOWN_COUNT; + + if (cxl_arm_dirty_shutdown(mds)) { + dev_warn(dev, "GPF: could not set dirty shutdown state\n"); + return; + } + + if (!cxl_gpf_get_dvsec(cxlds->dev, false)) + return; + + if (cxl_get_dirty_count(mds, &count)) { + dev_warn(dev, "GPF: could not retrieve dirty count\n"); + return; + } + + cxl_nvd->dirty_shutdowns = count; +} + static int cxl_nvdimm_probe(struct device *dev) { struct cxl_nvdimm *cxl_nvd = to_cxl_nvdimm(dev); @@ -78,20 +139,20 @@ static int cxl_nvdimm_probe(struct device *dev) set_bit(ND_CMD_GET_CONFIG_SIZE, &cmd_mask); set_bit(ND_CMD_GET_CONFIG_DATA, &cmd_mask); set_bit(ND_CMD_SET_CONFIG_DATA, &cmd_mask); - nvdimm = __nvdimm_create(cxl_nvb->nvdimm_bus, cxl_nvd, - cxl_dimm_attribute_groups, flags, - cmd_mask, 0, NULL, cxl_nvd->dev_id, - cxl_security_ops, NULL); - if (!nvdimm) - return -ENOMEM; /* * Set dirty shutdown now, with the expectation that the device * clear it upon a successful GPF flow. The exception to this * is upon Viral detection, per CXL 3.2 section 12.4.2. */ - if (cxl_arm_dirty_shutdown(mds)) - dev_warn(dev, "GPF: could not dirty shutdown state\n"); + cxl_nvdimm_arm_dirty_shutdown_tracking(cxl_nvd); + + nvdimm = __nvdimm_create(cxl_nvb->nvdimm_bus, cxl_nvd, + cxl_dimm_attribute_groups, flags, + cmd_mask, 0, NULL, cxl_nvd->dev_id, + cxl_security_ops, NULL); + if (!nvdimm) + return -ENOMEM; dev_set_drvdata(dev, nvdimm); return devm_add_action_or_reset(dev, unregister_nvdimm, nvdimm); From 6eb52f63ea47c6aa7f820262911be47602e12da6 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Thu, 20 Feb 2025 14:02:35 -0800 Subject: [PATCH 33/39] tools/testing/cxl: Set Shutdown State support Add support to emulate the CXL Set Shutdown State operation. Signed-off-by: Davidlohr Bueso Reviewed-by: Dave Jiang Reviewed-by: Li Ming Reviewed-by: Ira Weiny Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20250220220235.276831-5-dave@stgolabs.net Signed-off-by: Dave Jiang --- tools/testing/cxl/test/mem.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/tools/testing/cxl/test/mem.c b/tools/testing/cxl/test/mem.c index 8d731bd63988..c99105dabe30 100644 --- a/tools/testing/cxl/test/mem.c +++ b/tools/testing/cxl/test/mem.c @@ -65,6 +65,10 @@ static struct cxl_cel_entry mock_cel[] = { .opcode = cpu_to_le16(CXL_MBOX_OP_GET_HEALTH_INFO), .effect = CXL_CMD_EFFECT_NONE, }, + { + .opcode = cpu_to_le16(CXL_MBOX_OP_SET_SHUTDOWN_STATE), + .effect = POLICY_CHANGE_IMMEDIATE, + }, { .opcode = cpu_to_le16(CXL_MBOX_OP_GET_POISON), .effect = CXL_CMD_EFFECT_NONE, @@ -161,6 +165,7 @@ struct cxl_mockmem_data { u8 event_buf[SZ_4K]; u64 timestamp; unsigned long sanitize_timeout; + u8 shutdown_state; }; static struct mock_event_log *event_find_log(struct device *dev, int log_type) @@ -1088,6 +1093,21 @@ static int mock_health_info(struct cxl_mbox_cmd *cmd) return 0; } +static int mock_set_shutdown_state(struct cxl_mockmem_data *mdata, + struct cxl_mbox_cmd *cmd) +{ + struct cxl_mbox_set_shutdown_state_in *ss = cmd->payload_in; + + if (cmd->size_in != sizeof(*ss)) + return -EINVAL; + + if (cmd->size_out != 0) + return -EINVAL; + + mdata->shutdown_state = ss->state; + return 0; +} + static struct mock_poison { struct cxl_dev_state *cxlds; u64 dpa; @@ -1421,6 +1441,9 @@ static int cxl_mock_mbox_send(struct cxl_mailbox *cxl_mbox, case CXL_MBOX_OP_PASSPHRASE_SECURE_ERASE: rc = mock_passphrase_secure_erase(mdata, cmd); break; + case CXL_MBOX_OP_SET_SHUTDOWN_STATE: + rc = mock_set_shutdown_state(mdata, cmd); + break; case CXL_MBOX_OP_GET_POISON: rc = mock_get_poison(cxlds, cmd); break; From 84f8b6e242deb997f2b32b4fc0895e8703704029 Mon Sep 17 00:00:00 2001 From: Li Ming Date: Thu, 27 Feb 2025 18:18:48 +0800 Subject: [PATCH 34/39] cxl/mem: Do not return error if CONFIG_CXL_MCE unset CONFIG_CXL_MCE depends on CONFIG_MEMORY_FAILURE, if CONFIG_CXL_MCE is not set, devm_cxl_register_mce_notifier() will return an -EOPNOTSUPP, it will cause cxl_mem_state_create() failure , and then cxl pci device probing failed. In this case, it should not break cxl pci device probing. Add a checking in cxl_mem_state_create() to check if the returned value of devm_cxl_register_mce_notifier() is -EOPNOTSUPP, if yes, just output a warning log, do not let cxl_mem_state_create() return an error. Signed-off-by: Li Ming Reviewed-by: Dave Jiang Reviewed-by: Ira Weiny Link: https://patch.msgid.link/20250227101848.388595-1-ming.li@zohomail.com Signed-off-by: Dave Jiang --- drivers/cxl/core/mbox.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c index e088c6ba1705..7450c4df522e 100644 --- a/drivers/cxl/core/mbox.c +++ b/drivers/cxl/core/mbox.c @@ -1473,7 +1473,9 @@ struct cxl_memdev_state *cxl_memdev_state_create(struct device *dev) mds->cxlds.type = CXL_DEVTYPE_CLASSMEM; rc = devm_cxl_register_mce_notifier(dev, &mds->mce_notifier); - if (rc) + if (rc == -EOPNOTSUPP) + dev_warn(dev, "CXL MCE unsupported\n"); + else if (rc) return ERR_PTR(rc); return mds; From 114a89b433aa899cdcc867bb26806a41aae505ee Mon Sep 17 00:00:00 2001 From: Alison Schofield Date: Wed, 26 Feb 2025 14:19:27 -0800 Subject: [PATCH 35/39] cxl/test: Define a CFMWS capable of a 3 way HB interleave The CXL unit test cxl-xor-region.sh is skipping a 1+1+1 region interleave test case because the window is not defined. Additionally, upcoming expansion of 3 way HB interleave test cases (like 2+2+2) require the same window. Replace an unused CFMWS with a 3-way capable CFMWS in the set of CFMWS's loaded when interleave_arithmetic=1. Signed-off-by: Alison Schofield Reviewed-by: Dave Jiang Reviewed-by: Ira Weiny Tested-by: Li Zhijian Link: https://patch.msgid.link/20250226221931.2352061-1-alison.schofield@intel.com Signed-off-by: Dave Jiang --- tools/testing/cxl/test/cxl.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tools/testing/cxl/test/cxl.c b/tools/testing/cxl/test/cxl.c index 083a66a52731..1c3336095923 100644 --- a/tools/testing/cxl/test/cxl.c +++ b/tools/testing/cxl/test/cxl.c @@ -155,7 +155,7 @@ static struct { } cfmws7; struct { struct acpi_cedt_cfmws cfmws; - u32 target[4]; + u32 target[3]; } cfmws8; struct { struct acpi_cedt_cxims cxims; @@ -331,14 +331,14 @@ static struct { .length = sizeof(mock_cedt.cfmws8), }, .interleave_arithmetic = ACPI_CEDT_CFMWS_ARITHMETIC_XOR, - .interleave_ways = 2, - .granularity = 0, + .interleave_ways = 8, + .granularity = 1, .restrictions = ACPI_CEDT_CFMWS_RESTRICT_TYPE3 | ACPI_CEDT_CFMWS_RESTRICT_PMEM, .qtg_id = FAKE_QTG_ID, - .window_size = SZ_256M * 16UL, + .window_size = SZ_512M * 6UL, }, - .target = { 0, 1, 0, 1, }, + .target = { 0, 1, 2, }, }, .cxims0 = { .cxims = { From 3d3e3b94440631179b7b6ffb1a64b944b27519c1 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Fri, 28 Feb 2025 13:47:39 -0700 Subject: [PATCH 36/39] cxl: Fix warning from emitting resource_size_t as long long int on 32bit systems Reported by kernel test bot from an ARM build: drivers/cxl/core/region.c:3263:26: warning: format '%lld' expects argument of type 'long long int', but argument 3 has type 'resource_size_t' {aka 'unsigned int'} [-Wformat=] On a 32bit system, resource_size_t is defined as 32bit long vs on a 64bit system it is defined as 64bit long. Use %pa format to deal with resource_size_t. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202503010252.mIDhZ5kY-lkp@intel.com/ Fixes: 0ec9849b6333 ("acpi/hmat / cxl: Add extended linear cache support for CXL") Reviewed-by: Alison Schofield Link: https://patch.msgid.link/20250228204739.3849309-1-dave.jiang@intel.com Signed-off-by: Dave Jiang --- drivers/cxl/core/region.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index 80ba19cf3094..a09ae21a26c6 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -3251,8 +3251,8 @@ static int cxl_extended_linear_cache_resize(struct cxl_region *cxlr, if (size != cache_size) { dev_warn(&cxlr->dev, - "Extended Linear Cache size %lld != CXL size %lld. No Support!", - cache_size, size); + "Extended Linear Cache size %pa != CXL size %pa. No Support!", + &cache_size, &size); return -EOPNOTSUPP; } From 962ac4c83e81e38b0761f31b500b398cfbc33857 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Mon, 24 Feb 2025 12:29:29 -0600 Subject: [PATCH 37/39] cxl/Documentation: Remove 'mixed' from sysfs mode doc Commit 188e9529a606 ("cxl: Remove the CXL_DECODER_MIXED mistake") removed the mixed mode. Remove it from the sysfs documentation. Cc: Dan Williams Signed-off-by: Ira Weiny Reviewed-by: Jonathan Cameron Reviewed-by: Dave Jiang Link: https://patch.msgid.link/20250224-remove-mixed-sysfs-v1-1-a329db313dac@intel.com Signed-off-by: Dave Jiang --- Documentation/ABI/testing/sysfs-bus-cxl | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-bus-cxl b/Documentation/ABI/testing/sysfs-bus-cxl index 6d911f046a78..99bb3faf7a0e 100644 --- a/Documentation/ABI/testing/sysfs-bus-cxl +++ b/Documentation/ABI/testing/sysfs-bus-cxl @@ -339,14 +339,13 @@ KernelVersion: v6.0 Contact: linux-cxl@vger.kernel.org Description: (RW) When a CXL decoder is of devtype "cxl_decoder_endpoint" it - translates from a host physical address range, to a device local - address range. Device-local address ranges are further split - into a 'ram' (volatile memory) range and 'pmem' (persistent - memory) range. The 'mode' attribute emits one of 'ram', 'pmem', - 'mixed', or 'none'. The 'mixed' indication is for error cases - when a decoder straddles the volatile/persistent partition - boundary, and 'none' indicates the decoder is not actively - decoding, or no DPA allocation policy has been set. + translates from a host physical address range, to a device + local address range. Device-local address ranges are further + split into a 'ram' (volatile memory) range and 'pmem' + (persistent memory) range. The 'mode' attribute emits one of + 'ram', 'pmem', or 'none'. The 'none' indicates the decoder is + not actively decoding, or no DPA allocation policy has been + set. 'mode' can be written, when the decoder is in the 'disabled' state, with either 'ram' or 'pmem' to set the boundaries for the From 74d9c59658e4d3b06f163da0c5ed7647656705c1 Mon Sep 17 00:00:00 2001 From: Alison Schofield Date: Thu, 6 Mar 2025 13:36:51 -0800 Subject: [PATCH 38/39] cxl/region: Quiet some dev_warn()s in extended linear cache setup Extended Linear Cache (ELC) setup code emits a dev_warn(), "Extended linear cache calculation failed." for issues found while setting up the ELC. For platforms without CONFIG_ACPI_HMAT, every auto region setup will emit the warning because the default !ACPI_HMAT return value is EOPNOTSUPP. Suppress it by skipping the warn for EOPNOTSUPP. Change the EOPNOTSUPP in the actual ELC failure path to ENXIO. Remove the check and enusing dev_warn() when region resource size is NULL. The endpoint decoders hpa_range used to create the resource is checked in init_hdm_decoder(), so it cannot be NULL here. For good measure, add the rc value to the dev_warn(). It will either be the -ENOENT returned by HMAT if the mem target is not found, or the -ENXIO from the region driver calculation. Reviewed-by: Li Ming Signed-off-by: Alison Schofield Link: https://patch.msgid.link/20250306213700.2606304-1-alison.schofield@intel.com Signed-off-by: Dave Jiang --- drivers/cxl/core/region.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index a09ae21a26c6..6d8bdb53f258 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -3235,13 +3235,10 @@ static int cxl_extended_linear_cache_resize(struct cxl_region *cxlr, struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(cxlr->dev.parent); struct cxl_region_params *p = &cxlr->params; int nid = phys_to_target_node(res->start); - resource_size_t size, cache_size, start; + resource_size_t size = resource_size(res); + resource_size_t cache_size, start; int rc; - size = resource_size(res); - if (!size) - return -EINVAL; - rc = cxl_acpi_get_extended_linear_cache_size(res, nid, &cache_size); if (rc) return rc; @@ -3253,7 +3250,7 @@ static int cxl_extended_linear_cache_resize(struct cxl_region *cxlr, dev_warn(&cxlr->dev, "Extended Linear Cache size %pa != CXL size %pa. No Support!", &cache_size, &size); - return -EOPNOTSUPP; + return -ENXIO; } /* @@ -3305,14 +3302,14 @@ static int __construct_region(struct cxl_region *cxlr, dev_name(&cxlr->dev)); rc = cxl_extended_linear_cache_resize(cxlr, res); - if (rc) { + if (rc && rc != -EOPNOTSUPP) { /* * Failing to support extended linear cache region resize does not * prevent the region from functioning. Only causes cxl list showing * incorrect region size. */ dev_warn(cxlmd->dev.parent, - "Extended linear cache calculation failed.\n"); + "Extended linear cache calculation failed rc:%d\n", rc); } rc = insert_resource(cxlrd->res, res); From aae0594a7053c60b82621136257c8b648c67b512 Mon Sep 17 00:00:00 2001 From: Li Ming Date: Mon, 17 Mar 2025 15:01:24 +0800 Subject: [PATCH 39/39] cxl/region: Fix the first aliased address miscalculation In extended linear cache(ELC) case, cxl_port_get_spa_cache_alias() helps to get the aliased address of a SPA, it considers the first address in CXL memory range is "region start + region cache size + 1", but it should be "region start + region cache size". So if a SPA is equal to "region start + region cache size", its aliased address should be "SPA - region cache size". Signed-off-by: Li Ming Reviewed-by: Alison Schofield Reviewed-by: Ira Weiny Link: https://patch.msgid.link/20250317070124.815028-1-ming.li@zohomail.com Signed-off-by: Dave Jiang --- drivers/cxl/core/region.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index 6d8bdb53f258..c3f4dc244df7 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -3460,7 +3460,7 @@ u64 cxl_port_get_spa_cache_alias(struct cxl_port *endpoint, u64 spa) if (!p->cache_size) return ~0ULL; - if (spa > p->res->start + p->cache_size) + if (spa >= p->res->start + p->cache_size) return spa - p->cache_size; return spa + p->cache_size;