mirror of
git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2025-11-01 09:13:37 +00:00
CXL changes for v6.16
- Remove always true condition in cxl features code.
- Add verification of CHBS length for CXL 2.0
- Ignore interleave granularity when interleave ways is 1
- Add update addressing mising MODULE_DESCRIPTION for cxl_test
- A series of cleanups/refactor to prep for AMD Zen5 translate code
- Clean %pa debug printk in core/hdm.c
- Documentation updates
- Update to CXL Maturity Map
- Fixes to source linking in CXL documentation
- CXL documentation fixes, spelling corrections
- A large collection of CXL documentation for the entire CXL subsystem, including
documentation on CXL related platform and firmware notes
- Remove redundant code of cxlctl_get_supported_features()
- Series to support CXL RAS Features
- Including "Patrol Scrub Control", "Error Check Scrub", "Performance Maitenance"
and "Memory Sparing". The series connects CXL to EDAC.
-----BEGIN PGP SIGNATURE-----
iQIzBAABCAAdFiEE5DAy15EJMCV1R6v9YGjFFmlTOEoFAmg/EX0ACgkQYGjFFmlT
OErkxA/+MuvYH6PjdSwbJiJUcLCrq0gdNczX7t+CAo3v4rNs5CrOJSKuXMxGVIpf
lCjwS/J7j4XOa9DigDO1Dxl4PWG/R0K3HjbHQJLVjy3jmsVr5GgJVt4s5EqS78Or
QoM9d/2dq8Q6dqk89Z4rFY2JlAmXcibe+lz2m9k5vy8KPQvrZI1KruZMG0qN1rWC
SBa+eUWW49MP3Ab6pBDRRCI7EPcJ44QF+49SWXrkkiJjll/OTtYu3V1JymaPV4zT
/UM/CwHLnmb5odUfOx5EZJcZIZzqasBD28Xu6Y6Vjs2pgTNPr9VNGCs+8lLwQCg7
1O8cyPjPa1p5HwKo2INJfM1Xdpo1Nqar1qGcSPVJKk0+a538i07YuXR3uWqnJ+mO
uplJwvtL1Rvg9h0C0fHwfB86Tl5poFIDn0zeZQ4tqMWH6y2wged5PcS5RSVuVYdX
CHEzAvp1RreQvXd9KcSo6ITKbIvv5PplfcyTiftG0R71ewXYOpB386D7ihh8ZvvO
Y6TJN1nXUv8w3ve7rbs3T2ncX+BbhHRKSnXTp3rbkXTQLF5t+2gjzVuwSSEH71Ps
4bD+7EeE0JGSz76qLbfQPNt6l3HnG6ctycpAydsUs/YmIOnciKpT5R9OGwncKHrT
/Ccx+9uN5+CizqsKCi+rxadoOw7Pk4bKmo0a4wCZ5sDPGix0g18=
=evKj
-----END PGP SIGNATURE-----
Merge tag 'cxl-for-6.16' of git://git.kernel.org/pub/scm/linux/kernel/git/cxl/cxl
Pull Compute Express Link (CXL) updates from Dave Jiang:
- Remove always true condition in cxl features code
- Add verification of CHBS length for CXL 2.0
- Ignore interleave granularity when interleave ways is 1
- Add update addressing mising MODULE_DESCRIPTION for cxl_test
- A series of cleanups/refactor to prep for AMD Zen5 translate code
- Clean %pa debug printk in core/hdm.c
- Documentation updates:
- Update to CXL Maturity Map
- Fixes to source linking in CXL documentation
- CXL documentation fixes, spelling corrections
- A large collection of CXL documentation for the entire CXL
subsystem, including documentation on CXL related platform and
firmware notes
- Remove redundant code of cxlctl_get_supported_features()
- Series to support CXL RAS Features
- Including "Patrol Scrub Control", "Error Check Scrub",
"Performance Maitenance" and "Memory Sparing". The series
connects CXL to EDAC.
* tag 'cxl-for-6.16' of git://git.kernel.org/pub/scm/linux/kernel/git/cxl/cxl: (53 commits)
cxl/edac: Add CXL memory device soft PPR control feature
cxl/edac: Add CXL memory device memory sparing control feature
cxl/edac: Support for finding memory operation attributes from the current boot
cxl/edac: Add support for PERFORM_MAINTENANCE command
cxl/edac: Add CXL memory device ECS control feature
cxl/edac: Add CXL memory device patrol scrub control feature
cxl: Update prototype of function get_support_feature_info()
EDAC: Update documentation for the CXL memory patrol scrub control feature
cxl/features: Remove the inline specifier from to_cxlfs()
cxl/feature: Remove redundant code of get supported features
docs: ABI: Fix "firwmare" to "firmware"
cxl/Documentation: Fix typo in sysfs write_bandwidth attribute path
cxl: doc/linux/access-coordinates Update access coordinates calculation methods
cxl: docs/platform/acpi/srat Add generic target documentation
cxl: docs/platform/cdat reference documentation
Documentation: Update the CXL Maturity Map
cxl: Sync up the driver-api/cxl documentation
cxl: docs - add self-referencing cross-links
cxl: docs/allocation/hugepages
cxl: docs/allocation/reclaim
...
This commit is contained in:
commit
29e9359005
59 changed files with 6770 additions and 267 deletions
|
|
@ -242,7 +242,7 @@ Description:
|
|||
decoding a Host Physical Address range. Note that this number
|
||||
may be elevated without any regionX objects active or even
|
||||
enumerated, as this may be due to decoders established by
|
||||
platform firwmare or a previous kernel (kexec).
|
||||
platform firmware or a previous kernel (kexec).
|
||||
|
||||
|
||||
What: /sys/bus/cxl/devices/decoderX.Y
|
||||
|
|
@ -572,7 +572,7 @@ Description:
|
|||
|
||||
|
||||
What: /sys/bus/cxl/devices/regionZ/accessY/read_bandwidth
|
||||
/sys/bus/cxl/devices/regionZ/accessY/write_banwidth
|
||||
/sys/bus/cxl/devices/regionZ/accessY/write_bandwidth
|
||||
Date: Jan, 2024
|
||||
KernelVersion: v6.9
|
||||
Contact: linux-cxl@vger.kernel.org
|
||||
|
|
|
|||
|
|
@ -1,91 +0,0 @@
|
|||
.. SPDX-License-Identifier: GPL-2.0
|
||||
.. include:: <isonum.txt>
|
||||
|
||||
==================================
|
||||
CXL Access Coordinates Computation
|
||||
==================================
|
||||
|
||||
Shared Upstream Link Calculation
|
||||
================================
|
||||
For certain CXL region construction with endpoints behind CXL switches (SW) or
|
||||
Root Ports (RP), there is the possibility of the total bandwidth for all
|
||||
the endpoints behind a switch being more than the switch upstream link.
|
||||
A similar situation can occur within the host, upstream of the root ports.
|
||||
The CXL driver performs an additional pass after all the targets have
|
||||
arrived for a region in order to recalculate the bandwidths with possible
|
||||
upstream link being a limiting factor in mind.
|
||||
|
||||
The algorithm assumes the configuration is a symmetric topology as that
|
||||
maximizes performance. When asymmetric topology is detected, the calculation
|
||||
is aborted. An asymmetric topology is detected during topology walk where the
|
||||
number of RPs detected as a grandparent is not equal to the number of devices
|
||||
iterated in the same iteration loop. The assumption is made that subtle
|
||||
asymmetry in properties does not happen and all paths to EPs are equal.
|
||||
|
||||
There can be multiple switches under an RP. There can be multiple RPs under
|
||||
a CXL Host Bridge (HB). There can be multiple HBs under a CXL Fixed Memory
|
||||
Window Structure (CFMWS).
|
||||
|
||||
An example hierarchy:
|
||||
|
||||
> CFMWS 0
|
||||
> |
|
||||
> _________|_________
|
||||
> | |
|
||||
> ACPI0017-0 ACPI0017-1
|
||||
> GP0/HB0/ACPI0016-0 GP1/HB1/ACPI0016-1
|
||||
> | | | |
|
||||
> RP0 RP1 RP2 RP3
|
||||
> | | | |
|
||||
> SW 0 SW 1 SW 2 SW 3
|
||||
> | | | | | | | |
|
||||
> EP0 EP1 EP2 EP3 EP4 EP5 EP6 EP7
|
||||
|
||||
Computation for the example hierarchy:
|
||||
|
||||
Min (GP0 to CPU BW,
|
||||
Min(SW 0 Upstream Link to RP0 BW,
|
||||
Min(SW0SSLBIS for SW0DSP0 (EP0), EP0 DSLBIS, EP0 Upstream Link) +
|
||||
Min(SW0SSLBIS for SW0DSP1 (EP1), EP1 DSLBIS, EP1 Upstream link)) +
|
||||
Min(SW 1 Upstream Link to RP1 BW,
|
||||
Min(SW1SSLBIS for SW1DSP0 (EP2), EP2 DSLBIS, EP2 Upstream Link) +
|
||||
Min(SW1SSLBIS for SW1DSP1 (EP3), EP3 DSLBIS, EP3 Upstream link))) +
|
||||
Min (GP1 to CPU BW,
|
||||
Min(SW 2 Upstream Link to RP2 BW,
|
||||
Min(SW2SSLBIS for SW2DSP0 (EP4), EP4 DSLBIS, EP4 Upstream Link) +
|
||||
Min(SW2SSLBIS for SW2DSP1 (EP5), EP5 DSLBIS, EP5 Upstream link)) +
|
||||
Min(SW 3 Upstream Link to RP3 BW,
|
||||
Min(SW3SSLBIS for SW3DSP0 (EP6), EP6 DSLBIS, EP6 Upstream Link) +
|
||||
Min(SW3SSLBIS for SW3DSP1 (EP7), EP7 DSLBIS, EP7 Upstream link))))
|
||||
|
||||
The calculation starts at cxl_region_shared_upstream_perf_update(). A xarray
|
||||
is created to collect all the endpoint bandwidths via the
|
||||
cxl_endpoint_gather_bandwidth() function. The min() of bandwidth from the
|
||||
endpoint CDAT and the upstream link bandwidth is calculated. If the endpoint
|
||||
has a CXL switch as a parent, then min() of calculated bandwidth and the
|
||||
bandwidth from the SSLBIS for the switch downstream port that is associated
|
||||
with the endpoint is calculated. The final bandwidth is stored in a
|
||||
'struct cxl_perf_ctx' in the xarray indexed by a device pointer. If the
|
||||
endpoint is direct attached to a root port (RP), the device pointer would be an
|
||||
RP device. If the endpoint is behind a switch, the device pointer would be the
|
||||
upstream device of the parent switch.
|
||||
|
||||
At the next stage, the code walks through one or more switches if they exist
|
||||
in the topology. For endpoints directly attached to RPs, this step is skipped.
|
||||
If there is another switch upstream, the code takes the min() of the current
|
||||
gathered bandwidth and the upstream link bandwidth. If there's a switch
|
||||
upstream, then the SSLBIS of the upstream switch.
|
||||
|
||||
Once the topology walk reaches the RP, whether it's direct attached endpoints
|
||||
or walking through the switch(es), cxl_rp_gather_bandwidth() is called. At
|
||||
this point all the bandwidths are aggregated per each host bridge, which is
|
||||
also the index for the resulting xarray.
|
||||
|
||||
The next step is to take the min() of the per host bridge bandwidth and the
|
||||
bandwidth from the Generic Port (GP). The bandwidths for the GP is retrieved
|
||||
via ACPI tables SRAT/HMAT. The min bandwidth are aggregated under the same
|
||||
ACPI0017 device to form a new xarray.
|
||||
|
||||
Finally, the cxl_region_update_bandwidth() is called and the aggregated
|
||||
bandwidth from all the members of the last xarray is updated for the
|
||||
access coordinates residing in the cxl region (cxlr) context.
|
||||
60
Documentation/driver-api/cxl/allocation/dax.rst
Normal file
60
Documentation/driver-api/cxl/allocation/dax.rst
Normal file
|
|
@ -0,0 +1,60 @@
|
|||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
===========
|
||||
DAX Devices
|
||||
===========
|
||||
CXL capacity exposed as a DAX device can be accessed directly via mmap.
|
||||
Users may wish to use this interface mechanism to write their own userland
|
||||
CXL allocator, or to managed shared or persistent memory regions across multiple
|
||||
hosts.
|
||||
|
||||
If the capacity is shared across hosts or persistent, appropriate flushing
|
||||
mechanisms must be employed unless the region supports Snoop Back-Invalidate.
|
||||
|
||||
Note that mappings must be aligned (size and base) to the dax device's base
|
||||
alignment, which is typically 2MB - but maybe be configured larger.
|
||||
|
||||
::
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <sys/mman.h>
|
||||
#include <fcntl.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#define DEVICE_PATH "/dev/dax0.0" // Replace DAX device path
|
||||
#define DEVICE_SIZE (4ULL * 1024 * 1024 * 1024) // 4GB
|
||||
|
||||
int main() {
|
||||
int fd;
|
||||
void* mapped_addr;
|
||||
|
||||
/* Open the DAX device */
|
||||
fd = open(DEVICE_PATH, O_RDWR);
|
||||
if (fd < 0) {
|
||||
perror("open");
|
||||
return -1;
|
||||
}
|
||||
|
||||
/* Map the device into memory */
|
||||
mapped_addr = mmap(NULL, DEVICE_SIZE, PROT_READ | PROT_WRITE,
|
||||
MAP_SHARED, fd, 0);
|
||||
if (mapped_addr == MAP_FAILED) {
|
||||
perror("mmap");
|
||||
close(fd);
|
||||
return -1;
|
||||
}
|
||||
|
||||
printf("Mapped address: %p\n", mapped_addr);
|
||||
|
||||
/* You can now access the device through the mapped address */
|
||||
uint64_t* ptr = (uint64_t*)mapped_addr;
|
||||
*ptr = 0x1234567890abcdef; // Write a value to the device
|
||||
printf("Value at address %p: 0x%016llx\n", ptr, *ptr);
|
||||
|
||||
/* Clean up */
|
||||
munmap(mapped_addr, DEVICE_SIZE);
|
||||
close(fd);
|
||||
return 0;
|
||||
}
|
||||
32
Documentation/driver-api/cxl/allocation/hugepages.rst
Normal file
32
Documentation/driver-api/cxl/allocation/hugepages.rst
Normal file
|
|
@ -0,0 +1,32 @@
|
|||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
==========
|
||||
Huge Pages
|
||||
==========
|
||||
|
||||
Contiguous Memory Allocator
|
||||
===========================
|
||||
CXL Memory onlined as SystemRAM during early boot is eligible for use by CMA,
|
||||
as the NUMA node hosting that capacity will be `Online` at the time CMA
|
||||
carves out contiguous capacity.
|
||||
|
||||
CXL Memory deferred to the CXL Driver for configuration cannot have its
|
||||
capacity allocated by CMA - as the NUMA node hosting the capacity is `Offline`
|
||||
at :code:`__init` time - when CMA carves out contiguous capacity.
|
||||
|
||||
HugeTLB
|
||||
=======
|
||||
Different huge page sizes allow different memory configurations.
|
||||
|
||||
2MB Huge Pages
|
||||
--------------
|
||||
All CXL capacity regardless of configuration time or memory zone is eligible
|
||||
for use as 2MB huge pages.
|
||||
|
||||
1GB Huge Pages
|
||||
--------------
|
||||
CXL capacity onlined in :code:`ZONE_NORMAL` is eligible for 1GB Gigantic Page
|
||||
allocation.
|
||||
|
||||
CXL capacity onlined in :code:`ZONE_MOVABLE` is not eligible for 1GB Gigantic
|
||||
Page allocation.
|
||||
85
Documentation/driver-api/cxl/allocation/page-allocator.rst
Normal file
85
Documentation/driver-api/cxl/allocation/page-allocator.rst
Normal file
|
|
@ -0,0 +1,85 @@
|
|||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
==================
|
||||
The Page Allocator
|
||||
==================
|
||||
|
||||
The kernel page allocator services all general page allocation requests, such
|
||||
as :code:`kmalloc`. CXL configuration steps affect the behavior of the page
|
||||
allocator based on the selected `Memory Zone` and `NUMA node` the capacity is
|
||||
placed in.
|
||||
|
||||
This section mostly focuses on how these configurations affect the page
|
||||
allocator (as of Linux v6.15) rather than the overall page allocator behavior.
|
||||
|
||||
NUMA nodes and mempolicy
|
||||
========================
|
||||
Unless a task explicitly registers a mempolicy, the default memory policy
|
||||
of the linux kernel is to allocate memory from the `local NUMA node` first,
|
||||
and fall back to other nodes only if the local node is pressured.
|
||||
|
||||
Generally, we expect to see local DRAM and CXL memory on separate NUMA nodes,
|
||||
with the CXL memory being non-local. Technically, however, it is possible
|
||||
for a compute node to have no local DRAM, and for CXL memory to be the
|
||||
`local` capacity for that compute node.
|
||||
|
||||
|
||||
Memory Zones
|
||||
============
|
||||
CXL capacity may be onlined in :code:`ZONE_NORMAL` or :code:`ZONE_MOVABLE`.
|
||||
|
||||
As of v6.15, the page allocator attempts to allocate from the highest
|
||||
available and compatible ZONE for an allocation from the local node first.
|
||||
|
||||
An example of a `zone incompatibility` is attempting to service an allocation
|
||||
marked :code:`GFP_KERNEL` from :code:`ZONE_MOVABLE`. Kernel allocations are
|
||||
typically not migratable, and as a result can only be serviced from
|
||||
:code:`ZONE_NORMAL` or lower.
|
||||
|
||||
To simplify this, the page allocator will prefer :code:`ZONE_MOVABLE` over
|
||||
:code:`ZONE_NORMAL` by default, but if :code:`ZONE_MOVABLE` is depleted, it
|
||||
will fallback to allocate from :code:`ZONE_NORMAL`.
|
||||
|
||||
|
||||
Zone and Node Quirks
|
||||
====================
|
||||
Let's consider a configuration where the local DRAM capacity is largely onlined
|
||||
into :code:`ZONE_NORMAL`, with no :code:`ZONE_MOVABLE` capacity present. The
|
||||
CXL capacity has the opposite configuration - all onlined in
|
||||
:code:`ZONE_MOVABLE`.
|
||||
|
||||
Under the default allocation policy, the page allocator will completely skip
|
||||
:code:`ZONE_MOVABLE` as a valid allocation target. This is because, as of
|
||||
Linux v6.15, the page allocator does (approximately) the following: ::
|
||||
|
||||
for (each zone in local_node):
|
||||
|
||||
for (each node in fallback_order):
|
||||
|
||||
attempt_allocation(gfp_flags);
|
||||
|
||||
Because the local node does not have :code:`ZONE_MOVABLE`, the CXL node is
|
||||
functionally unreachable for direct allocation. As a result, the only way
|
||||
for CXL capacity to be used is via `demotion` in the reclaim path.
|
||||
|
||||
This configuration also means that if the DRAM ndoe has :code:`ZONE_MOVABLE`
|
||||
capacity - when that capacity is depleted, the page allocator will actually
|
||||
prefer CXL :code:`ZONE_MOVABLE` pages over DRAM :code:`ZONE_NORMAL` pages.
|
||||
|
||||
We may wish to invert this priority in future Linux versions.
|
||||
|
||||
If `demotion` and `swap` are disabled, Linux will begin to cause OOM crashes
|
||||
when the DRAM nodes are depleted. See the reclaim section for more details.
|
||||
|
||||
|
||||
CGroups and CPUSets
|
||||
===================
|
||||
Finally, assuming CXL memory is reachable via the page allocation (i.e. onlined
|
||||
in :code:`ZONE_NORMAL`), the :code:`cpusets.mems_allowed` may be used by
|
||||
containers to limit the accessibility of certain NUMA nodes for tasks in that
|
||||
container. Users may wish to utilize this in multi-tenant systems where some
|
||||
tasks prefer not to use slower memory.
|
||||
|
||||
In the reclaim section we'll discuss some limitations of this interface to
|
||||
prevent demotions of shared data to CXL memory (if demotions are enabled).
|
||||
|
||||
51
Documentation/driver-api/cxl/allocation/reclaim.rst
Normal file
51
Documentation/driver-api/cxl/allocation/reclaim.rst
Normal file
|
|
@ -0,0 +1,51 @@
|
|||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
=======
|
||||
Reclaim
|
||||
=======
|
||||
Another way CXL memory can be utilized *indirectly* is via the reclaim system
|
||||
in :code:`mm/vmscan.c`. Reclaim is engaged when memory capacity on the system
|
||||
becomes pressured based on global and cgroup-local `watermark` settings.
|
||||
|
||||
In this section we won't discuss the `watermark` configurations, just how CXL
|
||||
memory can be consumed by various pieces of reclaim system.
|
||||
|
||||
Demotion
|
||||
========
|
||||
By default, the reclaim system will prefer swap (or zswap) when reclaiming
|
||||
memory. Enabling :code:`kernel/mm/numa/demotion_enabled` will cause vmscan
|
||||
to opportunistically prefer distant NUMA nodes to swap or zswap, if capacity
|
||||
is available.
|
||||
|
||||
Demotion engages the :code:`mm/memory_tier.c` component to determine the
|
||||
next demotion node. The next demotion node is based on the :code:`HMAT`
|
||||
or :code:`CDAT` performance data.
|
||||
|
||||
cpusets.mems_allowed quirk
|
||||
--------------------------
|
||||
In Linux v6.15 and below, demotion does not respect :code:`cpusets.mems_allowed`
|
||||
when migrating pages. As a result, if demotion is enabled, vmscan cannot
|
||||
guarantee isolation of a container's memory from nodes not set in mems_allowed.
|
||||
|
||||
In Linux v6.XX and up, demotion does attempt to respect
|
||||
:code:`cpusets.mems_allowed`; however, certain classes of shared memory
|
||||
originally instantiated by another cgroup (such as common libraries - e.g.
|
||||
libc) may still be demoted. As a result, the mems_allowed interface still
|
||||
cannot provide perfect isolation from the remote nodes.
|
||||
|
||||
ZSwap and Node Preference
|
||||
=========================
|
||||
In Linux v6.15 and below, ZSwap allocates memory from the local node of the
|
||||
processor for the new pages being compressed. Since pages being compressed
|
||||
are typically cold, the result is a cold page becomes promoted - only to
|
||||
be later demoted as it ages off the LRU.
|
||||
|
||||
In Linux v6.XX, ZSwap tries to prefer the node of the page being compressed
|
||||
as the allocation target for the compression page. This helps prevent
|
||||
thrashing.
|
||||
|
||||
Demotion with ZSwap
|
||||
===================
|
||||
When enabling both Demotion and ZSwap, you create a situation where ZSwap
|
||||
will prefer the slowest form of CXL memory by default until that tier of
|
||||
memory is exhausted.
|
||||
165
Documentation/driver-api/cxl/devices/device-types.rst
Normal file
165
Documentation/driver-api/cxl/devices/device-types.rst
Normal file
|
|
@ -0,0 +1,165 @@
|
|||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
=====================
|
||||
Devices and Protocols
|
||||
=====================
|
||||
|
||||
The type of CXL device (Memory, Accelerator, etc) dictates many configuration steps. This section
|
||||
covers some basic background on device types and on-device resources used by the platform and OS
|
||||
which impact configuration.
|
||||
|
||||
Protocols
|
||||
=========
|
||||
|
||||
There are three core protocols to CXL. For the purpose of this documentation,
|
||||
we will only discuss very high level definitions as the specific hardware
|
||||
details are largely abstracted away from Linux. See the CXL specification
|
||||
for more details.
|
||||
|
||||
CXL.io
|
||||
------
|
||||
The basic interaction protocol, similar to PCIe configuration mechanisms.
|
||||
Typically used for initialization, configuration, and I/O access for anything
|
||||
other than memory (CXL.mem) or cache (CXL.cache) operations.
|
||||
|
||||
The Linux CXL driver exposes access to .io functionalty via the various sysfs
|
||||
interfaces and /dev/cxl/ devices (which exposes direct access to device
|
||||
mailboxes).
|
||||
|
||||
CXL.cache
|
||||
---------
|
||||
The mechanism by which a device may coherently access and cache host memory.
|
||||
|
||||
Largely transparent to Linux once configured.
|
||||
|
||||
CXL.mem
|
||||
---------
|
||||
The mechanism by which the CPU may coherently access and cache device memory.
|
||||
|
||||
Largely transparent to Linux once configured.
|
||||
|
||||
|
||||
Device Types
|
||||
============
|
||||
|
||||
Type-1
|
||||
------
|
||||
|
||||
A Type-1 CXL device:
|
||||
|
||||
* Supports cxl.io and cxl.cache protocols
|
||||
* Implements a fully coherent cache
|
||||
* Allows Device-to-Host coherence and Host-to-Device snoops.
|
||||
* Does NOT have host-managed device memory (HDM)
|
||||
|
||||
Typical examples of type-1 devices is a Smart NIC - which may want to
|
||||
directly operate on host-memory (DMA) to store incoming packets. These
|
||||
devices largely rely on CPU-attached memory.
|
||||
|
||||
Type-2
|
||||
------
|
||||
|
||||
A Type-2 CXL Device:
|
||||
|
||||
* Supports cxl.io, cxl.cache, and cxl.mem protocols
|
||||
* Optionally implements coherent cache and Host-Managed Device Memory
|
||||
* Is typically an accelerator device w/ high bandwidth memory.
|
||||
|
||||
The primary difference between a type-1 and type-2 device is the presence
|
||||
of host-managed device memory, which allows the device to operate on a
|
||||
local memory bank - while the CPU sill has coherent DMA to the same memory.
|
||||
|
||||
The allows things like GPUs to expose their memory via DAX devices or file
|
||||
descriptors, allows drivers and programs direct access to device memory
|
||||
rather than use block-transfer semantics.
|
||||
|
||||
Type-3
|
||||
------
|
||||
|
||||
A Type-3 CXL Device
|
||||
|
||||
* Supports cxl.io and cxl.mem
|
||||
* Implements Host-Managed Device Memory
|
||||
* May provide either Volatile or Persistent memory capacity (or both).
|
||||
|
||||
A basic example of a type-3 device is a simple memory expander, whose
|
||||
local memory capacity is exposed to the CPU for access directly via
|
||||
basic coherent DMA.
|
||||
|
||||
Switch
|
||||
------
|
||||
|
||||
A CXL switch is a device capacity of routing any CXL (and by extension, PCIe)
|
||||
protocol between an upstream, downstream, or peer devices. Many devices, such
|
||||
as Multi-Logical Devices, imply the presence of switching in some manner.
|
||||
|
||||
Logical Devices and Heads
|
||||
-------------------------
|
||||
|
||||
A CXL device may present one or more "Logical Devices" to one or more hosts
|
||||
(via physical "Heads").
|
||||
|
||||
A Single-Logical Device (SLD) is a device which presents a single device to
|
||||
one or more heads.
|
||||
|
||||
A Multi-Logical Device (MLD) is a device which may present multiple devices
|
||||
to one or more devices.
|
||||
|
||||
A Single-Headed Device exposes only a single physical connection.
|
||||
|
||||
A Multi-Headed Device exposes multiple physical connections.
|
||||
|
||||
MHSLD
|
||||
~~~~~
|
||||
A Multi-Headed Single-Logical Device (MHSLD) exposes a single logical
|
||||
device to multiple heads which may be connected to one or more discrete
|
||||
hosts. An example of this would be a simple memory-pool which may be
|
||||
statically configured (prior to boot) to expose portions of its memory
|
||||
to Linux via :doc:`CEDT <../platform/acpi/cedt>`.
|
||||
|
||||
MHMLD
|
||||
~~~~~
|
||||
A Multi-Headed Multi-Logical Device (MHMLD) exposes multiple logical
|
||||
devices to multiple heads which may be connected to one or more discrete
|
||||
hosts. An example of this would be a Dynamic Capacity Device or which
|
||||
may be configured at runtime to expose portions of its memory to Linux.
|
||||
|
||||
Example Devices
|
||||
===============
|
||||
|
||||
Memory Expander
|
||||
---------------
|
||||
The simplest form of Type-3 device is a memory expander. A memory expander
|
||||
exposes Host-Managed Device Memory (HDM) to Linux. This memory may be
|
||||
Volatile or Non-Volatile (Persistent).
|
||||
|
||||
Memory Expanders will typically be considered a form of Single-Headed,
|
||||
Single-Logical Device - as its form factor will typically be an add-in-card
|
||||
(AIC) or some other similar form-factor.
|
||||
|
||||
The Linux CXL driver provides support for static or dynamic configuration of
|
||||
basic memory expanders. The platform may program decoders prior to OS init
|
||||
(e.g. auto-decoders), or the user may program the fabric if the platform
|
||||
defers these operations to the OS.
|
||||
|
||||
Multiple Memory Expanders may be added to an external chassis and exposed to
|
||||
a host via a head attached to a CXL switch. This is a "memory pool", and
|
||||
would be considered an MHSLD or MHMLD depending on the management capabilities
|
||||
provided by the switch platform.
|
||||
|
||||
As of v6.14, Linux does not provide a formalized interface to manage non-DCD
|
||||
MHSLD or MHMLD devices.
|
||||
|
||||
Dynamic Capacity Device (DCD)
|
||||
-----------------------------
|
||||
|
||||
A Dynamic Capacity Device is a Type-3 device which provides dynamic management
|
||||
of memory capacity. The basic premise of a DCD to provide an allocator-like
|
||||
interface for physical memory capacity to a "Fabric Manager" (an external,
|
||||
privileged host with privileges to change configurations for other hosts).
|
||||
|
||||
A DCD manages "Memory Extents", which may be volatile or persistent. Extents
|
||||
may also be exclusive to a single host or shared across multiple hosts.
|
||||
|
||||
As of v6.14, Linux does not provide a formalized interface to manage DCD
|
||||
devices, however there is active work on LKML targeting future release.
|
||||
|
|
@ -4,12 +4,50 @@
|
|||
Compute Express Link
|
||||
====================
|
||||
|
||||
CXL device configuration has a complex handoff between platform (Hardware,
|
||||
BIOS, EFI), OS (early boot, core kernel, driver), and user policy decisions
|
||||
that have impacts on each other. The docs here break up configurations steps.
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 1
|
||||
|
||||
memory-devices
|
||||
access-coordinates
|
||||
:maxdepth: 2
|
||||
:caption: Overview
|
||||
|
||||
theory-of-operation
|
||||
maturity-map
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
:caption: Device Reference
|
||||
|
||||
devices/device-types
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
:caption: Platform Configuration
|
||||
|
||||
platform/bios-and-efi
|
||||
platform/acpi
|
||||
platform/cdat
|
||||
platform/example-configs
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
:caption: Linux Kernel Configuration
|
||||
|
||||
linux/overview
|
||||
linux/early-boot
|
||||
linux/cxl-driver
|
||||
linux/dax-driver
|
||||
linux/memory-hotplug
|
||||
linux/access-coordinates
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
:caption: Memory Allocation
|
||||
|
||||
allocation/dax
|
||||
allocation/page-allocator
|
||||
allocation/reclaim
|
||||
allocation/hugepages.rst
|
||||
|
||||
.. only:: subproject and html
|
||||
|
|
|
|||
178
Documentation/driver-api/cxl/linux/access-coordinates.rst
Normal file
178
Documentation/driver-api/cxl/linux/access-coordinates.rst
Normal file
|
|
@ -0,0 +1,178 @@
|
|||
.. SPDX-License-Identifier: GPL-2.0
|
||||
.. include:: <isonum.txt>
|
||||
|
||||
==================================
|
||||
CXL Access Coordinates Computation
|
||||
==================================
|
||||
|
||||
Latency and Bandwidth Calculation
|
||||
=================================
|
||||
A memory region performance coordinates (latency and bandwidth) are typically
|
||||
provided via ACPI tables :doc:`SRAT <../platform/acpi/srat>` and
|
||||
:doc:`HMAT <../platform/acpi/hmat>`. However, the platform firmware (BIOS) is
|
||||
not able to annotate those for CXL devices that are hot-plugged since they do
|
||||
not exist during platform firmware initialization. The CXL driver can compute
|
||||
the performance coordinates by retrieving data from several components.
|
||||
|
||||
The :doc:`SRAT <../platform/acpi/srat>` provides a Generic Port Affinity
|
||||
subtable that ties a proximity domain to a device handle, which in this case
|
||||
would be the CXL hostbridge. Using this association, the performance
|
||||
coordinates for the Generic Port can be retrieved from the
|
||||
:doc:`HMAT <../platform/acpi/hmat>` subtable. This piece represents the
|
||||
performance coordinates between a CPU and a Generic Port (CXL hostbridge).
|
||||
|
||||
The :doc:`CDAT <../platform/cdat>` provides the performance coordinates for
|
||||
the CXL device itself. That is the bandwidth and latency to access that device's
|
||||
memory region. The DSMAS subtable provides a DSMADHandle that is tied to a
|
||||
Device Physical Address (DPA) range. The DSLBIS subtable provides the
|
||||
performance coordinates that's tied to a DSMADhandle and this ties the two
|
||||
table entries together to provide the performance coordinates for each DPA
|
||||
region. For example, if a device exports a DRAM region and a PMEM region,
|
||||
then there would be different performance characteristsics for each of those
|
||||
regions.
|
||||
|
||||
If there's a CXL switch in the topology, then the performance coordinates for the
|
||||
switch is provided by SSLBIS subtable. This provides the bandwidth and latency
|
||||
for traversing the switch between the switch upstream port and the switch
|
||||
downstream port that points to the endpoint device.
|
||||
|
||||
Simple topology example::
|
||||
|
||||
GP0/HB0/ACPI0016-0
|
||||
RP0
|
||||
|
|
||||
| L0
|
||||
|
|
||||
SW 0 / USP0
|
||||
SW 0 / DSP0
|
||||
|
|
||||
| L1
|
||||
|
|
||||
EP0
|
||||
|
||||
In this example, there is a CXL switch between an endpoint and a root port.
|
||||
Latency in this example is calculated as such:
|
||||
L(EP0) - Latency from EP0 CDAT DSMAS+DSLBIS
|
||||
L(L1) - Link latency between EP0 and SW0DSP0
|
||||
L(SW0) - Latency for the switch from SW0 CDAT SSLBIS.
|
||||
L(L0) - Link latency between SW0 and RP0
|
||||
L(RP0) - Latency from root port to CPU via SRAT and HMAT (Generic Port).
|
||||
Total read and write latencies are the sum of all these parts.
|
||||
|
||||
Bandwidth in this example is calculated as such:
|
||||
B(EP0) - Bandwidth from EP0 CDAT DSMAS+DSLBIS
|
||||
B(L1) - Link bandwidth between EP0 and SW0DSP0
|
||||
B(SW0) - Bandwidth for the switch from SW0 CDAT SSLBIS.
|
||||
B(L0) - Link bandwidth between SW0 and RP0
|
||||
B(RP0) - Bandwidth from root port to CPU via SRAT and HMAT (Generic Port).
|
||||
The total read and write bandwidth is the min() of all these parts.
|
||||
|
||||
To calculate the link bandwidth:
|
||||
LinkOperatingFrequency (GT/s) is the current negotiated link speed.
|
||||
DataRatePerLink (MB/s) = LinkOperatingFrequency / 8
|
||||
Bandwidth (MB/s) = PCIeCurrentLinkWidth * DataRatePerLink
|
||||
Where PCIeCurrentLinkWidth is the number of lanes in the link.
|
||||
|
||||
To calculate the link latency:
|
||||
LinkLatency (picoseconds) = FlitSize / LinkBandwidth (MB/s)
|
||||
|
||||
See `CXL Memory Device SW Guide r1.0 <https://www.intel.com/content/www/us/en/content-details/643805/cxl-memory-device-software-guide.html>`_,
|
||||
section 2.11.3 and 2.11.4 for details.
|
||||
|
||||
In the end, the access coordinates for a constructed memory region is calculated from one
|
||||
or more memory partitions from each of the CXL device(s).
|
||||
|
||||
Shared Upstream Link Calculation
|
||||
================================
|
||||
For certain CXL region construction with endpoints behind CXL switches (SW) or
|
||||
Root Ports (RP), there is the possibility of the total bandwidth for all
|
||||
the endpoints behind a switch being more than the switch upstream link.
|
||||
A similar situation can occur within the host, upstream of the root ports.
|
||||
The CXL driver performs an additional pass after all the targets have
|
||||
arrived for a region in order to recalculate the bandwidths with possible
|
||||
upstream link being a limiting factor in mind.
|
||||
|
||||
The algorithm assumes the configuration is a symmetric topology as that
|
||||
maximizes performance. When asymmetric topology is detected, the calculation
|
||||
is aborted. An asymmetric topology is detected during topology walk where the
|
||||
number of RPs detected as a grandparent is not equal to the number of devices
|
||||
iterated in the same iteration loop. The assumption is made that subtle
|
||||
asymmetry in properties does not happen and all paths to EPs are equal.
|
||||
|
||||
There can be multiple switches under an RP. There can be multiple RPs under
|
||||
a CXL Host Bridge (HB). There can be multiple HBs under a CXL Fixed Memory
|
||||
Window Structure (CFMWS) in the :doc:`CEDT <../platform/acpi/cedt>`.
|
||||
|
||||
An example hierarchy::
|
||||
|
||||
CFMWS 0
|
||||
|
|
||||
_________|_________
|
||||
| |
|
||||
ACPI0017-0 ACPI0017-1
|
||||
GP0/HB0/ACPI0016-0 GP1/HB1/ACPI0016-1
|
||||
| | | |
|
||||
RP0 RP1 RP2 RP3
|
||||
| | | |
|
||||
SW 0 SW 1 SW 2 SW 3
|
||||
| | | | | | | |
|
||||
EP0 EP1 EP2 EP3 EP4 EP5 EP6 EP7
|
||||
|
||||
Computation for the example hierarchy:
|
||||
|
||||
Min (GP0 to CPU BW,
|
||||
Min(SW 0 Upstream Link to RP0 BW,
|
||||
Min(SW0SSLBIS for SW0DSP0 (EP0), EP0 DSLBIS, EP0 Upstream Link) +
|
||||
Min(SW0SSLBIS for SW0DSP1 (EP1), EP1 DSLBIS, EP1 Upstream link)) +
|
||||
Min(SW 1 Upstream Link to RP1 BW,
|
||||
Min(SW1SSLBIS for SW1DSP0 (EP2), EP2 DSLBIS, EP2 Upstream Link) +
|
||||
Min(SW1SSLBIS for SW1DSP1 (EP3), EP3 DSLBIS, EP3 Upstream link))) +
|
||||
Min (GP1 to CPU BW,
|
||||
Min(SW 2 Upstream Link to RP2 BW,
|
||||
Min(SW2SSLBIS for SW2DSP0 (EP4), EP4 DSLBIS, EP4 Upstream Link) +
|
||||
Min(SW2SSLBIS for SW2DSP1 (EP5), EP5 DSLBIS, EP5 Upstream link)) +
|
||||
Min(SW 3 Upstream Link to RP3 BW,
|
||||
Min(SW3SSLBIS for SW3DSP0 (EP6), EP6 DSLBIS, EP6 Upstream Link) +
|
||||
Min(SW3SSLBIS for SW3DSP1 (EP7), EP7 DSLBIS, EP7 Upstream link))))
|
||||
|
||||
The calculation starts at cxl_region_shared_upstream_perf_update(). A xarray
|
||||
is created to collect all the endpoint bandwidths via the
|
||||
cxl_endpoint_gather_bandwidth() function. The min() of bandwidth from the
|
||||
endpoint CDAT and the upstream link bandwidth is calculated. If the endpoint
|
||||
has a CXL switch as a parent, then min() of calculated bandwidth and the
|
||||
bandwidth from the SSLBIS for the switch downstream port that is associated
|
||||
with the endpoint is calculated. The final bandwidth is stored in a
|
||||
'struct cxl_perf_ctx' in the xarray indexed by a device pointer. If the
|
||||
endpoint is direct attached to a root port (RP), the device pointer would be an
|
||||
RP device. If the endpoint is behind a switch, the device pointer would be the
|
||||
upstream device of the parent switch.
|
||||
|
||||
At the next stage, the code walks through one or more switches if they exist
|
||||
in the topology. For endpoints directly attached to RPs, this step is skipped.
|
||||
If there is another switch upstream, the code takes the min() of the current
|
||||
gathered bandwidth and the upstream link bandwidth. If there's a switch
|
||||
upstream, then the SSLBIS of the upstream switch.
|
||||
|
||||
Once the topology walk reaches the RP, whether it's direct attached endpoints
|
||||
or walking through the switch(es), cxl_rp_gather_bandwidth() is called. At
|
||||
this point all the bandwidths are aggregated per each host bridge, which is
|
||||
also the index for the resulting xarray.
|
||||
|
||||
The next step is to take the min() of the per host bridge bandwidth and the
|
||||
bandwidth from the Generic Port (GP). The bandwidths for the GP are retrieved
|
||||
via ACPI tables (:doc:`SRAT <../platform/acpi/srat>` and
|
||||
:doc:`HMAT <../platform/acpi/hmat>`). The minimum bandwidth are aggregated
|
||||
under the same ACPI0017 device to form a new xarray.
|
||||
|
||||
Finally, the cxl_region_update_bandwidth() is called and the aggregated
|
||||
bandwidth from all the members of the last xarray is updated for the
|
||||
access coordinates residing in the cxl region (cxlr) context.
|
||||
|
||||
QTG ID
|
||||
======
|
||||
Each :doc:`CEDT <../platform/acpi/cedt>` has a QTG ID field. This field provides
|
||||
the ID that associates with a QoS Throttling Group (QTG) for the CFMWS window.
|
||||
Once the access coordinates are calculated, an ACPI Device Specific Method can
|
||||
be issued to the ACPI0016 device to retrieve the QTG ID depends on the access
|
||||
coordinates provided. The QTG ID for the device can be used as guidance to match
|
||||
to the CFMWS to setup the best Linux root decoder for the device performance.
|
||||
630
Documentation/driver-api/cxl/linux/cxl-driver.rst
Normal file
630
Documentation/driver-api/cxl/linux/cxl-driver.rst
Normal file
|
|
@ -0,0 +1,630 @@
|
|||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
====================
|
||||
CXL Driver Operation
|
||||
====================
|
||||
|
||||
The devices described in this section are present in ::
|
||||
|
||||
/sys/bus/cxl/devices/
|
||||
/dev/cxl/
|
||||
|
||||
The :code:`cxl-cli` library, maintained as part of the NDTCL project, may
|
||||
be used to script interactions with these devices.
|
||||
|
||||
Drivers
|
||||
=======
|
||||
The CXL driver is split into a number of drivers.
|
||||
|
||||
* cxl_core - fundamental init interface and core object creation
|
||||
* cxl_port - initializes root and provides port enumeration interface.
|
||||
* cxl_acpi - initializes root decoders and interacts with ACPI data.
|
||||
* cxl_p/mem - initializes memory devices
|
||||
* cxl_pci - uses cxl_port to enumates the actual fabric hierarchy.
|
||||
|
||||
Driver Devices
|
||||
==============
|
||||
Here is an example from a single-socket system with 4 host bridges. Two host
|
||||
bridges have a single memory device attached, and the devices are interleaved
|
||||
into a single memory region. The memory region has been converted to dax. ::
|
||||
|
||||
# ls /sys/bus/cxl/devices/
|
||||
dax_region0 decoder3.0 decoder6.0 mem0 port3
|
||||
decoder0.0 decoder4.0 decoder6.1 mem1 port4
|
||||
decoder1.0 decoder5.0 endpoint5 port1 region0
|
||||
decoder2.0 decoder5.1 endpoint6 port2 root0
|
||||
|
||||
|
||||
.. kernel-render:: DOT
|
||||
:alt: Digraph of CXL fabric describing host-bridge interleaving
|
||||
:caption: Diagraph of CXL fabric with a host-bridge interleave memory region
|
||||
|
||||
digraph foo {
|
||||
"root0" -> "port1";
|
||||
"root0" -> "port3";
|
||||
"root0" -> "decoder0.0";
|
||||
"port1" -> "endpoint5";
|
||||
"port3" -> "endpoint6";
|
||||
"port1" -> "decoder1.0";
|
||||
"port3" -> "decoder3.0";
|
||||
"endpoint5" -> "decoder5.0";
|
||||
"endpoint6" -> "decoder6.0";
|
||||
"decoder0.0" -> "region0";
|
||||
"decoder0.0" -> "decoder1.0";
|
||||
"decoder0.0" -> "decoder3.0";
|
||||
"decoder1.0" -> "decoder5.0";
|
||||
"decoder3.0" -> "decoder6.0";
|
||||
"decoder5.0" -> "region0";
|
||||
"decoder6.0" -> "region0";
|
||||
"region0" -> "dax_region0";
|
||||
"dax_region0" -> "dax0.0";
|
||||
}
|
||||
|
||||
For this section we'll explore the devices present in this configuration, but
|
||||
we'll explore more configurations in-depth in example configurations below.
|
||||
|
||||
Base Devices
|
||||
------------
|
||||
Most devices in a CXL fabric are a `port` of some kind (because each
|
||||
device mostly routes request from one device to the next, rather than
|
||||
provide a direct service).
|
||||
|
||||
Root
|
||||
~~~~
|
||||
The `CXL Root` is logical object created by the `cxl_acpi` driver during
|
||||
:code:`cxl_acpi_probe` - if the :code:`ACPI0017` `Compute Express Link
|
||||
Root Object` Device Class is found.
|
||||
|
||||
The Root contains links to:
|
||||
|
||||
* `Host Bridge Ports` defined by CHBS in the :doc:`CEDT<../platform/acpi/cedt>`
|
||||
|
||||
* `Downstream Ports` typically connected to `Host Bridge Ports`.
|
||||
|
||||
* `Root Decoders` defined by CFMWS the :doc:`CEDT<../platform/acpi/cedt>`
|
||||
|
||||
::
|
||||
|
||||
# ls /sys/bus/cxl/devices/root0
|
||||
decoder0.0 dport0 dport5 port2 subsystem
|
||||
decoders_committed dport1 modalias port3 uevent
|
||||
devtype dport4 port1 port4 uport
|
||||
|
||||
# cat /sys/bus/cxl/devices/root0/devtype
|
||||
cxl_port
|
||||
|
||||
# cat port1/devtype
|
||||
cxl_port
|
||||
|
||||
# cat decoder0.0/devtype
|
||||
cxl_decoder_root
|
||||
|
||||
The root is first `logical port` in the CXL fabric, as presented by the Linux
|
||||
CXL driver. The `CXL root` is a special type of `switch port`, in that it
|
||||
only has downstream port connections.
|
||||
|
||||
Port
|
||||
~~~~
|
||||
A `port` object is better described as a `switch port`. It may represent a
|
||||
host bridge to the root or an actual switch port on a switch. A `switch port`
|
||||
contains one or more decoders used to route memory requests downstream ports,
|
||||
which may be connected to another `switch port` or an `endpoint port`.
|
||||
|
||||
::
|
||||
|
||||
# ls /sys/bus/cxl/devices/port1
|
||||
decoder1.0 dport0 driver parent_dport uport
|
||||
decoders_committed dport113 endpoint5 subsystem
|
||||
devtype dport2 modalias uevent
|
||||
|
||||
# cat devtype
|
||||
cxl_port
|
||||
|
||||
# cat decoder1.0/devtype
|
||||
cxl_decoder_switch
|
||||
|
||||
# cat endpoint5/devtype
|
||||
cxl_port
|
||||
|
||||
CXL `Host Bridges` in the fabric are probed during :code:`cxl_acpi_probe` at
|
||||
the time the `CXL Root` is probed. The allows for the immediate logical
|
||||
connection to between the root and host bridge.
|
||||
|
||||
* The root has a downstream port connection to a host bridge
|
||||
|
||||
* The host bridge has an upstream port connection to the root.
|
||||
|
||||
* The host bridge has one or more downstream port connections to switch
|
||||
or endpoint ports.
|
||||
|
||||
A `Host Bridge` is a special type of CXL `switch port`. It is explicitly
|
||||
defined in the ACPI specification via `ACPI0016` ID. `Host Bridge` ports
|
||||
will be probed at `acpi_probe` time, while similar ports on an actual switch
|
||||
will be probed later. Otherwise, switch and host bridge ports look very
|
||||
similar - the both contain switch decoders which route accesses between
|
||||
upstream and downstream ports.
|
||||
|
||||
Endpoint
|
||||
~~~~~~~~
|
||||
An `endpoint` is a terminal port in the fabric. This is a `logical device`,
|
||||
and may be one of many `logical devices` presented by a memory device. It
|
||||
is still considered a type of `port` in the fabric.
|
||||
|
||||
An `endpoint` contains `endpoint decoders` and the device's Coherent Device
|
||||
Attribute Table (which describes the device's capabilities). ::
|
||||
|
||||
# ls /sys/bus/cxl/devices/endpoint5
|
||||
CDAT decoders_committed modalias uevent
|
||||
decoder5.0 devtype parent_dport uport
|
||||
decoder5.1 driver subsystem
|
||||
|
||||
# cat /sys/bus/cxl/devices/endpoint5/devtype
|
||||
cxl_port
|
||||
|
||||
# cat /sys/bus/cxl/devices/endpoint5/decoder5.0/devtype
|
||||
cxl_decoder_endpoint
|
||||
|
||||
|
||||
Memory Device (memdev)
|
||||
~~~~~~~~~~~~~~~~~~~~~~
|
||||
A `memdev` is probed and added by the `cxl_pci` driver in :code:`cxl_pci_probe`
|
||||
and is managed by the `cxl_mem` driver. It primarily provides the `IOCTL`
|
||||
interface to a memory device, via :code:`/dev/cxl/memN`, and exposes various
|
||||
device configuration data. ::
|
||||
|
||||
# ls /sys/bus/cxl/devices/mem0
|
||||
dev firmware_version payload_max security uevent
|
||||
driver label_storage_size pmem serial
|
||||
firmware numa_node ram subsystem
|
||||
|
||||
A Memory Device is a discrete base object that is not a port. While the
|
||||
physical device it belongs to may also host an `endpoint`, the relationship
|
||||
between an `endpoint` and a `memdev` is not captured in sysfs.
|
||||
|
||||
Port Relationships
|
||||
~~~~~~~~~~~~~~~~~~
|
||||
In our example described above, there are four host bridges attached to the
|
||||
root, and two of the host bridges have one endpoint attached.
|
||||
|
||||
.. kernel-render:: DOT
|
||||
:alt: Digraph of CXL fabric describing host-bridge interleaving
|
||||
:caption: Diagraph of CXL fabric with a host-bridge interleave memory region
|
||||
|
||||
digraph foo {
|
||||
"root0" -> "port1";
|
||||
"root0" -> "port2";
|
||||
"root0" -> "port3";
|
||||
"root0" -> "port4";
|
||||
"port1" -> "endpoint5";
|
||||
"port3" -> "endpoint6";
|
||||
}
|
||||
|
||||
Decoders
|
||||
--------
|
||||
A `Decoder` is short for a CXL Host-Managed Device Memory (HDM) Decoder. It is
|
||||
a device that routes accesses through the CXL fabric to an endpoint, and at
|
||||
the endpoint translates a `Host Physical` to `Device Physical` Addressing.
|
||||
|
||||
The CXL 3.1 specification heavily implies that only endpoint decoders should
|
||||
engage in translation of `Host Physical Address` to `Device Physical Address`.
|
||||
::
|
||||
|
||||
8.2.4.20 CXL HDM Decoder Capability Structure
|
||||
|
||||
IMPLEMENTATION NOTE
|
||||
CXL Host Bridge and Upstream Switch Port Decode Flow
|
||||
|
||||
IMPLEMENTATION NOTE
|
||||
Device Decode Logic
|
||||
|
||||
These notes imply that there are two logical groups of decoders.
|
||||
|
||||
* Routing Decoder - a decoder which routes accesses but does not translate
|
||||
addresses from HPA to DPA.
|
||||
|
||||
* Translating Decoder - a decoder which translates accesses from HPA to DPA
|
||||
for an endpoint to service.
|
||||
|
||||
The CXL drivers distinguish 3 decoder types: root, switch, and endpoint. Only
|
||||
endpoint decoders are Translating Decoders, all others are Routing Decoders.
|
||||
|
||||
.. note:: PLATFORM VENDORS BE AWARE
|
||||
|
||||
Linux makes a strong assumption that endpoint decoders are the only decoder
|
||||
in the fabric that actively translates HPA to DPA. Linux assumes routing
|
||||
decoders pass the HPA unchanged to the next decoder in the fabric.
|
||||
|
||||
It is therefore assumed that any given decoder in the fabric will have an
|
||||
address range that is a subset of its upstream port decoder. Any deviation
|
||||
from this scheme undefined per the specification. Linux prioritizes
|
||||
spec-defined / architectural behavior.
|
||||
|
||||
Decoders may have one or more `Downstream Targets` if configured to interleave
|
||||
memory accesses. This will be presented in sysfs via the :code:`target_list`
|
||||
parameter.
|
||||
|
||||
Root Decoder
|
||||
~~~~~~~~~~~~
|
||||
A `Root Decoder` is logical construct of the physical address and interleave
|
||||
configurations present in the CFMWS field of the :doc:`CEDT
|
||||
<../platform/acpi/cedt>`.
|
||||
Linux presents this information as a decoder present in the `CXL Root`. We
|
||||
consider this a `Root Decoder`, though technically it exists on the boundary
|
||||
of the CXL specification and platform-specific CXL root implementations.
|
||||
|
||||
Linux considers these logical decoders a type of `Routing Decoder`, and is the
|
||||
first decoder in the CXL fabric to receive a memory access from the platform's
|
||||
memory controllers.
|
||||
|
||||
`Root Decoders` are created during :code:`cxl_acpi_probe`. One root decoder
|
||||
is created per CFMWS entry in the :doc:`CEDT <../platform/acpi/cedt>`.
|
||||
|
||||
The :code:`target_list` parameter is filled by the CFMWS target fields. Targets
|
||||
of a root decoder are `Host Bridges`, which means interleave done at the root
|
||||
decoder level is an `Inter-Host-Bridge Interleave`.
|
||||
|
||||
Only root decoders are capable of `Inter-Host-Bridge Interleave`.
|
||||
|
||||
Such interleaves must be configured by the platform and described in the ACPI
|
||||
CEDT CFMWS, as the target CXL host bridge UIDs in the CFMWS must match the CXL
|
||||
host bridge UIDs in the CHBS field of the :doc:`CEDT
|
||||
<../platform/acpi/cedt>` and the UID field of CXL Host Bridges defined in
|
||||
the :doc:`DSDT <../platform/acpi/dsdt>`.
|
||||
|
||||
Interleave settings in a root decoder describe how to interleave accesses among
|
||||
the *immediate downstream targets*, not the entire interleave set.
|
||||
|
||||
The memory range described in the root decoder is used to
|
||||
|
||||
1) Create a memory region (:code:`region0` in this example), and
|
||||
|
||||
2) Associate the region with an IO Memory Resource (:code:`kernel/resource.c`)
|
||||
|
||||
::
|
||||
|
||||
# ls /sys/bus/cxl/devices/decoder0.0/
|
||||
cap_pmem devtype region0
|
||||
cap_ram interleave_granularity size
|
||||
cap_type2 interleave_ways start
|
||||
cap_type3 locked subsystem
|
||||
create_ram_region modalias target_list
|
||||
delete_region qos_class uevent
|
||||
|
||||
# cat /sys/bus/cxl/devices/decoder0.0/region0/resource
|
||||
0xc050000000
|
||||
|
||||
The IO Memory Resource is created during early boot when the CFMWS region is
|
||||
identified in the EFI Memory Map or E820 table (on x86).
|
||||
|
||||
Root decoders are defined as a separate devtype, but are also a type
|
||||
of `Switch Decoder` due to having downstream targets. ::
|
||||
|
||||
# cat /sys/bus/cxl/devices/decoder0.0/devtype
|
||||
cxl_decoder_root
|
||||
|
||||
Switch Decoder
|
||||
~~~~~~~~~~~~~~
|
||||
Any non-root, translating decoder is considered a `Switch Decoder`, and will
|
||||
present with the type :code:`cxl_decoder_switch`. Both `Host Bridge` and `CXL
|
||||
Switch` (device) decoders are of type :code:`cxl_decoder_switch`. ::
|
||||
|
||||
# ls /sys/bus/cxl/devices/decoder1.0/
|
||||
devtype locked size target_list
|
||||
interleave_granularity modalias start target_type
|
||||
interleave_ways region subsystem uevent
|
||||
|
||||
# cat /sys/bus/cxl/devices/decoder1.0/devtype
|
||||
cxl_decoder_switch
|
||||
|
||||
# cat /sys/bus/cxl/devices/decoder1.0/region
|
||||
region0
|
||||
|
||||
A `Switch Decoder` has associations between a region defined by a root
|
||||
decoder and downstream target ports. Interleaving done within a switch decoder
|
||||
is a multi-downstream-port interleave (or `Intra-Host-Bridge Interleave` for
|
||||
host bridges).
|
||||
|
||||
Interleave settings in a switch decoder describe how to interleave accesses
|
||||
among the *immediate downstream targets*, not the entire interleave set.
|
||||
|
||||
Switch decoders are created during :code:`cxl_switch_port_probe` in the
|
||||
:code:`cxl_port` driver, and is created based on a PCI device's DVSEC
|
||||
registers.
|
||||
|
||||
Switch decoder programming is validated during probe if the platform programs
|
||||
them during boot (See `Auto Decoders` below), or on commit if programmed at
|
||||
runtime (See `Runtime Programming` below).
|
||||
|
||||
|
||||
Endpoint Decoder
|
||||
~~~~~~~~~~~~~~~~
|
||||
Any decoder attached to a *terminal* point in the CXL fabric (`An Endpoint`) is
|
||||
considered an `Endpoint Decoder`. Endpoint decoders are of type
|
||||
:code:`cxl_decoder_endpoint`. ::
|
||||
|
||||
# ls /sys/bus/cxl/devices/decoder5.0
|
||||
devtype locked start
|
||||
dpa_resource modalias subsystem
|
||||
dpa_size mode target_type
|
||||
interleave_granularity region uevent
|
||||
interleave_ways size
|
||||
|
||||
# cat /sys/bus/cxl/devices/decoder5.0/devtype
|
||||
cxl_decoder_endpoint
|
||||
|
||||
# cat /sys/bus/cxl/devices/decoder5.0/region
|
||||
region0
|
||||
|
||||
An `Endpoint Decoder` has an association with a region defined by a root
|
||||
decoder and describes the device-local resource associated with this region.
|
||||
|
||||
Unlike root and switch decoders, endpoint decoders translate `Host Physical` to
|
||||
`Device Physical` address ranges. The interleave settings on an endpoint
|
||||
therefore describe the entire *interleave set*.
|
||||
|
||||
`Device Physical Address` regions must be committed in-order. For example, the
|
||||
DPA region starting at 0x80000000 cannot be committed before the DPA region
|
||||
starting at 0x0.
|
||||
|
||||
As of Linux v6.15, Linux does not support *imbalanced* interleave setups, all
|
||||
endpoints in an interleave set are expected to have the same interleave
|
||||
settings (granularity and ways must be the same).
|
||||
|
||||
Endpoint decoders are created during :code:`cxl_endpoint_port_probe` in the
|
||||
:code:`cxl_port` driver, and is created based on a PCI device's DVSEC registers.
|
||||
|
||||
Decoder Relationships
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
In our example described above, there is one root decoder which routes memory
|
||||
accesses over two host bridges. Each host bridge has a decoder which routes
|
||||
access to their singular endpoint targets. Each endpoint has a decoder which
|
||||
translates HPA to DPA and services the memory request.
|
||||
|
||||
The driver validates relationships between ports by decoder programming, so
|
||||
we can think of decoders being related in a similarly hierarchical fashion to
|
||||
ports.
|
||||
|
||||
.. kernel-render:: DOT
|
||||
:alt: Digraph of hierarchical relationship between root, switch, and endpoint decoders.
|
||||
:caption: Diagraph of CXL root, switch, and endpoint decoders.
|
||||
|
||||
digraph foo {
|
||||
"root0" -> "decoder0.0";
|
||||
"decoder0.0" -> "decoder1.0";
|
||||
"decoder0.0" -> "decoder3.0";
|
||||
"decoder1.0" -> "decoder5.0";
|
||||
"decoder3.0" -> "decoder6.0";
|
||||
}
|
||||
|
||||
Regions
|
||||
-------
|
||||
|
||||
Memory Region
|
||||
~~~~~~~~~~~~~
|
||||
A `Memory Region` is a logical construct that connects a set of CXL ports in
|
||||
the fabric to an IO Memory Resource. It is ultimately used to expose the memory
|
||||
on these devices to the DAX subsystem via a `DAX Region`.
|
||||
|
||||
An example RAM region: ::
|
||||
|
||||
# ls /sys/bus/cxl/devices/region0/
|
||||
access0 devtype modalias subsystem uuid
|
||||
access1 driver mode target0
|
||||
commit interleave_granularity resource target1
|
||||
dax_region0 interleave_ways size uevent
|
||||
|
||||
A memory region can be constructed during endpoint probe, if decoders were
|
||||
programmed by BIOS/EFI (see `Auto Decoders`), or by creating a region manually
|
||||
via a `Root Decoder`'s :code:`create_ram_region` or :code:`create_pmem_region`
|
||||
interfaces.
|
||||
|
||||
The interleave settings in a `Memory Region` describe the configuration of the
|
||||
`Interleave Set` - and are what can be expected to be seen in the endpoint
|
||||
interleave settings.
|
||||
|
||||
.. kernel-render:: DOT
|
||||
:alt: Digraph of CXL memory region relationships between root and endpoint decoders.
|
||||
:caption: Regions are created based on root decoder configurations. Endpoint decoders
|
||||
must be programmed with the same interleave settings as the region.
|
||||
|
||||
digraph foo {
|
||||
"root0" -> "decoder0.0";
|
||||
"decoder0.0" -> "region0";
|
||||
"region0" -> "decoder5.0";
|
||||
"region0" -> "decoder6.0";
|
||||
}
|
||||
|
||||
DAX Region
|
||||
~~~~~~~~~~
|
||||
A `DAX Region` is used to convert a CXL `Memory Region` to a DAX device. A
|
||||
DAX device may then be accessed directly via a file descriptor interface, or
|
||||
converted to System RAM via the DAX kmem driver. See the DAX driver section
|
||||
for more details. ::
|
||||
|
||||
# ls /sys/bus/cxl/devices/dax_region0/
|
||||
dax0.0 devtype modalias uevent
|
||||
dax_region driver subsystem
|
||||
|
||||
Mailbox Interfaces
|
||||
------------------
|
||||
A mailbox command interface for each device is exposed in ::
|
||||
|
||||
/dev/cxl/mem0
|
||||
/dev/cxl/mem1
|
||||
|
||||
These mailboxes may receive any specification-defined command. Raw commands
|
||||
(custom commands) can only be sent to these interfaces if the build config
|
||||
:code:`CXL_MEM_RAW_COMMANDS` is set. This is considered a debug and/or
|
||||
development interface, not an officially supported mechanism for creation
|
||||
of vendor-specific commands (see the `fwctl` subsystem for that).
|
||||
|
||||
Decoder Programming
|
||||
===================
|
||||
|
||||
Runtime Programming
|
||||
-------------------
|
||||
During probe, the only decoders *required* to be programmed are `Root Decoders`.
|
||||
In reality, `Root Decoders` are a logical construct to describe the memory
|
||||
region and interleave configuration at the host bridge level - as described
|
||||
in the ACPI CEDT CFMWS.
|
||||
|
||||
All other `Switch` and `Endpoint` decoders may be programmed by the user
|
||||
at runtime - if the platform supports such configurations.
|
||||
|
||||
This interaction is what creates a `Software Defined Memory` environment.
|
||||
|
||||
See the :code:`cxl-cli` documentation for more information about how to
|
||||
configure CXL decoders at runtime.
|
||||
|
||||
Auto Decoders
|
||||
-------------
|
||||
Auto Decoders are decoders programmed by BIOS/EFI at boot time, and are
|
||||
almost always locked (cannot be changed). This is done by a platform
|
||||
which may have a static configuration - or certain quirks which may prevent
|
||||
dynamic runtime changes to the decoders (such as requiring additional
|
||||
controller programming within the CPU complex outside the scope of CXL).
|
||||
|
||||
Auto Decoders are probed automatically as long as the devices and memory
|
||||
regions they are associated with probe without issue. When probing Auto
|
||||
Decoders, the driver's primary responsibility is to ensure the fabric is
|
||||
sane - as-if validating runtime programmed regions and decoders.
|
||||
|
||||
If Linux cannot validate auto-decoder configuration, the memory will not
|
||||
be surfaced as a DAX device - and therefore not be exposed to the page
|
||||
allocator - effectively stranding it.
|
||||
|
||||
Interleave
|
||||
----------
|
||||
|
||||
The Linux CXL driver supports `Cross-Link First` interleave. This dictates
|
||||
how interleave is programmed at each decoder step, as the driver validates
|
||||
the relationships between a decoder and it's parent.
|
||||
|
||||
For example, in a `Cross-Link First` interleave setup with 16 endpoints
|
||||
attached to 4 host bridges, linux expects the following ways/granularity
|
||||
across the root, host bridge, and endpoints respectively.
|
||||
|
||||
.. flat-table:: 4x4 cross-link first interleave settings
|
||||
|
||||
* - decoder
|
||||
- ways
|
||||
- granularity
|
||||
|
||||
* - root
|
||||
- 4
|
||||
- 256
|
||||
|
||||
* - host bridge
|
||||
- 4
|
||||
- 1024
|
||||
|
||||
* - endpoint
|
||||
- 16
|
||||
- 256
|
||||
|
||||
At the root, every a given access will be routed to the
|
||||
:code:`((HPA / 256) % 4)th` target host bridge. Within a host bridge, every
|
||||
:code:`((HPA / 1024) % 4)th` target endpoint. Each endpoint translates based
|
||||
on the entire 16 device interleave set.
|
||||
|
||||
Unbalanced interleave sets are not supported - decoders at a similar point
|
||||
in the hierarchy (e.g. all host bridge decoders) must have the same ways and
|
||||
granularity configuration.
|
||||
|
||||
At Root
|
||||
~~~~~~~
|
||||
Root decoder interleave is defined by CFMWS field of the :doc:`CEDT
|
||||
<../platform/acpi/cedt>`. The CEDT may actually define multiple CFMWS
|
||||
configurations to describe the same physical capacity, with the intent to allow
|
||||
users to decide at runtime whether to online memory as interleaved or
|
||||
non-interleaved. ::
|
||||
|
||||
Subtable Type : 01 [CXL Fixed Memory Window Structure]
|
||||
Window base address : 0000000100000000
|
||||
Window size : 0000000100000000
|
||||
Interleave Members (2^n) : 00
|
||||
Interleave Arithmetic : 00
|
||||
First Target : 00000007
|
||||
|
||||
Subtable Type : 01 [CXL Fixed Memory Window Structure]
|
||||
Window base address : 0000000200000000
|
||||
Window size : 0000000100000000
|
||||
Interleave Members (2^n) : 00
|
||||
Interleave Arithmetic : 00
|
||||
First Target : 00000006
|
||||
|
||||
Subtable Type : 01 [CXL Fixed Memory Window Structure]
|
||||
Window base address : 0000000300000000
|
||||
Window size : 0000000200000000
|
||||
Interleave Members (2^n) : 01
|
||||
Interleave Arithmetic : 00
|
||||
First Target : 00000007
|
||||
Next Target : 00000006
|
||||
|
||||
In this example, the CFMWS defines two discrete non-interleaved 4GB regions
|
||||
for each host bridge, and one interleaved 8GB region that targets both. This
|
||||
would result in 3 root decoders presenting in the root. ::
|
||||
|
||||
# ls /sys/bus/cxl/devices/root0/decoder*
|
||||
decoder0.0 decoder0.1 decoder0.2
|
||||
|
||||
# cat /sys/bus/cxl/devices/decoder0.0/target_list start size
|
||||
7
|
||||
0x100000000
|
||||
0x100000000
|
||||
|
||||
# cat /sys/bus/cxl/devices/decoder0.1/target_list start size
|
||||
6
|
||||
0x200000000
|
||||
0x100000000
|
||||
|
||||
# cat /sys/bus/cxl/devices/decoder0.2/target_list start size
|
||||
7,6
|
||||
0x300000000
|
||||
0x200000000
|
||||
|
||||
These decoders are not runtime programmable. They are used to generate a
|
||||
`Memory Region` to bring this memory online with runtime programmed settings
|
||||
at the `Switch` and `Endpoint` decoders.
|
||||
|
||||
At Host Bridge or Switch
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
`Host Bridge` and `Switch` decoders are programmable via the following fields:
|
||||
|
||||
- :code:`start` - the HPA region associated with the memory region
|
||||
- :code:`size` - the size of the region
|
||||
- :code:`target_list` - the list of downstream ports
|
||||
- :code:`interleave_ways` - the number downstream ports to interleave across
|
||||
- :code:`interleave_granularity` - the granularity to interleave at.
|
||||
|
||||
Linux expects the :code:`interleave_granularity` of switch decoders to be
|
||||
derived from their upstream port connections. In `Cross-Link First` interleave
|
||||
configurations, the :code:`interleave_granularity` of a decoder is equal to
|
||||
:code:`parent_interleave_granularity * parent_interleave_ways`.
|
||||
|
||||
At Endpoint
|
||||
~~~~~~~~~~~
|
||||
`Endpoint Decoders` are programmed similar to Host Bridge and Switch decoders,
|
||||
with the exception that the ways and granularity are defined by the interleave
|
||||
set (e.g. the interleave settings defined by the associated `Memory Region`).
|
||||
|
||||
- :code:`start` - the HPA region associated with the memory region
|
||||
- :code:`size` - the size of the region
|
||||
- :code:`interleave_ways` - the number endpoints in the interleave set
|
||||
- :code:`interleave_granularity` - the granularity to interleave at.
|
||||
|
||||
These settings are used by endpoint decoders to *Translate* memory requests
|
||||
from HPA to DPA. This is why they must be aware of the entire interleave set.
|
||||
|
||||
Linux does not support unbalanced interleave configurations. As a result, all
|
||||
endpoints in an interleave set must have the same ways and granularity.
|
||||
|
||||
Example Configurations
|
||||
======================
|
||||
.. toctree::
|
||||
:maxdepth: 1
|
||||
|
||||
example-configurations/single-device.rst
|
||||
example-configurations/hb-interleave.rst
|
||||
example-configurations/intra-hb-interleave.rst
|
||||
example-configurations/multi-interleave.rst
|
||||
43
Documentation/driver-api/cxl/linux/dax-driver.rst
Normal file
43
Documentation/driver-api/cxl/linux/dax-driver.rst
Normal file
|
|
@ -0,0 +1,43 @@
|
|||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
====================
|
||||
DAX Driver Operation
|
||||
====================
|
||||
The `Direct Access Device` driver was originally designed to provide a
|
||||
memory-like access mechanism to memory-like block-devices. It was
|
||||
extended to support CXL Memory Devices, which provide user-configured
|
||||
memory devices.
|
||||
|
||||
The CXL subsystem depends on the DAX subsystem to either:
|
||||
|
||||
- Generate a file-like interface to userland via :code:`/dev/daxN.Y`, or
|
||||
- Engage the memory-hotplug interface to add CXL memory to page allocator.
|
||||
|
||||
The DAX subsystem exposes this ability through the `cxl_dax_region` driver.
|
||||
A `dax_region` provides the translation between a CXL `memory_region` and
|
||||
a `DAX Device`.
|
||||
|
||||
DAX Device
|
||||
==========
|
||||
A `DAX Device` is a file-like interface exposed in :code:`/dev/daxN.Y`. A
|
||||
memory region exposed via dax device can be accessed via userland software
|
||||
via the :code:`mmap()` system-call. The result is direct mappings to the
|
||||
CXL capacity in the task's page tables.
|
||||
|
||||
Users wishing to manually handle allocation of CXL memory should use this
|
||||
interface.
|
||||
|
||||
kmem conversion
|
||||
===============
|
||||
The :code:`dax_kmem` driver converts a `DAX Device` into a series of `hotplug
|
||||
memory blocks` managed by :code:`kernel/memory-hotplug.c`. This capacity
|
||||
will be exposed to the kernel page allocator in the user-selected memory
|
||||
zone.
|
||||
|
||||
The :code:`memmap_on_memory` setting (both global and DAX device local)
|
||||
dictates where the kernell will allocate the :code:`struct folio` descriptors
|
||||
for this memory will come from. If :code:`memmap_on_memory` is set, memory
|
||||
hotplug will set aside a portion of the memory block capacity to allocate
|
||||
folios. If unset, the memory is allocated via a normal :code:`GFP_KERNEL`
|
||||
allocation - and as a result will most likely land on the local NUM node of the
|
||||
CPU executing the hotplug operation.
|
||||
137
Documentation/driver-api/cxl/linux/early-boot.rst
Normal file
137
Documentation/driver-api/cxl/linux/early-boot.rst
Normal file
|
|
@ -0,0 +1,137 @@
|
|||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
=======================
|
||||
Linux Init (Early Boot)
|
||||
=======================
|
||||
|
||||
Linux configuration is split into two major steps: Early-Boot and everything else.
|
||||
|
||||
During early boot, Linux sets up immutable resources (such as numa nodes), while
|
||||
later operations include things like driver probe and memory hotplug. Linux may
|
||||
read EFI and ACPI information throughout this process to configure logical
|
||||
representations of the devices.
|
||||
|
||||
During Linux Early Boot stage (functions in the kernel that have the __init
|
||||
decorator), the system takes the resources created by EFI/BIOS
|
||||
(:doc:`ACPI tables <../platform/acpi>`) and turns them into resources that the
|
||||
kernel can consume.
|
||||
|
||||
|
||||
BIOS, Build and Boot Options
|
||||
============================
|
||||
|
||||
There are 4 pre-boot options that need to be considered during kernel build
|
||||
which dictate how memory will be managed by Linux during early boot.
|
||||
|
||||
* EFI_MEMORY_SP
|
||||
|
||||
* BIOS/EFI Option that dictates whether memory is SystemRAM or
|
||||
Specific Purpose. Specific Purpose memory will be deferred to
|
||||
drivers to manage - and not immediately exposed as system RAM.
|
||||
|
||||
* CONFIG_EFI_SOFT_RESERVE
|
||||
|
||||
* Linux Build config option that dictates whether the kernel supports
|
||||
Specific Purpose memory.
|
||||
|
||||
* CONFIG_MHP_DEFAULT_ONLINE_TYPE
|
||||
|
||||
* Linux Build config that dictates whether and how Specific Purpose memory
|
||||
converted to a dax device should be managed (left as DAX or onlined as
|
||||
SystemRAM in ZONE_NORMAL or ZONE_MOVABLE).
|
||||
|
||||
* nosoftreserve
|
||||
|
||||
* Linux kernel boot option that dictates whether Soft Reserve should be
|
||||
supported. Similar to CONFIG_EFI_SOFT_RESERVE.
|
||||
|
||||
Memory Map Creation
|
||||
===================
|
||||
|
||||
While the kernel parses the EFI memory map, if :code:`Specific Purpose` memory
|
||||
is supported and detected, it will set this region aside as
|
||||
:code:`SOFT_RESERVED`.
|
||||
|
||||
If :code:`EFI_MEMORY_SP=0`, :code:`CONFIG_EFI_SOFT_RESERVE=n`, or
|
||||
:code:`nosoftreserve=y` - Linux will default a CXL device memory region to
|
||||
SystemRAM. This will expose the memory to the kernel page allocator in
|
||||
:code:`ZONE_NORMAL`, making it available for use for most allocations (including
|
||||
:code:`struct page` and page tables).
|
||||
|
||||
If `Specific Purpose` is set and supported, :code:`CONFIG_MHP_DEFAULT_ONLINE_TYPE_*`
|
||||
dictates whether the memory is onlined by default (:code:`_OFFLINE` or
|
||||
:code:`_ONLINE_*`), and if online which zone to online this memory to by default
|
||||
(:code:`_NORMAL` or :code:`_MOVABLE`).
|
||||
|
||||
If placed in :code:`ZONE_MOVABLE`, the memory will not be available for most
|
||||
kernel allocations (such as :code:`struct page` or page tables). This may
|
||||
significant impact performance depending on the memory capacity of the system.
|
||||
|
||||
|
||||
NUMA Node Reservation
|
||||
=====================
|
||||
|
||||
Linux refers to the proximity domains (:code:`PXM`) defined in the :doc:`SRAT
|
||||
<../platform/acpi/srat>` to create NUMA nodes in :code:`acpi_numa_init`.
|
||||
Typically, there is a 1:1 relation between :code:`PXM` and NUMA node IDs.
|
||||
|
||||
The SRAT is the only ACPI defined way of defining Proximity Domains. Linux
|
||||
chooses to, at most, map those 1:1 with NUMA nodes.
|
||||
:doc:`CEDT <../platform/acpi/cedt>` adds a description of SPA ranges which
|
||||
Linux may map to one or more NUMA nodes.
|
||||
|
||||
If there are CXL ranges in the CFMWS but not in SRAT, then a fake :code:`PXM`
|
||||
is created (as of v6.15). In the future, Linux may reject CFMWS not described
|
||||
by SRAT due to the ambiguity of proximity domain association.
|
||||
|
||||
It is important to note that NUMA node creation cannot be done at runtime. All
|
||||
possible NUMA nodes are identified at :code:`__init` time, more specifically
|
||||
during :code:`mm_init`. The CEDT and SRAT must contain sufficient :code:`PXM`
|
||||
data for Linux to identify NUMA nodes their associated memory regions.
|
||||
|
||||
The relevant code exists in: :code:`linux/drivers/acpi/numa/srat.c`.
|
||||
|
||||
See :doc:`Example Platform Configurations <../platform/example-configs>`
|
||||
for more info.
|
||||
|
||||
Memory Tiers Creation
|
||||
=====================
|
||||
Memory tiers are a collection of NUMA nodes grouped by performance characteristics.
|
||||
During :code:`__init`, Linux initializes the system with a default memory tier that
|
||||
contains all nodes marked :code:`N_MEMORY`.
|
||||
|
||||
:code:`memory_tier_init` is called at boot for all nodes with memory online by
|
||||
default. :code:`memory_tier_late_init` is called during late-init for nodes setup
|
||||
during driver configuration.
|
||||
|
||||
Nodes are only marked :code:`N_MEMORY` if they have *online* memory.
|
||||
|
||||
Tier membership can be inspected in ::
|
||||
|
||||
/sys/devices/virtual/memory_tiering/memory_tierN/nodelist
|
||||
0-1
|
||||
|
||||
If nodes are grouped which have clear difference in performance, check the
|
||||
:doc:`HMAT <../platform/acpi/hmat>` and CDAT information for the CXL nodes. All
|
||||
nodes default to the DRAM tier, unless HMAT/CDAT information is reported to the
|
||||
memory_tier component via `access_coordinates`.
|
||||
|
||||
For more, see :doc:`CXL access coordinates documentation
|
||||
<../linux/access-coordinates>`.
|
||||
|
||||
Contiguous Memory Allocation
|
||||
============================
|
||||
The contiguous memory allocator (CMA) enables reservation of contiguous memory
|
||||
regions on NUMA nodes during early boot. However, CMA cannot reserve memory
|
||||
on NUMA nodes that are not online during early boot. ::
|
||||
|
||||
void __init hugetlb_cma_reserve(int order) {
|
||||
if (!node_online(nid))
|
||||
/* do not allow reservations */
|
||||
}
|
||||
|
||||
This means if users intend to defer management of CXL memory to the driver, CMA
|
||||
cannot be used to guarantee huge page allocations. If enabling CXL memory as
|
||||
SystemRAM in `ZONE_NORMAL` during early boot, CMA reservations per-node can be
|
||||
made with the :code:`cma_pernuma` or :code:`numa_cma` kernel command line
|
||||
parameters.
|
||||
|
|
@ -0,0 +1,314 @@
|
|||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
============================
|
||||
Inter-Host-Bridge Interleave
|
||||
============================
|
||||
This cxl-cli configuration dump shows the following host configuration:
|
||||
|
||||
* A single socket system with one CXL root
|
||||
* CXL Root has Four (4) CXL Host Bridges
|
||||
* Two CXL Host Bridges have a single CXL Memory Expander Attached
|
||||
* The CXL root is configured to interleave across the two host bridges.
|
||||
|
||||
This output is generated by :code:`cxl list -v` and describes the relationships
|
||||
between objects exposed in :code:`/sys/bus/cxl/devices/`.
|
||||
|
||||
::
|
||||
|
||||
[
|
||||
{
|
||||
"bus":"root0",
|
||||
"provider":"ACPI.CXL",
|
||||
"nr_dports":4,
|
||||
"dports":[
|
||||
{
|
||||
"dport":"pci0000:00",
|
||||
"alias":"ACPI0016:01",
|
||||
"id":0
|
||||
},
|
||||
{
|
||||
"dport":"pci0000:a8",
|
||||
"alias":"ACPI0016:02",
|
||||
"id":4
|
||||
},
|
||||
{
|
||||
"dport":"pci0000:2a",
|
||||
"alias":"ACPI0016:03",
|
||||
"id":1
|
||||
},
|
||||
{
|
||||
"dport":"pci0000:d2",
|
||||
"alias":"ACPI0016:00",
|
||||
"id":5
|
||||
}
|
||||
],
|
||||
|
||||
This chunk shows the CXL "bus" (root0) has 4 downstream ports attached to CXL
|
||||
Host Bridges. The `Root` can be considered the singular upstream port attached
|
||||
to the platform's memory controller - which routes memory requests to it.
|
||||
|
||||
The `ports:root0` section lays out how each of these downstream ports are
|
||||
configured. If a port is not configured (id's 0 and 1), they are omitted.
|
||||
|
||||
::
|
||||
|
||||
"ports:root0":[
|
||||
{
|
||||
"port":"port1",
|
||||
"host":"pci0000:d2",
|
||||
"depth":1,
|
||||
"nr_dports":3,
|
||||
"dports":[
|
||||
{
|
||||
"dport":"0000:d2:01.1",
|
||||
"alias":"device:02",
|
||||
"id":0
|
||||
},
|
||||
{
|
||||
"dport":"0000:d2:01.3",
|
||||
"alias":"device:05",
|
||||
"id":2
|
||||
},
|
||||
{
|
||||
"dport":"0000:d2:07.1",
|
||||
"alias":"device:0d",
|
||||
"id":113
|
||||
}
|
||||
],
|
||||
|
||||
This chunk shows the available downstream ports associated with the CXL Host
|
||||
Bridge :code:`port1`. In this case, :code:`port1` has 3 available downstream
|
||||
ports: :code:`dport1`, :code:`dport2`, and :code:`dport113`..
|
||||
|
||||
::
|
||||
|
||||
"endpoints:port1":[
|
||||
{
|
||||
"endpoint":"endpoint5",
|
||||
"host":"mem0",
|
||||
"parent_dport":"0000:d2:01.1",
|
||||
"depth":2,
|
||||
"memdev":{
|
||||
"memdev":"mem0",
|
||||
"ram_size":137438953472,
|
||||
"serial":0,
|
||||
"numa_node":0,
|
||||
"host":"0000:d3:00.0"
|
||||
},
|
||||
"decoders:endpoint5":[
|
||||
{
|
||||
"decoder":"decoder5.0",
|
||||
"resource":825975898112,
|
||||
"size":274877906944,
|
||||
"interleave_ways":2,
|
||||
"interleave_granularity":256,
|
||||
"region":"region0",
|
||||
"dpa_resource":0,
|
||||
"dpa_size":137438953472,
|
||||
"mode":"ram"
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
|
||||
This chunk shows the endpoints attached to the host bridge :code:`port1`.
|
||||
|
||||
:code:`endpoint5` contains a single configured decoder :code:`decoder5.0`
|
||||
which has the same interleave configuration as :code:`region0` (shown later).
|
||||
|
||||
Next we have the decodesr belonging to the host bridge:
|
||||
|
||||
::
|
||||
|
||||
"decoders:port1":[
|
||||
{
|
||||
"decoder":"decoder1.0",
|
||||
"resource":825975898112,
|
||||
"size":274877906944,
|
||||
"interleave_ways":1,
|
||||
"region":"region0",
|
||||
"nr_targets":1,
|
||||
"targets":[
|
||||
{
|
||||
"target":"0000:d2:01.1",
|
||||
"alias":"device:02",
|
||||
"position":0,
|
||||
"id":0
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
|
||||
Host Bridge :code:`port1` has a single decoder (:code:`decoder1.0`), whose only
|
||||
target is :code:`dport1` - which is attached to :code:`endpoint5`.
|
||||
|
||||
The following chunk shows a similar configuration for Host Bridge :code:`port3`,
|
||||
the second host bridge with a memory device attached.
|
||||
|
||||
::
|
||||
|
||||
{
|
||||
"port":"port3",
|
||||
"host":"pci0000:a8",
|
||||
"depth":1,
|
||||
"nr_dports":1,
|
||||
"dports":[
|
||||
{
|
||||
"dport":"0000:a8:01.1",
|
||||
"alias":"device:c3",
|
||||
"id":0
|
||||
}
|
||||
],
|
||||
"endpoints:port3":[
|
||||
{
|
||||
"endpoint":"endpoint6",
|
||||
"host":"mem1",
|
||||
"parent_dport":"0000:a8:01.1",
|
||||
"depth":2,
|
||||
"memdev":{
|
||||
"memdev":"mem1",
|
||||
"ram_size":137438953472,
|
||||
"serial":0,
|
||||
"numa_node":0,
|
||||
"host":"0000:a9:00.0"
|
||||
},
|
||||
"decoders:endpoint6":[
|
||||
{
|
||||
"decoder":"decoder6.0",
|
||||
"resource":825975898112,
|
||||
"size":274877906944,
|
||||
"interleave_ways":2,
|
||||
"interleave_granularity":256,
|
||||
"region":"region0",
|
||||
"dpa_resource":0,
|
||||
"dpa_size":137438953472,
|
||||
"mode":"ram"
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"decoders:port3":[
|
||||
{
|
||||
"decoder":"decoder3.0",
|
||||
"resource":825975898112,
|
||||
"size":274877906944,
|
||||
"interleave_ways":1,
|
||||
"region":"region0",
|
||||
"nr_targets":1,
|
||||
"targets":[
|
||||
{
|
||||
"target":"0000:a8:01.1",
|
||||
"alias":"device:c3",
|
||||
"position":0,
|
||||
"id":0
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
|
||||
|
||||
The next chunk shows the two CXL host bridges without attached endpoints.
|
||||
|
||||
::
|
||||
|
||||
{
|
||||
"port":"port2",
|
||||
"host":"pci0000:00",
|
||||
"depth":1,
|
||||
"nr_dports":2,
|
||||
"dports":[
|
||||
{
|
||||
"dport":"0000:00:01.3",
|
||||
"alias":"device:55",
|
||||
"id":2
|
||||
},
|
||||
{
|
||||
"dport":"0000:00:07.1",
|
||||
"alias":"device:5d",
|
||||
"id":113
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"port":"port4",
|
||||
"host":"pci0000:2a",
|
||||
"depth":1,
|
||||
"nr_dports":1,
|
||||
"dports":[
|
||||
{
|
||||
"dport":"0000:2a:01.1",
|
||||
"alias":"device:d0",
|
||||
"id":0
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
|
||||
Next we have the `Root Decoders` belonging to :code:`root0`. This root decoder
|
||||
applies the interleave across the downstream ports :code:`port1` and
|
||||
:code:`port3` - with a granularity of 256 bytes.
|
||||
|
||||
This information is generated by the CXL driver reading the ACPI CEDT CMFWS.
|
||||
|
||||
::
|
||||
|
||||
"decoders:root0":[
|
||||
{
|
||||
"decoder":"decoder0.0",
|
||||
"resource":825975898112,
|
||||
"size":274877906944,
|
||||
"interleave_ways":2,
|
||||
"interleave_granularity":256,
|
||||
"max_available_extent":0,
|
||||
"volatile_capable":true,
|
||||
"nr_targets":2,
|
||||
"targets":[
|
||||
{
|
||||
"target":"pci0000:a8",
|
||||
"alias":"ACPI0016:02",
|
||||
"position":1,
|
||||
"id":4
|
||||
},
|
||||
{
|
||||
"target":"pci0000:d2",
|
||||
"alias":"ACPI0016:00",
|
||||
"position":0,
|
||||
"id":5
|
||||
}
|
||||
],
|
||||
|
||||
Finally we have the `Memory Region` associated with the `Root Decoder`
|
||||
:code:`decoder0.0`. This region describes the overall interleave configuration
|
||||
of the interleave set.
|
||||
|
||||
::
|
||||
|
||||
"regions:decoder0.0":[
|
||||
{
|
||||
"region":"region0",
|
||||
"resource":825975898112,
|
||||
"size":274877906944,
|
||||
"type":"ram",
|
||||
"interleave_ways":2,
|
||||
"interleave_granularity":256,
|
||||
"decode_state":"commit",
|
||||
"mappings":[
|
||||
{
|
||||
"position":1,
|
||||
"memdev":"mem1",
|
||||
"decoder":"decoder6.0"
|
||||
},
|
||||
{
|
||||
"position":0,
|
||||
"memdev":"mem0",
|
||||
"decoder":"decoder5.0"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
|
|
@ -0,0 +1,291 @@
|
|||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
============================
|
||||
Intra-Host-Bridge Interleave
|
||||
============================
|
||||
This cxl-cli configuration dump shows the following host configuration:
|
||||
|
||||
* A single socket system with one CXL root
|
||||
* CXL Root has Four (4) CXL Host Bridges
|
||||
* One (1) CXL Host Bridges has two CXL Memory Expanders Attached
|
||||
* The Host bridge decoder is programmed to interleave across the expanders.
|
||||
|
||||
This output is generated by :code:`cxl list -v` and describes the relationships
|
||||
between objects exposed in :code:`/sys/bus/cxl/devices/`.
|
||||
|
||||
::
|
||||
|
||||
[
|
||||
{
|
||||
"bus":"root0",
|
||||
"provider":"ACPI.CXL",
|
||||
"nr_dports":4,
|
||||
"dports":[
|
||||
{
|
||||
"dport":"pci0000:00",
|
||||
"alias":"ACPI0016:01",
|
||||
"id":0
|
||||
},
|
||||
{
|
||||
"dport":"pci0000:a8",
|
||||
"alias":"ACPI0016:02",
|
||||
"id":4
|
||||
},
|
||||
{
|
||||
"dport":"pci0000:2a",
|
||||
"alias":"ACPI0016:03",
|
||||
"id":1
|
||||
},
|
||||
{
|
||||
"dport":"pci0000:d2",
|
||||
"alias":"ACPI0016:00",
|
||||
"id":5
|
||||
}
|
||||
],
|
||||
|
||||
This chunk shows the CXL "bus" (root0) has 4 downstream ports attached to CXL
|
||||
Host Bridges. The `Root` can be considered the singular upstream port attached
|
||||
to the platform's memory controller - which routes memory requests to it.
|
||||
|
||||
The `ports:root0` section lays out how each of these downstream ports are
|
||||
configured. If a port is not configured (id's 0 and 1), they are omitted.
|
||||
|
||||
::
|
||||
|
||||
"ports:root0":[
|
||||
{
|
||||
"port":"port1",
|
||||
"host":"pci0000:d2",
|
||||
"depth":1,
|
||||
"nr_dports":3,
|
||||
"dports":[
|
||||
{
|
||||
"dport":"0000:d2:01.1",
|
||||
"alias":"device:02",
|
||||
"id":0
|
||||
},
|
||||
{
|
||||
"dport":"0000:d2:01.3",
|
||||
"alias":"device:05",
|
||||
"id":2
|
||||
},
|
||||
{
|
||||
"dport":"0000:d2:07.1",
|
||||
"alias":"device:0d",
|
||||
"id":113
|
||||
}
|
||||
],
|
||||
|
||||
This chunk shows the available downstream ports associated with the CXL Host
|
||||
Bridge :code:`port1`. In this case, :code:`port1` has 3 available downstream
|
||||
ports: :code:`dport1`, :code:`dport2`, and :code:`dport113`..
|
||||
|
||||
::
|
||||
|
||||
"endpoints:port1":[
|
||||
{
|
||||
"endpoint":"endpoint5",
|
||||
"host":"mem0",
|
||||
"parent_dport":"0000:d2:01.1",
|
||||
"depth":2,
|
||||
"memdev":{
|
||||
"memdev":"mem0",
|
||||
"ram_size":137438953472,
|
||||
"serial":0,
|
||||
"numa_node":0,
|
||||
"host":"0000:d3:00.0"
|
||||
},
|
||||
"decoders:endpoint5":[
|
||||
{
|
||||
"decoder":"decoder5.0",
|
||||
"resource":825975898112,
|
||||
"size":274877906944,
|
||||
"interleave_ways":2,
|
||||
"interleave_granularity":256,
|
||||
"region":"region0",
|
||||
"dpa_resource":0,
|
||||
"dpa_size":137438953472,
|
||||
"mode":"ram"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"endpoint":"endpoint6",
|
||||
"host":"mem1",
|
||||
"parent_dport":"0000:d2:01.3,
|
||||
"depth":2,
|
||||
"memdev":{
|
||||
"memdev":"mem1",
|
||||
"ram_size":137438953472,
|
||||
"serial":0,
|
||||
"numa_node":0,
|
||||
"host":"0000:a9:00.0"
|
||||
},
|
||||
"decoders:endpoint6":[
|
||||
{
|
||||
"decoder":"decoder6.0",
|
||||
"resource":825975898112,
|
||||
"size":274877906944,
|
||||
"interleave_ways":2,
|
||||
"interleave_granularity":256,
|
||||
"region":"region0",
|
||||
"dpa_resource":0,
|
||||
"dpa_size":137438953472,
|
||||
"mode":"ram"
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
|
||||
This chunk shows the endpoints attached to the host bridge :code:`port1`.
|
||||
|
||||
:code:`endpoint5` contains a single configured decoder :code:`decoder5.0`
|
||||
which has the same interleave configuration memory region they belong to
|
||||
(show later).
|
||||
|
||||
Next we have the decoders belonging to the host bridge:
|
||||
|
||||
::
|
||||
|
||||
"decoders:port1":[
|
||||
{
|
||||
"decoder":"decoder1.0",
|
||||
"resource":825975898112,
|
||||
"size":274877906944,
|
||||
"interleave_ways":2,
|
||||
"interleave_granularity":256,
|
||||
"region":"region0",
|
||||
"nr_targets":2,
|
||||
"targets":[
|
||||
{
|
||||
"target":"0000:d2:01.1",
|
||||
"alias":"device:02",
|
||||
"position":0,
|
||||
"id":0
|
||||
},
|
||||
{
|
||||
"target":"0000:d2:01.3",
|
||||
"alias":"device:05",
|
||||
"position":1,
|
||||
"id":0
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
|
||||
Host Bridge :code:`port1` has a single decoder (:code:`decoder1.0`) with two
|
||||
targets: :code:`dport1` and :code:`dport3` - which are attached to
|
||||
:code:`endpoint5` and :code:`endpoint6` respectively.
|
||||
|
||||
The host bridge decoder interleaves these devices at a 256 byte granularity.
|
||||
|
||||
The next chunk shows the three CXL host bridges without attached endpoints.
|
||||
|
||||
::
|
||||
|
||||
{
|
||||
"port":"port2",
|
||||
"host":"pci0000:00",
|
||||
"depth":1,
|
||||
"nr_dports":2,
|
||||
"dports":[
|
||||
{
|
||||
"dport":"0000:00:01.3",
|
||||
"alias":"device:55",
|
||||
"id":2
|
||||
},
|
||||
{
|
||||
"dport":"0000:00:07.1",
|
||||
"alias":"device:5d",
|
||||
"id":113
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"port":"port3",
|
||||
"host":"pci0000:a8",
|
||||
"depth":1,
|
||||
"nr_dports":1,
|
||||
"dports":[
|
||||
{
|
||||
"dport":"0000:a8:01.1",
|
||||
"alias":"device:c3",
|
||||
"id":0
|
||||
}
|
||||
],
|
||||
},
|
||||
{
|
||||
"port":"port4",
|
||||
"host":"pci0000:2a",
|
||||
"depth":1,
|
||||
"nr_dports":1,
|
||||
"dports":[
|
||||
{
|
||||
"dport":"0000:2a:01.1",
|
||||
"alias":"device:d0",
|
||||
"id":0
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
|
||||
Next we have the `Root Decoders` belonging to :code:`root0`. This root decoder
|
||||
applies the interleave across the downstream ports :code:`port1` and
|
||||
:code:`port3` - with a granularity of 256 bytes.
|
||||
|
||||
This information is generated by the CXL driver reading the ACPI CEDT CMFWS.
|
||||
|
||||
::
|
||||
|
||||
"decoders:root0":[
|
||||
{
|
||||
"decoder":"decoder0.0",
|
||||
"resource":825975898112,
|
||||
"size":274877906944,
|
||||
"interleave_ways":1,
|
||||
"max_available_extent":0,
|
||||
"volatile_capable":true,
|
||||
"nr_targets":2,
|
||||
"targets":[
|
||||
{
|
||||
"target":"pci0000:a8",
|
||||
"alias":"ACPI0016:02",
|
||||
"position":1,
|
||||
"id":4
|
||||
},
|
||||
],
|
||||
|
||||
Finally we have the `Memory Region` associated with the `Root Decoder`
|
||||
:code:`decoder0.0`. This region describes the overall interleave configuration
|
||||
of the interleave set.
|
||||
|
||||
::
|
||||
|
||||
"regions:decoder0.0":[
|
||||
{
|
||||
"region":"region0",
|
||||
"resource":825975898112,
|
||||
"size":274877906944,
|
||||
"type":"ram",
|
||||
"interleave_ways":2,
|
||||
"interleave_granularity":256,
|
||||
"decode_state":"commit",
|
||||
"mappings":[
|
||||
{
|
||||
"position":1,
|
||||
"memdev":"mem1",
|
||||
"decoder":"decoder6.0"
|
||||
},
|
||||
{
|
||||
"position":0,
|
||||
"memdev":"mem0",
|
||||
"decoder":"decoder5.0"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
|
|
@ -0,0 +1,401 @@
|
|||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
======================
|
||||
Multi-Level Interleave
|
||||
======================
|
||||
This cxl-cli configuration dump shows the following host configuration:
|
||||
|
||||
* A single socket system with one CXL root
|
||||
* CXL Root has Four (4) CXL Host Bridges
|
||||
* Two CXL Host Bridges have a two CXL Memory Expanders Attached each.
|
||||
* The CXL root is configured to interleave across the two host bridges.
|
||||
* Each host bridge with expanders interleaves across two endpoints.
|
||||
|
||||
This output is generated by :code:`cxl list -v` and describes the relationships
|
||||
between objects exposed in :code:`/sys/bus/cxl/devices/`.
|
||||
|
||||
::
|
||||
|
||||
[
|
||||
{
|
||||
"bus":"root0",
|
||||
"provider":"ACPI.CXL",
|
||||
"nr_dports":4,
|
||||
"dports":[
|
||||
{
|
||||
"dport":"pci0000:00",
|
||||
"alias":"ACPI0016:01",
|
||||
"id":0
|
||||
},
|
||||
{
|
||||
"dport":"pci0000:a8",
|
||||
"alias":"ACPI0016:02",
|
||||
"id":4
|
||||
},
|
||||
{
|
||||
"dport":"pci0000:2a",
|
||||
"alias":"ACPI0016:03",
|
||||
"id":1
|
||||
},
|
||||
{
|
||||
"dport":"pci0000:d2",
|
||||
"alias":"ACPI0016:00",
|
||||
"id":5
|
||||
}
|
||||
],
|
||||
|
||||
This chunk shows the CXL "bus" (root0) has 4 downstream ports attached to CXL
|
||||
Host Bridges. The `Root` can be considered the singular upstream port attached
|
||||
to the platform's memory controller - which routes memory requests to it.
|
||||
|
||||
The `ports:root0` section lays out how each of these downstream ports are
|
||||
configured. If a port is not configured (id's 0 and 1), they are omitted.
|
||||
|
||||
::
|
||||
|
||||
"ports:root0":[
|
||||
{
|
||||
"port":"port1",
|
||||
"host":"pci0000:d2",
|
||||
"depth":1,
|
||||
"nr_dports":3,
|
||||
"dports":[
|
||||
{
|
||||
"dport":"0000:d2:01.1",
|
||||
"alias":"device:02",
|
||||
"id":0
|
||||
},
|
||||
{
|
||||
"dport":"0000:d2:01.3",
|
||||
"alias":"device:05",
|
||||
"id":2
|
||||
},
|
||||
{
|
||||
"dport":"0000:d2:07.1",
|
||||
"alias":"device:0d",
|
||||
"id":113
|
||||
}
|
||||
],
|
||||
|
||||
This chunk shows the available downstream ports associated with the CXL Host
|
||||
Bridge :code:`port1`. In this case, :code:`port1` has 3 available downstream
|
||||
ports: :code:`dport0`, :code:`dport2`, and :code:`dport113`.
|
||||
|
||||
::
|
||||
|
||||
"endpoints:port1":[
|
||||
{
|
||||
"endpoint":"endpoint5",
|
||||
"host":"mem0",
|
||||
"parent_dport":"0000:d2:01.1",
|
||||
"depth":2,
|
||||
"memdev":{
|
||||
"memdev":"mem0",
|
||||
"ram_size":137438953472,
|
||||
"serial":0,
|
||||
"numa_node":0,
|
||||
"host":"0000:d3:00.0"
|
||||
},
|
||||
"decoders:endpoint5":[
|
||||
{
|
||||
"decoder":"decoder5.0",
|
||||
"resource":825975898112,
|
||||
"size":549755813888,
|
||||
"interleave_ways":4,
|
||||
"interleave_granularity":256,
|
||||
"region":"region0",
|
||||
"dpa_resource":0,
|
||||
"dpa_size":137438953472,
|
||||
"mode":"ram"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"endpoint":"endpoint6",
|
||||
"host":"mem1",
|
||||
"parent_dport":"0000:d2:01.3",
|
||||
"depth":2,
|
||||
"memdev":{
|
||||
"memdev":"mem1",
|
||||
"ram_size":137438953472,
|
||||
"serial":0,
|
||||
"numa_node":0,
|
||||
"host":"0000:d3:00.0"
|
||||
},
|
||||
"decoders:endpoint6":[
|
||||
{
|
||||
"decoder":"decoder6.0",
|
||||
"resource":825975898112,
|
||||
"size":549755813888,
|
||||
"interleave_ways":4,
|
||||
"interleave_granularity":256,
|
||||
"region":"region0",
|
||||
"dpa_resource":0,
|
||||
"dpa_size":137438953472,
|
||||
"mode":"ram"
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
|
||||
This chunk shows the endpoints attached to the host bridge :code:`port1`.
|
||||
|
||||
:code:`endpoint5` contains a single configured decoder :code:`decoder5.0`
|
||||
which has the same interleave configuration as :code:`region0` (shown later).
|
||||
|
||||
:code:`endpoint6` contains a single configured decoder :code:`decoder5.0`
|
||||
which has the same interleave configuration as :code:`region0` (shown later).
|
||||
|
||||
Next we have the decoders belonging to the host bridge:
|
||||
|
||||
::
|
||||
|
||||
"decoders:port1":[
|
||||
{
|
||||
"decoder":"decoder1.0",
|
||||
"resource":825975898112,
|
||||
"size":549755813888,
|
||||
"interleave_ways":2,
|
||||
"interleave_granularity":512,
|
||||
"region":"region0",
|
||||
"nr_targets":2,
|
||||
"targets":[
|
||||
{
|
||||
"target":"0000:d2:01.1",
|
||||
"alias":"device:02",
|
||||
"position":0,
|
||||
"id":0
|
||||
},
|
||||
{
|
||||
"target":"0000:d2:01.3",
|
||||
"alias":"device:05",
|
||||
"position":2,
|
||||
"id":0
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
|
||||
Host Bridge :code:`port1` has a single decoder (:code:`decoder1.0`), whose
|
||||
targets are :code:`dport0` and :code:`dport2` - which are attached to
|
||||
:code:`endpoint5` and :code:`endpoint6` respectively.
|
||||
|
||||
The following chunk shows a similar configuration for Host Bridge :code:`port3`,
|
||||
the second host bridge with a memory device attached.
|
||||
|
||||
::
|
||||
|
||||
{
|
||||
"port":"port3",
|
||||
"host":"pci0000:a8",
|
||||
"depth":1,
|
||||
"nr_dports":1,
|
||||
"dports":[
|
||||
{
|
||||
"dport":"0000:a8:01.1",
|
||||
"alias":"device:c3",
|
||||
"id":0
|
||||
},
|
||||
{
|
||||
"dport":"0000:a8:01.3",
|
||||
"alias":"device:c5",
|
||||
"id":0
|
||||
}
|
||||
],
|
||||
"endpoints:port3":[
|
||||
{
|
||||
"endpoint":"endpoint7",
|
||||
"host":"mem2",
|
||||
"parent_dport":"0000:a8:01.1",
|
||||
"depth":2,
|
||||
"memdev":{
|
||||
"memdev":"mem2",
|
||||
"ram_size":137438953472,
|
||||
"serial":0,
|
||||
"numa_node":0,
|
||||
"host":"0000:a9:00.0"
|
||||
},
|
||||
"decoders:endpoint7":[
|
||||
{
|
||||
"decoder":"decoder7.0",
|
||||
"resource":825975898112,
|
||||
"size":549755813888,
|
||||
"interleave_ways":4,
|
||||
"interleave_granularity":256,
|
||||
"region":"region0",
|
||||
"dpa_resource":0,
|
||||
"dpa_size":137438953472,
|
||||
"mode":"ram"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"endpoint":"endpoint8",
|
||||
"host":"mem3",
|
||||
"parent_dport":"0000:a8:01.3",
|
||||
"depth":2,
|
||||
"memdev":{
|
||||
"memdev":"mem3",
|
||||
"ram_size":137438953472,
|
||||
"serial":0,
|
||||
"numa_node":0,
|
||||
"host":"0000:a9:00.0"
|
||||
},
|
||||
"decoders:endpoint8":[
|
||||
{
|
||||
"decoder":"decoder8.0",
|
||||
"resource":825975898112,
|
||||
"size":549755813888,
|
||||
"interleave_ways":4,
|
||||
"interleave_granularity":256,
|
||||
"region":"region0",
|
||||
"dpa_resource":0,
|
||||
"dpa_size":137438953472,
|
||||
"mode":"ram"
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"decoders:port3":[
|
||||
{
|
||||
"decoder":"decoder3.0",
|
||||
"resource":825975898112,
|
||||
"size":549755813888,
|
||||
"interleave_ways":2,
|
||||
"interleave_granularity":512,
|
||||
"region":"region0",
|
||||
"nr_targets":1,
|
||||
"targets":[
|
||||
{
|
||||
"target":"0000:a8:01.1",
|
||||
"alias":"device:c3",
|
||||
"position":1,
|
||||
"id":0
|
||||
},
|
||||
{
|
||||
"target":"0000:a8:01.3",
|
||||
"alias":"device:c5",
|
||||
"position":3,
|
||||
"id":0
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
|
||||
|
||||
The next chunk shows the two CXL host bridges without attached endpoints.
|
||||
|
||||
::
|
||||
|
||||
{
|
||||
"port":"port2",
|
||||
"host":"pci0000:00",
|
||||
"depth":1,
|
||||
"nr_dports":2,
|
||||
"dports":[
|
||||
{
|
||||
"dport":"0000:00:01.3",
|
||||
"alias":"device:55",
|
||||
"id":2
|
||||
},
|
||||
{
|
||||
"dport":"0000:00:07.1",
|
||||
"alias":"device:5d",
|
||||
"id":113
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"port":"port4",
|
||||
"host":"pci0000:2a",
|
||||
"depth":1,
|
||||
"nr_dports":1,
|
||||
"dports":[
|
||||
{
|
||||
"dport":"0000:2a:01.1",
|
||||
"alias":"device:d0",
|
||||
"id":0
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
|
||||
Next we have the `Root Decoders` belonging to :code:`root0`. This root decoder
|
||||
applies the interleave across the downstream ports :code:`port1` and
|
||||
:code:`port3` - with a granularity of 256 bytes.
|
||||
|
||||
This information is generated by the CXL driver reading the ACPI CEDT CMFWS.
|
||||
|
||||
::
|
||||
|
||||
"decoders:root0":[
|
||||
{
|
||||
"decoder":"decoder0.0",
|
||||
"resource":825975898112,
|
||||
"size":549755813888,
|
||||
"interleave_ways":2,
|
||||
"interleave_granularity":256,
|
||||
"max_available_extent":0,
|
||||
"volatile_capable":true,
|
||||
"nr_targets":2,
|
||||
"targets":[
|
||||
{
|
||||
"target":"pci0000:a8",
|
||||
"alias":"ACPI0016:02",
|
||||
"position":1,
|
||||
"id":4
|
||||
},
|
||||
{
|
||||
"target":"pci0000:d2",
|
||||
"alias":"ACPI0016:00",
|
||||
"position":0,
|
||||
"id":5
|
||||
}
|
||||
],
|
||||
|
||||
Finally we have the `Memory Region` associated with the `Root Decoder`
|
||||
:code:`decoder0.0`. This region describes the overall interleave configuration
|
||||
of the interleave set. So we see there are a total of :code:`4` interleave
|
||||
targets across 4 endpoint decoders.
|
||||
|
||||
::
|
||||
|
||||
"regions:decoder0.0":[
|
||||
{
|
||||
"region":"region0",
|
||||
"resource":825975898112,
|
||||
"size":549755813888,
|
||||
"type":"ram",
|
||||
"interleave_ways":4,
|
||||
"interleave_granularity":256,
|
||||
"decode_state":"commit",
|
||||
"mappings":[
|
||||
{
|
||||
"position":3,
|
||||
"memdev":"mem3",
|
||||
"decoder":"decoder8.0"
|
||||
},
|
||||
{
|
||||
"position":2,
|
||||
"memdev":"mem1",
|
||||
"decoder":"decoder6.0"
|
||||
}
|
||||
{
|
||||
"position":1,
|
||||
"memdev":"mem2",
|
||||
"decoder":"decoder7.0"
|
||||
},
|
||||
{
|
||||
"position":0,
|
||||
"memdev":"mem0",
|
||||
"decoder":"decoder5.0"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
|
|
@ -0,0 +1,246 @@
|
|||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
=============
|
||||
Single Device
|
||||
=============
|
||||
This cxl-cli configuration dump shows the following host configuration:
|
||||
|
||||
* A single socket system with one CXL root
|
||||
* CXL Root has Four (4) CXL Host Bridges
|
||||
* One CXL Host Bridges has a single CXL Memory Expander Attached
|
||||
* No interleave is present.
|
||||
|
||||
This output is generated by :code:`cxl list -v` and describes the relationships
|
||||
between objects exposed in :code:`/sys/bus/cxl/devices/`.
|
||||
|
||||
::
|
||||
|
||||
[
|
||||
{
|
||||
"bus":"root0",
|
||||
"provider":"ACPI.CXL",
|
||||
"nr_dports":4,
|
||||
"dports":[
|
||||
{
|
||||
"dport":"pci0000:00",
|
||||
"alias":"ACPI0016:01",
|
||||
"id":0
|
||||
},
|
||||
{
|
||||
"dport":"pci0000:a8",
|
||||
"alias":"ACPI0016:02",
|
||||
"id":4
|
||||
},
|
||||
{
|
||||
"dport":"pci0000:2a",
|
||||
"alias":"ACPI0016:03",
|
||||
"id":1
|
||||
},
|
||||
{
|
||||
"dport":"pci0000:d2",
|
||||
"alias":"ACPI0016:00",
|
||||
"id":5
|
||||
}
|
||||
],
|
||||
|
||||
This chunk shows the CXL "bus" (root0) has 4 downstream ports attached to CXL
|
||||
Host Bridges. The `Root` can be considered the singular upstream port attached
|
||||
to the platform's memory controller - which routes memory requests to it.
|
||||
|
||||
The `ports:root0` section lays out how each of these downstream ports are
|
||||
configured. If a port is not configured (id's 0, 1, and 4), they are omitted.
|
||||
|
||||
::
|
||||
|
||||
"ports:root0":[
|
||||
{
|
||||
"port":"port1",
|
||||
"host":"pci0000:d2",
|
||||
"depth":1,
|
||||
"nr_dports":3,
|
||||
"dports":[
|
||||
{
|
||||
"dport":"0000:d2:01.1",
|
||||
"alias":"device:02",
|
||||
"id":0
|
||||
},
|
||||
{
|
||||
"dport":"0000:d2:01.3",
|
||||
"alias":"device:05",
|
||||
"id":2
|
||||
},
|
||||
{
|
||||
"dport":"0000:d2:07.1",
|
||||
"alias":"device:0d",
|
||||
"id":113
|
||||
}
|
||||
],
|
||||
|
||||
This chunk shows the available downstream ports associated with the CXL Host
|
||||
Bridge :code:`port1`. In this case, :code:`port1` has 3 available downstream
|
||||
ports: :code:`dport1`, :code:`dport2`, and :code:`dport113`..
|
||||
|
||||
::
|
||||
|
||||
"endpoints:port1":[
|
||||
{
|
||||
"endpoint":"endpoint5",
|
||||
"host":"mem0",
|
||||
"parent_dport":"0000:d2:01.1",
|
||||
"depth":2,
|
||||
"memdev":{
|
||||
"memdev":"mem0",
|
||||
"ram_size":137438953472,
|
||||
"serial":0,
|
||||
"numa_node":0,
|
||||
"host":"0000:d3:00.0"
|
||||
},
|
||||
"decoders:endpoint5":[
|
||||
{
|
||||
"decoder":"decoder5.0",
|
||||
"resource":825975898112,
|
||||
"size":137438953472,
|
||||
"interleave_ways":1,
|
||||
"region":"region0",
|
||||
"dpa_resource":0,
|
||||
"dpa_size":137438953472,
|
||||
"mode":"ram"
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
|
||||
This chunk shows the endpoints attached to the host bridge :code:`port1`.
|
||||
|
||||
:code:`endpoint5` contains a single configured decoder :code:`decoder5.0`
|
||||
which has the same interleave configuration as :code:`region0` (shown later).
|
||||
|
||||
Next we have the decoders belonging to the host bridge:
|
||||
|
||||
::
|
||||
|
||||
"decoders:port1":[
|
||||
{
|
||||
"decoder":"decoder1.0",
|
||||
"resource":825975898112,
|
||||
"size":137438953472,
|
||||
"interleave_ways":1,
|
||||
"region":"region0",
|
||||
"nr_targets":1,
|
||||
"targets":[
|
||||
{
|
||||
"target":"0000:d2:01.1",
|
||||
"alias":"device:02",
|
||||
"position":0,
|
||||
"id":0
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
|
||||
Host Bridge :code:`port1` has a single decoder (:code:`decoder1.0`), whose only
|
||||
target is :code:`dport1` - which is attached to :code:`endpoint5`.
|
||||
|
||||
The next chunk shows the three CXL host bridges without attached endpoints.
|
||||
|
||||
::
|
||||
|
||||
{
|
||||
"port":"port2",
|
||||
"host":"pci0000:00",
|
||||
"depth":1,
|
||||
"nr_dports":2,
|
||||
"dports":[
|
||||
{
|
||||
"dport":"0000:00:01.3",
|
||||
"alias":"device:55",
|
||||
"id":2
|
||||
},
|
||||
{
|
||||
"dport":"0000:00:07.1",
|
||||
"alias":"device:5d",
|
||||
"id":113
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"port":"port3",
|
||||
"host":"pci0000:a8",
|
||||
"depth":1,
|
||||
"nr_dports":1,
|
||||
"dports":[
|
||||
{
|
||||
"dport":"0000:a8:01.1",
|
||||
"alias":"device:c3",
|
||||
"id":0
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"port":"port4",
|
||||
"host":"pci0000:2a",
|
||||
"depth":1,
|
||||
"nr_dports":1,
|
||||
"dports":[
|
||||
{
|
||||
"dport":"0000:2a:01.1",
|
||||
"alias":"device:d0",
|
||||
"id":0
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
|
||||
Next we have the `Root Decoders` belonging to :code:`root0`. This root decoder
|
||||
is a pass-through decoder because :code:`interleave_ways` is set to :code:`1`.
|
||||
|
||||
This information is generated by the CXL driver reading the ACPI CEDT CMFWS.
|
||||
|
||||
::
|
||||
|
||||
"decoders:root0":[
|
||||
{
|
||||
"decoder":"decoder0.0",
|
||||
"resource":825975898112,
|
||||
"size":137438953472,
|
||||
"interleave_ways":1,
|
||||
"max_available_extent":0,
|
||||
"volatile_capable":true,
|
||||
"nr_targets":1,
|
||||
"targets":[
|
||||
{
|
||||
"target":"pci0000:d2",
|
||||
"alias":"ACPI0016:00",
|
||||
"position":0,
|
||||
"id":5
|
||||
}
|
||||
],
|
||||
|
||||
Finally we have the `Memory Region` associated with the `Root Decoder`
|
||||
:code:`decoder0.0`. This region describes the discrete region associated
|
||||
with the lone device.
|
||||
|
||||
::
|
||||
|
||||
"regions:decoder0.0":[
|
||||
{
|
||||
"region":"region0",
|
||||
"resource":825975898112,
|
||||
"size":137438953472,
|
||||
"type":"ram",
|
||||
"interleave_ways":1,
|
||||
"decode_state":"commit",
|
||||
"mappings":[
|
||||
{
|
||||
"position":0,
|
||||
"memdev":"mem0",
|
||||
"decoder":"decoder5.0"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
78
Documentation/driver-api/cxl/linux/memory-hotplug.rst
Normal file
78
Documentation/driver-api/cxl/linux/memory-hotplug.rst
Normal file
|
|
@ -0,0 +1,78 @@
|
|||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
==============
|
||||
Memory Hotplug
|
||||
==============
|
||||
The final phase of surfacing CXL memory to the kernel page allocator is for
|
||||
the `DAX` driver to surface a `Driver Managed` memory region via the
|
||||
memory-hotplug component.
|
||||
|
||||
There are four major configurations to consider:
|
||||
|
||||
1) Default Online Behavior (on/off and zone)
|
||||
2) Hotplug Memory Block size
|
||||
3) Memory Map Resource location
|
||||
4) Driver-Managed Memory Designation
|
||||
|
||||
Default Online Behavior
|
||||
=======================
|
||||
The default-online behavior of hotplug memory is dictated by the following,
|
||||
in order of precedence:
|
||||
|
||||
- :code:`CONFIG_MHP_DEFAULT_ONLINE_TYPE` Build Configuration
|
||||
- :code:`memhp_default_state` Boot parameter
|
||||
- :code:`/sys/devices/system/memory/auto_online_blocks` value
|
||||
|
||||
These dictate whether hotplugged memory blocks arrive in one of three states:
|
||||
|
||||
1) Offline
|
||||
2) Online in :code:`ZONE_NORMAL`
|
||||
3) Online in :code:`ZONE_MOVABLE`
|
||||
|
||||
:code:`ZONE_NORMAL` implies this capacity may be used for almost any allocation,
|
||||
while :code:`ZONE_MOVABLE` implies this capacity should only be used for
|
||||
migratable allocations.
|
||||
|
||||
:code:`ZONE_MOVABLE` attempts to retain the hotplug-ability of a memory block
|
||||
so that it the entire region may be hot-unplugged at a later time. Any capacity
|
||||
onlined into :code:`ZONE_NORMAL` should be considered permanently attached to
|
||||
the page allocator.
|
||||
|
||||
Hotplug Memory Block Size
|
||||
=========================
|
||||
By default, on most architectures, the Hotplug Memory Block Size is either
|
||||
128MB or 256MB. On x86, the block size increases up to 2GB as total memory
|
||||
capacity exceeds 64GB. As of v6.15, Linux does not take into account the
|
||||
size and alignment of the ACPI CEDT CFMWS regions (see Early Boot docs) when
|
||||
deciding the Hotplug Memory Block Size.
|
||||
|
||||
Memory Map
|
||||
==========
|
||||
The location of :code:`struct folio` allocations to represent the hotplugged
|
||||
memory capacity are dictated by the following system settings:
|
||||
|
||||
- :code:`/sys_module/memory_hotplug/parameters/memmap_on_memory`
|
||||
- :code:`/sys/bus/dax/devices/daxN.Y/memmap_on_memory`
|
||||
|
||||
If both of these parameters are set to true, :code:`struct folio` for this
|
||||
capacity will be carved out of the memory block being onlined. This has
|
||||
performance implications if the memory is particularly high-latency and
|
||||
its :code:`struct folio` becomes hotly contended.
|
||||
|
||||
If either parameter is set to false, :code:`struct folio` for this capacity
|
||||
will be allocated from the local node of the processor running the hotplug
|
||||
procedure. This capacity will be allocated from :code:`ZONE_NORMAL` on
|
||||
that node, as it is a :code:`GFP_KERNEL` allocation.
|
||||
|
||||
Systems with extremely large amounts of :code:`ZONE_MOVABLE` memory (e.g.
|
||||
CXL memory pools) must ensure that there is sufficient local
|
||||
:code:`ZONE_NORMAL` capacity to host the memory map for the hotplugged capacity.
|
||||
|
||||
Driver Managed Memory
|
||||
=====================
|
||||
The DAX driver surfaces this memory to memory-hotplug as "Driver Managed". This
|
||||
is not a configurable setting, but it's important to note that driver managed
|
||||
memory is explicitly excluded from use during kexec. This is required to ensure
|
||||
any reset or out-of-band operations that the CXL device may be subject to during
|
||||
a functional system-reboot (such as a reset-on-probe) will not cause portions of
|
||||
the kexec kernel to be overwritten.
|
||||
103
Documentation/driver-api/cxl/linux/overview.rst
Normal file
103
Documentation/driver-api/cxl/linux/overview.rst
Normal file
|
|
@ -0,0 +1,103 @@
|
|||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
========
|
||||
Overview
|
||||
========
|
||||
|
||||
This section presents the configuration process of a CXL Type-3 memory device,
|
||||
and how it is ultimately exposed to users as either a :code:`DAX` device or
|
||||
normal memory pages via the kernel's page allocator.
|
||||
|
||||
Portions marked with a bullet are points at which certain kernel objects
|
||||
are generated.
|
||||
|
||||
1) Early Boot
|
||||
|
||||
a) BIOS, Build, and Boot Parameters
|
||||
|
||||
i) EFI_MEMORY_SP
|
||||
ii) CONFIG_EFI_SOFT_RESERVE
|
||||
iii) CONFIG_MHP_DEFAULT_ONLINE_TYPE
|
||||
iv) nosoftreserve
|
||||
|
||||
b) Memory Map Creation
|
||||
|
||||
i) EFI Memory Map / E820 Consulted for Soft-Reserved
|
||||
|
||||
* CXL Memory is set aside to be handled by the CXL driver
|
||||
|
||||
* Soft-Reserved IO Resource created for CFMWS entry
|
||||
|
||||
c) NUMA Node Creation
|
||||
|
||||
* Nodes created from ACPI CEDT CFMWS and SRAT Proximity domains (PXM)
|
||||
|
||||
d) Memory Tier Creation
|
||||
|
||||
* A default memory_tier is created with all nodes.
|
||||
|
||||
e) Contiguous Memory Allocation
|
||||
|
||||
* Any requested CMA is allocated from Online nodes
|
||||
|
||||
f) Init Finishes, Drivers start probing
|
||||
|
||||
2) ACPI and PCI Drivers
|
||||
|
||||
a) Detects PCI device is CXL, marking it for probe by CXL driver
|
||||
|
||||
3) CXL Driver Operation
|
||||
|
||||
a) Base device creation
|
||||
|
||||
* root, port, and memdev devices created
|
||||
* CEDT CFMWS IO Resource creation
|
||||
|
||||
b) Decoder creation
|
||||
|
||||
* root, switch, and endpoint decoders created
|
||||
|
||||
c) Logical device creation
|
||||
|
||||
* memory_region and endpoint devices created
|
||||
|
||||
d) Devices are associated with each other
|
||||
|
||||
* If auto-decoder (BIOS-programmed decoders), driver validates
|
||||
configurations, builds associations, and locks configs at probe time.
|
||||
|
||||
* If user-configured, validation and associations are built at
|
||||
decoder-commit time.
|
||||
|
||||
e) Regions surfaced as DAX region
|
||||
|
||||
* dax_region created
|
||||
|
||||
* DAX device created via DAX driver
|
||||
|
||||
4) DAX Driver Operation
|
||||
|
||||
a) DAX driver surfaces DAX region as one of two dax device modes
|
||||
|
||||
* kmem - dax device is converted to hotplug memory blocks
|
||||
|
||||
* DAX kmem IO Resource creation
|
||||
|
||||
* hmem - dax device is left as daxdev to be accessed as a file.
|
||||
|
||||
* If hmem, journey ends here.
|
||||
|
||||
b) DAX kmem surfaces memory region to Memory Hotplug to add to page
|
||||
allocator as "driver managed memory"
|
||||
|
||||
5) Memory Hotplug
|
||||
|
||||
a) mhp component surfaces a dax device memory region as multiple memory
|
||||
blocks to the page allocator
|
||||
|
||||
* blocks appear in :code:`/sys/bus/memory/devices` and linked to a NUMA node
|
||||
|
||||
b) blocks are onlined into the requested zone (NORMAL or MOVABLE)
|
||||
|
||||
* Memory is marked "Driver Managed" to avoid kexec from using it as region
|
||||
for kernel updates
|
||||
|
|
@ -51,9 +51,9 @@ in place, but there are several corner cases that are pending closure.
|
|||
|
||||
* [2] CXL Window Enumeration
|
||||
|
||||
* [0] :ref:`Extended-linear memory-side cache <extended-linear>`
|
||||
* [2] :ref:`Extended-linear memory-side cache <extended-linear>`
|
||||
* [0] Low Memory-hole
|
||||
* [0] Hetero-interleave
|
||||
* [X] Hetero-interleave
|
||||
|
||||
* [2] Switch Enumeration
|
||||
|
||||
|
|
@ -173,7 +173,7 @@ Accelerator
|
|||
User Flow Support
|
||||
-----------------
|
||||
|
||||
* [0] HPA->DPA Address translation (need xormaps export solution)
|
||||
* [0] Inject & clear poison by HPA
|
||||
|
||||
Details
|
||||
=======
|
||||
|
|
|
|||
76
Documentation/driver-api/cxl/platform/acpi.rst
Normal file
76
Documentation/driver-api/cxl/platform/acpi.rst
Normal file
|
|
@ -0,0 +1,76 @@
|
|||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
===========
|
||||
ACPI Tables
|
||||
===========
|
||||
|
||||
ACPI is the "Advanced Configuration and Power Interface", which is a standard
|
||||
that defines how platforms and OS manage power and configure computer hardware.
|
||||
For the purpose of this theory of operation, when referring to "ACPI" we will
|
||||
usually refer to "ACPI Tables" - which are the way a platform (BIOS/EFI)
|
||||
communicates static configuration information to the operation system.
|
||||
|
||||
The Following ACPI tables contain *static* configuration and performance data
|
||||
about CXL devices.
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 1
|
||||
|
||||
acpi/cedt.rst
|
||||
acpi/srat.rst
|
||||
acpi/hmat.rst
|
||||
acpi/slit.rst
|
||||
acpi/dsdt.rst
|
||||
|
||||
The SRAT table may also contain generic port/initiator content that is intended
|
||||
to describe the generic port, but not information about the rest of the path to
|
||||
the endpoint.
|
||||
|
||||
Linux uses these tables to configure kernel resources for statically configured
|
||||
(by BIOS/EFI) CXL devices, such as:
|
||||
|
||||
- NUMA nodes
|
||||
- Memory Tiers
|
||||
- NUMA Abstract Distances
|
||||
- SystemRAM Memory Regions
|
||||
- Weighted Interleave Node Weights
|
||||
|
||||
ACPI Debugging
|
||||
==============
|
||||
|
||||
The :code:`acpidump -b` command dumps the ACPI tables into binary format.
|
||||
|
||||
The :code:`iasl -d` command disassembles the files into human readable format.
|
||||
|
||||
Example :code:`acpidump -b && iasl -d cedt.dat` ::
|
||||
|
||||
[000h 0000 4] Signature : "CEDT" [CXL Early Discovery Table]
|
||||
|
||||
Common Issues
|
||||
-------------
|
||||
Most failures described here result in a failure of the driver to surface
|
||||
memory as a DAX device and/or kmem.
|
||||
|
||||
* CEDT CFMWS targets list UIDs do not match CEDT CHBS UIDs.
|
||||
* CEDT CFMWS targets list UIDs do not match DSDT CXL Host Bridge UIDs.
|
||||
* CEDT CFMWS Restriction Bits are not correct.
|
||||
* CEDT CFMWS Memory regions are poorly aligned.
|
||||
* CEDT CFMWS Memory regions spans a platform memory hole.
|
||||
* CEDT CHBS UIDs do not match DSDT CXL Host Bridge UIDs.
|
||||
* CEDT CHBS Specification version is incorrect.
|
||||
* SRAT is missing regions described in CEDT CFMWS.
|
||||
|
||||
* Result: failure to create a NUMA node for the region, or
|
||||
region is placed in wrong node.
|
||||
|
||||
* HMAT is missing data for regions described in CEDT CFMWS.
|
||||
|
||||
* Result: NUMA node being placed in the wrong memory tier.
|
||||
|
||||
* SLIT has bad data.
|
||||
|
||||
* Result: Lots of performance mechanisms in the kernel will be very unhappy.
|
||||
|
||||
All of these issues will appear to users as if the driver is failing to
|
||||
support CXL - when in reality they are all the failure of a platform to
|
||||
configure the ACPI tables correctly.
|
||||
62
Documentation/driver-api/cxl/platform/acpi/cedt.rst
Normal file
62
Documentation/driver-api/cxl/platform/acpi/cedt.rst
Normal file
|
|
@ -0,0 +1,62 @@
|
|||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
================================
|
||||
CEDT - CXL Early Discovery Table
|
||||
================================
|
||||
|
||||
The CXL Early Discovery Table is generated by BIOS to describe the CXL memory
|
||||
regions configured at boot by the BIOS.
|
||||
|
||||
CHBS
|
||||
====
|
||||
The CXL Host Bridge Structure describes CXL host bridges. Other than describing
|
||||
device register information, it reports the specific host bridge UID for this
|
||||
host bridge. These host bridge ID's will be referenced in other tables.
|
||||
|
||||
Example ::
|
||||
|
||||
Subtable Type : 00 [CXL Host Bridge Structure]
|
||||
Reserved : 00
|
||||
Length : 0020
|
||||
Associated host bridge : 00000007 <- Host bridge _UID
|
||||
Specification version : 00000001
|
||||
Reserved : 00000000
|
||||
Register base : 0000010370400000
|
||||
Register length : 0000000000010000
|
||||
|
||||
CFMWS
|
||||
=====
|
||||
The CXL Fixed Memory Window structure describes a memory region associated
|
||||
with one or more CXL host bridges (as described by the CHBS). It additionally
|
||||
describes any inter-host-bridge interleave configuration that may have been
|
||||
programmed by BIOS.
|
||||
|
||||
Example ::
|
||||
|
||||
Subtable Type : 01 [CXL Fixed Memory Window Structure]
|
||||
Reserved : 00
|
||||
Length : 002C
|
||||
Reserved : 00000000
|
||||
Window base address : 000000C050000000 <- Memory Region
|
||||
Window size : 0000003CA0000000
|
||||
Interleave Members (2^n) : 01 <- Interleave configuration
|
||||
Interleave Arithmetic : 00
|
||||
Reserved : 0000
|
||||
Granularity : 00000000
|
||||
Restrictions : 0006
|
||||
QtgId : 0001
|
||||
First Target : 00000007 <- Host Bridge _UID
|
||||
Next Target : 00000006 <- Host Bridge _UID
|
||||
|
||||
The restriction field dictates what this SPA range may be used for (memory type,
|
||||
voltile vs persistent, etc). One or more bits may be set. ::
|
||||
|
||||
Bit[0]: CXL Type 2 Memory
|
||||
Bit[1]: CXL Type 3 Memory
|
||||
Bit[2]: Volatile Memory
|
||||
Bit[3]: Persistent Memory
|
||||
Bit[4]: Fixed Config (HPA cannot be re-used)
|
||||
|
||||
INTRA-host-bridge interleave (multiple devices on one host bridge) is NOT
|
||||
reported in this structure, and is solely defined via CXL device decoder
|
||||
programming (host bridge and endpoint decoders).
|
||||
28
Documentation/driver-api/cxl/platform/acpi/dsdt.rst
Normal file
28
Documentation/driver-api/cxl/platform/acpi/dsdt.rst
Normal file
|
|
@ -0,0 +1,28 @@
|
|||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
==============================================
|
||||
DSDT - Differentiated system Description Table
|
||||
==============================================
|
||||
|
||||
This table describes what peripherals a machine has.
|
||||
|
||||
This table's UIDs for CXL devices - specifically host bridges, must be
|
||||
consistent with the contents of the CEDT, otherwise the CXL driver will
|
||||
fail to probe correctly.
|
||||
|
||||
Example Compute Express Link Host Bridge ::
|
||||
|
||||
Scope (_SB)
|
||||
{
|
||||
Device (S0D0)
|
||||
{
|
||||
Name (_HID, "ACPI0016" /* Compute Express Link Host Bridge */) // _HID: Hardware ID
|
||||
Name (_CID, Package (0x02) // _CID: Compatible ID
|
||||
{
|
||||
EisaId ("PNP0A08") /* PCI Express Bus */,
|
||||
EisaId ("PNP0A03") /* PCI Bus */
|
||||
})
|
||||
...
|
||||
Name (_UID, 0x05) // _UID: Unique ID
|
||||
...
|
||||
}
|
||||
32
Documentation/driver-api/cxl/platform/acpi/hmat.rst
Normal file
32
Documentation/driver-api/cxl/platform/acpi/hmat.rst
Normal file
|
|
@ -0,0 +1,32 @@
|
|||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
===========================================
|
||||
HMAT - Heterogeneous Memory Attribute Table
|
||||
===========================================
|
||||
|
||||
The Heterogeneous Memory Attributes Table contains information such as cache
|
||||
attributes and bandwidth and latency details for memory proximity domains.
|
||||
For the purpose of this document, we will only discuss the SSLIB entry.
|
||||
|
||||
SLLBI
|
||||
=====
|
||||
The System Locality Latency and Bandwidth Information records latency and
|
||||
bandwidth information for proximity domains.
|
||||
|
||||
This table is used by Linux to configure interleave weights and memory tiers.
|
||||
|
||||
Example (Heavily truncated for brevity) ::
|
||||
|
||||
Structure Type : 0001 [SLLBI]
|
||||
Data Type : 00 <- Latency
|
||||
Target Proximity Domain List : 00000000
|
||||
Target Proximity Domain List : 00000001
|
||||
Entry : 0080 <- DRAM LTC
|
||||
Entry : 0100 <- CXL LTC
|
||||
|
||||
Structure Type : 0001 [SLLBI]
|
||||
Data Type : 03 <- Bandwidth
|
||||
Target Proximity Domain List : 00000000
|
||||
Target Proximity Domain List : 00000001
|
||||
Entry : 1200 <- DRAM BW
|
||||
Entry : 0200 <- CXL BW
|
||||
21
Documentation/driver-api/cxl/platform/acpi/slit.rst
Normal file
21
Documentation/driver-api/cxl/platform/acpi/slit.rst
Normal file
|
|
@ -0,0 +1,21 @@
|
|||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
========================================
|
||||
SLIT - System Locality Information Table
|
||||
========================================
|
||||
|
||||
The system locality information table provides "abstract distances" between
|
||||
accessor and memory nodes. Node without initiators (cpus) are infinitely (FF)
|
||||
distance away from all other nodes.
|
||||
|
||||
The abstract distance described in this table does not describe any real
|
||||
latency of bandwidth information.
|
||||
|
||||
Example ::
|
||||
|
||||
Signature : "SLIT" [System Locality Information Table]
|
||||
Localities : 0000000000000004
|
||||
Locality 0 : 10 20 20 30
|
||||
Locality 1 : 20 10 30 20
|
||||
Locality 2 : FF FF 0A FF
|
||||
Locality 3 : FF FF FF 0A
|
||||
71
Documentation/driver-api/cxl/platform/acpi/srat.rst
Normal file
71
Documentation/driver-api/cxl/platform/acpi/srat.rst
Normal file
|
|
@ -0,0 +1,71 @@
|
|||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
=====================================
|
||||
SRAT - Static Resource Affinity Table
|
||||
=====================================
|
||||
|
||||
The System/Static Resource Affinity Table describes resource (CPU, Memory)
|
||||
affinity to "Proximity Domains". This table is technically optional, but for
|
||||
performance information (see "HMAT") to be enumerated by linux it must be
|
||||
present.
|
||||
|
||||
There is a careful dance between the CEDT and SRAT tables and how NUMA nodes are
|
||||
created. If things don't look quite the way you expect - check the SRAT Memory
|
||||
Affinity entries and CEDT CFMWS to determine what your platform actually
|
||||
supports in terms of flexible topologies.
|
||||
|
||||
The SRAT may statically assign portions of a CFMWS SPA range to a specific
|
||||
proximity domains. See linux numa creation for more information about how
|
||||
this presents in the NUMA topology.
|
||||
|
||||
Proximity Domain
|
||||
================
|
||||
A proximity domain is ROUGHLY equivalent to "NUMA Node" - though a 1-to-1
|
||||
mapping is not guaranteed. There are scenarios where "Proximity Domain 4" may
|
||||
map to "NUMA Node 3", for example. (See "NUMA Node Creation")
|
||||
|
||||
Memory Affinity
|
||||
===============
|
||||
Generally speaking, if a host does any amount of CXL fabric (decoder)
|
||||
programming in BIOS - an SRAT entry for that memory needs to be present.
|
||||
|
||||
Example ::
|
||||
|
||||
Subtable Type : 01 [Memory Affinity]
|
||||
Length : 28
|
||||
Proximity Domain : 00000001 <- NUMA Node 1
|
||||
Reserved1 : 0000
|
||||
Base Address : 000000C050000000 <- Physical Memory Region
|
||||
Address Length : 0000003CA0000000
|
||||
Reserved2 : 00000000
|
||||
Flags (decoded below) : 0000000B
|
||||
Enabled : 1
|
||||
Hot Pluggable : 1
|
||||
Non-Volatile : 0
|
||||
|
||||
|
||||
Generic Port Affinity
|
||||
=====================
|
||||
The Generic Port Affinity subtable provides an association between a proximity
|
||||
domain and a device handle representing a Generic Port such as a CXL host
|
||||
bridge. With the association, latency and bandwidth numbers can be retrieved
|
||||
from the SRAT for the path between CPU(s) (initiator) and the Generic Port.
|
||||
This is used to construct performance coordinates for hotplugged CXL DEVICES,
|
||||
which cannot be enumerated at boot by platform firmware.
|
||||
|
||||
Example ::
|
||||
|
||||
Subtable Type : 06 [Generic Port Affinity]
|
||||
Length : 20 <- 32d, length of table
|
||||
Reserved : 00
|
||||
Device Handle Type : 00 <- 0 - ACPI, 1 - PCI
|
||||
Proximity Domain : 00000001
|
||||
Device Handle : ACPI0016:01
|
||||
Flags : 00000001 <- Bit 0 (Enabled)
|
||||
Reserved : 00000000
|
||||
|
||||
The Proximity Domain is matched up to the :doc:`HMAT <hmat>` SSLBI Target
|
||||
Proximity Domain List for the related latency or bandwidth numbers. Those
|
||||
performance numbers are tied to a CXL host bridge via the Device Handle.
|
||||
The driver uses the association to retrieve the Generic Port performance
|
||||
numbers for the whole CXL path access coordinates calculation.
|
||||
262
Documentation/driver-api/cxl/platform/bios-and-efi.rst
Normal file
262
Documentation/driver-api/cxl/platform/bios-and-efi.rst
Normal file
|
|
@ -0,0 +1,262 @@
|
|||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
======================
|
||||
BIOS/EFI Configuration
|
||||
======================
|
||||
|
||||
BIOS and EFI are largely responsible for configuring static information about
|
||||
devices (or potential future devices) such that Linux can build the appropriate
|
||||
logical representations of these devices.
|
||||
|
||||
At a high level, this is what occurs during this phase of configuration.
|
||||
|
||||
* The bootloader starts the BIOS/EFI.
|
||||
|
||||
* BIOS/EFI do early device probe to determine static configuration
|
||||
|
||||
* BIOS/EFI creates ACPI Tables that describe static config for the OS
|
||||
|
||||
* BIOS/EFI create the system memory map (EFI Memory Map, E820, etc)
|
||||
|
||||
* BIOS/EFI calls :code:`start_kernel` and begins the Linux Early Boot process.
|
||||
|
||||
Much of what this section is concerned with is ACPI Table production and
|
||||
static memory map configuration. More detail on these tables can be found
|
||||
at :doc:`ACPI Tables <acpi>`.
|
||||
|
||||
.. note::
|
||||
Platform Vendors should read carefully, as this sections has recommendations
|
||||
on physical memory region size and alignment, memory holes, HDM interleave,
|
||||
and what linux expects of HDM decoders trying to work with these features.
|
||||
|
||||
UEFI Settings
|
||||
=============
|
||||
If your platform supports it, the :code:`uefisettings` command can be used to
|
||||
read/write EFI settings. Changes will be reflected on the next reboot. Kexec
|
||||
is not a sufficient reboot.
|
||||
|
||||
One notable configuration here is the EFI_MEMORY_SP (Specific Purpose) bit.
|
||||
When this is enabled, this bit tells linux to defer management of a memory
|
||||
region to a driver (in this case, the CXL driver). Otherwise, the memory is
|
||||
treated as "normal memory", and is exposed to the page allocator during
|
||||
:code:`__init`.
|
||||
|
||||
uefisettings examples
|
||||
---------------------
|
||||
|
||||
:code:`uefisettings identify` ::
|
||||
|
||||
uefisettings identify
|
||||
|
||||
bios_vendor: xxx
|
||||
bios_version: xxx
|
||||
bios_release: xxx
|
||||
bios_date: xxx
|
||||
product_name: xxx
|
||||
product_family: xxx
|
||||
product_version: xxx
|
||||
|
||||
On some AMD platforms, the :code:`EFI_MEMORY_SP` bit is set via the :code:`CXL
|
||||
Memory Attribute` field. This may be called something else on your platform.
|
||||
|
||||
:code:`uefisettings get "CXL Memory Attribute"` ::
|
||||
|
||||
selector: xxx
|
||||
...
|
||||
question: Question {
|
||||
name: "CXL Memory Attribute",
|
||||
answer: "Enabled",
|
||||
...
|
||||
}
|
||||
|
||||
Physical Memory Map
|
||||
===================
|
||||
|
||||
Physical Address Region Alignment
|
||||
---------------------------------
|
||||
|
||||
As of Linux v6.14, the hotplug memory system requires memory regions to be
|
||||
uniform in size and alignment. While the CXL specification allows for memory
|
||||
regions as small as 256MB, the supported memory block size and alignment for
|
||||
hotplugged memory is architecture-defined.
|
||||
|
||||
A Linux memory blocks may be as small as 128MB and increase in powers of two.
|
||||
|
||||
* On ARM, the default block size and alignment is either 128MB or 256MB.
|
||||
|
||||
* On x86, the default block size is 256MB, and increases to 2GB as the
|
||||
capacity of the system increases up to 64GB.
|
||||
|
||||
For best support across versions, platform vendors should place CXL memory at
|
||||
a 2GB aligned base address, and regions should be 2GB aligned. This also helps
|
||||
prevent the creating thousands of memory devices (one per block).
|
||||
|
||||
Memory Holes
|
||||
------------
|
||||
|
||||
Holes in the memory map are tricky. Consider a 4GB device located at base
|
||||
address 0x100000000, but with the following memory map ::
|
||||
|
||||
---------------------
|
||||
| 0x100000000 |
|
||||
| CXL |
|
||||
| 0x1BFFFFFFF |
|
||||
---------------------
|
||||
| 0x1C0000000 |
|
||||
| MEMORY HOLE |
|
||||
| 0x1FFFFFFFF |
|
||||
---------------------
|
||||
| 0x200000000 |
|
||||
| CXL CONT. |
|
||||
| 0x23FFFFFFF |
|
||||
---------------------
|
||||
|
||||
There are two issues to consider:
|
||||
|
||||
* decoder programming, and
|
||||
* memory block alignment.
|
||||
|
||||
If your architecture requires 2GB uniform size and aligned memory blocks, the
|
||||
only capacity Linux is capable of mapping (as of v6.14) would be the capacity
|
||||
from `0x100000000-0x180000000`. The remaining capacity will be stranded, as
|
||||
they are not of 2GB aligned length.
|
||||
|
||||
Assuming your architecture and memory configuration allows 1GB memory blocks,
|
||||
this memory map is supported and this should be presented as multiple CFMWS
|
||||
in the CEDT that describe each side of the memory hole separately - along with
|
||||
matching decoders.
|
||||
|
||||
Multiple decoders can (and should) be used to manage such a memory hole (see
|
||||
below), but each chunk of a memory hole should be aligned to a reasonable block
|
||||
size (larger alignment is always better). If you intend to have memory holes
|
||||
in the memory map, expect to use one decoder per contiguous chunk of host
|
||||
physical memory.
|
||||
|
||||
As of v6.14, Linux does provide support for memory hotplug of multiple
|
||||
physical memory regions separated by a memory hole described by a single
|
||||
HDM decoder.
|
||||
|
||||
|
||||
Decoder Programming
|
||||
===================
|
||||
If BIOS/EFI intends to program the decoders to be statically configured,
|
||||
there are a few things to consider to avoid major pitfalls that will
|
||||
prevent Linux compatibility. Some of these recommendations are not
|
||||
required "per the specification", but Linux makes no guarantees of support
|
||||
otherwise.
|
||||
|
||||
|
||||
Translation Point
|
||||
-----------------
|
||||
Per the specification, the only decoders which **TRANSLATE** Host Physical
|
||||
Address (HPA) to Device Physical Address (DPA) are the **Endpoint Decoders**.
|
||||
All other decoders in the fabric are intended to route accesses without
|
||||
translating the addresses.
|
||||
|
||||
This is heavily implied by the specification, see: ::
|
||||
|
||||
CXL Specification 3.1
|
||||
8.2.4.20: CXL HDM Decoder Capability Structure
|
||||
- Implementation Note: CXL Host Bridge and Upstream Switch Port Decoder Flow
|
||||
- Implementation Note: Device Decoder Logic
|
||||
|
||||
Given this, Linux makes a strong assumption that decoders between CPU and
|
||||
endpoint will all be programmed with addresses ranges that are subsets of
|
||||
their parent decoder.
|
||||
|
||||
Due to some ambiguity in how Architecture, ACPI, PCI, and CXL specifications
|
||||
"hand off" responsibility between domains, some early adopting platforms
|
||||
attempted to do translation at the originating memory controller or host
|
||||
bridge. This configuration requires a platform specific extension to the
|
||||
driver and is not officially endorsed - despite being supported.
|
||||
|
||||
It is *highly recommended* **NOT** to do this; otherwise, you are on your own
|
||||
to implement driver support for your platform.
|
||||
|
||||
Interleave and Configuration Flexibility
|
||||
----------------------------------------
|
||||
If providing cross-host-bridge interleave, a CFMWS entry in the :doc:`CEDT
|
||||
<acpi/cedt>` must be presented with target host-bridges for the interleaved
|
||||
device sets (there may be multiple behind each host bridge).
|
||||
|
||||
If providing intra-host-bridge interleaving, only 1 CFMWS entry in the CEDT is
|
||||
required for that host bridge - if it covers the entire capacity of the devices
|
||||
behind the host bridge.
|
||||
|
||||
If intending to provide users flexibility in programming decoders beyond the
|
||||
root, you may want to provide multiple CFMWS entries in the CEDT intended for
|
||||
different purposes. For example, you may want to consider adding:
|
||||
|
||||
1) A CFMWS entry to cover all interleavable host bridges.
|
||||
2) A CFMWS entry to cover all devices on a single host bridge.
|
||||
3) A CFMWS entry to cover each device.
|
||||
|
||||
A platform may choose to add all of these, or change the mode based on a BIOS
|
||||
setting. For each CFMWS entry, Linux expects descriptions of the described
|
||||
memory regions in the :doc:`SRAT <acpi/srat>` to determine the number of
|
||||
NUMA nodes it should reserve during early boot / init.
|
||||
|
||||
As of v6.14, Linux will create a NUMA node for each CEDT CFMWS entry, even if
|
||||
a matching SRAT entry does not exist; however, this is not guaranteed in the
|
||||
future and such a configuration should be avoided.
|
||||
|
||||
Memory Holes
|
||||
------------
|
||||
If your platform includes memory holes intersparsed between your CXL memory, it
|
||||
is recommended to utilize multiple decoders to cover these regions of memory,
|
||||
rather than try to program the decoders to accept the entire range and expect
|
||||
Linux to manage the overlap.
|
||||
|
||||
For example, consider the Memory Hole described above ::
|
||||
|
||||
---------------------
|
||||
| 0x100000000 |
|
||||
| CXL |
|
||||
| 0x1BFFFFFFF |
|
||||
---------------------
|
||||
| 0x1C0000000 |
|
||||
| MEMORY HOLE |
|
||||
| 0x1FFFFFFFF |
|
||||
---------------------
|
||||
| 0x200000000 |
|
||||
| CXL CONT. |
|
||||
| 0x23FFFFFFF |
|
||||
---------------------
|
||||
|
||||
Assuming this is provided by a single device attached directly to a host bridge,
|
||||
Linux would expect the following decoder programming ::
|
||||
|
||||
----------------------- -----------------------
|
||||
| root-decoder-0 | | root-decoder-1 |
|
||||
| base: 0x100000000 | | base: 0x200000000 |
|
||||
| size: 0xC0000000 | | size: 0x40000000 |
|
||||
----------------------- -----------------------
|
||||
| |
|
||||
----------------------- -----------------------
|
||||
| HB-decoder-0 | | HB-decoder-1 |
|
||||
| base: 0x100000000 | | base: 0x200000000 |
|
||||
| size: 0xC0000000 | | size: 0x40000000 |
|
||||
----------------------- -----------------------
|
||||
| |
|
||||
----------------------- -----------------------
|
||||
| ep-decoder-0 | | ep-decoder-1 |
|
||||
| base: 0x100000000 | | base: 0x200000000 |
|
||||
| size: 0xC0000000 | | size: 0x40000000 |
|
||||
----------------------- -----------------------
|
||||
|
||||
With a CEDT configuration with two CFMWS describing the above root decoders.
|
||||
|
||||
Linux makes no guarantee of support for strange memory hole situations.
|
||||
|
||||
Multi-Media Devices
|
||||
-------------------
|
||||
The CFMWS field of the CEDT has special restriction bits which describe whether
|
||||
the described memory region allows volatile or persistent memory (or both). If
|
||||
the platform intends to support either:
|
||||
|
||||
1) A device with multiple medias, or
|
||||
2) Using a persistent memory device as normal memory
|
||||
|
||||
A platform may wish to create multiple CEDT CFMWS entries to describe the same
|
||||
memory, with the intent of allowing the end user flexibility in how that memory
|
||||
is configured. Linux does not presently have strong requirements in this area.
|
||||
118
Documentation/driver-api/cxl/platform/cdat.rst
Normal file
118
Documentation/driver-api/cxl/platform/cdat.rst
Normal file
|
|
@ -0,0 +1,118 @@
|
|||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
======================================
|
||||
Coherent Device Attribute Table (CDAT)
|
||||
======================================
|
||||
|
||||
The CDAT provides functional and performance attributes of devices such
|
||||
as CXL accelerators, switches, or endpoints. The table formatting is
|
||||
similar to ACPI tables. CDAT data may be parsed by BIOS at boot or may
|
||||
be enumerated at runtime (after device hotplug, for example).
|
||||
|
||||
Terminology:
|
||||
DPA - Device Physical Address, used by the CXL device to denote the address
|
||||
it supports for that device.
|
||||
|
||||
DSMADHandle - A device unique handle that is associated with a DPA range
|
||||
defined by the DSMAS table.
|
||||
|
||||
|
||||
===============================================
|
||||
Device Scoped Memory Affinity Structure (DSMAS)
|
||||
===============================================
|
||||
|
||||
The DSMAS contains information such as DSMADHandle, the DPA Base, and DPA
|
||||
Length.
|
||||
|
||||
This table is used by Linux in conjunction with the Device Scoped Latency and
|
||||
Bandwidth Information Structure (DSLBIS) to determine the performance
|
||||
attributes of the CXL device itself.
|
||||
|
||||
Example ::
|
||||
|
||||
Structure Type : 00 [DSMAS]
|
||||
Reserved : 00
|
||||
Length : 0018 <- 24d, size of structure
|
||||
DSMADHandle : 01
|
||||
Flags : 00
|
||||
Reserved : 0000
|
||||
DPA Base : 0000000040000000 <- 1GiB base
|
||||
DPA Length : 0000000080000000 <- 2GiB size
|
||||
|
||||
|
||||
==================================================================
|
||||
Device Scoped Latency and Bandwidth Information Structure (DSLBIS)
|
||||
==================================================================
|
||||
|
||||
This table is used by Linux in conjunction with DSMAS to determine the
|
||||
performance attributes of a CXL device. The DSLBIS contains latency
|
||||
and bandwidth information based on DSMADHandle matching.
|
||||
|
||||
Example ::
|
||||
|
||||
Structure Type : 01 [DSLBIS]
|
||||
Reserved : 00
|
||||
Length : 18 <- 24d, size of structure
|
||||
Handle : 0001 <- DSMAS handle
|
||||
Flags : 00 <- Matches flag field for HMAT SLLBIS
|
||||
Data Type : 00 <- Latency
|
||||
Entry Basee Unit : 0000000000001000 <- Entry Base Unit field in HMAT SSLBIS
|
||||
Entry : 010000000000 <- First byte used here, CXL LTC
|
||||
Reserved : 0000
|
||||
|
||||
Structure Type : 01 [DSLBIS]
|
||||
Reserved : 00
|
||||
Length : 18 <- 24d, size of structure
|
||||
Handle : 0001 <- DSMAS handle
|
||||
Flags : 00 <- Matches flag field for HMAT SLLBIS
|
||||
Data Type : 03 <- Bandwidth
|
||||
Entry Basee Unit : 0000000000001000 <- Entry Base Unit field in HMAT SSLBIS
|
||||
Entry : 020000000000 <- First byte used here, CXL BW
|
||||
Reserved : 0000
|
||||
|
||||
|
||||
==================================================================
|
||||
Switch Scoped Latency and Bandwidth Information Structure (SSLBIS)
|
||||
==================================================================
|
||||
|
||||
The SSLBIS contains information about the latency and bandwidth of a switch.
|
||||
|
||||
The table is used by Linux to compute the performance coordinates of a CXL path
|
||||
from the device to the root port where a switch is part of the path.
|
||||
|
||||
Example ::
|
||||
|
||||
Structure Type : 05 [SSLBIS]
|
||||
Reserved : 00
|
||||
Length : 20 <- 32d, length of record, including SSLB entries
|
||||
Data Type : 00 <- Latency
|
||||
Reserved : 000000
|
||||
Entry Base Unit : 00000000000000001000 <- Matches Entry Base Unit in HMAT SSLBIS
|
||||
|
||||
<- SSLB Entry 0
|
||||
Port X ID : 0100 <- First port, 0100h represents an upstream port
|
||||
Port Y ID : 0000 <- Second port, downstream port 0
|
||||
Latency : 0100 <- Port latency
|
||||
Reserved : 0000
|
||||
<- SSLB Entry 1
|
||||
Port X ID : 0100
|
||||
Port Y ID : 0001
|
||||
Latency : 0100
|
||||
Reserved : 0000
|
||||
|
||||
|
||||
Structure Type : 05 [SSLBIS]
|
||||
Reserved : 00
|
||||
Length : 18 <- 24d, length of record, including SSLB entry
|
||||
Data Type : 03 <- Bandwidth
|
||||
Reserved : 000000
|
||||
Entry Base Unit : 00000000000000001000 <- Matches Entry Base Unit in HMAT SSLBIS
|
||||
|
||||
<- SSLB Entry 0
|
||||
Port X ID : 0100 <- First port, 0100h represents an upstream port
|
||||
Port Y ID : FFFF <- Second port, FFFFh indicates any port
|
||||
Bandwidth : 1200 <- Port bandwidth
|
||||
Reserved : 0000
|
||||
|
||||
The CXL driver uses a combination of CDAT, HMAT, SRAT, and other data to
|
||||
generate "whole path performance" data for a CXL device.
|
||||
13
Documentation/driver-api/cxl/platform/example-configs.rst
Normal file
13
Documentation/driver-api/cxl/platform/example-configs.rst
Normal file
|
|
@ -0,0 +1,13 @@
|
|||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
Example Platform Configurations
|
||||
###############################
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 1
|
||||
:caption: Contents
|
||||
|
||||
example-configurations/one-dev-per-hb.rst
|
||||
example-configurations/multi-dev-per-hb.rst
|
||||
example-configurations/hb-interleave.rst
|
||||
example-configurations/flexible.rst
|
||||
|
|
@ -0,0 +1,296 @@
|
|||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
=====================
|
||||
Flexible Presentation
|
||||
=====================
|
||||
This system has a single socket with two CXL host bridges. Each host bridge
|
||||
has two CXL memory expanders with a 4GB of memory (32GB total).
|
||||
|
||||
On this system, the platform designer wanted to provide the user flexibility
|
||||
to configure the memory devices in various interleave or NUMA node
|
||||
configurations. So they provided every combination.
|
||||
|
||||
Things to note:
|
||||
|
||||
* Cross-Bridge interleave is described in one CFMWS that covers all capacity.
|
||||
* One CFMWS is also described per-host bridge.
|
||||
* One CFMWS is also described per-device.
|
||||
* This SRAT describes one node for each of the above CFMWS.
|
||||
* The HMAT describes performance for each node in the SRAT.
|
||||
|
||||
:doc:`CEDT <../acpi/cedt>`::
|
||||
|
||||
Subtable Type : 00 [CXL Host Bridge Structure]
|
||||
Reserved : 00
|
||||
Length : 0020
|
||||
Associated host bridge : 00000007
|
||||
Specification version : 00000001
|
||||
Reserved : 00000000
|
||||
Register base : 0000010370400000
|
||||
Register length : 0000000000010000
|
||||
|
||||
Subtable Type : 00 [CXL Host Bridge Structure]
|
||||
Reserved : 00
|
||||
Length : 0020
|
||||
Associated host bridge : 00000006
|
||||
Specification version : 00000001
|
||||
Reserved : 00000000
|
||||
Register base : 0000010380800000
|
||||
Register length : 0000000000010000
|
||||
|
||||
Subtable Type : 01 [CXL Fixed Memory Window Structure]
|
||||
Reserved : 00
|
||||
Length : 002C
|
||||
Reserved : 00000000
|
||||
Window base address : 0000001000000000
|
||||
Window size : 0000000400000000
|
||||
Interleave Members (2^n) : 01
|
||||
Interleave Arithmetic : 00
|
||||
Reserved : 0000
|
||||
Granularity : 00000000
|
||||
Restrictions : 0006
|
||||
QtgId : 0001
|
||||
First Target : 00000007
|
||||
Second Target : 00000006
|
||||
|
||||
Subtable Type : 01 [CXL Fixed Memory Window Structure]
|
||||
Reserved : 00
|
||||
Length : 002C
|
||||
Reserved : 00000000
|
||||
Window base address : 0000002000000000
|
||||
Window size : 0000000200000000
|
||||
Interleave Members (2^n) : 00
|
||||
Interleave Arithmetic : 00
|
||||
Reserved : 0000
|
||||
Granularity : 00000000
|
||||
Restrictions : 0006
|
||||
QtgId : 0001
|
||||
First Target : 00000007
|
||||
|
||||
Subtable Type : 01 [CXL Fixed Memory Window Structure]
|
||||
Reserved : 00
|
||||
Length : 002C
|
||||
Reserved : 00000000
|
||||
Window base address : 0000002200000000
|
||||
Window size : 0000000200000000
|
||||
Interleave Members (2^n) : 00
|
||||
Interleave Arithmetic : 00
|
||||
Reserved : 0000
|
||||
Granularity : 00000000
|
||||
Restrictions : 0006
|
||||
QtgId : 0001
|
||||
First Target : 00000006
|
||||
|
||||
Subtable Type : 01 [CXL Fixed Memory Window Structure]
|
||||
Reserved : 00
|
||||
Length : 002C
|
||||
Reserved : 00000000
|
||||
Window base address : 0000003000000000
|
||||
Window size : 0000000100000000
|
||||
Interleave Members (2^n) : 00
|
||||
Interleave Arithmetic : 00
|
||||
Reserved : 0000
|
||||
Granularity : 00000000
|
||||
Restrictions : 0006
|
||||
QtgId : 0001
|
||||
First Target : 00000007
|
||||
|
||||
Subtable Type : 01 [CXL Fixed Memory Window Structure]
|
||||
Reserved : 00
|
||||
Length : 002C
|
||||
Reserved : 00000000
|
||||
Window base address : 0000003100000000
|
||||
Window size : 0000000100000000
|
||||
Interleave Members (2^n) : 00
|
||||
Interleave Arithmetic : 00
|
||||
Reserved : 0000
|
||||
Granularity : 00000000
|
||||
Restrictions : 0006
|
||||
QtgId : 0001
|
||||
First Target : 00000007
|
||||
|
||||
Subtable Type : 01 [CXL Fixed Memory Window Structure]
|
||||
Reserved : 00
|
||||
Length : 002C
|
||||
Reserved : 00000000
|
||||
Window base address : 0000003200000000
|
||||
Window size : 0000000100000000
|
||||
Interleave Members (2^n) : 00
|
||||
Interleave Arithmetic : 00
|
||||
Reserved : 0000
|
||||
Granularity : 00000000
|
||||
Restrictions : 0006
|
||||
QtgId : 0001
|
||||
First Target : 00000006
|
||||
|
||||
Subtable Type : 01 [CXL Fixed Memory Window Structure]
|
||||
Reserved : 00
|
||||
Length : 002C
|
||||
Reserved : 00000000
|
||||
Window base address : 0000003300000000
|
||||
Window size : 0000000100000000
|
||||
Interleave Members (2^n) : 00
|
||||
Interleave Arithmetic : 00
|
||||
Reserved : 0000
|
||||
Granularity : 00000000
|
||||
Restrictions : 0006
|
||||
QtgId : 0001
|
||||
First Target : 00000006
|
||||
|
||||
:doc:`SRAT <../acpi/srat>`::
|
||||
|
||||
Subtable Type : 01 [Memory Affinity]
|
||||
Length : 28
|
||||
Proximity Domain : 00000001
|
||||
Reserved1 : 0000
|
||||
Base Address : 0000001000000000
|
||||
Address Length : 0000000400000000
|
||||
Reserved2 : 00000000
|
||||
Flags (decoded below) : 0000000B
|
||||
Enabled : 1
|
||||
Hot Pluggable : 1
|
||||
Non-Volatile : 0
|
||||
|
||||
Subtable Type : 01 [Memory Affinity]
|
||||
Length : 28
|
||||
Proximity Domain : 00000002
|
||||
Reserved1 : 0000
|
||||
Base Address : 0000002000000000
|
||||
Address Length : 0000000200000000
|
||||
Reserved2 : 00000000
|
||||
Flags (decoded below) : 0000000B
|
||||
Enabled : 1
|
||||
Hot Pluggable : 1
|
||||
Non-Volatile : 0
|
||||
|
||||
Subtable Type : 01 [Memory Affinity]
|
||||
Length : 28
|
||||
Proximity Domain : 00000003
|
||||
Reserved1 : 0000
|
||||
Base Address : 0000002200000000
|
||||
Address Length : 0000000200000000
|
||||
Reserved2 : 00000000
|
||||
Flags (decoded below) : 0000000B
|
||||
Enabled : 1
|
||||
Hot Pluggable : 1
|
||||
Non-Volatile : 0
|
||||
|
||||
Subtable Type : 01 [Memory Affinity]
|
||||
Length : 28
|
||||
Proximity Domain : 00000004
|
||||
Reserved1 : 0000
|
||||
Base Address : 0000003000000000
|
||||
Address Length : 0000000100000000
|
||||
Reserved2 : 00000000
|
||||
Flags (decoded below) : 0000000B
|
||||
Enabled : 1
|
||||
Hot Pluggable : 1
|
||||
Non-Volatile : 0
|
||||
|
||||
Subtable Type : 01 [Memory Affinity]
|
||||
Length : 28
|
||||
Proximity Domain : 00000005
|
||||
Reserved1 : 0000
|
||||
Base Address : 0000003100000000
|
||||
Address Length : 0000000100000000
|
||||
Reserved2 : 00000000
|
||||
Flags (decoded below) : 0000000B
|
||||
Enabled : 1
|
||||
Hot Pluggable : 1
|
||||
Non-Volatile : 0
|
||||
|
||||
Subtable Type : 01 [Memory Affinity]
|
||||
Length : 28
|
||||
Proximity Domain : 00000006
|
||||
Reserved1 : 0000
|
||||
Base Address : 0000003200000000
|
||||
Address Length : 0000000100000000
|
||||
Reserved2 : 00000000
|
||||
Flags (decoded below) : 0000000B
|
||||
Enabled : 1
|
||||
Hot Pluggable : 1
|
||||
Non-Volatile : 0
|
||||
|
||||
Subtable Type : 01 [Memory Affinity]
|
||||
Length : 28
|
||||
Proximity Domain : 00000007
|
||||
Reserved1 : 0000
|
||||
Base Address : 0000003300000000
|
||||
Address Length : 0000000100000000
|
||||
Reserved2 : 00000000
|
||||
Flags (decoded below) : 0000000B
|
||||
Enabled : 1
|
||||
Hot Pluggable : 1
|
||||
Non-Volatile : 0
|
||||
|
||||
:doc:`HMAT <../acpi/hmat>`::
|
||||
|
||||
Structure Type : 0001 [SLLBI]
|
||||
Data Type : 00 [Latency]
|
||||
Target Proximity Domain List : 00000000
|
||||
Target Proximity Domain List : 00000001
|
||||
Target Proximity Domain List : 00000002
|
||||
Target Proximity Domain List : 00000003
|
||||
Target Proximity Domain List : 00000004
|
||||
Target Proximity Domain List : 00000005
|
||||
Target Proximity Domain List : 00000006
|
||||
Target Proximity Domain List : 00000007
|
||||
Entry : 0080
|
||||
Entry : 0100
|
||||
Entry : 0100
|
||||
Entry : 0100
|
||||
Entry : 0100
|
||||
Entry : 0100
|
||||
Entry : 0100
|
||||
Entry : 0100
|
||||
|
||||
Structure Type : 0001 [SLLBI]
|
||||
Data Type : 03 [Bandwidth]
|
||||
Target Proximity Domain List : 00000000
|
||||
Target Proximity Domain List : 00000001
|
||||
Target Proximity Domain List : 00000002
|
||||
Target Proximity Domain List : 00000003
|
||||
Target Proximity Domain List : 00000004
|
||||
Target Proximity Domain List : 00000005
|
||||
Target Proximity Domain List : 00000006
|
||||
Target Proximity Domain List : 00000007
|
||||
Entry : 1200
|
||||
Entry : 0400
|
||||
Entry : 0200
|
||||
Entry : 0200
|
||||
Entry : 0100
|
||||
Entry : 0100
|
||||
Entry : 0100
|
||||
Entry : 0100
|
||||
|
||||
:doc:`SLIT <../acpi/slit>`::
|
||||
|
||||
Signature : "SLIT" [System Locality Information Table]
|
||||
Localities : 0000000000000003
|
||||
Locality 0 : 10 20 20 20 20 20 20 20
|
||||
Locality 1 : FF 0A FF FF FF FF FF FF
|
||||
Locality 2 : FF FF 0A FF FF FF FF FF
|
||||
Locality 3 : FF FF FF 0A FF FF FF FF
|
||||
Locality 4 : FF FF FF FF 0A FF FF FF
|
||||
Locality 5 : FF FF FF FF FF 0A FF FF
|
||||
Locality 6 : FF FF FF FF FF FF 0A FF
|
||||
Locality 7 : FF FF FF FF FF FF FF 0A
|
||||
|
||||
:doc:`DSDT <../acpi/dsdt>`::
|
||||
|
||||
Scope (_SB)
|
||||
{
|
||||
Device (S0D0)
|
||||
{
|
||||
Name (_HID, "ACPI0016" /* Compute Express Link Host Bridge */) // _HID: Hardware ID
|
||||
...
|
||||
Name (_UID, 0x07) // _UID: Unique ID
|
||||
}
|
||||
...
|
||||
Device (S0D5)
|
||||
{
|
||||
Name (_HID, "ACPI0016" /* Compute Express Link Host Bridge */) // _HID: Hardware ID
|
||||
...
|
||||
Name (_UID, 0x06) // _UID: Unique ID
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,107 @@
|
|||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
============================
|
||||
Cross-Host-Bridge Interleave
|
||||
============================
|
||||
This system has a single socket with two CXL host bridges. Each host bridge
|
||||
has a single CXL memory expander with a 4GB of memory.
|
||||
|
||||
Things to note:
|
||||
|
||||
* Cross-Bridge interleave is described.
|
||||
* The expanders are described by a single CFMWS.
|
||||
* This SRAT describes one node for both host bridges.
|
||||
* The HMAT describes a single node's performance.
|
||||
|
||||
:doc:`CEDT <../acpi/cedt>`::
|
||||
|
||||
Subtable Type : 00 [CXL Host Bridge Structure]
|
||||
Reserved : 00
|
||||
Length : 0020
|
||||
Associated host bridge : 00000007
|
||||
Specification version : 00000001
|
||||
Reserved : 00000000
|
||||
Register base : 0000010370400000
|
||||
Register length : 0000000000010000
|
||||
|
||||
Subtable Type : 00 [CXL Host Bridge Structure]
|
||||
Reserved : 00
|
||||
Length : 0020
|
||||
Associated host bridge : 00000006
|
||||
Specification version : 00000001
|
||||
Reserved : 00000000
|
||||
Register base : 0000010380800000
|
||||
Register length : 0000000000010000
|
||||
|
||||
Subtable Type : 01 [CXL Fixed Memory Window Structure]
|
||||
Reserved : 00
|
||||
Length : 002C
|
||||
Reserved : 00000000
|
||||
Window base address : 0000001000000000
|
||||
Window size : 0000000200000000
|
||||
Interleave Members (2^n) : 01
|
||||
Interleave Arithmetic : 00
|
||||
Reserved : 0000
|
||||
Granularity : 00000000
|
||||
Restrictions : 0006
|
||||
QtgId : 0001
|
||||
First Target : 00000007
|
||||
Second Target : 00000006
|
||||
|
||||
:doc:`SRAT <../acpi/srat>`::
|
||||
|
||||
Subtable Type : 01 [Memory Affinity]
|
||||
Length : 28
|
||||
Proximity Domain : 00000001
|
||||
Reserved1 : 0000
|
||||
Base Address : 0000001000000000
|
||||
Address Length : 0000000200000000
|
||||
Reserved2 : 00000000
|
||||
Flags (decoded below) : 0000000B
|
||||
Enabled : 1
|
||||
Hot Pluggable : 1
|
||||
Non-Volatile : 0
|
||||
|
||||
:doc:`HMAT <../acpi/hmat>`::
|
||||
|
||||
Structure Type : 0001 [SLLBI]
|
||||
Data Type : 00 [Latency]
|
||||
Target Proximity Domain List : 00000000
|
||||
Target Proximity Domain List : 00000001
|
||||
Target Proximity Domain List : 00000002
|
||||
Entry : 0080
|
||||
Entry : 0100
|
||||
|
||||
Structure Type : 0001 [SLLBI]
|
||||
Data Type : 03 [Bandwidth]
|
||||
Target Proximity Domain List : 00000000
|
||||
Target Proximity Domain List : 00000001
|
||||
Target Proximity Domain List : 00000002
|
||||
Entry : 1200
|
||||
Entry : 0400
|
||||
|
||||
:doc:`SLIT <../acpi/slit>`::
|
||||
|
||||
Signature : "SLIT" [System Locality Information Table]
|
||||
Localities : 0000000000000003
|
||||
Locality 0 : 10 20
|
||||
Locality 1 : FF 0A
|
||||
|
||||
:doc:`DSDT <../acpi/dsdt>`::
|
||||
|
||||
Scope (_SB)
|
||||
{
|
||||
Device (S0D0)
|
||||
{
|
||||
Name (_HID, "ACPI0016" /* Compute Express Link Host Bridge */) // _HID: Hardware ID
|
||||
...
|
||||
Name (_UID, 0x07) // _UID: Unique ID
|
||||
}
|
||||
...
|
||||
Device (S0D5)
|
||||
{
|
||||
Name (_HID, "ACPI0016" /* Compute Express Link Host Bridge */) // _HID: Hardware ID
|
||||
...
|
||||
Name (_UID, 0x06) // _UID: Unique ID
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,90 @@
|
|||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
================================
|
||||
Multiple Devices per Host Bridge
|
||||
================================
|
||||
|
||||
In this example system we will have a single socket and one CXL host bridge.
|
||||
There are two CXL memory expanders with 4GB attached to the host bridge.
|
||||
|
||||
Things to note:
|
||||
|
||||
* Intra-Bridge interleave is not described here.
|
||||
* The expanders are described by a single CEDT/CFMWS.
|
||||
* This CEDT/SRAT describes one node for both devices.
|
||||
* There is only one proximity domain the HMAT for both devices.
|
||||
|
||||
:doc:`CEDT <../acpi/cedt>`::
|
||||
|
||||
Subtable Type : 00 [CXL Host Bridge Structure]
|
||||
Reserved : 00
|
||||
Length : 0020
|
||||
Associated host bridge : 00000007
|
||||
Specification version : 00000001
|
||||
Reserved : 00000000
|
||||
Register base : 0000010370400000
|
||||
Register length : 0000000000010000
|
||||
|
||||
Subtable Type : 01 [CXL Fixed Memory Window Structure]
|
||||
Reserved : 00
|
||||
Length : 002C
|
||||
Reserved : 00000000
|
||||
Window base address : 0000001000000000
|
||||
Window size : 0000000200000000
|
||||
Interleave Members (2^n) : 00
|
||||
Interleave Arithmetic : 00
|
||||
Reserved : 0000
|
||||
Granularity : 00000000
|
||||
Restrictions : 0006
|
||||
QtgId : 0001
|
||||
First Target : 00000007
|
||||
|
||||
:doc:`SRAT <../acpi/srat>`::
|
||||
|
||||
Subtable Type : 01 [Memory Affinity]
|
||||
Length : 28
|
||||
Proximity Domain : 00000001
|
||||
Reserved1 : 0000
|
||||
Base Address : 0000001000000000
|
||||
Address Length : 0000000200000000
|
||||
Reserved2 : 00000000
|
||||
Flags (decoded below) : 0000000B
|
||||
Enabled : 1
|
||||
Hot Pluggable : 1
|
||||
Non-Volatile : 0
|
||||
|
||||
:doc:`HMAT <../acpi/hmat>`::
|
||||
|
||||
Structure Type : 0001 [SLLBI]
|
||||
Data Type : 00 [Latency]
|
||||
Target Proximity Domain List : 00000000
|
||||
Target Proximity Domain List : 00000001
|
||||
Entry : 0080
|
||||
Entry : 0100
|
||||
|
||||
Structure Type : 0001 [SLLBI]
|
||||
Data Type : 03 [Bandwidth]
|
||||
Target Proximity Domain List : 00000000
|
||||
Target Proximity Domain List : 00000001
|
||||
Entry : 1200
|
||||
Entry : 0200
|
||||
|
||||
:doc:`SLIT <../acpi/slit>`::
|
||||
|
||||
Signature : "SLIT" [System Locality Information Table]
|
||||
Localities : 0000000000000003
|
||||
Locality 0 : 10 20
|
||||
Locality 1 : FF 0A
|
||||
|
||||
:doc:`DSDT <../acpi/dsdt>`::
|
||||
|
||||
Scope (_SB)
|
||||
{
|
||||
Device (S0D0)
|
||||
{
|
||||
Name (_HID, "ACPI0016" /* Compute Express Link Host Bridge */) // _HID: Hardware ID
|
||||
...
|
||||
Name (_UID, 0x07) // _UID: Unique ID
|
||||
}
|
||||
...
|
||||
}
|
||||
|
|
@ -0,0 +1,136 @@
|
|||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
==========================
|
||||
One Device per Host Bridge
|
||||
==========================
|
||||
|
||||
This system has a single socket with two CXL host bridges. Each host bridge
|
||||
has a single CXL memory expander with a 4GB of memory.
|
||||
|
||||
Things to note:
|
||||
|
||||
* Cross-Bridge interleave is not being used.
|
||||
* The expanders are in two separate but adjascent memory regions.
|
||||
* This CEDT/SRAT describes one node per device
|
||||
* The expanders have the same performance and will be in the same memory tier.
|
||||
|
||||
:doc:`CEDT <../acpi/cedt>`::
|
||||
|
||||
Subtable Type : 00 [CXL Host Bridge Structure]
|
||||
Reserved : 00
|
||||
Length : 0020
|
||||
Associated host bridge : 00000007
|
||||
Specification version : 00000001
|
||||
Reserved : 00000000
|
||||
Register base : 0000010370400000
|
||||
Register length : 0000000000010000
|
||||
|
||||
Subtable Type : 00 [CXL Host Bridge Structure]
|
||||
Reserved : 00
|
||||
Length : 0020
|
||||
Associated host bridge : 00000006
|
||||
Specification version : 00000001
|
||||
Reserved : 00000000
|
||||
Register base : 0000010380800000
|
||||
Register length : 0000000000010000
|
||||
|
||||
Subtable Type : 01 [CXL Fixed Memory Window Structure]
|
||||
Reserved : 00
|
||||
Length : 002C
|
||||
Reserved : 00000000
|
||||
Window base address : 0000001000000000
|
||||
Window size : 0000000100000000
|
||||
Interleave Members (2^n) : 00
|
||||
Interleave Arithmetic : 00
|
||||
Reserved : 0000
|
||||
Granularity : 00000000
|
||||
Restrictions : 0006
|
||||
QtgId : 0001
|
||||
First Target : 00000007
|
||||
|
||||
Subtable Type : 01 [CXL Fixed Memory Window Structure]
|
||||
Reserved : 00
|
||||
Length : 002C
|
||||
Reserved : 00000000
|
||||
Window base address : 0000001100000000
|
||||
Window size : 0000000100000000
|
||||
Interleave Members (2^n) : 00
|
||||
Interleave Arithmetic : 00
|
||||
Reserved : 0000
|
||||
Granularity : 00000000
|
||||
Restrictions : 0006
|
||||
QtgId : 0001
|
||||
First Target : 00000006
|
||||
|
||||
:doc:`SRAT <../acpi/srat>`::
|
||||
|
||||
Subtable Type : 01 [Memory Affinity]
|
||||
Length : 28
|
||||
Proximity Domain : 00000001
|
||||
Reserved1 : 0000
|
||||
Base Address : 0000001000000000
|
||||
Address Length : 0000000100000000
|
||||
Reserved2 : 00000000
|
||||
Flags (decoded below) : 0000000B
|
||||
Enabled : 1
|
||||
Hot Pluggable : 1
|
||||
Non-Volatile : 0
|
||||
|
||||
Subtable Type : 01 [Memory Affinity]
|
||||
Length : 28
|
||||
Proximity Domain : 00000002
|
||||
Reserved1 : 0000
|
||||
Base Address : 0000001100000000
|
||||
Address Length : 0000000100000000
|
||||
Reserved2 : 00000000
|
||||
Flags (decoded below) : 0000000B
|
||||
Enabled : 1
|
||||
Hot Pluggable : 1
|
||||
Non-Volatile : 0
|
||||
|
||||
:doc:`HMAT <../acpi/hmat>`::
|
||||
|
||||
Structure Type : 0001 [SLLBI]
|
||||
Data Type : 00 [Latency]
|
||||
Target Proximity Domain List : 00000000
|
||||
Target Proximity Domain List : 00000001
|
||||
Target Proximity Domain List : 00000002
|
||||
Entry : 0080
|
||||
Entry : 0100
|
||||
Entry : 0100
|
||||
|
||||
Structure Type : 0001 [SLLBI]
|
||||
Data Type : 03 [Bandwidth]
|
||||
Target Proximity Domain List : 00000000
|
||||
Target Proximity Domain List : 00000001
|
||||
Target Proximity Domain List : 00000002
|
||||
Entry : 1200
|
||||
Entry : 0200
|
||||
Entry : 0200
|
||||
|
||||
:doc:`SLIT <../acpi/slit>`::
|
||||
|
||||
Signature : "SLIT" [System Locality Information Table]
|
||||
Localities : 0000000000000003
|
||||
Locality 0 : 10 20 20
|
||||
Locality 1 : FF 0A FF
|
||||
Locality 2 : FF FF 0A
|
||||
|
||||
:doc:`DSDT <../acpi/dsdt>`::
|
||||
|
||||
Scope (_SB)
|
||||
{
|
||||
Device (S0D0)
|
||||
{
|
||||
Name (_HID, "ACPI0016" /* Compute Express Link Host Bridge */) // _HID: Hardware ID
|
||||
...
|
||||
Name (_UID, 0x07) // _UID: Unique ID
|
||||
}
|
||||
...
|
||||
Device (S0D5)
|
||||
{
|
||||
Name (_HID, "ACPI0016" /* Compute Express Link Host Bridge */) // _HID: Hardware ID
|
||||
...
|
||||
Name (_UID, 0x06) // _UID: Unique ID
|
||||
}
|
||||
}
|
||||
|
|
@ -1,9 +1,9 @@
|
|||
.. SPDX-License-Identifier: GPL-2.0
|
||||
.. include:: <isonum.txt>
|
||||
|
||||
===================================
|
||||
Compute Express Link Memory Devices
|
||||
===================================
|
||||
===============================================
|
||||
Compute Express Link Driver Theory of Operation
|
||||
===============================================
|
||||
|
||||
A Compute Express Link Memory Device is a CXL component that implements the
|
||||
CXL.mem protocol. It contains some amount of volatile memory, persistent memory,
|
||||
|
|
@ -14,8 +14,8 @@ that optionally define a device's contribution to an interleaved address
|
|||
range across multiple devices underneath a host-bridge or interleaved
|
||||
across host-bridges.
|
||||
|
||||
CXL Bus: Theory of Operation
|
||||
============================
|
||||
The CXL Bus
|
||||
===========
|
||||
Similar to how a RAID driver takes disk objects and assembles them into a new
|
||||
logical device, the CXL subsystem is tasked to take PCIe and ACPI objects and
|
||||
assemble them into a CXL.mem decode topology. The need for runtime configuration
|
||||
|
|
@ -347,6 +347,9 @@ CXL Core
|
|||
.. kernel-doc:: drivers/cxl/cxl.h
|
||||
:internal:
|
||||
|
||||
.. kernel-doc:: drivers/cxl/acpi.c
|
||||
:identifiers: add_cxl_resources
|
||||
|
||||
.. kernel-doc:: drivers/cxl/core/hdm.c
|
||||
:doc: cxl core hdm
|
||||
|
||||
|
|
@ -371,12 +374,26 @@ CXL Core
|
|||
.. kernel-doc:: drivers/cxl/core/pmem.c
|
||||
:doc: cxl pmem
|
||||
|
||||
.. kernel-doc:: drivers/cxl/core/pmem.c
|
||||
:identifiers:
|
||||
|
||||
.. kernel-doc:: drivers/cxl/core/regs.c
|
||||
:doc: cxl registers
|
||||
|
||||
.. kernel-doc:: drivers/cxl/core/regs.c
|
||||
:identifiers:
|
||||
|
||||
.. kernel-doc:: drivers/cxl/core/mbox.c
|
||||
:doc: cxl mbox
|
||||
|
||||
.. kernel-doc:: drivers/cxl/core/mbox.c
|
||||
:identifiers:
|
||||
|
||||
.. kernel-doc:: drivers/cxl/core/features.c
|
||||
:doc: cxl features
|
||||
|
||||
See :c:func:`devm_cxl_setup_features` for API details.
|
||||
|
||||
CXL Regions
|
||||
-----------
|
||||
.. kernel-doc:: drivers/cxl/core/region.c
|
||||
|
|
@ -119,3 +119,34 @@ sysfs
|
|||
|
||||
Sysfs files are documented in
|
||||
`Documentation/ABI/testing/sysfs-edac-memory-repair`.
|
||||
|
||||
Examples
|
||||
--------
|
||||
|
||||
The memory repair usage takes the form shown in this example:
|
||||
|
||||
1. CXL memory sparing
|
||||
|
||||
Memory sparing is defined as a repair function that replaces a portion of
|
||||
memory with a portion of functional memory at that same DPA. The subclass
|
||||
for this operation, cacheline/row/bank/rank sparing, vary in terms of the
|
||||
scope of the sparing being performed.
|
||||
|
||||
Memory sparing maintenance operations may be supported by CXL devices that
|
||||
implement CXL.mem protocol. A sparing maintenance operation requests the
|
||||
CXL device to perform a repair operation on its media. For example, a CXL
|
||||
device with DRAM components that support memory sparing features may
|
||||
implement sparing maintenance operations.
|
||||
|
||||
2. CXL memory Soft Post Package Repair (sPPR)
|
||||
|
||||
Post Package Repair (PPR) maintenance operations may be supported by CXL
|
||||
devices that implement CXL.mem protocol. A PPR maintenance operation
|
||||
requests the CXL device to perform a repair operation on its media.
|
||||
For example, a CXL device with DRAM components that support PPR features
|
||||
may implement PPR Maintenance operations. Soft PPR (sPPR) is a temporary
|
||||
row repair. Soft PPR may be faster, but the repair is lost with a power
|
||||
cycle.
|
||||
|
||||
Sysfs files for memory repair are documented in
|
||||
`Documentation/ABI/testing/sysfs-edac-memory-repair`
|
||||
|
|
|
|||
|
|
@ -264,3 +264,79 @@ Sysfs files are documented in
|
|||
`Documentation/ABI/testing/sysfs-edac-scrub`
|
||||
|
||||
`Documentation/ABI/testing/sysfs-edac-ecs`
|
||||
|
||||
Examples
|
||||
--------
|
||||
|
||||
The usage takes the form shown in these examples:
|
||||
|
||||
1. CXL memory Patrol Scrub
|
||||
|
||||
The following are the use cases identified why we might increase the scrub rate.
|
||||
|
||||
- Scrubbing is needed at device granularity because a device is showing
|
||||
unexpectedly high errors.
|
||||
|
||||
- Scrubbing may apply to memory that isn't online at all yet. Likely this
|
||||
is a system wide default setting on boot.
|
||||
|
||||
- Scrubbing at a higher rate because the monitor software has determined that
|
||||
more reliability is necessary for a particular data set. This is called
|
||||
Differentiated Reliability.
|
||||
|
||||
1.1. Device based scrubbing
|
||||
|
||||
CXL memory is exposed to memory management subsystem and ultimately userspace
|
||||
via CXL devices. Device-based scrubbing is used for the first use case
|
||||
described in "Section 1 CXL Memory Patrol Scrub".
|
||||
|
||||
When combining control via the device interfaces and region interfaces,
|
||||
"see Section 1.2 Region based scrubbing".
|
||||
|
||||
Sysfs files for scrubbing are documented in
|
||||
`Documentation/ABI/testing/sysfs-edac-scrub`
|
||||
|
||||
1.2. Region based scrubbing
|
||||
|
||||
CXL memory is exposed to memory management subsystem and ultimately userspace
|
||||
via CXL regions. CXL Regions represent mapped memory capacity in system
|
||||
physical address space. These can incorporate one or more parts of multiple CXL
|
||||
memory devices with traffic interleaved across them. The user may want to control
|
||||
the scrub rate via this more abstract region instead of having to figure out the
|
||||
constituent devices and program them separately. The scrub rate for each device
|
||||
covers the whole device. Thus if multiple regions use parts of that device then
|
||||
requests for scrubbing of other regions may result in a higher scrub rate than
|
||||
requested for this specific region.
|
||||
|
||||
Region-based scrubbing is used for the third use case described in
|
||||
"Section 1 CXL Memory Patrol Scrub".
|
||||
|
||||
Userspace must follow below set of rules on how to set the scrub rates for any
|
||||
mixture of requirements.
|
||||
|
||||
1. Taking each region in turn from lowest desired scrub rate to highest and set
|
||||
their scrub rates. Later regions may override the scrub rate on individual
|
||||
devices (and hence potentially whole regions).
|
||||
|
||||
2. Take each device for which enhanced scrubbing is required (higher rate) and
|
||||
set those scrub rates. This will override the scrub rates of individual devices,
|
||||
setting them to the maximum rate required for any of the regions they help back,
|
||||
unless a specific rate is already defined.
|
||||
|
||||
Sysfs files for scrubbing are documented in
|
||||
`Documentation/ABI/testing/sysfs-edac-scrub`
|
||||
|
||||
2. CXL memory Error Check Scrub (ECS)
|
||||
|
||||
The Error Check Scrub (ECS) feature enables a memory device to perform error
|
||||
checking and correction (ECC) and count single-bit errors. The associated
|
||||
memory controller sets the ECS mode with a trigger sent to the memory
|
||||
device. CXL ECS control allows the host, thus the userspace, to change the
|
||||
attributes for error count mode, threshold number of errors per segment
|
||||
(indicating how many segments have at least that number of errors) for
|
||||
reporting errors, and reset the ECS counter. Thus the responsibility for
|
||||
initiating Error Check Scrub on a memory device may lie with the memory
|
||||
controller or platform when unexpectedly high error rates are detected.
|
||||
|
||||
Sysfs files for scrubbing are documented in
|
||||
`Documentation/ABI/testing/sysfs-edac-ecs`
|
||||
|
|
|
|||
|
|
@ -114,6 +114,77 @@ config CXL_FEATURES
|
|||
|
||||
If unsure say 'n'
|
||||
|
||||
config CXL_EDAC_MEM_FEATURES
|
||||
bool "CXL: EDAC Memory Features"
|
||||
depends on EXPERT
|
||||
depends on CXL_MEM
|
||||
depends on CXL_FEATURES
|
||||
depends on EDAC >= CXL_BUS
|
||||
help
|
||||
The CXL EDAC memory feature is optional and allows host to
|
||||
control the EDAC memory features configurations of CXL memory
|
||||
expander devices.
|
||||
|
||||
Say 'y' if you have an expert need to change default settings
|
||||
of a memory RAS feature established by the platform/device.
|
||||
Otherwise say 'n'.
|
||||
|
||||
config CXL_EDAC_SCRUB
|
||||
bool "Enable CXL Patrol Scrub Control (Patrol Read)"
|
||||
depends on CXL_EDAC_MEM_FEATURES
|
||||
depends on EDAC_SCRUB
|
||||
help
|
||||
The CXL EDAC scrub control is optional and allows host to
|
||||
control the scrub feature configurations of CXL memory expander
|
||||
devices.
|
||||
|
||||
When enabled 'cxl_mem' and 'cxl_region' EDAC devices are
|
||||
published with memory scrub control attributes as described by
|
||||
Documentation/ABI/testing/sysfs-edac-scrub.
|
||||
|
||||
Say 'y' if you have an expert need to change default settings
|
||||
of a memory scrub feature established by the platform/device
|
||||
(e.g. scrub rates for the patrol scrub feature).
|
||||
Otherwise say 'n'.
|
||||
|
||||
config CXL_EDAC_ECS
|
||||
bool "Enable CXL Error Check Scrub (Repair)"
|
||||
depends on CXL_EDAC_MEM_FEATURES
|
||||
depends on EDAC_ECS
|
||||
help
|
||||
The CXL EDAC ECS control is optional and allows host to
|
||||
control the ECS feature configurations of CXL memory expander
|
||||
devices.
|
||||
|
||||
When enabled 'cxl_mem' EDAC devices are published with memory
|
||||
ECS control attributes as described by
|
||||
Documentation/ABI/testing/sysfs-edac-ecs.
|
||||
|
||||
Say 'y' if you have an expert need to change default settings
|
||||
of a memory ECS feature established by the platform/device.
|
||||
Otherwise say 'n'.
|
||||
|
||||
config CXL_EDAC_MEM_REPAIR
|
||||
bool "Enable CXL Memory Repair"
|
||||
depends on CXL_EDAC_MEM_FEATURES
|
||||
depends on EDAC_MEM_REPAIR
|
||||
help
|
||||
The CXL EDAC memory repair control is optional and allows host
|
||||
to control the memory repair features (e.g. sparing, PPR)
|
||||
configurations of CXL memory expander devices.
|
||||
|
||||
When enabled, the memory repair feature requires an additional
|
||||
memory of approximately 43KB to store CXL DRAM and CXL general
|
||||
media event records.
|
||||
|
||||
When enabled 'cxl_mem' EDAC devices are published with memory
|
||||
repair control attributes as described by
|
||||
Documentation/ABI/testing/sysfs-edac-memory-repair.
|
||||
|
||||
Say 'y' if you have an expert need to change default settings
|
||||
of a memory repair feature established by the platform/device.
|
||||
Otherwise say 'n'.
|
||||
|
||||
config CXL_PORT
|
||||
default CXL_BUS
|
||||
tristate
|
||||
|
|
|
|||
|
|
@ -11,8 +11,6 @@
|
|||
#include "cxlpci.h"
|
||||
#include "cxl.h"
|
||||
|
||||
#define CXL_RCRB_SIZE SZ_8K
|
||||
|
||||
struct cxl_cxims_data {
|
||||
int nr_maps;
|
||||
u64 xormaps[] __counted_by(nr_maps);
|
||||
|
|
@ -421,7 +419,15 @@ static int __cxl_parse_cfmws(struct acpi_cedt_cfmws *cfmws,
|
|||
rc = cxl_decoder_add(cxld, target_map);
|
||||
if (rc)
|
||||
return rc;
|
||||
return cxl_root_decoder_autoremove(dev, no_free_ptr(cxlrd));
|
||||
|
||||
rc = cxl_root_decoder_autoremove(dev, no_free_ptr(cxlrd));
|
||||
if (rc)
|
||||
return rc;
|
||||
|
||||
dev_dbg(root_port->dev.parent, "%s added to %s\n",
|
||||
dev_name(&cxld->dev), dev_name(&root_port->dev));
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int cxl_parse_cfmws(union acpi_subtable_headers *header, void *arg,
|
||||
|
|
@ -479,7 +485,11 @@ static int cxl_get_chbs_iter(union acpi_subtable_headers *header, void *arg,
|
|||
chbs = (struct acpi_cedt_chbs *) header;
|
||||
|
||||
if (chbs->cxl_version == ACPI_CEDT_CHBS_VERSION_CXL11 &&
|
||||
chbs->length != CXL_RCRB_SIZE)
|
||||
chbs->length != ACPI_CEDT_CHBS_LENGTH_CXL11)
|
||||
return 0;
|
||||
|
||||
if (chbs->cxl_version == ACPI_CEDT_CHBS_VERSION_CXL20 &&
|
||||
chbs->length != ACPI_CEDT_CHBS_LENGTH_CXL20)
|
||||
return 0;
|
||||
|
||||
if (!chbs->base)
|
||||
|
|
@ -739,10 +749,10 @@ static void remove_cxl_resources(void *data)
|
|||
* expanding its boundaries to ensure that any conflicting resources become
|
||||
* children. If a window is expanded it may then conflict with a another window
|
||||
* entry and require the window to be truncated or trimmed. Consider this
|
||||
* situation:
|
||||
* situation::
|
||||
*
|
||||
* |-- "CXL Window 0" --||----- "CXL Window 1" -----|
|
||||
* |--------------- "System RAM" -------------|
|
||||
* |-- "CXL Window 0" --||----- "CXL Window 1" -----|
|
||||
* |--------------- "System RAM" -------------|
|
||||
*
|
||||
* ...where platform firmware has established as System RAM resource across 2
|
||||
* windows, but has left some portion of window 1 for dynamic CXL region
|
||||
|
|
|
|||
|
|
@ -20,3 +20,4 @@ cxl_core-$(CONFIG_TRACING) += trace.o
|
|||
cxl_core-$(CONFIG_CXL_REGION) += region.o
|
||||
cxl_core-$(CONFIG_CXL_MCE) += mce.o
|
||||
cxl_core-$(CONFIG_CXL_FEATURES) += features.o
|
||||
cxl_core-$(CONFIG_CXL_EDAC_MEM_FEATURES) += edac.o
|
||||
|
|
|
|||
|
|
@ -28,7 +28,7 @@ static u32 cdat_normalize(u16 entry, u64 base, u8 type)
|
|||
*/
|
||||
if (entry == 0xffff || !entry)
|
||||
return 0;
|
||||
else if (base > (UINT_MAX / (entry)))
|
||||
if (base > (UINT_MAX / (entry)))
|
||||
return 0;
|
||||
|
||||
/*
|
||||
|
|
|
|||
|
|
@ -76,7 +76,7 @@ void __iomem *devm_cxl_iomap_block(struct device *dev, resource_size_t addr,
|
|||
struct dentry *cxl_debugfs_create_dir(const char *dir);
|
||||
int cxl_dpa_set_part(struct cxl_endpoint_decoder *cxled,
|
||||
enum cxl_partition_mode mode);
|
||||
int cxl_dpa_alloc(struct cxl_endpoint_decoder *cxled, unsigned long long size);
|
||||
int cxl_dpa_alloc(struct cxl_endpoint_decoder *cxled, u64 size);
|
||||
int cxl_dpa_free(struct cxl_endpoint_decoder *cxled);
|
||||
resource_size_t cxl_dpa_size(struct cxl_endpoint_decoder *cxled);
|
||||
resource_size_t cxl_dpa_resource_start(struct cxl_endpoint_decoder *cxled);
|
||||
|
|
@ -124,6 +124,8 @@ int cxl_acpi_get_extended_linear_cache_size(struct resource *backing_res,
|
|||
int nid, resource_size_t *size);
|
||||
|
||||
#ifdef CONFIG_CXL_FEATURES
|
||||
struct cxl_feat_entry *
|
||||
cxl_feature_info(struct cxl_features_state *cxlfs, const uuid_t *uuid);
|
||||
size_t cxl_get_feature(struct cxl_mailbox *cxl_mbox, const uuid_t *feat_uuid,
|
||||
enum cxl_get_feat_selection selection,
|
||||
void *feat_out, size_t feat_out_size, u16 offset,
|
||||
|
|
|
|||
2102
drivers/cxl/core/edac.c
Normal file
2102
drivers/cxl/core/edac.c
Normal file
File diff suppressed because it is too large
Load diff
|
|
@ -9,6 +9,16 @@
|
|||
#include "core.h"
|
||||
#include "cxlmem.h"
|
||||
|
||||
/**
|
||||
* DOC: cxl features
|
||||
*
|
||||
* CXL Features:
|
||||
* A CXL device that includes a mailbox supports commands that allows
|
||||
* listing, getting, and setting of optionally defined features such
|
||||
* as memory sparing or post package sparing. Vendors may define custom
|
||||
* features for the device.
|
||||
*/
|
||||
|
||||
/* All the features below are exclusive to the kernel */
|
||||
static const uuid_t cxl_exclusive_feats[] = {
|
||||
CXL_FEAT_PATROL_SCRUB_UUID,
|
||||
|
|
@ -36,7 +46,7 @@ static bool is_cxl_feature_exclusive(struct cxl_feat_entry *entry)
|
|||
return is_cxl_feature_exclusive_by_uuid(&entry->uuid);
|
||||
}
|
||||
|
||||
inline struct cxl_features_state *to_cxlfs(struct cxl_dev_state *cxlds)
|
||||
struct cxl_features_state *to_cxlfs(struct cxl_dev_state *cxlds)
|
||||
{
|
||||
return cxlds->cxlfs;
|
||||
}
|
||||
|
|
@ -355,17 +365,11 @@ static void cxlctl_close_uctx(struct fwctl_uctx *uctx)
|
|||
{
|
||||
}
|
||||
|
||||
static struct cxl_feat_entry *
|
||||
get_support_feature_info(struct cxl_features_state *cxlfs,
|
||||
const struct fwctl_rpc_cxl *rpc_in)
|
||||
struct cxl_feat_entry *
|
||||
cxl_feature_info(struct cxl_features_state *cxlfs,
|
||||
const uuid_t *uuid)
|
||||
{
|
||||
struct cxl_feat_entry *feat;
|
||||
const uuid_t *uuid;
|
||||
|
||||
if (rpc_in->op_size < sizeof(uuid))
|
||||
return ERR_PTR(-EINVAL);
|
||||
|
||||
uuid = &rpc_in->set_feat_in.uuid;
|
||||
|
||||
for (int i = 0; i < cxlfs->entries->num_features; i++) {
|
||||
feat = &cxlfs->entries->ent[i];
|
||||
|
|
@ -416,14 +420,6 @@ static void *cxlctl_get_supported_features(struct cxl_features_state *cxlfs,
|
|||
|
||||
rpc_out->size = struct_size(feat_out, ents, requested);
|
||||
feat_out = &rpc_out->get_sup_feats_out;
|
||||
if (requested == 0) {
|
||||
feat_out->num_entries = cpu_to_le16(requested);
|
||||
feat_out->supported_feats =
|
||||
cpu_to_le16(cxlfs->entries->num_features);
|
||||
rpc_out->retval = CXL_MBOX_CMD_RC_SUCCESS;
|
||||
*out_len = out_size;
|
||||
return no_free_ptr(rpc_out);
|
||||
}
|
||||
|
||||
for (i = start, pos = &feat_out->ents[0];
|
||||
i < cxlfs->entries->num_features; i++, pos++) {
|
||||
|
|
@ -547,7 +543,10 @@ static bool cxlctl_validate_set_features(struct cxl_features_state *cxlfs,
|
|||
struct cxl_feat_entry *feat;
|
||||
u32 flags;
|
||||
|
||||
feat = get_support_feature_info(cxlfs, rpc_in);
|
||||
if (rpc_in->op_size < sizeof(uuid_t))
|
||||
return ERR_PTR(-EINVAL);
|
||||
|
||||
feat = cxl_feature_info(cxlfs, &rpc_in->set_feat_in.uuid);
|
||||
if (IS_ERR(feat))
|
||||
return false;
|
||||
|
||||
|
|
@ -614,11 +613,7 @@ static bool cxlctl_validate_hw_command(struct cxl_features_state *cxlfs,
|
|||
switch (opcode) {
|
||||
case CXL_MBOX_OP_GET_SUPPORTED_FEATURES:
|
||||
case CXL_MBOX_OP_GET_FEATURE:
|
||||
if (cxl_mbox->feat_cap < CXL_FEATURES_RO)
|
||||
return false;
|
||||
if (scope >= FWCTL_RPC_CONFIGURATION)
|
||||
return true;
|
||||
return false;
|
||||
return cxl_mbox->feat_cap >= CXL_FEATURES_RO;
|
||||
case CXL_MBOX_OP_SET_FEATURE:
|
||||
if (cxl_mbox->feat_cap < CXL_FEATURES_RW)
|
||||
return false;
|
||||
|
|
|
|||
|
|
@ -34,7 +34,8 @@ static int add_hdm_decoder(struct cxl_port *port, struct cxl_decoder *cxld,
|
|||
if (rc)
|
||||
return rc;
|
||||
|
||||
dev_dbg(&cxld->dev, "Added to port %s\n", dev_name(&port->dev));
|
||||
dev_dbg(port->uport_dev, "%s added to %s\n",
|
||||
dev_name(&cxld->dev), dev_name(&port->dev));
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
@ -603,7 +604,7 @@ int cxl_dpa_set_part(struct cxl_endpoint_decoder *cxled,
|
|||
return 0;
|
||||
}
|
||||
|
||||
static int __cxl_dpa_alloc(struct cxl_endpoint_decoder *cxled, unsigned long long size)
|
||||
static int __cxl_dpa_alloc(struct cxl_endpoint_decoder *cxled, u64 size)
|
||||
{
|
||||
struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
|
||||
struct cxl_dev_state *cxlds = cxlmd->cxlds;
|
||||
|
|
@ -666,15 +667,15 @@ static int __cxl_dpa_alloc(struct cxl_endpoint_decoder *cxled, unsigned long lon
|
|||
skip = res->start - skip_start;
|
||||
|
||||
if (size > avail) {
|
||||
dev_dbg(dev, "%pa exceeds available %s capacity: %pa\n", &size,
|
||||
res->name, &avail);
|
||||
dev_dbg(dev, "%llu exceeds available %s capacity: %llu\n", size,
|
||||
res->name, (u64)avail);
|
||||
return -ENOSPC;
|
||||
}
|
||||
|
||||
return __cxl_dpa_reserve(cxled, start, size, skip);
|
||||
}
|
||||
|
||||
int cxl_dpa_alloc(struct cxl_endpoint_decoder *cxled, unsigned long long size)
|
||||
int cxl_dpa_alloc(struct cxl_endpoint_decoder *cxled, u64 size)
|
||||
{
|
||||
struct cxl_port *port = cxled_to_port(cxled);
|
||||
int rc;
|
||||
|
|
|
|||
|
|
@ -922,12 +922,19 @@ void cxl_event_trace_record(const struct cxl_memdev *cxlmd,
|
|||
hpa_alias = hpa - cache_size;
|
||||
}
|
||||
|
||||
if (event_type == CXL_CPER_EVENT_GEN_MEDIA)
|
||||
if (event_type == CXL_CPER_EVENT_GEN_MEDIA) {
|
||||
if (cxl_store_rec_gen_media((struct cxl_memdev *)cxlmd, evt))
|
||||
dev_dbg(&cxlmd->dev, "CXL store rec_gen_media failed\n");
|
||||
|
||||
trace_cxl_general_media(cxlmd, type, cxlr, hpa,
|
||||
hpa_alias, &evt->gen_media);
|
||||
else if (event_type == CXL_CPER_EVENT_DRAM)
|
||||
} else if (event_type == CXL_CPER_EVENT_DRAM) {
|
||||
if (cxl_store_rec_dram((struct cxl_memdev *)cxlmd, evt))
|
||||
dev_dbg(&cxlmd->dev, "CXL store rec_dram failed\n");
|
||||
|
||||
trace_cxl_dram(cxlmd, type, cxlr, hpa, hpa_alias,
|
||||
&evt->dram);
|
||||
}
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL_NS_GPL(cxl_event_trace_record, "CXL");
|
||||
|
|
|
|||
|
|
@ -27,6 +27,7 @@ static void cxl_memdev_release(struct device *dev)
|
|||
struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
|
||||
|
||||
ida_free(&cxl_memdev_ida, cxlmd->id);
|
||||
devm_cxl_memdev_edac_release(cxlmd);
|
||||
kfree(cxlmd);
|
||||
}
|
||||
|
||||
|
|
@ -153,8 +154,8 @@ static ssize_t security_state_show(struct device *dev,
|
|||
return sysfs_emit(buf, "frozen\n");
|
||||
if (state & CXL_PMEM_SEC_STATE_LOCKED)
|
||||
return sysfs_emit(buf, "locked\n");
|
||||
else
|
||||
return sysfs_emit(buf, "unlocked\n");
|
||||
|
||||
return sysfs_emit(buf, "unlocked\n");
|
||||
}
|
||||
static struct device_attribute dev_attr_security_state =
|
||||
__ATTR(state, 0444, security_state_show, NULL);
|
||||
|
|
|
|||
|
|
@ -415,9 +415,40 @@ int cxl_hdm_decode_init(struct cxl_dev_state *cxlds, struct cxl_hdm *cxlhdm,
|
|||
*/
|
||||
if (global_ctrl & CXL_HDM_DECODER_ENABLE || (!hdm && info->mem_enabled))
|
||||
return devm_cxl_enable_mem(&port->dev, cxlds);
|
||||
else if (!hdm)
|
||||
|
||||
/*
|
||||
* If the HDM Decoder Capability does not exist and DVSEC was
|
||||
* not setup, the DVSEC based emulation cannot be used.
|
||||
*/
|
||||
if (!hdm)
|
||||
return -ENODEV;
|
||||
|
||||
/* The HDM Decoder Capability exists but is globally disabled. */
|
||||
|
||||
/*
|
||||
* If the DVSEC CXL Range registers are not enabled, just
|
||||
* enable and use the HDM Decoder Capability registers.
|
||||
*/
|
||||
if (!info->mem_enabled) {
|
||||
rc = devm_cxl_enable_hdm(&port->dev, cxlhdm);
|
||||
if (rc)
|
||||
return rc;
|
||||
|
||||
return devm_cxl_enable_mem(&port->dev, cxlds);
|
||||
}
|
||||
|
||||
/*
|
||||
* Per CXL 2.0 Section 8.1.3.8.3 and 8.1.3.8.4 DVSEC CXL Range 1 Base
|
||||
* [High,Low] when HDM operation is enabled the range register values
|
||||
* are ignored by the device, but the spec also recommends matching the
|
||||
* DVSEC Range 1,2 to HDM Decoder Range 0,1. So, non-zero info->ranges
|
||||
* are expected even though Linux does not require or maintain that
|
||||
* match. Check if at least one DVSEC range is enabled and allowed by
|
||||
* the platform. That is, the DVSEC range must be covered by a locked
|
||||
* platform window (CFMWS). Fail otherwise as the endpoint's decoders
|
||||
* cannot be used.
|
||||
*/
|
||||
|
||||
root = to_cxl_port(port->dev.parent);
|
||||
while (!is_cxl_root(root) && is_cxl_port(root->dev.parent))
|
||||
root = to_cxl_port(root->dev.parent);
|
||||
|
|
@ -426,14 +457,6 @@ int cxl_hdm_decode_init(struct cxl_dev_state *cxlds, struct cxl_hdm *cxlhdm,
|
|||
return -ENODEV;
|
||||
}
|
||||
|
||||
if (!info->mem_enabled) {
|
||||
rc = devm_cxl_enable_hdm(&port->dev, cxlhdm);
|
||||
if (rc)
|
||||
return rc;
|
||||
|
||||
return devm_cxl_enable_mem(&port->dev, cxlds);
|
||||
}
|
||||
|
||||
for (i = 0, allowed = 0; i < info->ranges; i++) {
|
||||
struct device *cxld_dev;
|
||||
|
||||
|
|
@ -453,15 +476,6 @@ int cxl_hdm_decode_init(struct cxl_dev_state *cxlds, struct cxl_hdm *cxlhdm,
|
|||
return -ENXIO;
|
||||
}
|
||||
|
||||
/*
|
||||
* Per CXL 2.0 Section 8.1.3.8.3 and 8.1.3.8.4 DVSEC CXL Range 1 Base
|
||||
* [High,Low] when HDM operation is enabled the range register values
|
||||
* are ignored by the device, but the spec also recommends matching the
|
||||
* DVSEC Range 1,2 to HDM Decoder Range 0,1. So, non-zero info->ranges
|
||||
* are expected even though Linux does not require or maintain that
|
||||
* match. If at least one DVSEC range is enabled and allowed, skip HDM
|
||||
* Decoder Capability Enable.
|
||||
*/
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL_NS_GPL(cxl_hdm_decode_init, "CXL");
|
||||
|
|
|
|||
|
|
@ -602,17 +602,19 @@ struct cxl_port *to_cxl_port(const struct device *dev)
|
|||
}
|
||||
EXPORT_SYMBOL_NS_GPL(to_cxl_port, "CXL");
|
||||
|
||||
struct cxl_port *parent_port_of(struct cxl_port *port)
|
||||
{
|
||||
if (!port || !port->parent_dport)
|
||||
return NULL;
|
||||
return port->parent_dport->port;
|
||||
}
|
||||
|
||||
static void unregister_port(void *_port)
|
||||
{
|
||||
struct cxl_port *port = _port;
|
||||
struct cxl_port *parent;
|
||||
struct cxl_port *parent = parent_port_of(port);
|
||||
struct device *lock_dev;
|
||||
|
||||
if (is_cxl_root(port))
|
||||
parent = NULL;
|
||||
else
|
||||
parent = to_cxl_port(port->dev.parent);
|
||||
|
||||
/*
|
||||
* CXL root port's and the first level of ports are unregistered
|
||||
* under the platform firmware device lock, all other ports are
|
||||
|
|
@ -1035,15 +1037,6 @@ struct cxl_root *find_cxl_root(struct cxl_port *port)
|
|||
}
|
||||
EXPORT_SYMBOL_NS_GPL(find_cxl_root, "CXL");
|
||||
|
||||
void put_cxl_root(struct cxl_root *cxl_root)
|
||||
{
|
||||
if (!cxl_root)
|
||||
return;
|
||||
|
||||
put_device(&cxl_root->port.dev);
|
||||
}
|
||||
EXPORT_SYMBOL_NS_GPL(put_cxl_root, "CXL");
|
||||
|
||||
static struct cxl_dport *find_dport(struct cxl_port *port, int id)
|
||||
{
|
||||
struct cxl_dport *dport;
|
||||
|
|
|
|||
|
|
@ -231,11 +231,10 @@ static int cxl_region_invalidate_memregion(struct cxl_region *cxlr)
|
|||
&cxlr->dev,
|
||||
"Bypassing cpu_cache_invalidate_memregion() for testing!\n");
|
||||
return 0;
|
||||
} else {
|
||||
dev_WARN(&cxlr->dev,
|
||||
"Failed to synchronize CPU cache state\n");
|
||||
return -ENXIO;
|
||||
}
|
||||
dev_WARN(&cxlr->dev,
|
||||
"Failed to synchronize CPU cache state\n");
|
||||
return -ENXIO;
|
||||
}
|
||||
|
||||
cpu_cache_invalidate_memregion(IORES_DESC_CXL);
|
||||
|
|
@ -865,10 +864,23 @@ static int match_auto_decoder(struct device *dev, const void *data)
|
|||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* cxl_port_pick_region_decoder() - assign or lookup a decoder for a region
|
||||
* @port: a port in the ancestry of the endpoint implied by @cxled
|
||||
* @cxled: endpoint decoder to be, or currently, mapped by @port
|
||||
* @cxlr: region to establish, or validate, decode @port
|
||||
*
|
||||
* In the region creation path cxl_port_pick_region_decoder() is an
|
||||
* allocator to find a free port. In the region assembly path, it is
|
||||
* recalling the decoder that platform firmware picked for validation
|
||||
* purposes.
|
||||
*
|
||||
* The result is recorded in a 'struct cxl_region_ref' in @port.
|
||||
*/
|
||||
static struct cxl_decoder *
|
||||
cxl_region_find_decoder(struct cxl_port *port,
|
||||
struct cxl_endpoint_decoder *cxled,
|
||||
struct cxl_region *cxlr)
|
||||
cxl_port_pick_region_decoder(struct cxl_port *port,
|
||||
struct cxl_endpoint_decoder *cxled,
|
||||
struct cxl_region *cxlr)
|
||||
{
|
||||
struct device *dev;
|
||||
|
||||
|
|
@ -916,7 +928,8 @@ static bool auto_order_ok(struct cxl_port *port, struct cxl_region *cxlr_iter,
|
|||
|
||||
static struct cxl_region_ref *
|
||||
alloc_region_ref(struct cxl_port *port, struct cxl_region *cxlr,
|
||||
struct cxl_endpoint_decoder *cxled)
|
||||
struct cxl_endpoint_decoder *cxled,
|
||||
struct cxl_decoder *cxld)
|
||||
{
|
||||
struct cxl_region_params *p = &cxlr->params;
|
||||
struct cxl_region_ref *cxl_rr, *iter;
|
||||
|
|
@ -930,9 +943,6 @@ alloc_region_ref(struct cxl_port *port, struct cxl_region *cxlr,
|
|||
continue;
|
||||
|
||||
if (test_bit(CXL_REGION_F_AUTO, &cxlr->flags)) {
|
||||
struct cxl_decoder *cxld;
|
||||
|
||||
cxld = cxl_region_find_decoder(port, cxled, cxlr);
|
||||
if (auto_order_ok(port, iter->region, cxld))
|
||||
continue;
|
||||
}
|
||||
|
|
@ -1014,19 +1024,11 @@ static int cxl_rr_ep_add(struct cxl_region_ref *cxl_rr,
|
|||
return 0;
|
||||
}
|
||||
|
||||
static int cxl_rr_alloc_decoder(struct cxl_port *port, struct cxl_region *cxlr,
|
||||
struct cxl_endpoint_decoder *cxled,
|
||||
struct cxl_region_ref *cxl_rr)
|
||||
static int cxl_rr_assign_decoder(struct cxl_port *port, struct cxl_region *cxlr,
|
||||
struct cxl_endpoint_decoder *cxled,
|
||||
struct cxl_region_ref *cxl_rr,
|
||||
struct cxl_decoder *cxld)
|
||||
{
|
||||
struct cxl_decoder *cxld;
|
||||
|
||||
cxld = cxl_region_find_decoder(port, cxled, cxlr);
|
||||
if (!cxld) {
|
||||
dev_dbg(&cxlr->dev, "%s: no decoder available\n",
|
||||
dev_name(&port->dev));
|
||||
return -EBUSY;
|
||||
}
|
||||
|
||||
if (cxld->region) {
|
||||
dev_dbg(&cxlr->dev, "%s: %s already attached to %s\n",
|
||||
dev_name(&port->dev), dev_name(&cxld->dev),
|
||||
|
|
@ -1117,7 +1119,16 @@ static int cxl_port_attach_region(struct cxl_port *port,
|
|||
nr_targets_inc = true;
|
||||
}
|
||||
} else {
|
||||
cxl_rr = alloc_region_ref(port, cxlr, cxled);
|
||||
struct cxl_decoder *cxld;
|
||||
|
||||
cxld = cxl_port_pick_region_decoder(port, cxled, cxlr);
|
||||
if (!cxld) {
|
||||
dev_dbg(&cxlr->dev, "%s: no decoder available\n",
|
||||
dev_name(&port->dev));
|
||||
return -EBUSY;
|
||||
}
|
||||
|
||||
cxl_rr = alloc_region_ref(port, cxlr, cxled, cxld);
|
||||
if (IS_ERR(cxl_rr)) {
|
||||
dev_dbg(&cxlr->dev,
|
||||
"%s: failed to allocate region reference\n",
|
||||
|
|
@ -1126,7 +1137,7 @@ static int cxl_port_attach_region(struct cxl_port *port,
|
|||
}
|
||||
nr_targets_inc = true;
|
||||
|
||||
rc = cxl_rr_alloc_decoder(port, cxlr, cxled, cxl_rr);
|
||||
rc = cxl_rr_assign_decoder(port, cxlr, cxled, cxl_rr, cxld);
|
||||
if (rc)
|
||||
goto out_erase;
|
||||
}
|
||||
|
|
@ -1446,7 +1457,7 @@ static int cxl_port_setup_targets(struct cxl_port *port,
|
|||
|
||||
if (test_bit(CXL_REGION_F_AUTO, &cxlr->flags)) {
|
||||
if (cxld->interleave_ways != iw ||
|
||||
cxld->interleave_granularity != ig ||
|
||||
(iw > 1 && cxld->interleave_granularity != ig) ||
|
||||
!region_res_match_cxl_range(p, &cxld->hpa_range) ||
|
||||
((cxld->flags & CXL_DECODER_F_ENABLE) == 0)) {
|
||||
dev_err(&cxlr->dev,
|
||||
|
|
@ -1748,13 +1759,6 @@ static int cmp_interleave_pos(const void *a, const void *b)
|
|||
return cxled_a->pos - cxled_b->pos;
|
||||
}
|
||||
|
||||
static struct cxl_port *next_port(struct cxl_port *port)
|
||||
{
|
||||
if (!port->parent_dport)
|
||||
return NULL;
|
||||
return port->parent_dport->port;
|
||||
}
|
||||
|
||||
static int match_switch_decoder_by_range(struct device *dev,
|
||||
const void *data)
|
||||
{
|
||||
|
|
@ -1781,7 +1785,7 @@ static int find_pos_and_ways(struct cxl_port *port, struct range *range,
|
|||
struct device *dev;
|
||||
int rc = -ENXIO;
|
||||
|
||||
parent = next_port(port);
|
||||
parent = parent_port_of(port);
|
||||
if (!parent)
|
||||
return rc;
|
||||
|
||||
|
|
@ -1805,6 +1809,13 @@ static int find_pos_and_ways(struct cxl_port *port, struct range *range,
|
|||
}
|
||||
put_device(dev);
|
||||
|
||||
if (rc)
|
||||
dev_err(port->uport_dev,
|
||||
"failed to find %s:%s in target list of %s\n",
|
||||
dev_name(&port->dev),
|
||||
dev_name(port->parent_dport->dport_dev),
|
||||
dev_name(&cxlsd->cxld.dev));
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
|
|
@ -1861,7 +1872,7 @@ static int cxl_calc_interleave_pos(struct cxl_endpoint_decoder *cxled)
|
|||
*/
|
||||
|
||||
/* Iterate from endpoint to root_port refining the position */
|
||||
for (iter = port; iter; iter = next_port(iter)) {
|
||||
for (iter = port; iter; iter = parent_port_of(iter)) {
|
||||
if (is_cxl_root(iter))
|
||||
break;
|
||||
|
||||
|
|
@ -1940,7 +1951,9 @@ static int cxl_region_attach(struct cxl_region *cxlr,
|
|||
if (p->state > CXL_CONFIG_INTERLEAVE_ACTIVE) {
|
||||
dev_dbg(&cxlr->dev, "region already active\n");
|
||||
return -EBUSY;
|
||||
} else if (p->state < CXL_CONFIG_INTERLEAVE_ACTIVE) {
|
||||
}
|
||||
|
||||
if (p->state < CXL_CONFIG_INTERLEAVE_ACTIVE) {
|
||||
dev_dbg(&cxlr->dev, "interleave config missing\n");
|
||||
return -ENXIO;
|
||||
}
|
||||
|
|
@ -2160,6 +2173,12 @@ static int attach_target(struct cxl_region *cxlr,
|
|||
rc = cxl_region_attach(cxlr, cxled, pos);
|
||||
up_read(&cxl_dpa_rwsem);
|
||||
up_write(&cxl_region_rwsem);
|
||||
|
||||
if (rc)
|
||||
dev_warn(cxled->cxld.dev.parent,
|
||||
"failed to attach %s to %s: %d\n",
|
||||
dev_name(&cxled->cxld.dev), dev_name(&cxlr->dev), rc);
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
|
|
@ -3196,20 +3215,49 @@ err:
|
|||
return rc;
|
||||
}
|
||||
|
||||
static int match_root_decoder_by_range(struct device *dev,
|
||||
const void *data)
|
||||
static int match_decoder_by_range(struct device *dev, const void *data)
|
||||
{
|
||||
const struct range *r1, *r2 = data;
|
||||
struct cxl_root_decoder *cxlrd;
|
||||
struct cxl_decoder *cxld;
|
||||
|
||||
if (!is_root_decoder(dev))
|
||||
if (!is_switch_decoder(dev))
|
||||
return 0;
|
||||
|
||||
cxlrd = to_cxl_root_decoder(dev);
|
||||
r1 = &cxlrd->cxlsd.cxld.hpa_range;
|
||||
cxld = to_cxl_decoder(dev);
|
||||
r1 = &cxld->hpa_range;
|
||||
return range_contains(r1, r2);
|
||||
}
|
||||
|
||||
static struct cxl_decoder *
|
||||
cxl_port_find_switch_decoder(struct cxl_port *port, struct range *hpa)
|
||||
{
|
||||
struct device *cxld_dev = device_find_child(&port->dev, hpa,
|
||||
match_decoder_by_range);
|
||||
|
||||
return cxld_dev ? to_cxl_decoder(cxld_dev) : NULL;
|
||||
}
|
||||
|
||||
static struct cxl_root_decoder *
|
||||
cxl_find_root_decoder(struct cxl_endpoint_decoder *cxled)
|
||||
{
|
||||
struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
|
||||
struct cxl_port *port = cxled_to_port(cxled);
|
||||
struct cxl_root *cxl_root __free(put_cxl_root) = find_cxl_root(port);
|
||||
struct cxl_decoder *root, *cxld = &cxled->cxld;
|
||||
struct range *hpa = &cxld->hpa_range;
|
||||
|
||||
root = cxl_port_find_switch_decoder(&cxl_root->port, hpa);
|
||||
if (!root) {
|
||||
dev_err(cxlmd->dev.parent,
|
||||
"%s:%s no CXL window for range %#llx:%#llx\n",
|
||||
dev_name(&cxlmd->dev), dev_name(&cxld->dev),
|
||||
cxld->hpa_range.start, cxld->hpa_range.end);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return to_cxl_root_decoder(&root->dev);
|
||||
}
|
||||
|
||||
static int match_region_by_range(struct device *dev, const void *data)
|
||||
{
|
||||
struct cxl_region_params *p;
|
||||
|
|
@ -3376,47 +3424,45 @@ static struct cxl_region *construct_region(struct cxl_root_decoder *cxlrd,
|
|||
return cxlr;
|
||||
}
|
||||
|
||||
int cxl_add_to_region(struct cxl_port *root, struct cxl_endpoint_decoder *cxled)
|
||||
static struct cxl_region *
|
||||
cxl_find_region_by_range(struct cxl_root_decoder *cxlrd, struct range *hpa)
|
||||
{
|
||||
struct device *region_dev;
|
||||
|
||||
region_dev = device_find_child(&cxlrd->cxlsd.cxld.dev, hpa,
|
||||
match_region_by_range);
|
||||
if (!region_dev)
|
||||
return NULL;
|
||||
|
||||
return to_cxl_region(region_dev);
|
||||
}
|
||||
|
||||
int cxl_add_to_region(struct cxl_endpoint_decoder *cxled)
|
||||
{
|
||||
struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
|
||||
struct range *hpa = &cxled->cxld.hpa_range;
|
||||
struct cxl_decoder *cxld = &cxled->cxld;
|
||||
struct device *cxlrd_dev, *region_dev;
|
||||
struct cxl_root_decoder *cxlrd;
|
||||
struct cxl_region_params *p;
|
||||
struct cxl_region *cxlr;
|
||||
bool attach = false;
|
||||
int rc;
|
||||
|
||||
cxlrd_dev = device_find_child(&root->dev, &cxld->hpa_range,
|
||||
match_root_decoder_by_range);
|
||||
if (!cxlrd_dev) {
|
||||
dev_err(cxlmd->dev.parent,
|
||||
"%s:%s no CXL window for range %#llx:%#llx\n",
|
||||
dev_name(&cxlmd->dev), dev_name(&cxld->dev),
|
||||
cxld->hpa_range.start, cxld->hpa_range.end);
|
||||
struct cxl_root_decoder *cxlrd __free(put_cxl_root_decoder) =
|
||||
cxl_find_root_decoder(cxled);
|
||||
if (!cxlrd)
|
||||
return -ENXIO;
|
||||
}
|
||||
|
||||
cxlrd = to_cxl_root_decoder(cxlrd_dev);
|
||||
|
||||
/*
|
||||
* Ensure that if multiple threads race to construct_region() for @hpa
|
||||
* one does the construction and the others add to that.
|
||||
*/
|
||||
mutex_lock(&cxlrd->range_lock);
|
||||
region_dev = device_find_child(&cxlrd->cxlsd.cxld.dev, hpa,
|
||||
match_region_by_range);
|
||||
if (!region_dev) {
|
||||
struct cxl_region *cxlr __free(put_cxl_region) =
|
||||
cxl_find_region_by_range(cxlrd, hpa);
|
||||
if (!cxlr)
|
||||
cxlr = construct_region(cxlrd, cxled);
|
||||
region_dev = &cxlr->dev;
|
||||
} else
|
||||
cxlr = to_cxl_region(region_dev);
|
||||
mutex_unlock(&cxlrd->range_lock);
|
||||
|
||||
rc = PTR_ERR_OR_ZERO(cxlr);
|
||||
if (rc)
|
||||
goto out;
|
||||
return rc;
|
||||
|
||||
attach_target(cxlr, cxled, -1, TASK_UNINTERRUPTIBLE);
|
||||
|
||||
|
|
@ -3436,9 +3482,6 @@ int cxl_add_to_region(struct cxl_port *root, struct cxl_endpoint_decoder *cxled)
|
|||
p->res);
|
||||
}
|
||||
|
||||
put_device(region_dev);
|
||||
out:
|
||||
put_device(cxlrd_dev);
|
||||
return rc;
|
||||
}
|
||||
EXPORT_SYMBOL_NS_GPL(cxl_add_to_region, "CXL");
|
||||
|
|
@ -3537,8 +3580,18 @@ out:
|
|||
|
||||
switch (cxlr->mode) {
|
||||
case CXL_PARTMODE_PMEM:
|
||||
rc = devm_cxl_region_edac_register(cxlr);
|
||||
if (rc)
|
||||
dev_dbg(&cxlr->dev, "CXL EDAC registration for region_id=%d failed\n",
|
||||
cxlr->id);
|
||||
|
||||
return devm_cxl_add_pmem_region(cxlr);
|
||||
case CXL_PARTMODE_RAM:
|
||||
rc = devm_cxl_region_edac_register(cxlr);
|
||||
if (rc)
|
||||
dev_dbg(&cxlr->dev, "CXL EDAC registration for region_id=%d failed\n",
|
||||
cxlr->id);
|
||||
|
||||
/*
|
||||
* The region can not be manged by CXL if any portion of
|
||||
* it is already online as 'System RAM'
|
||||
|
|
|
|||
|
|
@ -724,6 +724,7 @@ static inline bool is_cxl_root(struct cxl_port *port)
|
|||
int cxl_num_decoders_committed(struct cxl_port *port);
|
||||
bool is_cxl_port(const struct device *dev);
|
||||
struct cxl_port *to_cxl_port(const struct device *dev);
|
||||
struct cxl_port *parent_port_of(struct cxl_port *port);
|
||||
void cxl_port_commit_reap(struct cxl_decoder *cxld);
|
||||
struct pci_bus;
|
||||
int devm_cxl_register_pci_bus(struct device *host, struct device *uport_dev,
|
||||
|
|
@ -736,10 +737,12 @@ struct cxl_port *devm_cxl_add_port(struct device *host,
|
|||
struct cxl_root *devm_cxl_add_root(struct device *host,
|
||||
const struct cxl_root_ops *ops);
|
||||
struct cxl_root *find_cxl_root(struct cxl_port *port);
|
||||
void put_cxl_root(struct cxl_root *cxl_root);
|
||||
DEFINE_FREE(put_cxl_root, struct cxl_root *, if (_T) put_cxl_root(_T))
|
||||
|
||||
DEFINE_FREE(put_cxl_root, struct cxl_root *, if (_T) put_device(&_T->port.dev))
|
||||
DEFINE_FREE(put_cxl_port, struct cxl_port *, if (!IS_ERR_OR_NULL(_T)) put_device(&_T->dev))
|
||||
DEFINE_FREE(put_cxl_root_decoder, struct cxl_root_decoder *, if (!IS_ERR_OR_NULL(_T)) put_device(&_T->cxlsd.cxld.dev))
|
||||
DEFINE_FREE(put_cxl_region, struct cxl_region *, if (!IS_ERR_OR_NULL(_T)) put_device(&_T->dev))
|
||||
|
||||
int devm_cxl_enumerate_ports(struct cxl_memdev *cxlmd);
|
||||
void cxl_bus_rescan(void);
|
||||
void cxl_bus_drain(void);
|
||||
|
|
@ -856,8 +859,7 @@ struct cxl_nvdimm_bridge *cxl_find_nvdimm_bridge(struct cxl_port *port);
|
|||
#ifdef CONFIG_CXL_REGION
|
||||
bool is_cxl_pmem_region(struct device *dev);
|
||||
struct cxl_pmem_region *to_cxl_pmem_region(struct device *dev);
|
||||
int cxl_add_to_region(struct cxl_port *root,
|
||||
struct cxl_endpoint_decoder *cxled);
|
||||
int cxl_add_to_region(struct cxl_endpoint_decoder *cxled);
|
||||
struct cxl_dax_region *to_cxl_dax_region(struct device *dev);
|
||||
u64 cxl_port_get_spa_cache_alias(struct cxl_port *endpoint, u64 spa);
|
||||
#else
|
||||
|
|
@ -869,8 +871,7 @@ static inline struct cxl_pmem_region *to_cxl_pmem_region(struct device *dev)
|
|||
{
|
||||
return NULL;
|
||||
}
|
||||
static inline int cxl_add_to_region(struct cxl_port *root,
|
||||
struct cxl_endpoint_decoder *cxled)
|
||||
static inline int cxl_add_to_region(struct cxl_endpoint_decoder *cxled)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
|
@ -912,4 +913,14 @@ bool cxl_endpoint_decoder_reset_detected(struct cxl_port *port);
|
|||
|
||||
u16 cxl_gpf_get_dvsec(struct device *dev);
|
||||
|
||||
static inline struct rw_semaphore *rwsem_read_intr_acquire(struct rw_semaphore *rwsem)
|
||||
{
|
||||
if (down_read_interruptible(rwsem))
|
||||
return NULL;
|
||||
|
||||
return rwsem;
|
||||
}
|
||||
|
||||
DEFINE_FREE(rwsem_read_release, struct rw_semaphore *, if (_T) up_read(_T))
|
||||
|
||||
#endif /* __CXL_H__ */
|
||||
|
|
|
|||
|
|
@ -45,6 +45,11 @@
|
|||
* @endpoint: connection to the CXL port topology for this memory device
|
||||
* @id: id number of this memdev instance.
|
||||
* @depth: endpoint port depth
|
||||
* @scrub_cycle: current scrub cycle set for this device
|
||||
* @scrub_region_id: id number of a backed region (if any) for which current scrub cycle set
|
||||
* @err_rec_array: List of xarrarys to store the memdev error records to
|
||||
* check attributes for a memory repair operation are from
|
||||
* current boot.
|
||||
*/
|
||||
struct cxl_memdev {
|
||||
struct device dev;
|
||||
|
|
@ -56,6 +61,9 @@ struct cxl_memdev {
|
|||
struct cxl_port *endpoint;
|
||||
int id;
|
||||
int depth;
|
||||
u8 scrub_cycle;
|
||||
int scrub_region_id;
|
||||
void *err_rec_array;
|
||||
};
|
||||
|
||||
static inline struct cxl_memdev *to_cxl_memdev(struct device *dev)
|
||||
|
|
@ -527,6 +535,7 @@ enum cxl_opcode {
|
|||
CXL_MBOX_OP_GET_SUPPORTED_FEATURES = 0x0500,
|
||||
CXL_MBOX_OP_GET_FEATURE = 0x0501,
|
||||
CXL_MBOX_OP_SET_FEATURE = 0x0502,
|
||||
CXL_MBOX_OP_DO_MAINTENANCE = 0x0600,
|
||||
CXL_MBOX_OP_IDENTIFY = 0x4000,
|
||||
CXL_MBOX_OP_GET_PARTITION_INFO = 0x4100,
|
||||
CXL_MBOX_OP_SET_PARTITION_INFO = 0x4101,
|
||||
|
|
@ -853,6 +862,27 @@ int cxl_trigger_poison_list(struct cxl_memdev *cxlmd);
|
|||
int cxl_inject_poison(struct cxl_memdev *cxlmd, u64 dpa);
|
||||
int cxl_clear_poison(struct cxl_memdev *cxlmd, u64 dpa);
|
||||
|
||||
#ifdef CONFIG_CXL_EDAC_MEM_FEATURES
|
||||
int devm_cxl_memdev_edac_register(struct cxl_memdev *cxlmd);
|
||||
int devm_cxl_region_edac_register(struct cxl_region *cxlr);
|
||||
int cxl_store_rec_gen_media(struct cxl_memdev *cxlmd, union cxl_event *evt);
|
||||
int cxl_store_rec_dram(struct cxl_memdev *cxlmd, union cxl_event *evt);
|
||||
void devm_cxl_memdev_edac_release(struct cxl_memdev *cxlmd);
|
||||
#else
|
||||
static inline int devm_cxl_memdev_edac_register(struct cxl_memdev *cxlmd)
|
||||
{ return 0; }
|
||||
static inline int devm_cxl_region_edac_register(struct cxl_region *cxlr)
|
||||
{ return 0; }
|
||||
static inline int cxl_store_rec_gen_media(struct cxl_memdev *cxlmd,
|
||||
union cxl_event *evt)
|
||||
{ return 0; }
|
||||
static inline int cxl_store_rec_dram(struct cxl_memdev *cxlmd,
|
||||
union cxl_event *evt)
|
||||
{ return 0; }
|
||||
static inline void devm_cxl_memdev_edac_release(struct cxl_memdev *cxlmd)
|
||||
{ return; }
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_CXL_SUSPEND
|
||||
void cxl_mem_active_inc(void);
|
||||
void cxl_mem_active_dec(void);
|
||||
|
|
|
|||
|
|
@ -180,6 +180,10 @@ static int cxl_mem_probe(struct device *dev)
|
|||
return rc;
|
||||
}
|
||||
|
||||
rc = devm_cxl_memdev_edac_register(cxlmd);
|
||||
if (rc)
|
||||
dev_dbg(dev, "CXL memdev EDAC registration failed rc=%d\n", rc);
|
||||
|
||||
/*
|
||||
* The kernel may be operating out of CXL memory on this device,
|
||||
* there is no spec defined way to determine whether this device
|
||||
|
|
|
|||
|
|
@ -30,7 +30,7 @@ static void schedule_detach(void *cxlmd)
|
|||
schedule_cxl_memdev_detach(cxlmd);
|
||||
}
|
||||
|
||||
static int discover_region(struct device *dev, void *root)
|
||||
static int discover_region(struct device *dev, void *unused)
|
||||
{
|
||||
struct cxl_endpoint_decoder *cxled;
|
||||
int rc;
|
||||
|
|
@ -49,7 +49,7 @@ static int discover_region(struct device *dev, void *root)
|
|||
* Region enumeration is opportunistic, if this add-event fails,
|
||||
* continue to the next endpoint decoder.
|
||||
*/
|
||||
rc = cxl_add_to_region(root, cxled);
|
||||
rc = cxl_add_to_region(cxled);
|
||||
if (rc)
|
||||
dev_dbg(dev, "failed to add to region: %#llx-%#llx\n",
|
||||
cxled->cxld.hpa_range.start, cxled->cxld.hpa_range.end);
|
||||
|
|
@ -95,7 +95,6 @@ static int cxl_endpoint_port_probe(struct cxl_port *port)
|
|||
struct cxl_memdev *cxlmd = to_cxl_memdev(port->uport_dev);
|
||||
struct cxl_dev_state *cxlds = cxlmd->cxlds;
|
||||
struct cxl_hdm *cxlhdm;
|
||||
struct cxl_port *root;
|
||||
int rc;
|
||||
|
||||
rc = cxl_dvsec_rr_decode(cxlds, &info);
|
||||
|
|
@ -126,19 +125,11 @@ static int cxl_endpoint_port_probe(struct cxl_port *port)
|
|||
if (rc)
|
||||
return rc;
|
||||
|
||||
/*
|
||||
* This can't fail in practice as CXL root exit unregisters all
|
||||
* descendant ports and that in turn synchronizes with cxl_port_probe()
|
||||
*/
|
||||
struct cxl_root *cxl_root __free(put_cxl_root) = find_cxl_root(port);
|
||||
|
||||
root = &cxl_root->port;
|
||||
|
||||
/*
|
||||
* Now that all endpoint decoders are successfully enumerated, try to
|
||||
* assemble regions from committed decoders
|
||||
*/
|
||||
device_for_each_child(&port->dev, root, discover_region);
|
||||
device_for_each_child(&port->dev, NULL, discover_region);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -45,6 +45,15 @@ struct edac_mem_repair_context {
|
|||
struct attribute_group group;
|
||||
};
|
||||
|
||||
const char * const edac_repair_type[] = {
|
||||
[EDAC_REPAIR_PPR] = "ppr",
|
||||
[EDAC_REPAIR_CACHELINE_SPARING] = "cacheline-sparing",
|
||||
[EDAC_REPAIR_ROW_SPARING] = "row-sparing",
|
||||
[EDAC_REPAIR_BANK_SPARING] = "bank-sparing",
|
||||
[EDAC_REPAIR_RANK_SPARING] = "rank-sparing",
|
||||
};
|
||||
EXPORT_SYMBOL_GPL(edac_repair_type);
|
||||
|
||||
#define TO_MR_DEV_ATTR(_dev_attr) \
|
||||
container_of(_dev_attr, struct edac_mem_repair_dev_attr, dev_attr)
|
||||
|
||||
|
|
|
|||
|
|
@ -64,7 +64,7 @@ struct cxl_features_state {
|
|||
struct cxl_mailbox;
|
||||
struct cxl_memdev;
|
||||
#ifdef CONFIG_CXL_FEATURES
|
||||
inline struct cxl_features_state *to_cxlfs(struct cxl_dev_state *cxlds);
|
||||
struct cxl_features_state *to_cxlfs(struct cxl_dev_state *cxlds);
|
||||
int devm_cxl_setup_features(struct cxl_dev_state *cxlds);
|
||||
int devm_cxl_setup_fwctl(struct device *host, struct cxl_memdev *cxlmd);
|
||||
#else
|
||||
|
|
|
|||
|
|
@ -745,9 +745,16 @@ static inline int edac_ecs_get_desc(struct device *ecs_dev,
|
|||
#endif /* CONFIG_EDAC_ECS */
|
||||
|
||||
enum edac_mem_repair_type {
|
||||
EDAC_REPAIR_PPR,
|
||||
EDAC_REPAIR_CACHELINE_SPARING,
|
||||
EDAC_REPAIR_ROW_SPARING,
|
||||
EDAC_REPAIR_BANK_SPARING,
|
||||
EDAC_REPAIR_RANK_SPARING,
|
||||
EDAC_REPAIR_MAX
|
||||
};
|
||||
|
||||
extern const char * const edac_repair_type[];
|
||||
|
||||
enum edac_mem_repair_cmd {
|
||||
EDAC_DO_MEM_REPAIR = 1,
|
||||
};
|
||||
|
|
|
|||
|
|
@ -67,6 +67,7 @@ cxl_core-$(CONFIG_TRACING) += $(CXL_CORE_SRC)/trace.o
|
|||
cxl_core-$(CONFIG_CXL_REGION) += $(CXL_CORE_SRC)/region.o
|
||||
cxl_core-$(CONFIG_CXL_MCE) += $(CXL_CORE_SRC)/mce.o
|
||||
cxl_core-$(CONFIG_CXL_FEATURES) += $(CXL_CORE_SRC)/features.o
|
||||
cxl_core-$(CONFIG_CXL_EDAC_MEM_FEATURES) += $(CXL_CORE_SRC)/edac.o
|
||||
cxl_core-y += config_check.o
|
||||
cxl_core-y += cxl_core_test.o
|
||||
cxl_core-y += cxl_core_exports.o
|
||||
|
|
|
|||
|
|
@ -1527,5 +1527,6 @@ MODULE_PARM_DESC(interleave_arithmetic, "Modulo:0, XOR:1");
|
|||
module_init(cxl_test_init);
|
||||
module_exit(cxl_test_exit);
|
||||
MODULE_LICENSE("GPL v2");
|
||||
MODULE_DESCRIPTION("cxl_test: setup module");
|
||||
MODULE_IMPORT_NS("ACPI");
|
||||
MODULE_IMPORT_NS("CXL");
|
||||
|
|
|
|||
|
|
@ -1909,4 +1909,5 @@ static struct platform_driver cxl_mock_mem_driver = {
|
|||
|
||||
module_platform_driver(cxl_mock_mem_driver);
|
||||
MODULE_LICENSE("GPL v2");
|
||||
MODULE_DESCRIPTION("cxl_test: mem device mock module");
|
||||
MODULE_IMPORT_NS("CXL");
|
||||
|
|
|
|||
|
|
@ -312,5 +312,6 @@ void __wrap_cxl_dport_init_ras_reporting(struct cxl_dport *dport, struct device
|
|||
EXPORT_SYMBOL_NS_GPL(__wrap_cxl_dport_init_ras_reporting, "CXL");
|
||||
|
||||
MODULE_LICENSE("GPL v2");
|
||||
MODULE_DESCRIPTION("cxl_test: emulation module");
|
||||
MODULE_IMPORT_NS("ACPI");
|
||||
MODULE_IMPORT_NS("CXL");
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue