linux/drivers/infiniband/hw/mlx5/umr.h

114 lines
3.6 KiB
C
Raw Permalink Normal View History

/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
/* Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. */
#ifndef _MLX5_IB_UMR_H
#define _MLX5_IB_UMR_H
#include "mlx5_ib.h"
#define MLX5_MAX_UMR_SHIFT 16
#define MLX5_MAX_UMR_PAGES (1 << MLX5_MAX_UMR_SHIFT)
#define MLX5_IB_UMR_OCTOWORD 16
#define MLX5_IB_UMR_XLT_ALIGNMENT 64
int mlx5r_umr_resource_init(struct mlx5_ib_dev *dev);
void mlx5r_umr_resource_cleanup(struct mlx5_ib_dev *dev);
int mlx5r_umr_init(struct mlx5_ib_dev *dev);
void mlx5r_umr_cleanup(struct mlx5_ib_dev *dev);
static inline bool mlx5r_umr_can_load_pas(struct mlx5_ib_dev *dev,
size_t length)
{
/*
* umr_check_mkey_mask() rejects MLX5_MKEY_MASK_PAGE_SIZE which is
* always set if MLX5_IB_SEND_UMR_UPDATE_TRANSLATION (aka
* MLX5_IB_UPD_XLT_ADDR and MLX5_IB_UPD_XLT_ENABLE) is set. Thus, a mkey
* can never be enabled without this capability. Simplify this weird
* quirky hardware by just saying it can't use PAS lists with UMR at
* all.
*/
if (MLX5_CAP_GEN(dev->mdev, umr_modify_entity_size_disabled))
return false;
/*
* length is the size of the MR in bytes when mlx5_ib_update_xlt() is
* used.
*/
if (!MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset) &&
length >= MLX5_MAX_UMR_PAGES * PAGE_SIZE)
return false;
return true;
}
/*
* true if an existing MR can be reconfigured to new access_flags using UMR.
* Older HW cannot use UMR to update certain elements of the MKC. See
* get_umr_update_access_mask() and umr_check_mkey_mask()
*/
static inline bool mlx5r_umr_can_reconfig(struct mlx5_ib_dev *dev,
unsigned int current_access_flags,
unsigned int target_access_flags)
{
unsigned int diffs = current_access_flags ^ target_access_flags;
if ((diffs & IB_ACCESS_REMOTE_ATOMIC) &&
MLX5_CAP_GEN(dev->mdev, atomic) &&
MLX5_CAP_GEN(dev->mdev, umr_modify_atomic_disabled))
return false;
if ((diffs & IB_ACCESS_RELAXED_ORDERING) &&
MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write) &&
!MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write_umr))
return false;
if ((diffs & IB_ACCESS_RELAXED_ORDERING) &&
RDMA/mlx5: Allow relaxed ordering read in VFs and VMs According to PCIe spec, Enable Relaxed Ordering value in the VF's PCI config space is wired to 0 and PF relaxed ordering (RO) setting should be applied to the VF. In QEMU (and maybe others), when assigning VFs, the RO bit in PCI config space is not emulated properly and is always set to 0. Therefore, pcie_relaxed_ordering_enabled() always returns 0 for VFs and VMs and thus MKeys can't be created with RO read even if the PF supports it. pcie_relaxed_ordering_enabled() check was added to avoid a syndrome when creating a MKey with relaxed ordering (RO) enabled when the driver's relaxed_ordering_read_pci_enabled HCA capability is out of sync with FW. With the new relaxed_ordering_read capability this can't happen, as it's set regardless of RO value in PCI config space and thus can't change during runtime. Hence, to allow RO read in VFs and VMs, use the new HCA capability relaxed_ordering_read without checking pcie_relaxed_ordering_enabled(). The old capability checks are kept for backward compatibility with older FWs. Allowing RO in VFs and VMs is valuable since it can greatly improve performance on some setups. For example, testing throughput of a VF on an AMD EPYC 7763 and ConnectX-6 Dx setup showed roughly 60% performance improvement. Signed-off-by: Avihai Horon <avihaih@nvidia.com> Reviewed-by: Shay Drory <shayd@nvidia.com> Reviewed-by: Aya Levin <ayal@nvidia.com> Link: https://lore.kernel.org/r/e7048640d66c341a8fa0465e099926e7989184bc.1681131553.git.leon@kernel.org Reviewed-by: Jacob Keller <jacob.e.keller@intel.com> Signed-off-by: Leon Romanovsky <leon@kernel.org>
2023-04-10 16:07:53 +03:00
(MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read) ||
MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read_pci_enabled)) &&
!MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read_umr))
return false;
return true;
}
static inline u64 mlx5r_umr_get_xlt_octo(u64 bytes)
{
return ALIGN(bytes, MLX5_IB_UMR_XLT_ALIGNMENT) /
MLX5_IB_UMR_OCTOWORD;
}
struct mlx5r_umr_context {
struct ib_cqe cqe;
enum ib_wc_status status;
struct completion done;
};
struct mlx5r_umr_wqe {
struct mlx5_wqe_umr_ctrl_seg ctrl_seg;
struct mlx5_mkey_seg mkey_seg;
struct mlx5_wqe_data_seg data_seg;
};
int mlx5r_umr_revoke_mr(struct mlx5_ib_mr *mr);
int mlx5r_umr_rereg_pd_access(struct mlx5_ib_mr *mr, struct ib_pd *pd,
int access_flags);
RDMA/mlx5: Optimize DMABUF mkey page size The current implementation of DMABUF memory registration uses a fixed page size for the memory key (mkey), which can lead to suboptimal performance when the underlying memory layout may offer better page size. The optimization improves performance by reducing the number of page table entries required for the mkey, leading to less MTT/KSM descriptors that the HCA must go through to find translations, fewer cache-lines, and shorter UMR work requests on mkey updates such as when re-registering or reusing a cacheable mkey. To ensure safe page size updates, the implementation uses a 5-step process: 1. Make the first X entries non-present, while X is calculated to be minimal according to a large page shift that can be used to cover the MR length. 2. Update the page size to the large supported page size 3. Load the remaining N-X entries according to the (optimized) page shift 4. Update the page size according to the (optimized) page shift 5. Load the first X entries with the correct translations This ensures that at no point is the MR accessible with a partially updated translation table, maintaining correctness and preventing access to stale or inconsistent mappings, such as having an mkey advertising the new page size while some of the underlying page table entries still contain the old page size translations. Signed-off-by: Edward Srouji <edwards@nvidia.com> Reviewed-by: Michael Guralnik <michaelgur@nvidia.com> Link: https://patch.msgid.link/bc05a6b2142c02f96a90635f9a4458ee4bbbf39f.1751979184.git.leon@kernel.org Signed-off-by: Leon Romanovsky <leon@kernel.org>
2025-07-09 09:42:11 +03:00
int mlx5r_umr_update_data_direct_ksm_pas_range(struct mlx5_ib_mr *mr,
unsigned int flags,
size_t start_block,
size_t nblocks);
RDMA/mlx5: Add support for DMABUF MR registrations with Data-direct Add support for DMABUF MR registrations with Data-direct device. Upon userspace calling to register a DMABUF MR with the data direct bit set, the below algorithm will be followed. 1) Obtain a pinned DMABUF umem from the IB core using the user input parameters (FD, offset, length) and the DMA PF device. The DMA PF device is needed to allow the IOMMU to enable the DMA PF to access the user buffer over PCI. 2) Create a KSM MKEY by setting its entries according to the user buffer VA to IOVA mapping, with the MKEY being the data direct device-crossed MKEY. This KSM MKEY is umrable and will be used as part of the MR cache. The PD for creating it is the internal device 'data direct' kernel one. 3) Create a crossing MKEY that points to the KSM MKEY using the crossing access mode. 4) Manage the KSM MKEY by adding it to a list of 'data direct' MKEYs managed on the mlx5_ib device. 5) Return the crossing MKEY to the user, created with its supplied PD. Upon DMA PF unbind flow, the driver will revoke the KSM entries. The final deregistration will occur under the hood once the application deregisters its MKEY. Notes: - This version supports only the PINNED UMEM mode, so there is no dependency on ODP. - The IOVA supplied by the application must be system page aligned due to HW translations of KSM. - The crossing MKEY will not be umrable or part of the MR cache, as we cannot change its crossed (i.e. KSM) MKEY over UMR. Signed-off-by: Yishai Hadas <yishaih@nvidia.com> Link: https://patch.msgid.link/1f99d8020ed540d9702b9e2252a145a439609ba6.1722512548.git.leon@kernel.org Signed-off-by: Leon Romanovsky <leon@kernel.org>
2024-08-01 15:05:16 +03:00
int mlx5r_umr_update_data_direct_ksm_pas(struct mlx5_ib_mr *mr, unsigned int flags);
RDMA/mlx5: Optimize DMABUF mkey page size The current implementation of DMABUF memory registration uses a fixed page size for the memory key (mkey), which can lead to suboptimal performance when the underlying memory layout may offer better page size. The optimization improves performance by reducing the number of page table entries required for the mkey, leading to less MTT/KSM descriptors that the HCA must go through to find translations, fewer cache-lines, and shorter UMR work requests on mkey updates such as when re-registering or reusing a cacheable mkey. To ensure safe page size updates, the implementation uses a 5-step process: 1. Make the first X entries non-present, while X is calculated to be minimal according to a large page shift that can be used to cover the MR length. 2. Update the page size to the large supported page size 3. Load the remaining N-X entries according to the (optimized) page shift 4. Update the page size according to the (optimized) page shift 5. Load the first X entries with the correct translations This ensures that at no point is the MR accessible with a partially updated translation table, maintaining correctness and preventing access to stale or inconsistent mappings, such as having an mkey advertising the new page size while some of the underlying page table entries still contain the old page size translations. Signed-off-by: Edward Srouji <edwards@nvidia.com> Reviewed-by: Michael Guralnik <michaelgur@nvidia.com> Link: https://patch.msgid.link/bc05a6b2142c02f96a90635f9a4458ee4bbbf39f.1751979184.git.leon@kernel.org Signed-off-by: Leon Romanovsky <leon@kernel.org>
2025-07-09 09:42:11 +03:00
int mlx5r_umr_update_mr_pas_range(struct mlx5_ib_mr *mr, unsigned int flags,
size_t start_block, size_t nblocks);
int mlx5r_umr_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags);
int mlx5r_umr_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages,
int page_shift, int flags);
RDMA/mlx5: Optimize DMABUF mkey page size The current implementation of DMABUF memory registration uses a fixed page size for the memory key (mkey), which can lead to suboptimal performance when the underlying memory layout may offer better page size. The optimization improves performance by reducing the number of page table entries required for the mkey, leading to less MTT/KSM descriptors that the HCA must go through to find translations, fewer cache-lines, and shorter UMR work requests on mkey updates such as when re-registering or reusing a cacheable mkey. To ensure safe page size updates, the implementation uses a 5-step process: 1. Make the first X entries non-present, while X is calculated to be minimal according to a large page shift that can be used to cover the MR length. 2. Update the page size to the large supported page size 3. Load the remaining N-X entries according to the (optimized) page shift 4. Update the page size according to the (optimized) page shift 5. Load the first X entries with the correct translations This ensures that at no point is the MR accessible with a partially updated translation table, maintaining correctness and preventing access to stale or inconsistent mappings, such as having an mkey advertising the new page size while some of the underlying page table entries still contain the old page size translations. Signed-off-by: Edward Srouji <edwards@nvidia.com> Reviewed-by: Michael Guralnik <michaelgur@nvidia.com> Link: https://patch.msgid.link/bc05a6b2142c02f96a90635f9a4458ee4bbbf39f.1751979184.git.leon@kernel.org Signed-off-by: Leon Romanovsky <leon@kernel.org>
2025-07-09 09:42:11 +03:00
int mlx5r_umr_update_mr_page_shift(struct mlx5_ib_mr *mr,
unsigned int page_shift,
bool dd);
int mlx5r_umr_dmabuf_update_pgsz(struct mlx5_ib_mr *mr, u32 xlt_flags,
unsigned int page_shift);
#endif /* _MLX5_IB_UMR_H */