2022-04-12 10:23:56 +03:00
|
|
|
// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
|
|
|
|
/* Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. */
|
|
|
|
|
2022-04-12 10:24:06 +03:00
|
|
|
#include <rdma/ib_umem_odp.h>
|
2022-04-12 10:23:56 +03:00
|
|
|
#include "mlx5_ib.h"
|
|
|
|
#include "umr.h"
|
2022-04-12 10:24:01 +03:00
|
|
|
#include "wr.h"
|
2022-04-12 10:23:56 +03:00
|
|
|
|
2022-04-12 10:24:04 +03:00
|
|
|
/*
|
|
|
|
* We can't use an array for xlt_emergency_page because dma_map_single doesn't
|
|
|
|
* work on kernel modules memory
|
|
|
|
*/
|
|
|
|
void *xlt_emergency_page;
|
|
|
|
static DEFINE_MUTEX(xlt_emergency_page_mutex);
|
|
|
|
|
2022-04-12 10:23:58 +03:00
|
|
|
static __be64 get_umr_enable_mr_mask(void)
|
|
|
|
{
|
|
|
|
u64 result;
|
|
|
|
|
|
|
|
result = MLX5_MKEY_MASK_KEY |
|
|
|
|
MLX5_MKEY_MASK_FREE;
|
|
|
|
|
|
|
|
return cpu_to_be64(result);
|
|
|
|
}
|
|
|
|
|
|
|
|
static __be64 get_umr_disable_mr_mask(void)
|
|
|
|
{
|
|
|
|
u64 result;
|
|
|
|
|
|
|
|
result = MLX5_MKEY_MASK_FREE;
|
|
|
|
|
|
|
|
return cpu_to_be64(result);
|
|
|
|
}
|
|
|
|
|
2025-07-09 09:42:09 +03:00
|
|
|
static __be64 get_umr_update_translation_mask(struct mlx5_ib_dev *dev)
|
2022-04-12 10:23:58 +03:00
|
|
|
{
|
|
|
|
u64 result;
|
|
|
|
|
|
|
|
result = MLX5_MKEY_MASK_LEN |
|
|
|
|
MLX5_MKEY_MASK_PAGE_SIZE |
|
|
|
|
MLX5_MKEY_MASK_START_ADDR;
|
2025-07-09 09:42:09 +03:00
|
|
|
if (MLX5_CAP_GEN_2(dev->mdev, umr_log_entity_size_5))
|
|
|
|
result |= MLX5_MKEY_MASK_PAGE_SIZE_5;
|
2022-04-12 10:23:58 +03:00
|
|
|
|
|
|
|
return cpu_to_be64(result);
|
|
|
|
}
|
|
|
|
|
2022-04-12 10:23:59 +03:00
|
|
|
static __be64 get_umr_update_access_mask(struct mlx5_ib_dev *dev)
|
2022-04-12 10:23:58 +03:00
|
|
|
{
|
|
|
|
u64 result;
|
|
|
|
|
|
|
|
result = MLX5_MKEY_MASK_LR |
|
|
|
|
MLX5_MKEY_MASK_LW |
|
|
|
|
MLX5_MKEY_MASK_RR |
|
|
|
|
MLX5_MKEY_MASK_RW;
|
|
|
|
|
2022-04-12 10:23:59 +03:00
|
|
|
if (MLX5_CAP_GEN(dev->mdev, atomic))
|
2022-04-12 10:23:58 +03:00
|
|
|
result |= MLX5_MKEY_MASK_A;
|
|
|
|
|
2022-04-12 10:23:59 +03:00
|
|
|
if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write_umr))
|
2022-04-12 10:23:58 +03:00
|
|
|
result |= MLX5_MKEY_MASK_RELAXED_ORDERING_WRITE;
|
|
|
|
|
2022-04-12 10:23:59 +03:00
|
|
|
if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read_umr))
|
2022-04-12 10:23:58 +03:00
|
|
|
result |= MLX5_MKEY_MASK_RELAXED_ORDERING_READ;
|
|
|
|
|
|
|
|
return cpu_to_be64(result);
|
|
|
|
}
|
|
|
|
|
|
|
|
static __be64 get_umr_update_pd_mask(void)
|
|
|
|
{
|
|
|
|
u64 result;
|
|
|
|
|
|
|
|
result = MLX5_MKEY_MASK_PD;
|
|
|
|
|
|
|
|
return cpu_to_be64(result);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int umr_check_mkey_mask(struct mlx5_ib_dev *dev, u64 mask)
|
|
|
|
{
|
|
|
|
if (mask & MLX5_MKEY_MASK_PAGE_SIZE &&
|
|
|
|
MLX5_CAP_GEN(dev->mdev, umr_modify_entity_size_disabled))
|
|
|
|
return -EPERM;
|
|
|
|
|
|
|
|
if (mask & MLX5_MKEY_MASK_A &&
|
|
|
|
MLX5_CAP_GEN(dev->mdev, umr_modify_atomic_disabled))
|
|
|
|
return -EPERM;
|
|
|
|
|
|
|
|
if (mask & MLX5_MKEY_MASK_RELAXED_ORDERING_WRITE &&
|
|
|
|
!MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write_umr))
|
|
|
|
return -EPERM;
|
|
|
|
|
|
|
|
if (mask & MLX5_MKEY_MASK_RELAXED_ORDERING_READ &&
|
|
|
|
!MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read_umr))
|
|
|
|
return -EPERM;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2022-04-12 10:23:56 +03:00
|
|
|
enum {
|
|
|
|
MAX_UMR_WR = 128,
|
|
|
|
};
|
|
|
|
|
|
|
|
static int mlx5r_umr_qp_rst2rts(struct mlx5_ib_dev *dev, struct ib_qp *qp)
|
|
|
|
{
|
|
|
|
struct ib_qp_attr attr = {};
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
attr.qp_state = IB_QPS_INIT;
|
|
|
|
attr.port_num = 1;
|
|
|
|
ret = ib_modify_qp(qp, &attr,
|
|
|
|
IB_QP_STATE | IB_QP_PKEY_INDEX | IB_QP_PORT);
|
|
|
|
if (ret) {
|
|
|
|
mlx5_ib_dbg(dev, "Couldn't modify UMR QP\n");
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
memset(&attr, 0, sizeof(attr));
|
|
|
|
attr.qp_state = IB_QPS_RTR;
|
|
|
|
|
|
|
|
ret = ib_modify_qp(qp, &attr, IB_QP_STATE);
|
|
|
|
if (ret) {
|
|
|
|
mlx5_ib_dbg(dev, "Couldn't modify umr QP to rtr\n");
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
memset(&attr, 0, sizeof(attr));
|
|
|
|
attr.qp_state = IB_QPS_RTS;
|
|
|
|
ret = ib_modify_qp(qp, &attr, IB_QP_STATE);
|
|
|
|
if (ret) {
|
|
|
|
mlx5_ib_dbg(dev, "Couldn't modify umr QP to rts\n");
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
int mlx5r_umr_resource_init(struct mlx5_ib_dev *dev)
|
|
|
|
{
|
|
|
|
struct ib_qp_init_attr init_attr = {};
|
|
|
|
struct ib_cq *cq;
|
|
|
|
struct ib_qp *qp;
|
2024-06-03 13:26:38 +03:00
|
|
|
int ret = 0;
|
2022-04-12 10:23:56 +03:00
|
|
|
|
2024-06-03 13:26:38 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* UMR qp is set once, never changed until device unload.
|
|
|
|
* Avoid taking the mutex if initialization is already done.
|
|
|
|
*/
|
|
|
|
if (dev->umrc.qp)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
mutex_lock(&dev->umrc.init_lock);
|
|
|
|
/* First user allocates the UMR resources. Skip if already allocated. */
|
|
|
|
if (dev->umrc.qp)
|
|
|
|
goto unlock;
|
2022-04-12 10:23:56 +03:00
|
|
|
|
|
|
|
cq = ib_alloc_cq(&dev->ib_dev, NULL, 128, 0, IB_POLL_SOFTIRQ);
|
|
|
|
if (IS_ERR(cq)) {
|
|
|
|
mlx5_ib_dbg(dev, "Couldn't create CQ for sync UMR QP\n");
|
|
|
|
ret = PTR_ERR(cq);
|
2024-06-03 13:26:38 +03:00
|
|
|
goto unlock;
|
2022-04-12 10:23:56 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
init_attr.send_cq = cq;
|
|
|
|
init_attr.recv_cq = cq;
|
|
|
|
init_attr.sq_sig_type = IB_SIGNAL_ALL_WR;
|
|
|
|
init_attr.cap.max_send_wr = MAX_UMR_WR;
|
|
|
|
init_attr.cap.max_send_sge = 1;
|
|
|
|
init_attr.qp_type = MLX5_IB_QPT_REG_UMR;
|
|
|
|
init_attr.port_num = 1;
|
2024-06-03 13:26:38 +03:00
|
|
|
qp = ib_create_qp(dev->umrc.pd, &init_attr);
|
2022-04-12 10:23:56 +03:00
|
|
|
if (IS_ERR(qp)) {
|
|
|
|
mlx5_ib_dbg(dev, "Couldn't create sync UMR QP\n");
|
|
|
|
ret = PTR_ERR(qp);
|
|
|
|
goto destroy_cq;
|
|
|
|
}
|
|
|
|
|
|
|
|
ret = mlx5r_umr_qp_rst2rts(dev, qp);
|
|
|
|
if (ret)
|
|
|
|
goto destroy_qp;
|
|
|
|
|
|
|
|
dev->umrc.cq = cq;
|
|
|
|
|
|
|
|
sema_init(&dev->umrc.sem, MAX_UMR_WR);
|
2022-05-15 07:19:53 +03:00
|
|
|
mutex_init(&dev->umrc.lock);
|
2022-08-29 12:02:29 +03:00
|
|
|
dev->umrc.state = MLX5_UMR_STATE_ACTIVE;
|
2024-06-03 13:26:38 +03:00
|
|
|
dev->umrc.qp = qp;
|
2022-04-12 10:23:56 +03:00
|
|
|
|
2024-06-03 13:26:38 +03:00
|
|
|
mutex_unlock(&dev->umrc.init_lock);
|
2022-04-12 10:23:56 +03:00
|
|
|
return 0;
|
|
|
|
|
|
|
|
destroy_qp:
|
|
|
|
ib_destroy_qp(qp);
|
|
|
|
destroy_cq:
|
|
|
|
ib_free_cq(cq);
|
2024-06-03 13:26:38 +03:00
|
|
|
unlock:
|
|
|
|
mutex_unlock(&dev->umrc.init_lock);
|
2022-04-12 10:23:56 +03:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
void mlx5r_umr_resource_cleanup(struct mlx5_ib_dev *dev)
|
|
|
|
{
|
2022-08-29 12:02:29 +03:00
|
|
|
if (dev->umrc.state == MLX5_UMR_STATE_UNINIT)
|
|
|
|
return;
|
2024-06-03 13:26:38 +03:00
|
|
|
mutex_destroy(&dev->umrc.lock);
|
|
|
|
/* After device init, UMR cp/qp are not unset during the lifetime. */
|
2022-04-12 10:23:56 +03:00
|
|
|
ib_destroy_qp(dev->umrc.qp);
|
|
|
|
ib_free_cq(dev->umrc.cq);
|
2024-06-03 13:26:38 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
int mlx5r_umr_init(struct mlx5_ib_dev *dev)
|
|
|
|
{
|
|
|
|
struct ib_pd *pd;
|
|
|
|
|
|
|
|
pd = ib_alloc_pd(&dev->ib_dev, 0);
|
|
|
|
if (IS_ERR(pd)) {
|
|
|
|
mlx5_ib_dbg(dev, "Couldn't create PD for sync UMR QP\n");
|
|
|
|
return PTR_ERR(pd);
|
|
|
|
}
|
|
|
|
dev->umrc.pd = pd;
|
|
|
|
|
|
|
|
mutex_init(&dev->umrc.init_lock);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
void mlx5r_umr_cleanup(struct mlx5_ib_dev *dev)
|
|
|
|
{
|
2024-09-02 13:35:40 +03:00
|
|
|
if (!dev->umrc.pd)
|
|
|
|
return;
|
|
|
|
|
2024-06-03 13:26:38 +03:00
|
|
|
mutex_destroy(&dev->umrc.init_lock);
|
2022-04-12 10:23:56 +03:00
|
|
|
ib_dealloc_pd(dev->umrc.pd);
|
|
|
|
}
|
2022-04-12 10:24:01 +03:00
|
|
|
|
2022-05-15 07:19:53 +03:00
|
|
|
|
2022-04-12 10:24:01 +03:00
|
|
|
static int mlx5r_umr_post_send(struct ib_qp *ibqp, u32 mkey, struct ib_cqe *cqe,
|
|
|
|
struct mlx5r_umr_wqe *wqe, bool with_data)
|
|
|
|
{
|
|
|
|
unsigned int wqe_size =
|
|
|
|
with_data ? sizeof(struct mlx5r_umr_wqe) :
|
|
|
|
sizeof(struct mlx5r_umr_wqe) -
|
|
|
|
sizeof(struct mlx5_wqe_data_seg);
|
|
|
|
struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
|
|
|
|
struct mlx5_core_dev *mdev = dev->mdev;
|
|
|
|
struct mlx5_ib_qp *qp = to_mqp(ibqp);
|
|
|
|
struct mlx5_wqe_ctrl_seg *ctrl;
|
|
|
|
union {
|
|
|
|
struct ib_cqe *ib_cqe;
|
|
|
|
u64 wr_id;
|
|
|
|
} id;
|
|
|
|
void *cur_edge, *seg;
|
|
|
|
unsigned long flags;
|
|
|
|
unsigned int idx;
|
|
|
|
int size, err;
|
|
|
|
|
|
|
|
if (unlikely(mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR))
|
|
|
|
return -EIO;
|
|
|
|
|
|
|
|
spin_lock_irqsave(&qp->sq.lock, flags);
|
|
|
|
|
|
|
|
err = mlx5r_begin_wqe(qp, &seg, &ctrl, &idx, &size, &cur_edge, 0,
|
|
|
|
cpu_to_be32(mkey), false, false);
|
|
|
|
if (WARN_ON(err))
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
qp->sq.wr_data[idx] = MLX5_IB_WR_UMR;
|
|
|
|
|
|
|
|
mlx5r_memcpy_send_wqe(&qp->sq, &cur_edge, &seg, &size, wqe, wqe_size);
|
|
|
|
|
|
|
|
id.ib_cqe = cqe;
|
|
|
|
mlx5r_finish_wqe(qp, ctrl, seg, size, cur_edge, idx, id.wr_id, 0,
|
2022-05-15 07:19:53 +03:00
|
|
|
MLX5_FENCE_MODE_INITIATOR_SMALL, MLX5_OPCODE_UMR);
|
2022-04-12 10:24:01 +03:00
|
|
|
|
|
|
|
mlx5r_ring_db(qp, 1, ctrl);
|
|
|
|
|
|
|
|
out:
|
|
|
|
spin_unlock_irqrestore(&qp->sq.lock, flags);
|
|
|
|
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
RDMA/mlx5: Fix the recovery flow of the UMR QP
This patch addresses an issue in the recovery flow of the UMR QP,
ensuring tasks do not get stuck, as highlighted by the call trace [1].
During recovery, before transitioning the QP to the RESET state, the
software must wait for all outstanding WRs to complete.
Failing to do so can cause the firmware to skip sending some flushed
CQEs with errors and simply discard them upon the RESET, as per the IB
specification.
This race condition can result in lost CQEs and tasks becoming stuck.
To resolve this, the patch sends a final WR which serves only as a
barrier before moving the QP state to RESET.
Once a CQE is received for that final WR, it guarantees that no
outstanding WRs remain, making it safe to transition the QP to RESET and
subsequently back to RTS, restoring proper functionality.
Note:
For the barrier WR, we simply reuse the failed and ready WR.
Since the QP is in an error state, it will only receive
IB_WC_WR_FLUSH_ERR. However, as it serves only as a barrier we don't
care about its status.
[1]
INFO: task rdma_resource_l:1922 blocked for more than 120 seconds.
Tainted: G W 6.12.0-rc7+ #1626
"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
task:rdma_resource_l state:D stack:0 pid:1922 tgid:1922 ppid:1369
flags:0x00004004
Call Trace:
<TASK>
__schedule+0x420/0xd30
schedule+0x47/0x130
schedule_timeout+0x280/0x300
? mark_held_locks+0x48/0x80
? lockdep_hardirqs_on_prepare+0xe5/0x1a0
wait_for_completion+0x75/0x130
mlx5r_umr_post_send_wait+0x3c2/0x5b0 [mlx5_ib]
? __pfx_mlx5r_umr_done+0x10/0x10 [mlx5_ib]
mlx5r_umr_revoke_mr+0x93/0xc0 [mlx5_ib]
__mlx5_ib_dereg_mr+0x299/0x520 [mlx5_ib]
? _raw_spin_unlock_irq+0x24/0x40
? wait_for_completion+0xfe/0x130
? rdma_restrack_put+0x63/0xe0 [ib_core]
ib_dereg_mr_user+0x5f/0x120 [ib_core]
? lock_release+0xc6/0x280
destroy_hw_idr_uobject+0x1d/0x60 [ib_uverbs]
uverbs_destroy_uobject+0x58/0x1d0 [ib_uverbs]
uobj_destroy+0x3f/0x70 [ib_uverbs]
ib_uverbs_cmd_verbs+0x3e4/0xbb0 [ib_uverbs]
? __pfx_uverbs_destroy_def_handler+0x10/0x10 [ib_uverbs]
? __lock_acquire+0x64e/0x2080
? mark_held_locks+0x48/0x80
? find_held_lock+0x2d/0xa0
? lock_acquire+0xc1/0x2f0
? ib_uverbs_ioctl+0xcb/0x170 [ib_uverbs]
? __fget_files+0xc3/0x1b0
ib_uverbs_ioctl+0xe7/0x170 [ib_uverbs]
? ib_uverbs_ioctl+0xcb/0x170 [ib_uverbs]
__x64_sys_ioctl+0x1b0/0xa70
do_syscall_64+0x6b/0x140
entry_SYSCALL_64_after_hwframe+0x76/0x7e
RIP: 0033:0x7f99c918b17b
RSP: 002b:00007ffc766d0468 EFLAGS: 00000246 ORIG_RAX:
0000000000000010
RAX: ffffffffffffffda RBX: 00007ffc766d0578 RCX:
00007f99c918b17b
RDX: 00007ffc766d0560 RSI: 00000000c0181b01 RDI:
0000000000000003
RBP: 00007ffc766d0540 R08: 00007f99c8f99010 R09:
000000000000bd7e
R10: 00007f99c94c1c70 R11: 0000000000000246 R12:
00007ffc766d0530
R13: 000000000000001c R14: 0000000040246a80 R15:
0000000000000000
</TASK>
Fixes: 158e71bb69e3 ("RDMA/mlx5: Add a umr recovery flow")
Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
Reviewed-by: Michael Guralnik <michaelgur@nvidia.com>
Link: https://patch.msgid.link/27b51b92ec42dfb09d8096fcbd51878f397ce6ec.1737290141.git.leon@kernel.org
Signed-off-by: Leon Romanovsky <leon@kernel.org>
2025-01-19 14:36:13 +02:00
|
|
|
static int mlx5r_umr_recover(struct mlx5_ib_dev *dev, u32 mkey,
|
|
|
|
struct mlx5r_umr_context *umr_context,
|
|
|
|
struct mlx5r_umr_wqe *wqe, bool with_data)
|
|
|
|
{
|
|
|
|
struct umr_common *umrc = &dev->umrc;
|
|
|
|
struct ib_qp_attr attr;
|
|
|
|
int err;
|
|
|
|
|
|
|
|
mutex_lock(&umrc->lock);
|
|
|
|
/* Preventing any further WRs to be sent now */
|
|
|
|
if (umrc->state != MLX5_UMR_STATE_RECOVER) {
|
|
|
|
mlx5_ib_warn(dev, "UMR recovery encountered an unexpected state=%d\n",
|
|
|
|
umrc->state);
|
|
|
|
umrc->state = MLX5_UMR_STATE_RECOVER;
|
|
|
|
}
|
|
|
|
mutex_unlock(&umrc->lock);
|
|
|
|
|
|
|
|
/* Sending a final/barrier WR (the failed one) and wait for its completion.
|
|
|
|
* This will ensure that all the previous WRs got a completion before
|
|
|
|
* we set the QP state to RESET.
|
|
|
|
*/
|
|
|
|
err = mlx5r_umr_post_send(umrc->qp, mkey, &umr_context->cqe, wqe,
|
|
|
|
with_data);
|
|
|
|
if (err) {
|
|
|
|
mlx5_ib_warn(dev, "UMR recovery post send failed, err %d\n", err);
|
|
|
|
goto err;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Since the QP is in an error state, it will only receive
|
|
|
|
* IB_WC_WR_FLUSH_ERR. However, as it serves only as a barrier
|
|
|
|
* we don't care about its status.
|
|
|
|
*/
|
|
|
|
wait_for_completion(&umr_context->done);
|
|
|
|
|
|
|
|
attr.qp_state = IB_QPS_RESET;
|
|
|
|
err = ib_modify_qp(umrc->qp, &attr, IB_QP_STATE);
|
|
|
|
if (err) {
|
|
|
|
mlx5_ib_warn(dev, "Couldn't modify UMR QP to RESET, err=%d\n", err);
|
|
|
|
goto err;
|
|
|
|
}
|
|
|
|
|
|
|
|
err = mlx5r_umr_qp_rst2rts(dev, umrc->qp);
|
|
|
|
if (err) {
|
|
|
|
mlx5_ib_warn(dev, "Couldn't modify UMR QP to RTS, err=%d\n", err);
|
|
|
|
goto err;
|
|
|
|
}
|
|
|
|
|
|
|
|
umrc->state = MLX5_UMR_STATE_ACTIVE;
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
err:
|
|
|
|
umrc->state = MLX5_UMR_STATE_ERR;
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
2022-04-12 10:24:01 +03:00
|
|
|
static void mlx5r_umr_done(struct ib_cq *cq, struct ib_wc *wc)
|
|
|
|
{
|
|
|
|
struct mlx5_ib_umr_context *context =
|
|
|
|
container_of(wc->wr_cqe, struct mlx5_ib_umr_context, cqe);
|
|
|
|
|
|
|
|
context->status = wc->status;
|
|
|
|
complete(&context->done);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void mlx5r_umr_init_context(struct mlx5r_umr_context *context)
|
|
|
|
{
|
|
|
|
context->cqe.done = mlx5r_umr_done;
|
|
|
|
init_completion(&context->done);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int mlx5r_umr_post_send_wait(struct mlx5_ib_dev *dev, u32 mkey,
|
|
|
|
struct mlx5r_umr_wqe *wqe, bool with_data)
|
|
|
|
{
|
|
|
|
struct umr_common *umrc = &dev->umrc;
|
|
|
|
struct mlx5r_umr_context umr_context;
|
|
|
|
int err;
|
|
|
|
|
|
|
|
err = umr_check_mkey_mask(dev, be64_to_cpu(wqe->ctrl_seg.mkey_mask));
|
|
|
|
if (WARN_ON(err))
|
|
|
|
return err;
|
|
|
|
|
|
|
|
mlx5r_umr_init_context(&umr_context);
|
|
|
|
|
|
|
|
down(&umrc->sem);
|
2022-05-15 07:19:53 +03:00
|
|
|
while (true) {
|
|
|
|
mutex_lock(&umrc->lock);
|
|
|
|
if (umrc->state == MLX5_UMR_STATE_ERR) {
|
|
|
|
mutex_unlock(&umrc->lock);
|
2022-04-12 10:24:01 +03:00
|
|
|
err = -EFAULT;
|
2022-05-15 07:19:53 +03:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (umrc->state == MLX5_UMR_STATE_RECOVER) {
|
|
|
|
mutex_unlock(&umrc->lock);
|
|
|
|
usleep_range(3000, 5000);
|
|
|
|
continue;
|
2022-04-12 10:24:01 +03:00
|
|
|
}
|
2022-05-15 07:19:53 +03:00
|
|
|
|
|
|
|
err = mlx5r_umr_post_send(umrc->qp, mkey, &umr_context.cqe, wqe,
|
|
|
|
with_data);
|
|
|
|
mutex_unlock(&umrc->lock);
|
|
|
|
if (err) {
|
|
|
|
mlx5_ib_warn(dev, "UMR post send failed, err %d\n",
|
|
|
|
err);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
wait_for_completion(&umr_context.done);
|
|
|
|
|
|
|
|
if (umr_context.status == IB_WC_SUCCESS)
|
|
|
|
break;
|
|
|
|
|
|
|
|
if (umr_context.status == IB_WC_WR_FLUSH_ERR)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
WARN_ON_ONCE(1);
|
|
|
|
mlx5_ib_warn(dev,
|
2023-09-21 11:07:16 +03:00
|
|
|
"reg umr failed (%u). Trying to recover and resubmit the flushed WQEs, mkey = %u\n",
|
|
|
|
umr_context.status, mkey);
|
RDMA/mlx5: Fix the recovery flow of the UMR QP
This patch addresses an issue in the recovery flow of the UMR QP,
ensuring tasks do not get stuck, as highlighted by the call trace [1].
During recovery, before transitioning the QP to the RESET state, the
software must wait for all outstanding WRs to complete.
Failing to do so can cause the firmware to skip sending some flushed
CQEs with errors and simply discard them upon the RESET, as per the IB
specification.
This race condition can result in lost CQEs and tasks becoming stuck.
To resolve this, the patch sends a final WR which serves only as a
barrier before moving the QP state to RESET.
Once a CQE is received for that final WR, it guarantees that no
outstanding WRs remain, making it safe to transition the QP to RESET and
subsequently back to RTS, restoring proper functionality.
Note:
For the barrier WR, we simply reuse the failed and ready WR.
Since the QP is in an error state, it will only receive
IB_WC_WR_FLUSH_ERR. However, as it serves only as a barrier we don't
care about its status.
[1]
INFO: task rdma_resource_l:1922 blocked for more than 120 seconds.
Tainted: G W 6.12.0-rc7+ #1626
"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
task:rdma_resource_l state:D stack:0 pid:1922 tgid:1922 ppid:1369
flags:0x00004004
Call Trace:
<TASK>
__schedule+0x420/0xd30
schedule+0x47/0x130
schedule_timeout+0x280/0x300
? mark_held_locks+0x48/0x80
? lockdep_hardirqs_on_prepare+0xe5/0x1a0
wait_for_completion+0x75/0x130
mlx5r_umr_post_send_wait+0x3c2/0x5b0 [mlx5_ib]
? __pfx_mlx5r_umr_done+0x10/0x10 [mlx5_ib]
mlx5r_umr_revoke_mr+0x93/0xc0 [mlx5_ib]
__mlx5_ib_dereg_mr+0x299/0x520 [mlx5_ib]
? _raw_spin_unlock_irq+0x24/0x40
? wait_for_completion+0xfe/0x130
? rdma_restrack_put+0x63/0xe0 [ib_core]
ib_dereg_mr_user+0x5f/0x120 [ib_core]
? lock_release+0xc6/0x280
destroy_hw_idr_uobject+0x1d/0x60 [ib_uverbs]
uverbs_destroy_uobject+0x58/0x1d0 [ib_uverbs]
uobj_destroy+0x3f/0x70 [ib_uverbs]
ib_uverbs_cmd_verbs+0x3e4/0xbb0 [ib_uverbs]
? __pfx_uverbs_destroy_def_handler+0x10/0x10 [ib_uverbs]
? __lock_acquire+0x64e/0x2080
? mark_held_locks+0x48/0x80
? find_held_lock+0x2d/0xa0
? lock_acquire+0xc1/0x2f0
? ib_uverbs_ioctl+0xcb/0x170 [ib_uverbs]
? __fget_files+0xc3/0x1b0
ib_uverbs_ioctl+0xe7/0x170 [ib_uverbs]
? ib_uverbs_ioctl+0xcb/0x170 [ib_uverbs]
__x64_sys_ioctl+0x1b0/0xa70
do_syscall_64+0x6b/0x140
entry_SYSCALL_64_after_hwframe+0x76/0x7e
RIP: 0033:0x7f99c918b17b
RSP: 002b:00007ffc766d0468 EFLAGS: 00000246 ORIG_RAX:
0000000000000010
RAX: ffffffffffffffda RBX: 00007ffc766d0578 RCX:
00007f99c918b17b
RDX: 00007ffc766d0560 RSI: 00000000c0181b01 RDI:
0000000000000003
RBP: 00007ffc766d0540 R08: 00007f99c8f99010 R09:
000000000000bd7e
R10: 00007f99c94c1c70 R11: 0000000000000246 R12:
00007ffc766d0530
R13: 000000000000001c R14: 0000000040246a80 R15:
0000000000000000
</TASK>
Fixes: 158e71bb69e3 ("RDMA/mlx5: Add a umr recovery flow")
Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
Reviewed-by: Michael Guralnik <michaelgur@nvidia.com>
Link: https://patch.msgid.link/27b51b92ec42dfb09d8096fcbd51878f397ce6ec.1737290141.git.leon@kernel.org
Signed-off-by: Leon Romanovsky <leon@kernel.org>
2025-01-19 14:36:13 +02:00
|
|
|
err = mlx5r_umr_recover(dev, mkey, &umr_context, wqe, with_data);
|
2022-05-15 07:19:53 +03:00
|
|
|
if (err)
|
|
|
|
mlx5_ib_warn(dev, "couldn't recover UMR, err %d\n",
|
|
|
|
err);
|
|
|
|
err = -EFAULT;
|
|
|
|
break;
|
2022-04-12 10:24:01 +03:00
|
|
|
}
|
|
|
|
up(&umrc->sem);
|
|
|
|
return err;
|
|
|
|
}
|
2022-04-12 10:24:02 +03:00
|
|
|
|
|
|
|
/**
|
|
|
|
* mlx5r_umr_revoke_mr - Fence all DMA on the MR
|
|
|
|
* @mr: The MR to fence
|
|
|
|
*
|
|
|
|
* Upon return the NIC will not be doing any DMA to the pages under the MR,
|
|
|
|
* and any DMA in progress will be completed. Failure of this function
|
|
|
|
* indicates the HW has failed catastrophically.
|
|
|
|
*/
|
|
|
|
int mlx5r_umr_revoke_mr(struct mlx5_ib_mr *mr)
|
|
|
|
{
|
|
|
|
struct mlx5_ib_dev *dev = mr_to_mdev(mr);
|
|
|
|
struct mlx5r_umr_wqe wqe = {};
|
|
|
|
|
|
|
|
if (dev->mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
wqe.ctrl_seg.mkey_mask |= get_umr_update_pd_mask();
|
|
|
|
wqe.ctrl_seg.mkey_mask |= get_umr_disable_mr_mask();
|
|
|
|
wqe.ctrl_seg.flags |= MLX5_UMR_INLINE;
|
|
|
|
|
|
|
|
MLX5_SET(mkc, &wqe.mkey_seg, free, 1);
|
|
|
|
MLX5_SET(mkc, &wqe.mkey_seg, pd, to_mpd(dev->umrc.pd)->pdn);
|
|
|
|
MLX5_SET(mkc, &wqe.mkey_seg, qpn, 0xffffff);
|
|
|
|
MLX5_SET(mkc, &wqe.mkey_seg, mkey_7_0,
|
|
|
|
mlx5_mkey_variant(mr->mmkey.key));
|
|
|
|
|
|
|
|
return mlx5r_umr_post_send_wait(dev, mr->mmkey.key, &wqe, false);
|
|
|
|
}
|
2022-04-12 10:24:03 +03:00
|
|
|
|
|
|
|
static void mlx5r_umr_set_access_flags(struct mlx5_ib_dev *dev,
|
|
|
|
struct mlx5_mkey_seg *seg,
|
|
|
|
unsigned int access_flags)
|
|
|
|
{
|
2023-04-10 16:07:51 +03:00
|
|
|
bool ro_read = (access_flags & IB_ACCESS_RELAXED_ORDERING) &&
|
2023-04-10 16:07:53 +03:00
|
|
|
(MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read) ||
|
|
|
|
pcie_relaxed_ordering_enabled(dev->mdev->pdev));
|
2023-04-10 16:07:51 +03:00
|
|
|
|
2022-04-12 10:24:03 +03:00
|
|
|
MLX5_SET(mkc, seg, a, !!(access_flags & IB_ACCESS_REMOTE_ATOMIC));
|
|
|
|
MLX5_SET(mkc, seg, rw, !!(access_flags & IB_ACCESS_REMOTE_WRITE));
|
|
|
|
MLX5_SET(mkc, seg, rr, !!(access_flags & IB_ACCESS_REMOTE_READ));
|
|
|
|
MLX5_SET(mkc, seg, lw, !!(access_flags & IB_ACCESS_LOCAL_WRITE));
|
|
|
|
MLX5_SET(mkc, seg, lr, 1);
|
|
|
|
MLX5_SET(mkc, seg, relaxed_ordering_write,
|
|
|
|
!!(access_flags & IB_ACCESS_RELAXED_ORDERING));
|
2023-04-10 16:07:51 +03:00
|
|
|
MLX5_SET(mkc, seg, relaxed_ordering_read, ro_read);
|
2022-04-12 10:24:03 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
int mlx5r_umr_rereg_pd_access(struct mlx5_ib_mr *mr, struct ib_pd *pd,
|
|
|
|
int access_flags)
|
|
|
|
{
|
|
|
|
struct mlx5_ib_dev *dev = mr_to_mdev(mr);
|
|
|
|
struct mlx5r_umr_wqe wqe = {};
|
|
|
|
int err;
|
|
|
|
|
|
|
|
wqe.ctrl_seg.mkey_mask = get_umr_update_access_mask(dev);
|
|
|
|
wqe.ctrl_seg.mkey_mask |= get_umr_update_pd_mask();
|
|
|
|
wqe.ctrl_seg.flags = MLX5_UMR_CHECK_FREE;
|
|
|
|
wqe.ctrl_seg.flags |= MLX5_UMR_INLINE;
|
|
|
|
|
|
|
|
mlx5r_umr_set_access_flags(dev, &wqe.mkey_seg, access_flags);
|
|
|
|
MLX5_SET(mkc, &wqe.mkey_seg, pd, to_mpd(pd)->pdn);
|
|
|
|
MLX5_SET(mkc, &wqe.mkey_seg, qpn, 0xffffff);
|
|
|
|
MLX5_SET(mkc, &wqe.mkey_seg, mkey_7_0,
|
|
|
|
mlx5_mkey_variant(mr->mmkey.key));
|
|
|
|
|
|
|
|
err = mlx5r_umr_post_send_wait(dev, mr->mmkey.key, &wqe, false);
|
|
|
|
if (err)
|
|
|
|
return err;
|
|
|
|
|
|
|
|
mr->access_flags = access_flags;
|
|
|
|
return 0;
|
|
|
|
}
|
2022-04-12 10:24:04 +03:00
|
|
|
|
|
|
|
#define MLX5_MAX_UMR_CHUNK \
|
2022-10-31 14:44:57 +02:00
|
|
|
((1 << (MLX5_MAX_UMR_SHIFT + 4)) - MLX5_UMR_FLEX_ALIGNMENT)
|
2022-04-12 10:24:04 +03:00
|
|
|
#define MLX5_SPARE_UMR_CHUNK 0x10000
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Allocate a temporary buffer to hold the per-page information to transfer to
|
|
|
|
* HW. For efficiency this should be as large as it can be, but buffer
|
|
|
|
* allocation failure is not allowed, so try smaller sizes.
|
|
|
|
*/
|
|
|
|
static void *mlx5r_umr_alloc_xlt(size_t *nents, size_t ent_size, gfp_t gfp_mask)
|
|
|
|
{
|
2022-10-31 14:44:57 +02:00
|
|
|
const size_t xlt_chunk_align = MLX5_UMR_FLEX_ALIGNMENT / ent_size;
|
2022-04-12 10:24:04 +03:00
|
|
|
size_t size;
|
|
|
|
void *res = NULL;
|
|
|
|
|
2022-10-31 14:44:57 +02:00
|
|
|
static_assert(PAGE_SIZE % MLX5_UMR_FLEX_ALIGNMENT == 0);
|
2022-04-12 10:24:04 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* MLX5_IB_UPD_XLT_ATOMIC doesn't signal an atomic context just that the
|
|
|
|
* allocation can't trigger any kind of reclaim.
|
|
|
|
*/
|
|
|
|
might_sleep();
|
|
|
|
|
|
|
|
gfp_mask |= __GFP_ZERO | __GFP_NORETRY;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If the system already has a suitable high order page then just use
|
|
|
|
* that, but don't try hard to create one. This max is about 1M, so a
|
|
|
|
* free x86 huge page will satisfy it.
|
|
|
|
*/
|
|
|
|
size = min_t(size_t, ent_size * ALIGN(*nents, xlt_chunk_align),
|
|
|
|
MLX5_MAX_UMR_CHUNK);
|
|
|
|
*nents = size / ent_size;
|
|
|
|
res = (void *)__get_free_pages(gfp_mask | __GFP_NOWARN,
|
|
|
|
get_order(size));
|
|
|
|
if (res)
|
|
|
|
return res;
|
|
|
|
|
|
|
|
if (size > MLX5_SPARE_UMR_CHUNK) {
|
|
|
|
size = MLX5_SPARE_UMR_CHUNK;
|
|
|
|
*nents = size / ent_size;
|
|
|
|
res = (void *)__get_free_pages(gfp_mask | __GFP_NOWARN,
|
|
|
|
get_order(size));
|
|
|
|
if (res)
|
|
|
|
return res;
|
|
|
|
}
|
|
|
|
|
|
|
|
*nents = PAGE_SIZE / ent_size;
|
|
|
|
res = (void *)__get_free_page(gfp_mask);
|
|
|
|
if (res)
|
|
|
|
return res;
|
|
|
|
|
|
|
|
mutex_lock(&xlt_emergency_page_mutex);
|
|
|
|
memset(xlt_emergency_page, 0, PAGE_SIZE);
|
|
|
|
return xlt_emergency_page;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void mlx5r_umr_free_xlt(void *xlt, size_t length)
|
|
|
|
{
|
|
|
|
if (xlt == xlt_emergency_page) {
|
|
|
|
mutex_unlock(&xlt_emergency_page_mutex);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
free_pages((unsigned long)xlt, get_order(length));
|
|
|
|
}
|
|
|
|
|
2022-04-12 10:24:06 +03:00
|
|
|
static void mlx5r_umr_unmap_free_xlt(struct mlx5_ib_dev *dev, void *xlt,
|
|
|
|
struct ib_sge *sg)
|
2022-04-12 10:24:04 +03:00
|
|
|
{
|
|
|
|
struct device *ddev = &dev->mdev->pdev->dev;
|
|
|
|
|
|
|
|
dma_unmap_single(ddev, sg->addr, sg->length, DMA_TO_DEVICE);
|
|
|
|
mlx5r_umr_free_xlt(xlt, sg->length);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Create an XLT buffer ready for submission.
|
|
|
|
*/
|
2022-04-12 10:24:06 +03:00
|
|
|
static void *mlx5r_umr_create_xlt(struct mlx5_ib_dev *dev, struct ib_sge *sg,
|
|
|
|
size_t nents, size_t ent_size,
|
|
|
|
unsigned int flags)
|
2022-04-12 10:24:04 +03:00
|
|
|
{
|
|
|
|
struct device *ddev = &dev->mdev->pdev->dev;
|
|
|
|
dma_addr_t dma;
|
|
|
|
void *xlt;
|
|
|
|
|
|
|
|
xlt = mlx5r_umr_alloc_xlt(&nents, ent_size,
|
|
|
|
flags & MLX5_IB_UPD_XLT_ATOMIC ? GFP_ATOMIC :
|
|
|
|
GFP_KERNEL);
|
|
|
|
sg->length = nents * ent_size;
|
|
|
|
dma = dma_map_single(ddev, xlt, sg->length, DMA_TO_DEVICE);
|
|
|
|
if (dma_mapping_error(ddev, dma)) {
|
|
|
|
mlx5_ib_err(dev, "unable to map DMA during XLT update.\n");
|
|
|
|
mlx5r_umr_free_xlt(xlt, sg->length);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
sg->addr = dma;
|
|
|
|
sg->lkey = dev->umrc.pd->local_dma_lkey;
|
|
|
|
|
|
|
|
return xlt;
|
|
|
|
}
|
2022-04-12 10:24:05 +03:00
|
|
|
|
|
|
|
static void
|
|
|
|
mlx5r_umr_set_update_xlt_ctrl_seg(struct mlx5_wqe_umr_ctrl_seg *ctrl_seg,
|
|
|
|
unsigned int flags, struct ib_sge *sg)
|
|
|
|
{
|
|
|
|
if (!(flags & MLX5_IB_UPD_XLT_ENABLE))
|
|
|
|
/* fail if free */
|
|
|
|
ctrl_seg->flags = MLX5_UMR_CHECK_FREE;
|
|
|
|
else
|
|
|
|
/* fail if not free */
|
|
|
|
ctrl_seg->flags = MLX5_UMR_CHECK_NOT_FREE;
|
|
|
|
ctrl_seg->xlt_octowords =
|
|
|
|
cpu_to_be16(mlx5r_umr_get_xlt_octo(sg->length));
|
|
|
|
}
|
|
|
|
|
|
|
|
static void mlx5r_umr_set_update_xlt_mkey_seg(struct mlx5_ib_dev *dev,
|
|
|
|
struct mlx5_mkey_seg *mkey_seg,
|
|
|
|
struct mlx5_ib_mr *mr,
|
|
|
|
unsigned int page_shift)
|
|
|
|
{
|
|
|
|
mlx5r_umr_set_access_flags(dev, mkey_seg, mr->access_flags);
|
|
|
|
MLX5_SET(mkc, mkey_seg, pd, to_mpd(mr->ibmr.pd)->pdn);
|
|
|
|
MLX5_SET64(mkc, mkey_seg, start_addr, mr->ibmr.iova);
|
|
|
|
MLX5_SET64(mkc, mkey_seg, len, mr->ibmr.length);
|
|
|
|
MLX5_SET(mkc, mkey_seg, log_page_size, page_shift);
|
|
|
|
MLX5_SET(mkc, mkey_seg, qpn, 0xffffff);
|
|
|
|
MLX5_SET(mkc, mkey_seg, mkey_7_0, mlx5_mkey_variant(mr->mmkey.key));
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
mlx5r_umr_set_update_xlt_data_seg(struct mlx5_wqe_data_seg *data_seg,
|
|
|
|
struct ib_sge *sg)
|
|
|
|
{
|
|
|
|
data_seg->byte_count = cpu_to_be32(sg->length);
|
|
|
|
data_seg->lkey = cpu_to_be32(sg->lkey);
|
|
|
|
data_seg->addr = cpu_to_be64(sg->addr);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void mlx5r_umr_update_offset(struct mlx5_wqe_umr_ctrl_seg *ctrl_seg,
|
|
|
|
u64 offset)
|
|
|
|
{
|
|
|
|
u64 octo_offset = mlx5r_umr_get_xlt_octo(offset);
|
|
|
|
|
|
|
|
ctrl_seg->xlt_offset = cpu_to_be16(octo_offset & 0xffff);
|
|
|
|
ctrl_seg->xlt_offset_47_16 = cpu_to_be32(octo_offset >> 16);
|
|
|
|
ctrl_seg->flags |= MLX5_UMR_TRANSLATION_OFFSET_EN;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void mlx5r_umr_final_update_xlt(struct mlx5_ib_dev *dev,
|
|
|
|
struct mlx5r_umr_wqe *wqe,
|
|
|
|
struct mlx5_ib_mr *mr, struct ib_sge *sg,
|
|
|
|
unsigned int flags)
|
|
|
|
{
|
|
|
|
bool update_pd_access, update_translation;
|
|
|
|
|
|
|
|
if (flags & MLX5_IB_UPD_XLT_ENABLE)
|
|
|
|
wqe->ctrl_seg.mkey_mask |= get_umr_enable_mr_mask();
|
|
|
|
|
|
|
|
update_pd_access = flags & MLX5_IB_UPD_XLT_ENABLE ||
|
|
|
|
flags & MLX5_IB_UPD_XLT_PD ||
|
|
|
|
flags & MLX5_IB_UPD_XLT_ACCESS;
|
|
|
|
|
|
|
|
if (update_pd_access) {
|
|
|
|
wqe->ctrl_seg.mkey_mask |= get_umr_update_access_mask(dev);
|
|
|
|
wqe->ctrl_seg.mkey_mask |= get_umr_update_pd_mask();
|
|
|
|
}
|
|
|
|
|
|
|
|
update_translation =
|
|
|
|
flags & MLX5_IB_UPD_XLT_ENABLE || flags & MLX5_IB_UPD_XLT_ADDR;
|
|
|
|
|
|
|
|
if (update_translation) {
|
2025-07-09 09:42:09 +03:00
|
|
|
wqe->ctrl_seg.mkey_mask |= get_umr_update_translation_mask(dev);
|
2022-04-12 10:24:05 +03:00
|
|
|
if (!mr->ibmr.length)
|
|
|
|
MLX5_SET(mkc, &wqe->mkey_seg, length64, 1);
|
2025-07-09 09:42:11 +03:00
|
|
|
if (flags & MLX5_IB_UPD_XLT_KEEP_PGSZ)
|
2025-07-20 12:25:35 +03:00
|
|
|
wqe->ctrl_seg.mkey_mask &=
|
|
|
|
cpu_to_be64(~MLX5_MKEY_MASK_PAGE_SIZE);
|
2022-04-12 10:24:05 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
wqe->ctrl_seg.xlt_octowords =
|
|
|
|
cpu_to_be16(mlx5r_umr_get_xlt_octo(sg->length));
|
|
|
|
wqe->data_seg.byte_count = cpu_to_be32(sg->length);
|
|
|
|
}
|
|
|
|
|
2025-07-09 09:42:11 +03:00
|
|
|
static void
|
|
|
|
_mlx5r_umr_init_wqe(struct mlx5_ib_mr *mr, struct mlx5r_umr_wqe *wqe,
|
|
|
|
struct ib_sge *sg, unsigned int flags,
|
|
|
|
unsigned int page_shift, bool dd)
|
|
|
|
{
|
|
|
|
struct mlx5_ib_dev *dev = mr_to_mdev(mr);
|
|
|
|
|
|
|
|
mlx5r_umr_set_update_xlt_ctrl_seg(&wqe->ctrl_seg, flags, sg);
|
|
|
|
mlx5r_umr_set_update_xlt_mkey_seg(dev, &wqe->mkey_seg, mr, page_shift);
|
|
|
|
if (dd) /* Use the data direct internal kernel PD */
|
|
|
|
MLX5_SET(mkc, &wqe->mkey_seg, pd, dev->ddr.pdn);
|
|
|
|
mlx5r_umr_set_update_xlt_data_seg(&wqe->data_seg, sg);
|
|
|
|
}
|
|
|
|
|
2024-08-01 15:05:16 +03:00
|
|
|
static int
|
2025-07-09 09:42:11 +03:00
|
|
|
_mlx5r_umr_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags, bool dd,
|
|
|
|
size_t start_block, size_t nblocks)
|
2022-04-12 10:24:05 +03:00
|
|
|
{
|
2024-08-01 15:05:16 +03:00
|
|
|
size_t ent_size = dd ? sizeof(struct mlx5_ksm) : sizeof(struct mlx5_mtt);
|
2022-04-12 10:24:05 +03:00
|
|
|
struct mlx5_ib_dev *dev = mr_to_mdev(mr);
|
|
|
|
struct device *ddev = &dev->mdev->pdev->dev;
|
|
|
|
struct mlx5r_umr_wqe wqe = {};
|
2025-07-09 09:42:11 +03:00
|
|
|
size_t processed_blocks = 0;
|
2022-04-12 10:24:05 +03:00
|
|
|
struct ib_block_iter biter;
|
2025-07-09 09:42:11 +03:00
|
|
|
size_t cur_block_idx = 0;
|
2024-08-01 15:05:16 +03:00
|
|
|
struct mlx5_ksm *cur_ksm;
|
2022-04-12 10:24:05 +03:00
|
|
|
struct mlx5_mtt *cur_mtt;
|
|
|
|
size_t orig_sg_length;
|
2025-07-09 09:42:11 +03:00
|
|
|
size_t total_blocks;
|
2022-04-12 10:24:05 +03:00
|
|
|
size_t final_size;
|
2024-08-01 15:05:16 +03:00
|
|
|
void *curr_entry;
|
2022-04-12 10:24:05 +03:00
|
|
|
struct ib_sge sg;
|
2024-08-01 15:05:16 +03:00
|
|
|
void *entry;
|
2025-07-09 09:42:11 +03:00
|
|
|
u64 offset;
|
2022-04-12 10:24:05 +03:00
|
|
|
int err = 0;
|
|
|
|
|
2025-07-09 09:42:11 +03:00
|
|
|
total_blocks = ib_umem_num_dma_blocks(mr->umem, 1UL << mr->page_shift);
|
|
|
|
if (start_block > total_blocks)
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
/* nblocks 0 means update all blocks starting from start_block */
|
|
|
|
if (nblocks)
|
|
|
|
total_blocks = nblocks;
|
|
|
|
|
|
|
|
entry = mlx5r_umr_create_xlt(dev, &sg, total_blocks, ent_size, flags);
|
2024-08-01 15:05:16 +03:00
|
|
|
if (!entry)
|
2022-04-12 10:24:05 +03:00
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
orig_sg_length = sg.length;
|
2025-07-09 09:42:11 +03:00
|
|
|
|
|
|
|
_mlx5r_umr_init_wqe(mr, &wqe, &sg, flags, mr->page_shift, dd);
|
|
|
|
|
|
|
|
/* Set initial translation offset to start_block */
|
|
|
|
offset = (u64)start_block * ent_size;
|
|
|
|
mlx5r_umr_update_offset(&wqe.ctrl_seg, offset);
|
|
|
|
|
|
|
|
if (dd)
|
2024-08-01 15:05:16 +03:00
|
|
|
cur_ksm = entry;
|
2025-07-09 09:42:11 +03:00
|
|
|
else
|
2024-08-01 15:05:16 +03:00
|
|
|
cur_mtt = entry;
|
2022-04-12 10:24:05 +03:00
|
|
|
|
2024-08-01 15:05:16 +03:00
|
|
|
curr_entry = entry;
|
2025-07-09 09:42:11 +03:00
|
|
|
|
2023-02-13 14:14:11 -04:00
|
|
|
rdma_umem_for_each_dma_block(mr->umem, &biter, BIT(mr->page_shift)) {
|
2025-07-09 09:42:11 +03:00
|
|
|
if (cur_block_idx < start_block) {
|
|
|
|
cur_block_idx++;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (nblocks && processed_blocks >= nblocks)
|
|
|
|
break;
|
|
|
|
|
2024-08-01 15:05:16 +03:00
|
|
|
if (curr_entry == entry + sg.length) {
|
2022-04-12 10:24:05 +03:00
|
|
|
dma_sync_single_for_device(ddev, sg.addr, sg.length,
|
|
|
|
DMA_TO_DEVICE);
|
|
|
|
|
|
|
|
err = mlx5r_umr_post_send_wait(dev, mr->mmkey.key, &wqe,
|
|
|
|
true);
|
|
|
|
if (err)
|
|
|
|
goto err;
|
|
|
|
dma_sync_single_for_cpu(ddev, sg.addr, sg.length,
|
|
|
|
DMA_TO_DEVICE);
|
|
|
|
offset += sg.length;
|
|
|
|
mlx5r_umr_update_offset(&wqe.ctrl_seg, offset);
|
2024-08-01 15:05:16 +03:00
|
|
|
if (dd)
|
|
|
|
cur_ksm = entry;
|
|
|
|
else
|
|
|
|
cur_mtt = entry;
|
2022-04-12 10:24:05 +03:00
|
|
|
}
|
|
|
|
|
2024-08-01 15:05:16 +03:00
|
|
|
if (dd) {
|
|
|
|
cur_ksm->va = cpu_to_be64(rdma_block_iter_dma_address(&biter));
|
|
|
|
cur_ksm->key = cpu_to_be32(dev->ddr.mkey);
|
2025-07-09 09:42:11 +03:00
|
|
|
if (mr->umem->is_dmabuf &&
|
|
|
|
(flags & MLX5_IB_UPD_XLT_ZAP)) {
|
|
|
|
cur_ksm->va = 0;
|
|
|
|
cur_ksm->key = 0;
|
|
|
|
}
|
2024-08-01 15:05:16 +03:00
|
|
|
cur_ksm++;
|
|
|
|
curr_entry = cur_ksm;
|
|
|
|
} else {
|
|
|
|
cur_mtt->ptag =
|
|
|
|
cpu_to_be64(rdma_block_iter_dma_address(&biter) |
|
|
|
|
MLX5_IB_MTT_PRESENT);
|
|
|
|
if (mr->umem->is_dmabuf && (flags & MLX5_IB_UPD_XLT_ZAP))
|
|
|
|
cur_mtt->ptag = 0;
|
|
|
|
cur_mtt++;
|
|
|
|
curr_entry = cur_mtt;
|
|
|
|
}
|
2025-07-09 09:42:11 +03:00
|
|
|
|
|
|
|
processed_blocks++;
|
2022-04-12 10:24:05 +03:00
|
|
|
}
|
|
|
|
|
2024-08-01 15:05:16 +03:00
|
|
|
final_size = curr_entry - entry;
|
2022-10-31 14:44:57 +02:00
|
|
|
sg.length = ALIGN(final_size, MLX5_UMR_FLEX_ALIGNMENT);
|
2024-08-01 15:05:16 +03:00
|
|
|
memset(curr_entry, 0, sg.length - final_size);
|
2022-04-12 10:24:05 +03:00
|
|
|
mlx5r_umr_final_update_xlt(dev, &wqe, mr, &sg, flags);
|
|
|
|
|
|
|
|
dma_sync_single_for_device(ddev, sg.addr, sg.length, DMA_TO_DEVICE);
|
|
|
|
err = mlx5r_umr_post_send_wait(dev, mr->mmkey.key, &wqe, true);
|
|
|
|
|
|
|
|
err:
|
|
|
|
sg.length = orig_sg_length;
|
2024-08-01 15:05:16 +03:00
|
|
|
mlx5r_umr_unmap_free_xlt(dev, entry, &sg);
|
2022-04-12 10:24:05 +03:00
|
|
|
return err;
|
|
|
|
}
|
2022-04-12 10:24:06 +03:00
|
|
|
|
2025-07-09 09:42:11 +03:00
|
|
|
int mlx5r_umr_update_data_direct_ksm_pas_range(struct mlx5_ib_mr *mr,
|
|
|
|
unsigned int flags,
|
|
|
|
size_t start_block,
|
|
|
|
size_t nblocks)
|
2024-08-01 15:05:16 +03:00
|
|
|
{
|
|
|
|
/* No invalidation flow is expected */
|
2025-07-09 09:42:11 +03:00
|
|
|
if (WARN_ON(!mr->umem->is_dmabuf) || ((flags & MLX5_IB_UPD_XLT_ZAP) &&
|
|
|
|
!(flags & MLX5_IB_UPD_XLT_KEEP_PGSZ)))
|
2024-08-01 15:05:16 +03:00
|
|
|
return -EINVAL;
|
|
|
|
|
2025-07-09 09:42:11 +03:00
|
|
|
return _mlx5r_umr_update_mr_pas(mr, flags, true, start_block, nblocks);
|
|
|
|
}
|
|
|
|
|
|
|
|
int mlx5r_umr_update_data_direct_ksm_pas(struct mlx5_ib_mr *mr,
|
|
|
|
unsigned int flags)
|
|
|
|
{
|
|
|
|
return mlx5r_umr_update_data_direct_ksm_pas_range(mr, flags, 0, 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
int mlx5r_umr_update_mr_pas_range(struct mlx5_ib_mr *mr, unsigned int flags,
|
|
|
|
size_t start_block, size_t nblocks)
|
|
|
|
{
|
|
|
|
if (WARN_ON(mr->umem->is_odp))
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
return _mlx5r_umr_update_mr_pas(mr, flags, false, start_block, nblocks);
|
2024-08-01 15:05:16 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Send the DMA list to the HW for a normal MR using UMR.
|
|
|
|
* Dmabuf MR is handled in a similar way, except that the MLX5_IB_UPD_XLT_ZAP
|
|
|
|
* flag may be used.
|
|
|
|
*/
|
|
|
|
int mlx5r_umr_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags)
|
|
|
|
{
|
2025-07-09 09:42:11 +03:00
|
|
|
return mlx5r_umr_update_mr_pas_range(mr, flags, 0, 0);
|
2024-08-01 15:05:16 +03:00
|
|
|
}
|
|
|
|
|
2022-04-12 10:24:06 +03:00
|
|
|
static bool umr_can_use_indirect_mkey(struct mlx5_ib_dev *dev)
|
|
|
|
{
|
|
|
|
return !MLX5_CAP_GEN(dev->mdev, umr_indirect_mkey_disabled);
|
|
|
|
}
|
|
|
|
|
|
|
|
int mlx5r_umr_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages,
|
|
|
|
int page_shift, int flags)
|
|
|
|
{
|
|
|
|
int desc_size = (flags & MLX5_IB_UPD_XLT_INDIRECT)
|
|
|
|
? sizeof(struct mlx5_klm)
|
|
|
|
: sizeof(struct mlx5_mtt);
|
2022-10-31 14:44:57 +02:00
|
|
|
const int page_align = MLX5_UMR_FLEX_ALIGNMENT / desc_size;
|
2022-04-12 10:24:06 +03:00
|
|
|
struct mlx5_ib_dev *dev = mr_to_mdev(mr);
|
|
|
|
struct device *ddev = &dev->mdev->pdev->dev;
|
|
|
|
const int page_mask = page_align - 1;
|
|
|
|
struct mlx5r_umr_wqe wqe = {};
|
|
|
|
size_t pages_mapped = 0;
|
|
|
|
size_t pages_to_map = 0;
|
|
|
|
size_t size_to_map = 0;
|
|
|
|
size_t orig_sg_length;
|
|
|
|
size_t pages_iter;
|
|
|
|
struct ib_sge sg;
|
|
|
|
int err = 0;
|
|
|
|
void *xlt;
|
|
|
|
|
|
|
|
if ((flags & MLX5_IB_UPD_XLT_INDIRECT) &&
|
|
|
|
!umr_can_use_indirect_mkey(dev))
|
|
|
|
return -EPERM;
|
|
|
|
|
|
|
|
if (WARN_ON(!mr->umem->is_odp))
|
|
|
|
return -EINVAL;
|
|
|
|
|
2022-10-31 14:44:57 +02:00
|
|
|
/* UMR copies MTTs in units of MLX5_UMR_FLEX_ALIGNMENT bytes,
|
2022-04-12 10:24:06 +03:00
|
|
|
* so we need to align the offset and length accordingly
|
|
|
|
*/
|
|
|
|
if (idx & page_mask) {
|
|
|
|
npages += idx & page_mask;
|
|
|
|
idx &= ~page_mask;
|
|
|
|
}
|
|
|
|
pages_to_map = ALIGN(npages, page_align);
|
|
|
|
|
|
|
|
xlt = mlx5r_umr_create_xlt(dev, &sg, npages, desc_size, flags);
|
|
|
|
if (!xlt)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
pages_iter = sg.length / desc_size;
|
|
|
|
orig_sg_length = sg.length;
|
|
|
|
|
|
|
|
if (!(flags & MLX5_IB_UPD_XLT_INDIRECT)) {
|
|
|
|
struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem);
|
|
|
|
size_t max_pages = ib_umem_odp_num_pages(odp) - idx;
|
|
|
|
|
|
|
|
pages_to_map = min_t(size_t, pages_to_map, max_pages);
|
|
|
|
}
|
|
|
|
|
|
|
|
mlx5r_umr_set_update_xlt_ctrl_seg(&wqe.ctrl_seg, flags, &sg);
|
|
|
|
mlx5r_umr_set_update_xlt_mkey_seg(dev, &wqe.mkey_seg, mr, page_shift);
|
|
|
|
mlx5r_umr_set_update_xlt_data_seg(&wqe.data_seg, &sg);
|
|
|
|
|
|
|
|
for (pages_mapped = 0;
|
|
|
|
pages_mapped < pages_to_map && !err;
|
|
|
|
pages_mapped += pages_iter, idx += pages_iter) {
|
|
|
|
npages = min_t(int, pages_iter, pages_to_map - pages_mapped);
|
|
|
|
size_to_map = npages * desc_size;
|
|
|
|
dma_sync_single_for_cpu(ddev, sg.addr, sg.length,
|
|
|
|
DMA_TO_DEVICE);
|
2025-04-28 12:22:19 +03:00
|
|
|
/*
|
|
|
|
* npages is the maximum number of pages to map, but we
|
|
|
|
* can't guarantee that all pages are actually mapped.
|
|
|
|
*
|
|
|
|
* For example, if page is p2p of type which is not supported
|
|
|
|
* for mapping, the number of pages mapped will be less than
|
|
|
|
* requested.
|
|
|
|
*/
|
|
|
|
err = mlx5_odp_populate_xlt(xlt, idx, npages, mr, flags);
|
|
|
|
if (err)
|
|
|
|
return err;
|
2022-04-12 10:24:06 +03:00
|
|
|
dma_sync_single_for_device(ddev, sg.addr, sg.length,
|
|
|
|
DMA_TO_DEVICE);
|
2022-10-31 14:44:57 +02:00
|
|
|
sg.length = ALIGN(size_to_map, MLX5_UMR_FLEX_ALIGNMENT);
|
2022-04-12 10:24:06 +03:00
|
|
|
|
|
|
|
if (pages_mapped + pages_iter >= pages_to_map)
|
|
|
|
mlx5r_umr_final_update_xlt(dev, &wqe, mr, &sg, flags);
|
|
|
|
mlx5r_umr_update_offset(&wqe.ctrl_seg, idx * desc_size);
|
|
|
|
err = mlx5r_umr_post_send_wait(dev, mr->mmkey.key, &wqe, true);
|
|
|
|
}
|
|
|
|
sg.length = orig_sg_length;
|
|
|
|
mlx5r_umr_unmap_free_xlt(dev, xlt, &sg);
|
|
|
|
return err;
|
|
|
|
}
|
2025-07-09 09:42:11 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Update only the page-size (log_page_size) field of an existing memory key
|
|
|
|
* using UMR. This is useful when the MR's physical layout stays the same
|
|
|
|
* but the optimal page shift has changed (e.g. dmabuf after pages are
|
|
|
|
* pinned and the HW can switch from 4K to huge-page alignment).
|
|
|
|
*/
|
|
|
|
int mlx5r_umr_update_mr_page_shift(struct mlx5_ib_mr *mr,
|
|
|
|
unsigned int page_shift,
|
|
|
|
bool dd)
|
|
|
|
{
|
|
|
|
struct mlx5_ib_dev *dev = mr_to_mdev(mr);
|
|
|
|
struct mlx5r_umr_wqe wqe = {};
|
|
|
|
int err;
|
|
|
|
|
|
|
|
/* Build UMR wqe: we touch only PAGE_SIZE, so use the dedicated mask */
|
|
|
|
wqe.ctrl_seg.mkey_mask = get_umr_update_translation_mask(dev);
|
|
|
|
|
|
|
|
/* MR must be free while page size is modified */
|
|
|
|
wqe.ctrl_seg.flags = MLX5_UMR_CHECK_FREE | MLX5_UMR_INLINE;
|
|
|
|
|
|
|
|
/* Fill mkey segment with the new page size, keep the rest unchanged */
|
|
|
|
MLX5_SET(mkc, &wqe.mkey_seg, log_page_size, page_shift);
|
|
|
|
|
|
|
|
if (dd)
|
|
|
|
MLX5_SET(mkc, &wqe.mkey_seg, pd, dev->ddr.pdn);
|
|
|
|
else
|
|
|
|
MLX5_SET(mkc, &wqe.mkey_seg, pd, to_mpd(mr->ibmr.pd)->pdn);
|
|
|
|
|
|
|
|
MLX5_SET64(mkc, &wqe.mkey_seg, start_addr, mr->ibmr.iova);
|
|
|
|
MLX5_SET64(mkc, &wqe.mkey_seg, len, mr->ibmr.length);
|
|
|
|
MLX5_SET(mkc, &wqe.mkey_seg, qpn, 0xffffff);
|
|
|
|
MLX5_SET(mkc, &wqe.mkey_seg, mkey_7_0,
|
|
|
|
mlx5_mkey_variant(mr->mmkey.key));
|
|
|
|
|
|
|
|
err = mlx5r_umr_post_send_wait(dev, mr->mmkey.key, &wqe, false);
|
|
|
|
if (!err)
|
|
|
|
mr->page_shift = page_shift;
|
|
|
|
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline int
|
|
|
|
_mlx5r_dmabuf_umr_update_pas(struct mlx5_ib_mr *mr, unsigned int flags,
|
|
|
|
size_t start_block, size_t nblocks, bool dd)
|
|
|
|
{
|
|
|
|
if (dd)
|
|
|
|
return mlx5r_umr_update_data_direct_ksm_pas_range(mr, flags,
|
|
|
|
start_block,
|
|
|
|
nblocks);
|
|
|
|
else
|
|
|
|
return mlx5r_umr_update_mr_pas_range(mr, flags, start_block,
|
|
|
|
nblocks);
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* This function makes an mkey non-present by zapping the translation entries of
|
|
|
|
* the mkey by zapping (zeroing out) the first N entries, where N is determined
|
|
|
|
* by the largest page size supported by the device and the MR length.
|
|
|
|
* It then updates the mkey's page size to the largest possible value, ensuring
|
|
|
|
* the MR is completely non-present and safe for further updates.
|
|
|
|
* It is useful to update the page size of a dmabuf MR on a page fault.
|
|
|
|
*
|
|
|
|
* Return: On success, returns the number of entries that were zapped.
|
|
|
|
* On error, returns a negative error code.
|
|
|
|
*/
|
|
|
|
static int _mlx5r_umr_zap_mkey(struct mlx5_ib_mr *mr,
|
|
|
|
unsigned int flags,
|
|
|
|
unsigned int page_shift,
|
2025-07-20 12:25:34 +03:00
|
|
|
size_t *nblocks,
|
2025-07-09 09:42:11 +03:00
|
|
|
bool dd)
|
|
|
|
{
|
|
|
|
unsigned int old_page_shift = mr->page_shift;
|
|
|
|
struct mlx5_ib_dev *dev = mr_to_mdev(mr);
|
|
|
|
unsigned int max_page_shift;
|
|
|
|
size_t page_shift_nblocks;
|
|
|
|
unsigned int max_log_size;
|
|
|
|
int access_mode;
|
|
|
|
int err;
|
|
|
|
|
|
|
|
access_mode = dd ? MLX5_MKC_ACCESS_MODE_KSM : MLX5_MKC_ACCESS_MODE_MTT;
|
|
|
|
flags |= MLX5_IB_UPD_XLT_KEEP_PGSZ | MLX5_IB_UPD_XLT_ZAP |
|
|
|
|
MLX5_IB_UPD_XLT_ATOMIC;
|
|
|
|
max_log_size = get_max_log_entity_size_cap(dev, access_mode);
|
|
|
|
max_page_shift = order_base_2(mr->ibmr.length);
|
|
|
|
max_page_shift = min(max(max_page_shift, page_shift), max_log_size);
|
|
|
|
/* Count blocks in units of max_page_shift, we will zap exactly this
|
|
|
|
* many to make the whole MR non-present.
|
|
|
|
* Block size must be aligned to MLX5_UMR_FLEX_ALIGNMENT since it may
|
|
|
|
* be used as offset into the XLT later on.
|
|
|
|
*/
|
2025-07-20 12:25:34 +03:00
|
|
|
*nblocks = ib_umem_num_dma_blocks(mr->umem, 1UL << max_page_shift);
|
2025-07-09 09:42:11 +03:00
|
|
|
if (dd)
|
2025-07-20 12:25:34 +03:00
|
|
|
*nblocks = ALIGN(*nblocks, MLX5_UMR_KSM_NUM_ENTRIES_ALIGNMENT);
|
2025-07-09 09:42:11 +03:00
|
|
|
else
|
2025-07-20 12:25:34 +03:00
|
|
|
*nblocks = ALIGN(*nblocks, MLX5_UMR_MTT_NUM_ENTRIES_ALIGNMENT);
|
2025-07-09 09:42:11 +03:00
|
|
|
page_shift_nblocks = ib_umem_num_dma_blocks(mr->umem,
|
|
|
|
1UL << page_shift);
|
|
|
|
/* If the number of blocks at max possible page shift is greater than
|
|
|
|
* the number of blocks at the new page size, we should just go over the
|
|
|
|
* whole mkey entries.
|
|
|
|
*/
|
2025-07-20 12:25:34 +03:00
|
|
|
if (*nblocks >= page_shift_nblocks)
|
|
|
|
*nblocks = 0;
|
2025-07-09 09:42:11 +03:00
|
|
|
|
|
|
|
/* Make the first nblocks entries non-present without changing
|
|
|
|
* page size yet.
|
|
|
|
*/
|
2025-07-20 12:25:34 +03:00
|
|
|
if (*nblocks)
|
2025-07-09 09:42:11 +03:00
|
|
|
mr->page_shift = max_page_shift;
|
2025-07-20 12:25:34 +03:00
|
|
|
err = _mlx5r_dmabuf_umr_update_pas(mr, flags, 0, *nblocks, dd);
|
2025-07-09 09:42:11 +03:00
|
|
|
if (err) {
|
|
|
|
mr->page_shift = old_page_shift;
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Change page size to the max page size now that the MR is completely
|
|
|
|
* non-present.
|
|
|
|
*/
|
2025-07-20 12:25:34 +03:00
|
|
|
if (*nblocks) {
|
2025-07-09 09:42:11 +03:00
|
|
|
err = mlx5r_umr_update_mr_page_shift(mr, max_page_shift, dd);
|
|
|
|
if (err) {
|
|
|
|
mr->page_shift = old_page_shift;
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2025-07-20 12:25:34 +03:00
|
|
|
return 0;
|
2025-07-09 09:42:11 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* mlx5r_umr_dmabuf_update_pgsz - Safely update DMABUF MR page size and its
|
|
|
|
* entries accordingly
|
|
|
|
* @mr: The memory region to update
|
|
|
|
* @xlt_flags: Translation table update flags
|
|
|
|
* @page_shift: The new (optimized) page shift to use
|
|
|
|
*
|
|
|
|
* This function updates the page size and mkey translation entries for a DMABUF
|
|
|
|
* MR in a safe, multi-step process to avoid exposing partially updated mappings
|
|
|
|
* The update is performed in 5 steps:
|
|
|
|
* 1. Make the first X entries non-present, while X is calculated to be
|
|
|
|
* minimal according to a large page shift that can be used to cover the
|
|
|
|
* MR length.
|
|
|
|
* 2. Update the page size to the large supported page size
|
|
|
|
* 3. Load the remaining N-X entries according to the (optimized) page_shift
|
|
|
|
* 4. Update the page size according to the (optimized) page_shift
|
|
|
|
* 5. Load the first X entries with the correct translations
|
|
|
|
*
|
|
|
|
* This ensures that at no point is the MR accessible with a partially updated
|
|
|
|
* translation table, maintaining correctness and preventing access to stale or
|
|
|
|
* inconsistent mappings.
|
|
|
|
*
|
|
|
|
* Returns 0 on success or a negative error code on failure.
|
|
|
|
*/
|
|
|
|
int mlx5r_umr_dmabuf_update_pgsz(struct mlx5_ib_mr *mr, u32 xlt_flags,
|
|
|
|
unsigned int page_shift)
|
|
|
|
{
|
|
|
|
unsigned int old_page_shift = mr->page_shift;
|
|
|
|
size_t zapped_blocks;
|
|
|
|
size_t total_blocks;
|
|
|
|
int err;
|
|
|
|
|
2025-07-20 12:25:34 +03:00
|
|
|
err = _mlx5r_umr_zap_mkey(mr, xlt_flags, page_shift, &zapped_blocks,
|
|
|
|
mr->data_direct);
|
|
|
|
if (err)
|
|
|
|
return err;
|
2025-07-09 09:42:11 +03:00
|
|
|
|
|
|
|
/* _mlx5r_umr_zap_mkey already enables the mkey */
|
|
|
|
xlt_flags &= ~MLX5_IB_UPD_XLT_ENABLE;
|
|
|
|
mr->page_shift = page_shift;
|
|
|
|
total_blocks = ib_umem_num_dma_blocks(mr->umem, 1UL << mr->page_shift);
|
|
|
|
if (zapped_blocks && zapped_blocks < total_blocks) {
|
|
|
|
/* Update PAS according to the new page size but don't update
|
|
|
|
* the page size in the mkey yet.
|
|
|
|
*/
|
|
|
|
err = _mlx5r_dmabuf_umr_update_pas(
|
|
|
|
mr,
|
|
|
|
xlt_flags | MLX5_IB_UPD_XLT_KEEP_PGSZ,
|
|
|
|
zapped_blocks,
|
|
|
|
total_blocks - zapped_blocks,
|
|
|
|
mr->data_direct);
|
|
|
|
if (err)
|
|
|
|
goto err;
|
|
|
|
}
|
|
|
|
|
|
|
|
err = mlx5r_umr_update_mr_page_shift(mr, mr->page_shift,
|
|
|
|
mr->data_direct);
|
|
|
|
if (err)
|
|
|
|
goto err;
|
|
|
|
err = _mlx5r_dmabuf_umr_update_pas(mr, xlt_flags, 0, zapped_blocks,
|
|
|
|
mr->data_direct);
|
|
|
|
if (err)
|
|
|
|
goto err;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
err:
|
|
|
|
mr->page_shift = old_page_shift;
|
|
|
|
return err;
|
|
|
|
}
|