RDMA/rxe: Allow registering MRs for On-Demand Paging

Allow userspace to register an ODP-enabled MR, in which case the flag
IB_ACCESS_ON_DEMAND is passed to rxe_reg_user_mr(). However, there is no
RDMA operation enabled right now. They will be supported later in the
subsequent two patches.

rxe_odp_do_pagefault() is called to initialize an ODP-enabled MR. It syncs
process address space from the CPU page table to the driver page table
(dma_list/pfn_list in umem_odp) when called with RXE_PAGEFAULT_SNAPSHOT
flag. Additionally, It can be used to trigger page fault when pages being
accessed are not present or do not have proper read/write permissions, and
possibly to prefetch pages in the future.

Link: https://patch.msgid.link/r/20241220100936.2193541-4-matsuda-daisuke@fujitsu.com
Signed-off-by: Daisuke Matsuda <matsuda-daisuke@fujitsu.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
This commit is contained in:
Daisuke Matsuda 2024-12-20 19:09:34 +09:00 committed by Jason Gunthorpe
parent b601792392
commit d03fb5c659
6 changed files with 128 additions and 6 deletions

View file

@ -92,6 +92,13 @@ static void rxe_init_device_param(struct rxe_dev *rxe)
dev_put(ndev);
rxe->max_ucontext = RXE_MAX_UCONTEXT;
if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) {
rxe->attr.kernel_cap_flags |= IBK_ON_DEMAND_PAGING;
/* IB_ODP_SUPPORT_IMPLICIT is not supported right now. */
rxe->attr.odp_caps.general_caps |= IB_ODP_SUPPORT;
}
}
/* initialize port attributes */

View file

@ -184,4 +184,16 @@ static inline unsigned int wr_opcode_mask(int opcode, struct rxe_qp *qp)
/* rxe_odp.c */
extern const struct mmu_interval_notifier_ops rxe_mn_ops;
#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
int rxe_odp_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length,
u64 iova, int access_flags, struct rxe_mr *mr);
#else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
static inline int
rxe_odp_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, u64 iova,
int access_flags, struct rxe_mr *mr)
{
return -EOPNOTSUPP;
}
#endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
#endif /* RXE_LOC_H */

View file

@ -323,6 +323,9 @@ int rxe_mr_copy(struct rxe_mr *mr, u64 iova, void *addr,
return err;
}
if (mr->umem->is_odp)
return -EOPNOTSUPP;
else
return rxe_mr_copy_xarray(mr, iova, addr, length, dir);
}
@ -532,6 +535,10 @@ int rxe_mr_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value)
struct page *page;
u64 *va;
/* ODP is not supported right now. WIP. */
if (mr->umem->is_odp)
return RESPST_ERR_UNSUPPORTED_OPCODE;
/* See IBA oA19-28 */
if (unlikely(mr->state != RXE_MR_STATE_VALID)) {
rxe_dbg_mr(mr, "mr not in valid state\n");

View file

@ -36,3 +36,89 @@ static bool rxe_ib_invalidate_range(struct mmu_interval_notifier *mni,
const struct mmu_interval_notifier_ops rxe_mn_ops = {
.invalidate = rxe_ib_invalidate_range,
};
#define RXE_PAGEFAULT_RDONLY BIT(1)
#define RXE_PAGEFAULT_SNAPSHOT BIT(2)
static int rxe_odp_do_pagefault_and_lock(struct rxe_mr *mr, u64 user_va, int bcnt, u32 flags)
{
struct ib_umem_odp *umem_odp = to_ib_umem_odp(mr->umem);
bool fault = !(flags & RXE_PAGEFAULT_SNAPSHOT);
u64 access_mask;
int np;
access_mask = ODP_READ_ALLOWED_BIT;
if (umem_odp->umem.writable && !(flags & RXE_PAGEFAULT_RDONLY))
access_mask |= ODP_WRITE_ALLOWED_BIT;
/*
* ib_umem_odp_map_dma_and_lock() locks umem_mutex on success.
* Callers must release the lock later to let invalidation handler
* do its work again.
*/
np = ib_umem_odp_map_dma_and_lock(umem_odp, user_va, bcnt,
access_mask, fault);
return np;
}
static int rxe_odp_init_pages(struct rxe_mr *mr)
{
struct ib_umem_odp *umem_odp = to_ib_umem_odp(mr->umem);
int ret;
ret = rxe_odp_do_pagefault_and_lock(mr, mr->umem->address,
mr->umem->length,
RXE_PAGEFAULT_SNAPSHOT);
if (ret >= 0)
mutex_unlock(&umem_odp->umem_mutex);
return ret >= 0 ? 0 : ret;
}
int rxe_odp_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length,
u64 iova, int access_flags, struct rxe_mr *mr)
{
struct ib_umem_odp *umem_odp;
int err;
if (!IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING))
return -EOPNOTSUPP;
rxe_mr_init(access_flags, mr);
if (!start && length == U64_MAX) {
if (iova != 0)
return -EINVAL;
if (!(rxe->attr.odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT))
return -EINVAL;
/* Never reach here, for implicit ODP is not implemented. */
}
umem_odp = ib_umem_odp_get(&rxe->ib_dev, start, length, access_flags,
&rxe_mn_ops);
if (IS_ERR(umem_odp)) {
rxe_dbg_mr(mr, "Unable to create umem_odp err = %d\n",
(int)PTR_ERR(umem_odp));
return PTR_ERR(umem_odp);
}
umem_odp->private = mr;
mr->umem = &umem_odp->umem;
mr->access = access_flags;
mr->ibmr.length = length;
mr->ibmr.iova = iova;
mr->page_offset = ib_umem_offset(&umem_odp->umem);
err = rxe_odp_init_pages(mr);
if (err) {
ib_umem_odp_release(umem_odp);
return err;
}
mr->state = RXE_MR_STATE_VALID;
mr->ibmr.type = IB_MR_TYPE_USER;
return err;
}

View file

@ -649,6 +649,10 @@ static enum resp_states process_flush(struct rxe_qp *qp,
struct rxe_mr *mr = qp->resp.mr;
struct resp_res *res = qp->resp.res;
/* ODP is not supported right now. WIP. */
if (mr->umem->is_odp)
return RESPST_ERR_UNSUPPORTED_OPCODE;
/* oA19-14, oA19-15 */
if (res && res->replay)
return RESPST_ACKNOWLEDGE;
@ -702,6 +706,9 @@ static enum resp_states atomic_reply(struct rxe_qp *qp,
if (!res->replay) {
u64 iova = qp->resp.va + qp->resp.offset;
if (mr->umem->is_odp)
err = RESPST_ERR_UNSUPPORTED_OPCODE;
else
err = rxe_mr_do_atomic_op(mr, iova, pkt->opcode,
atmeth_comp(pkt),
atmeth_swap_add(pkt),

View file

@ -1298,6 +1298,9 @@ static struct ib_mr *rxe_reg_user_mr(struct ib_pd *ibpd, u64 start,
mr->ibmr.pd = ibpd;
mr->ibmr.device = ibpd->device;
if (access & IB_ACCESS_ON_DEMAND)
err = rxe_odp_mr_init_user(rxe, start, length, iova, access, mr);
else
err = rxe_mr_init_user(rxe, start, length, access, mr);
if (err) {
rxe_dbg_mr(mr, "reg_user_mr failed, err = %d\n", err);