RDMA/umem: Store ODP access mask information in PFN

As a preparation to remove dma_list, store access mask in PFN pointer
and not in dma_addr_t.

Tested-by: Jens Axboe <axboe@kernel.dk>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
This commit is contained in:
Leon Romanovsky 2025-04-28 12:22:18 +03:00 committed by Leon Romanovsky
parent 8cad471305
commit eedd5b1276
5 changed files with 69 additions and 98 deletions

View file

@ -296,22 +296,11 @@ EXPORT_SYMBOL(ib_umem_odp_release);
static int ib_umem_odp_map_dma_single_page(
struct ib_umem_odp *umem_odp,
unsigned int dma_index,
struct page *page,
u64 access_mask)
struct page *page)
{
struct ib_device *dev = umem_odp->umem.ibdev;
dma_addr_t *dma_addr = &umem_odp->dma_list[dma_index];
if (*dma_addr) {
/*
* If the page is already dma mapped it means it went through
* a non-invalidating trasition, like read-only to writable.
* Resync the flags.
*/
*dma_addr = (*dma_addr & ODP_DMA_ADDR_MASK) | access_mask;
return 0;
}
*dma_addr = ib_dma_map_page(dev, page, 0, 1 << umem_odp->page_shift,
DMA_BIDIRECTIONAL);
if (ib_dma_mapping_error(dev, *dma_addr)) {
@ -319,7 +308,6 @@ static int ib_umem_odp_map_dma_single_page(
return -EFAULT;
}
umem_odp->npages++;
*dma_addr |= access_mask;
return 0;
}
@ -355,9 +343,6 @@ int ib_umem_odp_map_dma_and_lock(struct ib_umem_odp *umem_odp, u64 user_virt,
struct hmm_range range = {};
unsigned long timeout;
if (access_mask == 0)
return -EINVAL;
if (user_virt < ib_umem_start(umem_odp) ||
user_virt + bcnt > ib_umem_end(umem_odp))
return -EFAULT;
@ -383,7 +368,7 @@ int ib_umem_odp_map_dma_and_lock(struct ib_umem_odp *umem_odp, u64 user_virt,
if (fault) {
range.default_flags = HMM_PFN_REQ_FAULT;
if (access_mask & ODP_WRITE_ALLOWED_BIT)
if (access_mask & HMM_PFN_WRITE)
range.default_flags |= HMM_PFN_REQ_WRITE;
}
@ -415,22 +400,17 @@ retry:
for (pfn_index = 0; pfn_index < num_pfns;
pfn_index += 1 << (page_shift - PAGE_SHIFT), dma_index++) {
if (fault) {
/*
* Since we asked for hmm_range_fault() to populate
* pages it shouldn't return an error entry on success.
*/
WARN_ON(range.hmm_pfns[pfn_index] & HMM_PFN_ERROR);
WARN_ON(!(range.hmm_pfns[pfn_index] & HMM_PFN_VALID));
} else {
if (!(range.hmm_pfns[pfn_index] & HMM_PFN_VALID)) {
WARN_ON(umem_odp->dma_list[dma_index]);
continue;
}
access_mask = ODP_READ_ALLOWED_BIT;
if (range.hmm_pfns[pfn_index] & HMM_PFN_WRITE)
access_mask |= ODP_WRITE_ALLOWED_BIT;
}
/*
* Since we asked for hmm_range_fault() to populate
* pages it shouldn't return an error entry on success.
*/
WARN_ON(fault && range.hmm_pfns[pfn_index] & HMM_PFN_ERROR);
WARN_ON(fault && !(range.hmm_pfns[pfn_index] & HMM_PFN_VALID));
if (!(range.hmm_pfns[pfn_index] & HMM_PFN_VALID))
continue;
if (range.hmm_pfns[pfn_index] & HMM_PFN_DMA_MAPPED)
continue;
hmm_order = hmm_pfn_to_map_order(range.hmm_pfns[pfn_index]);
/* If a hugepage was detected and ODP wasn't set for, the umem
@ -445,13 +425,14 @@ retry:
}
ret = ib_umem_odp_map_dma_single_page(
umem_odp, dma_index, hmm_pfn_to_page(range.hmm_pfns[pfn_index]),
access_mask);
umem_odp, dma_index,
hmm_pfn_to_page(range.hmm_pfns[pfn_index]));
if (ret < 0) {
ibdev_dbg(umem_odp->umem.ibdev,
"ib_umem_odp_map_dma_single_page failed with error %d\n", ret);
break;
}
range.hmm_pfns[pfn_index] |= HMM_PFN_DMA_MAPPED;
}
/* upon success lock should stay on hold for the callee */
if (!ret)
@ -471,7 +452,6 @@ EXPORT_SYMBOL(ib_umem_odp_map_dma_and_lock);
void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt,
u64 bound)
{
dma_addr_t dma_addr;
dma_addr_t dma;
int idx;
u64 addr;
@ -482,34 +462,37 @@ void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt,
virt = max_t(u64, virt, ib_umem_start(umem_odp));
bound = min_t(u64, bound, ib_umem_end(umem_odp));
for (addr = virt; addr < bound; addr += BIT(umem_odp->page_shift)) {
unsigned long pfn_idx = (addr - ib_umem_start(umem_odp)) >>
PAGE_SHIFT;
struct page *page =
hmm_pfn_to_page(umem_odp->pfn_list[pfn_idx]);
idx = (addr - ib_umem_start(umem_odp)) >> umem_odp->page_shift;
dma = umem_odp->dma_list[idx];
/* The access flags guaranteed a valid DMA address in case was NULL */
if (dma) {
unsigned long pfn_idx = (addr - ib_umem_start(umem_odp)) >> PAGE_SHIFT;
struct page *page = hmm_pfn_to_page(umem_odp->pfn_list[pfn_idx]);
if (!(umem_odp->pfn_list[pfn_idx] & HMM_PFN_VALID))
goto clear;
if (!(umem_odp->pfn_list[pfn_idx] & HMM_PFN_DMA_MAPPED))
goto clear;
dma_addr = dma & ODP_DMA_ADDR_MASK;
ib_dma_unmap_page(dev, dma_addr,
BIT(umem_odp->page_shift),
DMA_BIDIRECTIONAL);
if (dma & ODP_WRITE_ALLOWED_BIT) {
struct page *head_page = compound_head(page);
/*
* set_page_dirty prefers being called with
* the page lock. However, MMU notifiers are
* called sometimes with and sometimes without
* the lock. We rely on the umem_mutex instead
* to prevent other mmu notifiers from
* continuing and allowing the page mapping to
* be removed.
*/
set_page_dirty(head_page);
}
umem_odp->dma_list[idx] = 0;
umem_odp->npages--;
ib_dma_unmap_page(dev, dma, BIT(umem_odp->page_shift),
DMA_BIDIRECTIONAL);
if (umem_odp->pfn_list[pfn_idx] & HMM_PFN_WRITE) {
struct page *head_page = compound_head(page);
/*
* set_page_dirty prefers being called with
* the page lock. However, MMU notifiers are
* called sometimes with and sometimes without
* the lock. We rely on the umem_mutex instead
* to prevent other mmu notifiers from
* continuing and allowing the page mapping to
* be removed.
*/
set_page_dirty(head_page);
}
umem_odp->npages--;
clear:
umem_odp->pfn_list[pfn_idx] &= ~HMM_PFN_FLAGS;
}
}
EXPORT_SYMBOL(ib_umem_odp_unmap_dma_pages);

View file

@ -351,6 +351,7 @@ struct mlx5_ib_flow_db {
#define MLX5_IB_UPD_XLT_PD BIT(4)
#define MLX5_IB_UPD_XLT_ACCESS BIT(5)
#define MLX5_IB_UPD_XLT_INDIRECT BIT(6)
#define MLX5_IB_UPD_XLT_DOWNGRADE BIT(7)
/* Private QP creation flags to be passed in ib_qp_init_attr.create_flags.
*

View file

@ -34,6 +34,7 @@
#include <linux/kernel.h>
#include <linux/dma-buf.h>
#include <linux/dma-resv.h>
#include <linux/hmm.h>
#include "mlx5_ib.h"
#include "cmd.h"
@ -158,22 +159,12 @@ static void populate_klm(struct mlx5_klm *pklm, size_t idx, size_t nentries,
}
}
static u64 umem_dma_to_mtt(dma_addr_t umem_dma)
{
u64 mtt_entry = umem_dma & ODP_DMA_ADDR_MASK;
if (umem_dma & ODP_READ_ALLOWED_BIT)
mtt_entry |= MLX5_IB_MTT_READ;
if (umem_dma & ODP_WRITE_ALLOWED_BIT)
mtt_entry |= MLX5_IB_MTT_WRITE;
return mtt_entry;
}
static void populate_mtt(__be64 *pas, size_t idx, size_t nentries,
struct mlx5_ib_mr *mr, int flags)
{
struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem);
bool downgrade = flags & MLX5_IB_UPD_XLT_DOWNGRADE;
unsigned long pfn;
dma_addr_t pa;
size_t i;
@ -181,8 +172,17 @@ static void populate_mtt(__be64 *pas, size_t idx, size_t nentries,
return;
for (i = 0; i < nentries; i++) {
pfn = odp->pfn_list[idx + i];
if (!(pfn & HMM_PFN_VALID))
/* ODP initialization */
continue;
pa = odp->dma_list[idx + i];
pas[i] = cpu_to_be64(umem_dma_to_mtt(pa));
pa |= MLX5_IB_MTT_READ;
if ((pfn & HMM_PFN_WRITE) && !downgrade)
pa |= MLX5_IB_MTT_WRITE;
pas[i] = cpu_to_be64(pa);
}
}
@ -303,8 +303,7 @@ static bool mlx5_ib_invalidate_range(struct mmu_interval_notifier *mni,
* estimate the cost of another UMR vs. the cost of bigger
* UMR.
*/
if (umem_odp->dma_list[idx] &
(ODP_READ_ALLOWED_BIT | ODP_WRITE_ALLOWED_BIT)) {
if (umem_odp->pfn_list[idx] & HMM_PFN_VALID) {
if (!in_block) {
blk_start_idx = idx;
in_block = 1;
@ -687,7 +686,7 @@ static int pagefault_real_mr(struct mlx5_ib_mr *mr, struct ib_umem_odp *odp,
{
int page_shift, ret, np;
bool downgrade = flags & MLX5_PF_FLAGS_DOWNGRADE;
u64 access_mask;
u64 access_mask = 0;
u64 start_idx;
bool fault = !(flags & MLX5_PF_FLAGS_SNAPSHOT);
u32 xlt_flags = MLX5_IB_UPD_XLT_ATOMIC;
@ -695,12 +694,14 @@ static int pagefault_real_mr(struct mlx5_ib_mr *mr, struct ib_umem_odp *odp,
if (flags & MLX5_PF_FLAGS_ENABLE)
xlt_flags |= MLX5_IB_UPD_XLT_ENABLE;
if (flags & MLX5_PF_FLAGS_DOWNGRADE)
xlt_flags |= MLX5_IB_UPD_XLT_DOWNGRADE;
page_shift = odp->page_shift;
start_idx = (user_va - ib_umem_start(odp)) >> page_shift;
access_mask = ODP_READ_ALLOWED_BIT;
if (odp->umem.writable && !downgrade)
access_mask |= ODP_WRITE_ALLOWED_BIT;
access_mask |= HMM_PFN_WRITE;
np = ib_umem_odp_map_dma_and_lock(odp, user_va, bcnt, access_mask, fault);
if (np < 0)

View file

@ -27,7 +27,7 @@ static bool rxe_ib_invalidate_range(struct mmu_interval_notifier *mni,
start = max_t(u64, ib_umem_start(umem_odp), range->start);
end = min_t(u64, ib_umem_end(umem_odp), range->end);
/* update umem_odp->dma_list */
/* update umem_odp->map.pfn_list */
ib_umem_odp_unmap_dma_pages(umem_odp, start, end);
mutex_unlock(&umem_odp->umem_mutex);
@ -45,12 +45,11 @@ static int rxe_odp_do_pagefault_and_lock(struct rxe_mr *mr, u64 user_va, int bcn
{
struct ib_umem_odp *umem_odp = to_ib_umem_odp(mr->umem);
bool fault = !(flags & RXE_PAGEFAULT_SNAPSHOT);
u64 access_mask;
u64 access_mask = 0;
int np;
access_mask = ODP_READ_ALLOWED_BIT;
if (umem_odp->umem.writable && !(flags & RXE_PAGEFAULT_RDONLY))
access_mask |= ODP_WRITE_ALLOWED_BIT;
access_mask |= HMM_PFN_WRITE;
/*
* ib_umem_odp_map_dma_and_lock() locks umem_mutex on success.
@ -138,7 +137,7 @@ static inline bool rxe_check_pagefault(struct ib_umem_odp *umem_odp,
while (addr < iova + length) {
idx = (addr - ib_umem_start(umem_odp)) >> umem_odp->page_shift;
if (!(umem_odp->dma_list[idx] & perm)) {
if (!(umem_odp->map.pfn_list[idx] & perm)) {
need_fault = true;
break;
}
@ -162,15 +161,14 @@ static int rxe_odp_map_range_and_lock(struct rxe_mr *mr, u64 iova, int length, u
{
struct ib_umem_odp *umem_odp = to_ib_umem_odp(mr->umem);
bool need_fault;
u64 perm;
u64 perm = 0;
int err;
if (unlikely(length < 1))
return -EINVAL;
perm = ODP_READ_ALLOWED_BIT;
if (!(flags & RXE_PAGEFAULT_RDONLY))
perm |= ODP_WRITE_ALLOWED_BIT;
perm |= HMM_PFN_WRITE;
mutex_lock(&umem_odp->umem_mutex);

View file

@ -8,6 +8,7 @@
#include <rdma/ib_umem.h>
#include <rdma/ib_verbs.h>
#include <linux/hmm.h>
struct ib_umem_odp {
struct ib_umem umem;
@ -67,19 +68,6 @@ static inline size_t ib_umem_odp_num_pages(struct ib_umem_odp *umem_odp)
umem_odp->page_shift;
}
/*
* The lower 2 bits of the DMA address signal the R/W permissions for
* the entry. To upgrade the permissions, provide the appropriate
* bitmask to the map_dma_pages function.
*
* Be aware that upgrading a mapped address might result in change of
* the DMA address for the page.
*/
#define ODP_READ_ALLOWED_BIT (1<<0ULL)
#define ODP_WRITE_ALLOWED_BIT (1<<1ULL)
#define ODP_DMA_ADDR_MASK (~(ODP_READ_ALLOWED_BIT | ODP_WRITE_ALLOWED_BIT))
#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
struct ib_umem_odp *