mirror of
git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2025-08-21 06:50:25 +00:00
RDMA/umem: Store ODP access mask information in PFN
As a preparation to remove dma_list, store access mask in PFN pointer and not in dma_addr_t. Tested-by: Jens Axboe <axboe@kernel.dk> Reviewed-by: Jason Gunthorpe <jgg@nvidia.com> Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
This commit is contained in:
parent
8cad471305
commit
eedd5b1276
5 changed files with 69 additions and 98 deletions
|
@ -296,22 +296,11 @@ EXPORT_SYMBOL(ib_umem_odp_release);
|
|||
static int ib_umem_odp_map_dma_single_page(
|
||||
struct ib_umem_odp *umem_odp,
|
||||
unsigned int dma_index,
|
||||
struct page *page,
|
||||
u64 access_mask)
|
||||
struct page *page)
|
||||
{
|
||||
struct ib_device *dev = umem_odp->umem.ibdev;
|
||||
dma_addr_t *dma_addr = &umem_odp->dma_list[dma_index];
|
||||
|
||||
if (*dma_addr) {
|
||||
/*
|
||||
* If the page is already dma mapped it means it went through
|
||||
* a non-invalidating trasition, like read-only to writable.
|
||||
* Resync the flags.
|
||||
*/
|
||||
*dma_addr = (*dma_addr & ODP_DMA_ADDR_MASK) | access_mask;
|
||||
return 0;
|
||||
}
|
||||
|
||||
*dma_addr = ib_dma_map_page(dev, page, 0, 1 << umem_odp->page_shift,
|
||||
DMA_BIDIRECTIONAL);
|
||||
if (ib_dma_mapping_error(dev, *dma_addr)) {
|
||||
|
@ -319,7 +308,6 @@ static int ib_umem_odp_map_dma_single_page(
|
|||
return -EFAULT;
|
||||
}
|
||||
umem_odp->npages++;
|
||||
*dma_addr |= access_mask;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -355,9 +343,6 @@ int ib_umem_odp_map_dma_and_lock(struct ib_umem_odp *umem_odp, u64 user_virt,
|
|||
struct hmm_range range = {};
|
||||
unsigned long timeout;
|
||||
|
||||
if (access_mask == 0)
|
||||
return -EINVAL;
|
||||
|
||||
if (user_virt < ib_umem_start(umem_odp) ||
|
||||
user_virt + bcnt > ib_umem_end(umem_odp))
|
||||
return -EFAULT;
|
||||
|
@ -383,7 +368,7 @@ int ib_umem_odp_map_dma_and_lock(struct ib_umem_odp *umem_odp, u64 user_virt,
|
|||
if (fault) {
|
||||
range.default_flags = HMM_PFN_REQ_FAULT;
|
||||
|
||||
if (access_mask & ODP_WRITE_ALLOWED_BIT)
|
||||
if (access_mask & HMM_PFN_WRITE)
|
||||
range.default_flags |= HMM_PFN_REQ_WRITE;
|
||||
}
|
||||
|
||||
|
@ -415,22 +400,17 @@ retry:
|
|||
for (pfn_index = 0; pfn_index < num_pfns;
|
||||
pfn_index += 1 << (page_shift - PAGE_SHIFT), dma_index++) {
|
||||
|
||||
if (fault) {
|
||||
/*
|
||||
* Since we asked for hmm_range_fault() to populate
|
||||
* pages it shouldn't return an error entry on success.
|
||||
*/
|
||||
WARN_ON(range.hmm_pfns[pfn_index] & HMM_PFN_ERROR);
|
||||
WARN_ON(!(range.hmm_pfns[pfn_index] & HMM_PFN_VALID));
|
||||
} else {
|
||||
if (!(range.hmm_pfns[pfn_index] & HMM_PFN_VALID)) {
|
||||
WARN_ON(umem_odp->dma_list[dma_index]);
|
||||
continue;
|
||||
}
|
||||
access_mask = ODP_READ_ALLOWED_BIT;
|
||||
if (range.hmm_pfns[pfn_index] & HMM_PFN_WRITE)
|
||||
access_mask |= ODP_WRITE_ALLOWED_BIT;
|
||||
}
|
||||
/*
|
||||
* Since we asked for hmm_range_fault() to populate
|
||||
* pages it shouldn't return an error entry on success.
|
||||
*/
|
||||
WARN_ON(fault && range.hmm_pfns[pfn_index] & HMM_PFN_ERROR);
|
||||
WARN_ON(fault && !(range.hmm_pfns[pfn_index] & HMM_PFN_VALID));
|
||||
if (!(range.hmm_pfns[pfn_index] & HMM_PFN_VALID))
|
||||
continue;
|
||||
|
||||
if (range.hmm_pfns[pfn_index] & HMM_PFN_DMA_MAPPED)
|
||||
continue;
|
||||
|
||||
hmm_order = hmm_pfn_to_map_order(range.hmm_pfns[pfn_index]);
|
||||
/* If a hugepage was detected and ODP wasn't set for, the umem
|
||||
|
@ -445,13 +425,14 @@ retry:
|
|||
}
|
||||
|
||||
ret = ib_umem_odp_map_dma_single_page(
|
||||
umem_odp, dma_index, hmm_pfn_to_page(range.hmm_pfns[pfn_index]),
|
||||
access_mask);
|
||||
umem_odp, dma_index,
|
||||
hmm_pfn_to_page(range.hmm_pfns[pfn_index]));
|
||||
if (ret < 0) {
|
||||
ibdev_dbg(umem_odp->umem.ibdev,
|
||||
"ib_umem_odp_map_dma_single_page failed with error %d\n", ret);
|
||||
break;
|
||||
}
|
||||
range.hmm_pfns[pfn_index] |= HMM_PFN_DMA_MAPPED;
|
||||
}
|
||||
/* upon success lock should stay on hold for the callee */
|
||||
if (!ret)
|
||||
|
@ -471,7 +452,6 @@ EXPORT_SYMBOL(ib_umem_odp_map_dma_and_lock);
|
|||
void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt,
|
||||
u64 bound)
|
||||
{
|
||||
dma_addr_t dma_addr;
|
||||
dma_addr_t dma;
|
||||
int idx;
|
||||
u64 addr;
|
||||
|
@ -482,34 +462,37 @@ void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt,
|
|||
virt = max_t(u64, virt, ib_umem_start(umem_odp));
|
||||
bound = min_t(u64, bound, ib_umem_end(umem_odp));
|
||||
for (addr = virt; addr < bound; addr += BIT(umem_odp->page_shift)) {
|
||||
unsigned long pfn_idx = (addr - ib_umem_start(umem_odp)) >>
|
||||
PAGE_SHIFT;
|
||||
struct page *page =
|
||||
hmm_pfn_to_page(umem_odp->pfn_list[pfn_idx]);
|
||||
|
||||
idx = (addr - ib_umem_start(umem_odp)) >> umem_odp->page_shift;
|
||||
dma = umem_odp->dma_list[idx];
|
||||
|
||||
/* The access flags guaranteed a valid DMA address in case was NULL */
|
||||
if (dma) {
|
||||
unsigned long pfn_idx = (addr - ib_umem_start(umem_odp)) >> PAGE_SHIFT;
|
||||
struct page *page = hmm_pfn_to_page(umem_odp->pfn_list[pfn_idx]);
|
||||
if (!(umem_odp->pfn_list[pfn_idx] & HMM_PFN_VALID))
|
||||
goto clear;
|
||||
if (!(umem_odp->pfn_list[pfn_idx] & HMM_PFN_DMA_MAPPED))
|
||||
goto clear;
|
||||
|
||||
dma_addr = dma & ODP_DMA_ADDR_MASK;
|
||||
ib_dma_unmap_page(dev, dma_addr,
|
||||
BIT(umem_odp->page_shift),
|
||||
DMA_BIDIRECTIONAL);
|
||||
if (dma & ODP_WRITE_ALLOWED_BIT) {
|
||||
struct page *head_page = compound_head(page);
|
||||
/*
|
||||
* set_page_dirty prefers being called with
|
||||
* the page lock. However, MMU notifiers are
|
||||
* called sometimes with and sometimes without
|
||||
* the lock. We rely on the umem_mutex instead
|
||||
* to prevent other mmu notifiers from
|
||||
* continuing and allowing the page mapping to
|
||||
* be removed.
|
||||
*/
|
||||
set_page_dirty(head_page);
|
||||
}
|
||||
umem_odp->dma_list[idx] = 0;
|
||||
umem_odp->npages--;
|
||||
ib_dma_unmap_page(dev, dma, BIT(umem_odp->page_shift),
|
||||
DMA_BIDIRECTIONAL);
|
||||
if (umem_odp->pfn_list[pfn_idx] & HMM_PFN_WRITE) {
|
||||
struct page *head_page = compound_head(page);
|
||||
/*
|
||||
* set_page_dirty prefers being called with
|
||||
* the page lock. However, MMU notifiers are
|
||||
* called sometimes with and sometimes without
|
||||
* the lock. We rely on the umem_mutex instead
|
||||
* to prevent other mmu notifiers from
|
||||
* continuing and allowing the page mapping to
|
||||
* be removed.
|
||||
*/
|
||||
set_page_dirty(head_page);
|
||||
}
|
||||
umem_odp->npages--;
|
||||
clear:
|
||||
umem_odp->pfn_list[pfn_idx] &= ~HMM_PFN_FLAGS;
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL(ib_umem_odp_unmap_dma_pages);
|
||||
|
|
|
@ -351,6 +351,7 @@ struct mlx5_ib_flow_db {
|
|||
#define MLX5_IB_UPD_XLT_PD BIT(4)
|
||||
#define MLX5_IB_UPD_XLT_ACCESS BIT(5)
|
||||
#define MLX5_IB_UPD_XLT_INDIRECT BIT(6)
|
||||
#define MLX5_IB_UPD_XLT_DOWNGRADE BIT(7)
|
||||
|
||||
/* Private QP creation flags to be passed in ib_qp_init_attr.create_flags.
|
||||
*
|
||||
|
|
|
@ -34,6 +34,7 @@
|
|||
#include <linux/kernel.h>
|
||||
#include <linux/dma-buf.h>
|
||||
#include <linux/dma-resv.h>
|
||||
#include <linux/hmm.h>
|
||||
|
||||
#include "mlx5_ib.h"
|
||||
#include "cmd.h"
|
||||
|
@ -158,22 +159,12 @@ static void populate_klm(struct mlx5_klm *pklm, size_t idx, size_t nentries,
|
|||
}
|
||||
}
|
||||
|
||||
static u64 umem_dma_to_mtt(dma_addr_t umem_dma)
|
||||
{
|
||||
u64 mtt_entry = umem_dma & ODP_DMA_ADDR_MASK;
|
||||
|
||||
if (umem_dma & ODP_READ_ALLOWED_BIT)
|
||||
mtt_entry |= MLX5_IB_MTT_READ;
|
||||
if (umem_dma & ODP_WRITE_ALLOWED_BIT)
|
||||
mtt_entry |= MLX5_IB_MTT_WRITE;
|
||||
|
||||
return mtt_entry;
|
||||
}
|
||||
|
||||
static void populate_mtt(__be64 *pas, size_t idx, size_t nentries,
|
||||
struct mlx5_ib_mr *mr, int flags)
|
||||
{
|
||||
struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem);
|
||||
bool downgrade = flags & MLX5_IB_UPD_XLT_DOWNGRADE;
|
||||
unsigned long pfn;
|
||||
dma_addr_t pa;
|
||||
size_t i;
|
||||
|
||||
|
@ -181,8 +172,17 @@ static void populate_mtt(__be64 *pas, size_t idx, size_t nentries,
|
|||
return;
|
||||
|
||||
for (i = 0; i < nentries; i++) {
|
||||
pfn = odp->pfn_list[idx + i];
|
||||
if (!(pfn & HMM_PFN_VALID))
|
||||
/* ODP initialization */
|
||||
continue;
|
||||
|
||||
pa = odp->dma_list[idx + i];
|
||||
pas[i] = cpu_to_be64(umem_dma_to_mtt(pa));
|
||||
pa |= MLX5_IB_MTT_READ;
|
||||
if ((pfn & HMM_PFN_WRITE) && !downgrade)
|
||||
pa |= MLX5_IB_MTT_WRITE;
|
||||
|
||||
pas[i] = cpu_to_be64(pa);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -303,8 +303,7 @@ static bool mlx5_ib_invalidate_range(struct mmu_interval_notifier *mni,
|
|||
* estimate the cost of another UMR vs. the cost of bigger
|
||||
* UMR.
|
||||
*/
|
||||
if (umem_odp->dma_list[idx] &
|
||||
(ODP_READ_ALLOWED_BIT | ODP_WRITE_ALLOWED_BIT)) {
|
||||
if (umem_odp->pfn_list[idx] & HMM_PFN_VALID) {
|
||||
if (!in_block) {
|
||||
blk_start_idx = idx;
|
||||
in_block = 1;
|
||||
|
@ -687,7 +686,7 @@ static int pagefault_real_mr(struct mlx5_ib_mr *mr, struct ib_umem_odp *odp,
|
|||
{
|
||||
int page_shift, ret, np;
|
||||
bool downgrade = flags & MLX5_PF_FLAGS_DOWNGRADE;
|
||||
u64 access_mask;
|
||||
u64 access_mask = 0;
|
||||
u64 start_idx;
|
||||
bool fault = !(flags & MLX5_PF_FLAGS_SNAPSHOT);
|
||||
u32 xlt_flags = MLX5_IB_UPD_XLT_ATOMIC;
|
||||
|
@ -695,12 +694,14 @@ static int pagefault_real_mr(struct mlx5_ib_mr *mr, struct ib_umem_odp *odp,
|
|||
if (flags & MLX5_PF_FLAGS_ENABLE)
|
||||
xlt_flags |= MLX5_IB_UPD_XLT_ENABLE;
|
||||
|
||||
if (flags & MLX5_PF_FLAGS_DOWNGRADE)
|
||||
xlt_flags |= MLX5_IB_UPD_XLT_DOWNGRADE;
|
||||
|
||||
page_shift = odp->page_shift;
|
||||
start_idx = (user_va - ib_umem_start(odp)) >> page_shift;
|
||||
access_mask = ODP_READ_ALLOWED_BIT;
|
||||
|
||||
if (odp->umem.writable && !downgrade)
|
||||
access_mask |= ODP_WRITE_ALLOWED_BIT;
|
||||
access_mask |= HMM_PFN_WRITE;
|
||||
|
||||
np = ib_umem_odp_map_dma_and_lock(odp, user_va, bcnt, access_mask, fault);
|
||||
if (np < 0)
|
||||
|
|
|
@ -27,7 +27,7 @@ static bool rxe_ib_invalidate_range(struct mmu_interval_notifier *mni,
|
|||
start = max_t(u64, ib_umem_start(umem_odp), range->start);
|
||||
end = min_t(u64, ib_umem_end(umem_odp), range->end);
|
||||
|
||||
/* update umem_odp->dma_list */
|
||||
/* update umem_odp->map.pfn_list */
|
||||
ib_umem_odp_unmap_dma_pages(umem_odp, start, end);
|
||||
|
||||
mutex_unlock(&umem_odp->umem_mutex);
|
||||
|
@ -45,12 +45,11 @@ static int rxe_odp_do_pagefault_and_lock(struct rxe_mr *mr, u64 user_va, int bcn
|
|||
{
|
||||
struct ib_umem_odp *umem_odp = to_ib_umem_odp(mr->umem);
|
||||
bool fault = !(flags & RXE_PAGEFAULT_SNAPSHOT);
|
||||
u64 access_mask;
|
||||
u64 access_mask = 0;
|
||||
int np;
|
||||
|
||||
access_mask = ODP_READ_ALLOWED_BIT;
|
||||
if (umem_odp->umem.writable && !(flags & RXE_PAGEFAULT_RDONLY))
|
||||
access_mask |= ODP_WRITE_ALLOWED_BIT;
|
||||
access_mask |= HMM_PFN_WRITE;
|
||||
|
||||
/*
|
||||
* ib_umem_odp_map_dma_and_lock() locks umem_mutex on success.
|
||||
|
@ -138,7 +137,7 @@ static inline bool rxe_check_pagefault(struct ib_umem_odp *umem_odp,
|
|||
while (addr < iova + length) {
|
||||
idx = (addr - ib_umem_start(umem_odp)) >> umem_odp->page_shift;
|
||||
|
||||
if (!(umem_odp->dma_list[idx] & perm)) {
|
||||
if (!(umem_odp->map.pfn_list[idx] & perm)) {
|
||||
need_fault = true;
|
||||
break;
|
||||
}
|
||||
|
@ -162,15 +161,14 @@ static int rxe_odp_map_range_and_lock(struct rxe_mr *mr, u64 iova, int length, u
|
|||
{
|
||||
struct ib_umem_odp *umem_odp = to_ib_umem_odp(mr->umem);
|
||||
bool need_fault;
|
||||
u64 perm;
|
||||
u64 perm = 0;
|
||||
int err;
|
||||
|
||||
if (unlikely(length < 1))
|
||||
return -EINVAL;
|
||||
|
||||
perm = ODP_READ_ALLOWED_BIT;
|
||||
if (!(flags & RXE_PAGEFAULT_RDONLY))
|
||||
perm |= ODP_WRITE_ALLOWED_BIT;
|
||||
perm |= HMM_PFN_WRITE;
|
||||
|
||||
mutex_lock(&umem_odp->umem_mutex);
|
||||
|
||||
|
|
|
@ -8,6 +8,7 @@
|
|||
|
||||
#include <rdma/ib_umem.h>
|
||||
#include <rdma/ib_verbs.h>
|
||||
#include <linux/hmm.h>
|
||||
|
||||
struct ib_umem_odp {
|
||||
struct ib_umem umem;
|
||||
|
@ -67,19 +68,6 @@ static inline size_t ib_umem_odp_num_pages(struct ib_umem_odp *umem_odp)
|
|||
umem_odp->page_shift;
|
||||
}
|
||||
|
||||
/*
|
||||
* The lower 2 bits of the DMA address signal the R/W permissions for
|
||||
* the entry. To upgrade the permissions, provide the appropriate
|
||||
* bitmask to the map_dma_pages function.
|
||||
*
|
||||
* Be aware that upgrading a mapped address might result in change of
|
||||
* the DMA address for the page.
|
||||
*/
|
||||
#define ODP_READ_ALLOWED_BIT (1<<0ULL)
|
||||
#define ODP_WRITE_ALLOWED_BIT (1<<1ULL)
|
||||
|
||||
#define ODP_DMA_ADDR_MASK (~(ODP_READ_ALLOWED_BIT | ODP_WRITE_ALLOWED_BIT))
|
||||
|
||||
#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
|
||||
|
||||
struct ib_umem_odp *
|
||||
|
|
Loading…
Add table
Reference in a new issue