mirror of
git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2025-08-05 16:54:27 +00:00
RDMA/core: Convert UMEM ODP DMA mapping to caching IOVA and page linkage
Reuse newly added DMA API to cache IOVA and only link/unlink pages in fast path for UMEM ODP flow. Tested-by: Jens Axboe <axboe@kernel.dk> Reviewed-by: Jason Gunthorpe <jgg@nvidia.com> Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
This commit is contained in:
parent
eedd5b1276
commit
1efe8c0670
6 changed files with 74 additions and 116 deletions
|
@ -41,6 +41,7 @@
|
|||
#include <linux/hugetlb.h>
|
||||
#include <linux/interval_tree.h>
|
||||
#include <linux/hmm.h>
|
||||
#include <linux/hmm-dma.h>
|
||||
#include <linux/pagemap.h>
|
||||
|
||||
#include <rdma/ib_umem_odp.h>
|
||||
|
@ -50,6 +51,7 @@
|
|||
static inline int ib_init_umem_odp(struct ib_umem_odp *umem_odp,
|
||||
const struct mmu_interval_notifier_ops *ops)
|
||||
{
|
||||
struct ib_device *dev = umem_odp->umem.ibdev;
|
||||
int ret;
|
||||
|
||||
umem_odp->umem.is_odp = 1;
|
||||
|
@ -59,7 +61,6 @@ static inline int ib_init_umem_odp(struct ib_umem_odp *umem_odp,
|
|||
size_t page_size = 1UL << umem_odp->page_shift;
|
||||
unsigned long start;
|
||||
unsigned long end;
|
||||
size_t ndmas, npfns;
|
||||
|
||||
start = ALIGN_DOWN(umem_odp->umem.address, page_size);
|
||||
if (check_add_overflow(umem_odp->umem.address,
|
||||
|
@ -70,36 +71,23 @@ static inline int ib_init_umem_odp(struct ib_umem_odp *umem_odp,
|
|||
if (unlikely(end < page_size))
|
||||
return -EOVERFLOW;
|
||||
|
||||
ndmas = (end - start) >> umem_odp->page_shift;
|
||||
if (!ndmas)
|
||||
return -EINVAL;
|
||||
|
||||
npfns = (end - start) >> PAGE_SHIFT;
|
||||
umem_odp->pfn_list = kvcalloc(
|
||||
npfns, sizeof(*umem_odp->pfn_list), GFP_KERNEL);
|
||||
if (!umem_odp->pfn_list)
|
||||
return -ENOMEM;
|
||||
|
||||
umem_odp->dma_list = kvcalloc(
|
||||
ndmas, sizeof(*umem_odp->dma_list), GFP_KERNEL);
|
||||
if (!umem_odp->dma_list) {
|
||||
ret = -ENOMEM;
|
||||
goto out_pfn_list;
|
||||
}
|
||||
ret = hmm_dma_map_alloc(dev->dma_device, &umem_odp->map,
|
||||
(end - start) >> PAGE_SHIFT,
|
||||
1 << umem_odp->page_shift);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
ret = mmu_interval_notifier_insert(&umem_odp->notifier,
|
||||
umem_odp->umem.owning_mm,
|
||||
start, end - start, ops);
|
||||
if (ret)
|
||||
goto out_dma_list;
|
||||
goto out_free_map;
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
||||
out_dma_list:
|
||||
kvfree(umem_odp->dma_list);
|
||||
out_pfn_list:
|
||||
kvfree(umem_odp->pfn_list);
|
||||
out_free_map:
|
||||
hmm_dma_map_free(dev->dma_device, &umem_odp->map);
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
@ -262,6 +250,8 @@ EXPORT_SYMBOL(ib_umem_odp_get);
|
|||
|
||||
void ib_umem_odp_release(struct ib_umem_odp *umem_odp)
|
||||
{
|
||||
struct ib_device *dev = umem_odp->umem.ibdev;
|
||||
|
||||
/*
|
||||
* Ensure that no more pages are mapped in the umem.
|
||||
*
|
||||
|
@ -274,48 +264,17 @@ void ib_umem_odp_release(struct ib_umem_odp *umem_odp)
|
|||
ib_umem_end(umem_odp));
|
||||
mutex_unlock(&umem_odp->umem_mutex);
|
||||
mmu_interval_notifier_remove(&umem_odp->notifier);
|
||||
kvfree(umem_odp->dma_list);
|
||||
kvfree(umem_odp->pfn_list);
|
||||
hmm_dma_map_free(dev->dma_device, &umem_odp->map);
|
||||
}
|
||||
put_pid(umem_odp->tgid);
|
||||
kfree(umem_odp);
|
||||
}
|
||||
EXPORT_SYMBOL(ib_umem_odp_release);
|
||||
|
||||
/*
|
||||
* Map for DMA and insert a single page into the on-demand paging page tables.
|
||||
*
|
||||
* @umem: the umem to insert the page to.
|
||||
* @dma_index: index in the umem to add the dma to.
|
||||
* @page: the page struct to map and add.
|
||||
* @access_mask: access permissions needed for this page.
|
||||
*
|
||||
* The function returns -EFAULT if the DMA mapping operation fails.
|
||||
*
|
||||
*/
|
||||
static int ib_umem_odp_map_dma_single_page(
|
||||
struct ib_umem_odp *umem_odp,
|
||||
unsigned int dma_index,
|
||||
struct page *page)
|
||||
{
|
||||
struct ib_device *dev = umem_odp->umem.ibdev;
|
||||
dma_addr_t *dma_addr = &umem_odp->dma_list[dma_index];
|
||||
|
||||
*dma_addr = ib_dma_map_page(dev, page, 0, 1 << umem_odp->page_shift,
|
||||
DMA_BIDIRECTIONAL);
|
||||
if (ib_dma_mapping_error(dev, *dma_addr)) {
|
||||
*dma_addr = 0;
|
||||
return -EFAULT;
|
||||
}
|
||||
umem_odp->npages++;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* ib_umem_odp_map_dma_and_lock - DMA map userspace memory in an ODP MR and lock it.
|
||||
*
|
||||
* Maps the range passed in the argument to DMA addresses.
|
||||
* The DMA addresses of the mapped pages is updated in umem_odp->dma_list.
|
||||
* Upon success the ODP MR will be locked to let caller complete its device
|
||||
* page table update.
|
||||
*
|
||||
|
@ -372,7 +331,7 @@ int ib_umem_odp_map_dma_and_lock(struct ib_umem_odp *umem_odp, u64 user_virt,
|
|||
range.default_flags |= HMM_PFN_REQ_WRITE;
|
||||
}
|
||||
|
||||
range.hmm_pfns = &(umem_odp->pfn_list[pfn_start_idx]);
|
||||
range.hmm_pfns = &(umem_odp->map.pfn_list[pfn_start_idx]);
|
||||
timeout = jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
|
||||
|
||||
retry:
|
||||
|
@ -423,16 +382,6 @@ retry:
|
|||
__func__, hmm_order, page_shift);
|
||||
break;
|
||||
}
|
||||
|
||||
ret = ib_umem_odp_map_dma_single_page(
|
||||
umem_odp, dma_index,
|
||||
hmm_pfn_to_page(range.hmm_pfns[pfn_index]));
|
||||
if (ret < 0) {
|
||||
ibdev_dbg(umem_odp->umem.ibdev,
|
||||
"ib_umem_odp_map_dma_single_page failed with error %d\n", ret);
|
||||
break;
|
||||
}
|
||||
range.hmm_pfns[pfn_index] |= HMM_PFN_DMA_MAPPED;
|
||||
}
|
||||
/* upon success lock should stay on hold for the callee */
|
||||
if (!ret)
|
||||
|
@ -452,32 +401,23 @@ EXPORT_SYMBOL(ib_umem_odp_map_dma_and_lock);
|
|||
void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt,
|
||||
u64 bound)
|
||||
{
|
||||
dma_addr_t dma;
|
||||
int idx;
|
||||
u64 addr;
|
||||
struct ib_device *dev = umem_odp->umem.ibdev;
|
||||
u64 addr;
|
||||
|
||||
lockdep_assert_held(&umem_odp->umem_mutex);
|
||||
|
||||
virt = max_t(u64, virt, ib_umem_start(umem_odp));
|
||||
bound = min_t(u64, bound, ib_umem_end(umem_odp));
|
||||
for (addr = virt; addr < bound; addr += BIT(umem_odp->page_shift)) {
|
||||
unsigned long pfn_idx = (addr - ib_umem_start(umem_odp)) >>
|
||||
PAGE_SHIFT;
|
||||
struct page *page =
|
||||
hmm_pfn_to_page(umem_odp->pfn_list[pfn_idx]);
|
||||
u64 offset = addr - ib_umem_start(umem_odp);
|
||||
size_t idx = offset >> umem_odp->page_shift;
|
||||
unsigned long pfn = umem_odp->map.pfn_list[idx];
|
||||
|
||||
idx = (addr - ib_umem_start(umem_odp)) >> umem_odp->page_shift;
|
||||
dma = umem_odp->dma_list[idx];
|
||||
|
||||
if (!(umem_odp->pfn_list[pfn_idx] & HMM_PFN_VALID))
|
||||
goto clear;
|
||||
if (!(umem_odp->pfn_list[pfn_idx] & HMM_PFN_DMA_MAPPED))
|
||||
if (!hmm_dma_unmap_pfn(dev->dma_device, &umem_odp->map, idx))
|
||||
goto clear;
|
||||
|
||||
ib_dma_unmap_page(dev, dma, BIT(umem_odp->page_shift),
|
||||
DMA_BIDIRECTIONAL);
|
||||
if (umem_odp->pfn_list[pfn_idx] & HMM_PFN_WRITE) {
|
||||
if (pfn & HMM_PFN_WRITE) {
|
||||
struct page *page = hmm_pfn_to_page(pfn);
|
||||
struct page *head_page = compound_head(page);
|
||||
/*
|
||||
* set_page_dirty prefers being called with
|
||||
|
@ -492,7 +432,7 @@ void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt,
|
|||
}
|
||||
umem_odp->npages--;
|
||||
clear:
|
||||
umem_odp->pfn_list[pfn_idx] &= ~HMM_PFN_FLAGS;
|
||||
umem_odp->map.pfn_list[idx] &= ~HMM_PFN_FLAGS;
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL(ib_umem_odp_unmap_dma_pages);
|
||||
|
|
|
@ -1474,8 +1474,8 @@ void mlx5_ib_odp_cleanup_one(struct mlx5_ib_dev *ibdev);
|
|||
int __init mlx5_ib_odp_init(void);
|
||||
void mlx5_ib_odp_cleanup(void);
|
||||
int mlx5_odp_init_mkey_cache(struct mlx5_ib_dev *dev);
|
||||
void mlx5_odp_populate_xlt(void *xlt, size_t idx, size_t nentries,
|
||||
struct mlx5_ib_mr *mr, int flags);
|
||||
int mlx5_odp_populate_xlt(void *xlt, size_t idx, size_t nentries,
|
||||
struct mlx5_ib_mr *mr, int flags);
|
||||
|
||||
int mlx5_ib_advise_mr_prefetch(struct ib_pd *pd,
|
||||
enum ib_uverbs_advise_mr_advice advice,
|
||||
|
@ -1496,8 +1496,11 @@ static inline int mlx5_odp_init_mkey_cache(struct mlx5_ib_dev *dev)
|
|||
{
|
||||
return 0;
|
||||
}
|
||||
static inline void mlx5_odp_populate_xlt(void *xlt, size_t idx, size_t nentries,
|
||||
struct mlx5_ib_mr *mr, int flags) {}
|
||||
static inline int mlx5_odp_populate_xlt(void *xlt, size_t idx, size_t nentries,
|
||||
struct mlx5_ib_mr *mr, int flags)
|
||||
{
|
||||
return -EOPNOTSUPP;
|
||||
}
|
||||
|
||||
static inline int
|
||||
mlx5_ib_advise_mr_prefetch(struct ib_pd *pd,
|
||||
|
|
|
@ -35,6 +35,8 @@
|
|||
#include <linux/dma-buf.h>
|
||||
#include <linux/dma-resv.h>
|
||||
#include <linux/hmm.h>
|
||||
#include <linux/hmm-dma.h>
|
||||
#include <linux/pci-p2pdma.h>
|
||||
|
||||
#include "mlx5_ib.h"
|
||||
#include "cmd.h"
|
||||
|
@ -159,40 +161,50 @@ static void populate_klm(struct mlx5_klm *pklm, size_t idx, size_t nentries,
|
|||
}
|
||||
}
|
||||
|
||||
static void populate_mtt(__be64 *pas, size_t idx, size_t nentries,
|
||||
struct mlx5_ib_mr *mr, int flags)
|
||||
static int populate_mtt(__be64 *pas, size_t start, size_t nentries,
|
||||
struct mlx5_ib_mr *mr, int flags)
|
||||
{
|
||||
struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem);
|
||||
bool downgrade = flags & MLX5_IB_UPD_XLT_DOWNGRADE;
|
||||
unsigned long pfn;
|
||||
dma_addr_t pa;
|
||||
struct pci_p2pdma_map_state p2pdma_state = {};
|
||||
struct ib_device *dev = odp->umem.ibdev;
|
||||
size_t i;
|
||||
|
||||
if (flags & MLX5_IB_UPD_XLT_ZAP)
|
||||
return;
|
||||
return 0;
|
||||
|
||||
for (i = 0; i < nentries; i++) {
|
||||
pfn = odp->pfn_list[idx + i];
|
||||
unsigned long pfn = odp->map.pfn_list[start + i];
|
||||
dma_addr_t dma_addr;
|
||||
|
||||
pfn = odp->map.pfn_list[start + i];
|
||||
if (!(pfn & HMM_PFN_VALID))
|
||||
/* ODP initialization */
|
||||
continue;
|
||||
|
||||
pa = odp->dma_list[idx + i];
|
||||
pa |= MLX5_IB_MTT_READ;
|
||||
if ((pfn & HMM_PFN_WRITE) && !downgrade)
|
||||
pa |= MLX5_IB_MTT_WRITE;
|
||||
dma_addr = hmm_dma_map_pfn(dev->dma_device, &odp->map,
|
||||
start + i, &p2pdma_state);
|
||||
if (ib_dma_mapping_error(dev, dma_addr))
|
||||
return -EFAULT;
|
||||
|
||||
pas[i] = cpu_to_be64(pa);
|
||||
dma_addr |= MLX5_IB_MTT_READ;
|
||||
if ((pfn & HMM_PFN_WRITE) && !downgrade)
|
||||
dma_addr |= MLX5_IB_MTT_WRITE;
|
||||
|
||||
pas[i] = cpu_to_be64(dma_addr);
|
||||
odp->npages++;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
void mlx5_odp_populate_xlt(void *xlt, size_t idx, size_t nentries,
|
||||
struct mlx5_ib_mr *mr, int flags)
|
||||
int mlx5_odp_populate_xlt(void *xlt, size_t idx, size_t nentries,
|
||||
struct mlx5_ib_mr *mr, int flags)
|
||||
{
|
||||
if (flags & MLX5_IB_UPD_XLT_INDIRECT) {
|
||||
populate_klm(xlt, idx, nentries, mr, flags);
|
||||
return 0;
|
||||
} else {
|
||||
populate_mtt(xlt, idx, nentries, mr, flags);
|
||||
return populate_mtt(xlt, idx, nentries, mr, flags);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -303,7 +315,7 @@ static bool mlx5_ib_invalidate_range(struct mmu_interval_notifier *mni,
|
|||
* estimate the cost of another UMR vs. the cost of bigger
|
||||
* UMR.
|
||||
*/
|
||||
if (umem_odp->pfn_list[idx] & HMM_PFN_VALID) {
|
||||
if (umem_odp->map.pfn_list[idx] & HMM_PFN_VALID) {
|
||||
if (!in_block) {
|
||||
blk_start_idx = idx;
|
||||
in_block = 1;
|
||||
|
|
|
@ -840,7 +840,17 @@ int mlx5r_umr_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages,
|
|||
size_to_map = npages * desc_size;
|
||||
dma_sync_single_for_cpu(ddev, sg.addr, sg.length,
|
||||
DMA_TO_DEVICE);
|
||||
mlx5_odp_populate_xlt(xlt, idx, npages, mr, flags);
|
||||
/*
|
||||
* npages is the maximum number of pages to map, but we
|
||||
* can't guarantee that all pages are actually mapped.
|
||||
*
|
||||
* For example, if page is p2p of type which is not supported
|
||||
* for mapping, the number of pages mapped will be less than
|
||||
* requested.
|
||||
*/
|
||||
err = mlx5_odp_populate_xlt(xlt, idx, npages, mr, flags);
|
||||
if (err)
|
||||
return err;
|
||||
dma_sync_single_for_device(ddev, sg.addr, sg.length,
|
||||
DMA_TO_DEVICE);
|
||||
sg.length = ALIGN(size_to_map, MLX5_UMR_FLEX_ALIGNMENT);
|
||||
|
|
|
@ -205,7 +205,7 @@ static int __rxe_odp_mr_copy(struct rxe_mr *mr, u64 iova, void *addr,
|
|||
while (length > 0) {
|
||||
u8 *src, *dest;
|
||||
|
||||
page = hmm_pfn_to_page(umem_odp->pfn_list[idx]);
|
||||
page = hmm_pfn_to_page(umem_odp->map.pfn_list[idx]);
|
||||
user_va = kmap_local_page(page);
|
||||
if (!user_va)
|
||||
return -EFAULT;
|
||||
|
@ -289,7 +289,7 @@ static enum resp_states rxe_odp_do_atomic_op(struct rxe_mr *mr, u64 iova,
|
|||
|
||||
idx = rxe_odp_iova_to_index(umem_odp, iova);
|
||||
page_offset = rxe_odp_iova_to_page_offset(umem_odp, iova);
|
||||
page = hmm_pfn_to_page(umem_odp->pfn_list[idx]);
|
||||
page = hmm_pfn_to_page(umem_odp->map.pfn_list[idx]);
|
||||
if (!page)
|
||||
return RESPST_ERR_RKEY_VIOLATION;
|
||||
|
||||
|
@ -355,7 +355,7 @@ int rxe_odp_flush_pmem_iova(struct rxe_mr *mr, u64 iova,
|
|||
index = rxe_odp_iova_to_index(umem_odp, iova);
|
||||
page_offset = rxe_odp_iova_to_page_offset(umem_odp, iova);
|
||||
|
||||
page = hmm_pfn_to_page(umem_odp->pfn_list[index]);
|
||||
page = hmm_pfn_to_page(umem_odp->map.pfn_list[index]);
|
||||
if (!page) {
|
||||
mutex_unlock(&umem_odp->umem_mutex);
|
||||
return -EFAULT;
|
||||
|
@ -401,7 +401,7 @@ enum resp_states rxe_odp_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value)
|
|||
|
||||
page_offset = rxe_odp_iova_to_page_offset(umem_odp, iova);
|
||||
index = rxe_odp_iova_to_index(umem_odp, iova);
|
||||
page = hmm_pfn_to_page(umem_odp->pfn_list[index]);
|
||||
page = hmm_pfn_to_page(umem_odp->map.pfn_list[index]);
|
||||
if (!page) {
|
||||
mutex_unlock(&umem_odp->umem_mutex);
|
||||
return RESPST_ERR_RKEY_VIOLATION;
|
||||
|
|
|
@ -8,24 +8,17 @@
|
|||
|
||||
#include <rdma/ib_umem.h>
|
||||
#include <rdma/ib_verbs.h>
|
||||
#include <linux/hmm.h>
|
||||
#include <linux/hmm-dma.h>
|
||||
|
||||
struct ib_umem_odp {
|
||||
struct ib_umem umem;
|
||||
struct mmu_interval_notifier notifier;
|
||||
struct pid *tgid;
|
||||
|
||||
/* An array of the pfns included in the on-demand paging umem. */
|
||||
unsigned long *pfn_list;
|
||||
struct hmm_dma_map map;
|
||||
|
||||
/*
|
||||
* An array with DMA addresses mapped for pfns in pfn_list.
|
||||
* The lower two bits designate access permissions.
|
||||
* See ODP_READ_ALLOWED_BIT and ODP_WRITE_ALLOWED_BIT.
|
||||
*/
|
||||
dma_addr_t *dma_list;
|
||||
/*
|
||||
* The umem_mutex protects the page_list and dma_list fields of an ODP
|
||||
* The umem_mutex protects the page_list field of an ODP
|
||||
* umem, allowing only a single thread to map/unmap pages. The mutex
|
||||
* also protects access to the mmu notifier counters.
|
||||
*/
|
||||
|
|
Loading…
Add table
Reference in a new issue