linux/drivers/iommu/io-pgtable-arm.c

1471 lines
39 KiB
C
Raw Permalink Normal View History

// SPDX-License-Identifier: GPL-2.0-only
/*
* CPU-agnostic ARM page table allocator.
*
* Copyright (C) 2014 ARM Limited
*
* Author: Will Deacon <will.deacon@arm.com>
*/
#define pr_fmt(fmt) "arm-lpae io-pgtable: " fmt
#include <linux/atomic.h>
#include <linux/bitops.h>
#include <linux/io-pgtable.h>
#include <linux/kernel.h>
#include <linux/device/faux.h>
#include <linux/sizes.h>
#include <linux/slab.h>
#include <linux/types.h>
#include <linux/dma-mapping.h>
#include <asm/barrier.h>
#include "io-pgtable-arm.h"
#include "iommu-pages.h"
#define ARM_LPAE_MAX_ADDR_BITS 52
#define ARM_LPAE_S2_MAX_CONCAT_PAGES 16
#define ARM_LPAE_MAX_LEVELS 4
/* Struct accessors */
#define io_pgtable_to_data(x) \
container_of((x), struct arm_lpae_io_pgtable, iop)
#define io_pgtable_ops_to_data(x) \
io_pgtable_to_data(io_pgtable_ops_to_pgtable(x))
/*
* Calculate the right shift amount to get to the portion describing level l
* in a virtual address mapped by the pagetable in d.
*/
#define ARM_LPAE_LVL_SHIFT(l,d) \
(((ARM_LPAE_MAX_LEVELS - (l)) * (d)->bits_per_level) + \
ilog2(sizeof(arm_lpae_iopte)))
#define ARM_LPAE_GRANULE(d) \
(sizeof(arm_lpae_iopte) << (d)->bits_per_level)
#define ARM_LPAE_PGD_SIZE(d) \
(sizeof(arm_lpae_iopte) << (d)->pgd_bits)
#define ARM_LPAE_PTES_PER_TABLE(d) \
(ARM_LPAE_GRANULE(d) >> ilog2(sizeof(arm_lpae_iopte)))
/*
* Calculate the index at level l used to map virtual address a using the
* pagetable in d.
*/
#define ARM_LPAE_PGD_IDX(l,d) \
((l) == (d)->start_level ? (d)->pgd_bits - (d)->bits_per_level : 0)
#define ARM_LPAE_LVL_IDX(a,l,d) \
(((u64)(a) >> ARM_LPAE_LVL_SHIFT(l,d)) & \
((1 << ((d)->bits_per_level + ARM_LPAE_PGD_IDX(l,d))) - 1))
/* Calculate the block/page mapping size at level l for pagetable in d. */
#define ARM_LPAE_BLOCK_SIZE(l,d) (1ULL << ARM_LPAE_LVL_SHIFT(l,d))
/* Page table bits */
#define ARM_LPAE_PTE_TYPE_SHIFT 0
#define ARM_LPAE_PTE_TYPE_MASK 0x3
#define ARM_LPAE_PTE_TYPE_BLOCK 1
#define ARM_LPAE_PTE_TYPE_TABLE 3
#define ARM_LPAE_PTE_TYPE_PAGE 3
#define ARM_LPAE_PTE_ADDR_MASK GENMASK_ULL(47,12)
#define ARM_LPAE_PTE_NSTABLE (((arm_lpae_iopte)1) << 63)
#define ARM_LPAE_PTE_XN (((arm_lpae_iopte)3) << 53)
#define ARM_LPAE_PTE_DBM (((arm_lpae_iopte)1) << 51)
#define ARM_LPAE_PTE_AF (((arm_lpae_iopte)1) << 10)
#define ARM_LPAE_PTE_SH_NS (((arm_lpae_iopte)0) << 8)
#define ARM_LPAE_PTE_SH_OS (((arm_lpae_iopte)2) << 8)
#define ARM_LPAE_PTE_SH_IS (((arm_lpae_iopte)3) << 8)
#define ARM_LPAE_PTE_NS (((arm_lpae_iopte)1) << 5)
#define ARM_LPAE_PTE_VALID (((arm_lpae_iopte)1) << 0)
/* Software bit for solving coherency races */
#define ARM_LPAE_PTE_SW_SYNC (((arm_lpae_iopte)1) << 55)
/* Stage-1 PTE */
#define ARM_LPAE_PTE_AP_UNPRIV (((arm_lpae_iopte)1) << 6)
#define ARM_LPAE_PTE_AP_RDONLY_BIT 7
#define ARM_LPAE_PTE_AP_RDONLY (((arm_lpae_iopte)1) << \
ARM_LPAE_PTE_AP_RDONLY_BIT)
#define ARM_LPAE_PTE_AP_WR_CLEAN_MASK (ARM_LPAE_PTE_AP_RDONLY | \
ARM_LPAE_PTE_DBM)
#define ARM_LPAE_PTE_ATTRINDX_SHIFT 2
#define ARM_LPAE_PTE_nG (((arm_lpae_iopte)1) << 11)
/* Stage-2 PTE */
#define ARM_LPAE_PTE_HAP_FAULT (((arm_lpae_iopte)0) << 6)
#define ARM_LPAE_PTE_HAP_READ (((arm_lpae_iopte)1) << 6)
#define ARM_LPAE_PTE_HAP_WRITE (((arm_lpae_iopte)2) << 6)
/*
* For !FWB these code to:
* 1111 = Normal outer write back cachable / Inner Write Back Cachable
* Permit S1 to override
* 0101 = Normal Non-cachable / Inner Non-cachable
* 0001 = Device / Device-nGnRE
* For S2FWB these code:
* 0110 Force Normal Write Back
* 0101 Normal* is forced Normal-NC, Device unchanged
* 0001 Force Device-nGnRE
*/
#define ARM_LPAE_PTE_MEMATTR_FWB_WB (((arm_lpae_iopte)0x6) << 2)
#define ARM_LPAE_PTE_MEMATTR_OIWB (((arm_lpae_iopte)0xf) << 2)
#define ARM_LPAE_PTE_MEMATTR_NC (((arm_lpae_iopte)0x5) << 2)
#define ARM_LPAE_PTE_MEMATTR_DEV (((arm_lpae_iopte)0x1) << 2)
/* Register bits */
#define ARM_LPAE_VTCR_SL0_MASK 0x3
#define ARM_LPAE_TCR_T0SZ_SHIFT 0
#define ARM_LPAE_VTCR_PS_SHIFT 16
#define ARM_LPAE_VTCR_PS_MASK 0x7
#define ARM_LPAE_MAIR_ATTR_SHIFT(n) ((n) << 3)
#define ARM_LPAE_MAIR_ATTR_MASK 0xff
#define ARM_LPAE_MAIR_ATTR_DEVICE 0x04
#define ARM_LPAE_MAIR_ATTR_NC 0x44
iommu/io-pgtable-arm: Add support to use system cache Few Qualcomm platforms such as, sdm845 have an additional outer cache called as System cache, aka. Last level cache (LLC) that allows non-coherent devices to upgrade to using caching. This cache sits right before the DDR, and is tightly coupled with the memory controller. The clients using this cache request their slices from this system cache, make it active, and can then start using it. There is a fundamental assumption that non-coherent devices can't access caches. This change adds an exception where they *can* use some level of cache despite still being non-coherent overall. The coherent devices that use cacheable memory, and CPU make use of this system cache by default. Looking at memory types, we have following - a) Normal uncached :- MAIR 0x44, inner non-cacheable, outer non-cacheable; b) Normal cached :- MAIR 0xff, inner read write-back non-transient, outer read write-back non-transient; attribute setting for coherenet I/O devices. and, for non-coherent i/o devices that can allocate in system cache another type gets added - c) Normal sys-cached :- MAIR 0xf4, inner non-cacheable, outer read write-back non-transient Coherent I/O devices use system cache by marking the memory as normal cached. Non-coherent I/O devices should mark the memory as normal sys-cached in page tables to use system cache. Acked-by: Robin Murphy <robin.murphy@arm.com> Signed-off-by: Vivek Gautam <vivek.gautam@codeaurora.org> Signed-off-by: Will Deacon <will.deacon@arm.com>
2019-05-16 15:00:20 +05:30
#define ARM_LPAE_MAIR_ATTR_INC_OWBRWA 0xf4
#define ARM_LPAE_MAIR_ATTR_WBRWA 0xff
#define ARM_LPAE_MAIR_ATTR_IDX_NC 0
#define ARM_LPAE_MAIR_ATTR_IDX_CACHE 1
#define ARM_LPAE_MAIR_ATTR_IDX_DEV 2
iommu/io-pgtable-arm: Add support to use system cache Few Qualcomm platforms such as, sdm845 have an additional outer cache called as System cache, aka. Last level cache (LLC) that allows non-coherent devices to upgrade to using caching. This cache sits right before the DDR, and is tightly coupled with the memory controller. The clients using this cache request their slices from this system cache, make it active, and can then start using it. There is a fundamental assumption that non-coherent devices can't access caches. This change adds an exception where they *can* use some level of cache despite still being non-coherent overall. The coherent devices that use cacheable memory, and CPU make use of this system cache by default. Looking at memory types, we have following - a) Normal uncached :- MAIR 0x44, inner non-cacheable, outer non-cacheable; b) Normal cached :- MAIR 0xff, inner read write-back non-transient, outer read write-back non-transient; attribute setting for coherenet I/O devices. and, for non-coherent i/o devices that can allocate in system cache another type gets added - c) Normal sys-cached :- MAIR 0xf4, inner non-cacheable, outer read write-back non-transient Coherent I/O devices use system cache by marking the memory as normal cached. Non-coherent I/O devices should mark the memory as normal sys-cached in page tables to use system cache. Acked-by: Robin Murphy <robin.murphy@arm.com> Signed-off-by: Vivek Gautam <vivek.gautam@codeaurora.org> Signed-off-by: Will Deacon <will.deacon@arm.com>
2019-05-16 15:00:20 +05:30
#define ARM_LPAE_MAIR_ATTR_IDX_INC_OCACHE 3
#define ARM_MALI_LPAE_TTBR_ADRMODE_TABLE (3u << 0)
#define ARM_MALI_LPAE_TTBR_READ_INNER BIT(2)
#define ARM_MALI_LPAE_TTBR_SHARE_OUTER BIT(4)
#define ARM_MALI_LPAE_MEMATTR_IMP_DEF 0x88ULL
#define ARM_MALI_LPAE_MEMATTR_WRITE_ALLOC 0x8DULL
/* IOPTE accessors */
#define iopte_deref(pte,d) __va(iopte_to_paddr(pte, d))
#define iopte_type(pte) \
(((pte) >> ARM_LPAE_PTE_TYPE_SHIFT) & ARM_LPAE_PTE_TYPE_MASK)
#define iopte_writeable_dirty(pte) \
(((pte) & ARM_LPAE_PTE_AP_WR_CLEAN_MASK) == ARM_LPAE_PTE_DBM)
#define iopte_set_writeable_clean(ptep) \
set_bit(ARM_LPAE_PTE_AP_RDONLY_BIT, (unsigned long *)(ptep))
struct arm_lpae_io_pgtable {
struct io_pgtable iop;
int pgd_bits;
int start_level;
int bits_per_level;
void *pgd;
};
typedef u64 arm_lpae_iopte;
static inline bool iopte_leaf(arm_lpae_iopte pte, int lvl,
enum io_pgtable_fmt fmt)
{
if (lvl == (ARM_LPAE_MAX_LEVELS - 1) && fmt != ARM_MALI_LPAE)
return iopte_type(pte) == ARM_LPAE_PTE_TYPE_PAGE;
return iopte_type(pte) == ARM_LPAE_PTE_TYPE_BLOCK;
}
static inline bool iopte_table(arm_lpae_iopte pte, int lvl)
{
if (lvl == (ARM_LPAE_MAX_LEVELS - 1))
return false;
return iopte_type(pte) == ARM_LPAE_PTE_TYPE_TABLE;
}
static arm_lpae_iopte paddr_to_iopte(phys_addr_t paddr,
struct arm_lpae_io_pgtable *data)
{
arm_lpae_iopte pte = paddr;
/* Of the bits which overlap, either 51:48 or 15:12 are always RES0 */
return (pte | (pte >> (48 - 12))) & ARM_LPAE_PTE_ADDR_MASK;
}
static phys_addr_t iopte_to_paddr(arm_lpae_iopte pte,
struct arm_lpae_io_pgtable *data)
{
u64 paddr = pte & ARM_LPAE_PTE_ADDR_MASK;
if (ARM_LPAE_GRANULE(data) < SZ_64K)
return paddr;
/* Rotate the packed high-order bits back to the top */
return (paddr | (paddr << (48 - 12))) & (ARM_LPAE_PTE_ADDR_MASK << 4);
}
/*
* Convert an index returned by ARM_LPAE_PGD_IDX(), which can point into
* a concatenated PGD, into the maximum number of entries that can be
* mapped in the same table page.
*/
static inline int arm_lpae_max_entries(int i, struct arm_lpae_io_pgtable *data)
{
int ptes_per_table = ARM_LPAE_PTES_PER_TABLE(data);
return ptes_per_table - (i & (ptes_per_table - 1));
}
/*
* Check if concatenated PGDs are mandatory according to Arm DDI0487 (K.a)
* 1) R_DXBSH: For 16KB, and 48-bit input size, use level 1 instead of 0.
* 2) R_SRKBC: After de-ciphering the table for PA size and valid initial lookup
* a) 40 bits PA size with 4K: use level 1 instead of level 0 (2 tables for ias = oas)
* b) 40 bits PA size with 16K: use level 2 instead of level 1 (16 tables for ias = oas)
* c) 42 bits PA size with 4K: use level 1 instead of level 0 (8 tables for ias = oas)
* d) 48 bits PA size with 16K: use level 1 instead of level 0 (2 tables for ias = oas)
*/
static inline bool arm_lpae_concat_mandatory(struct io_pgtable_cfg *cfg,
struct arm_lpae_io_pgtable *data)
{
unsigned int ias = cfg->ias;
unsigned int oas = cfg->oas;
/* Covers 1 and 2.d */
if ((ARM_LPAE_GRANULE(data) == SZ_16K) && (data->start_level == 0))
return (oas == 48) || (ias == 48);
/* Covers 2.a and 2.c */
if ((ARM_LPAE_GRANULE(data) == SZ_4K) && (data->start_level == 0))
return (oas == 40) || (oas == 42);
/* Case 2.b */
return (ARM_LPAE_GRANULE(data) == SZ_16K) &&
(data->start_level == 1) && (oas == 40);
}
static dma_addr_t __arm_lpae_dma_addr(void *pages)
{
return (dma_addr_t)virt_to_phys(pages);
}
static void *__arm_lpae_alloc_pages(size_t size, gfp_t gfp,
iommu: Extend LPAE page table format to support custom allocators We need that in order to implement the VM_BIND ioctl in the GPU driver targeting new Mali GPUs. VM_BIND is about executing MMU map/unmap requests asynchronously, possibly after waiting for external dependencies encoded as dma_fences. We intend to use the drm_sched framework to automate the dependency tracking and VM job dequeuing logic, but this comes with its own set of constraints, one of them being the fact we are not allowed to allocate memory in the drm_gpu_scheduler_ops::run_job() to avoid this sort of deadlocks: - VM_BIND map job needs to allocate a page table to map some memory to the VM. No memory available, so kswapd is kicked - GPU driver shrinker backend ends up waiting on the fence attached to the VM map job or any other job fence depending on this VM operation. With custom allocators, we will be able to pre-reserve enough pages to guarantee the map/unmap operations we queued will take place without going through the system allocator. But we can also optimize allocation/reservation by not free-ing pages immediately, so any upcoming page table allocation requests can be serviced by some free page table pool kept at the driver level. I might also be valuable for other aspects of GPU and similar use-cases, like fine-grained memory accounting and resource limiting. Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com> Reviewed-by: Steven Price <steven.price@arm.com> Reviewed-by: Robin Murphy <robin.murphy@arm.com> Link: https://lore.kernel.org/r/20231124142434.1577550-3-boris.brezillon@collabora.com Signed-off-by: Joerg Roedel <jroedel@suse.de>
2023-11-24 15:24:34 +01:00
struct io_pgtable_cfg *cfg,
void *cookie)
{
struct device *dev = cfg->iommu_dev;
size_t alloc_size;
dma_addr_t dma;
void *pages;
/*
* For very small starting-level translation tables the HW requires a
* minimum alignment of at least 64 to cover all cases.
*/
alloc_size = max(size, 64);
if (cfg->alloc)
pages = cfg->alloc(cookie, alloc_size, gfp);
else
pages = iommu_alloc_pages_node_sz(dev_to_node(dev), gfp,
alloc_size);
iommu: Extend LPAE page table format to support custom allocators We need that in order to implement the VM_BIND ioctl in the GPU driver targeting new Mali GPUs. VM_BIND is about executing MMU map/unmap requests asynchronously, possibly after waiting for external dependencies encoded as dma_fences. We intend to use the drm_sched framework to automate the dependency tracking and VM job dequeuing logic, but this comes with its own set of constraints, one of them being the fact we are not allowed to allocate memory in the drm_gpu_scheduler_ops::run_job() to avoid this sort of deadlocks: - VM_BIND map job needs to allocate a page table to map some memory to the VM. No memory available, so kswapd is kicked - GPU driver shrinker backend ends up waiting on the fence attached to the VM map job or any other job fence depending on this VM operation. With custom allocators, we will be able to pre-reserve enough pages to guarantee the map/unmap operations we queued will take place without going through the system allocator. But we can also optimize allocation/reservation by not free-ing pages immediately, so any upcoming page table allocation requests can be serviced by some free page table pool kept at the driver level. I might also be valuable for other aspects of GPU and similar use-cases, like fine-grained memory accounting and resource limiting. Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com> Reviewed-by: Steven Price <steven.price@arm.com> Reviewed-by: Robin Murphy <robin.murphy@arm.com> Link: https://lore.kernel.org/r/20231124142434.1577550-3-boris.brezillon@collabora.com Signed-off-by: Joerg Roedel <jroedel@suse.de>
2023-11-24 15:24:34 +01:00
if (!pages)
return NULL;
if (!cfg->coherent_walk) {
dma = dma_map_single(dev, pages, size, DMA_TO_DEVICE);
if (dma_mapping_error(dev, dma))
goto out_free;
/*
* We depend on the IOMMU being able to work with any physical
* address directly, so if the DMA layer suggests otherwise by
* translating or truncating them, that bodes very badly...
*/
if (dma != virt_to_phys(pages))
goto out_unmap;
}
return pages;
out_unmap:
dev_err(dev, "Cannot accommodate DMA translation for IOMMU page tables\n");
dma_unmap_single(dev, dma, size, DMA_TO_DEVICE);
iommu: Extend LPAE page table format to support custom allocators We need that in order to implement the VM_BIND ioctl in the GPU driver targeting new Mali GPUs. VM_BIND is about executing MMU map/unmap requests asynchronously, possibly after waiting for external dependencies encoded as dma_fences. We intend to use the drm_sched framework to automate the dependency tracking and VM job dequeuing logic, but this comes with its own set of constraints, one of them being the fact we are not allowed to allocate memory in the drm_gpu_scheduler_ops::run_job() to avoid this sort of deadlocks: - VM_BIND map job needs to allocate a page table to map some memory to the VM. No memory available, so kswapd is kicked - GPU driver shrinker backend ends up waiting on the fence attached to the VM map job or any other job fence depending on this VM operation. With custom allocators, we will be able to pre-reserve enough pages to guarantee the map/unmap operations we queued will take place without going through the system allocator. But we can also optimize allocation/reservation by not free-ing pages immediately, so any upcoming page table allocation requests can be serviced by some free page table pool kept at the driver level. I might also be valuable for other aspects of GPU and similar use-cases, like fine-grained memory accounting and resource limiting. Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com> Reviewed-by: Steven Price <steven.price@arm.com> Reviewed-by: Robin Murphy <robin.murphy@arm.com> Link: https://lore.kernel.org/r/20231124142434.1577550-3-boris.brezillon@collabora.com Signed-off-by: Joerg Roedel <jroedel@suse.de>
2023-11-24 15:24:34 +01:00
out_free:
iommu: Extend LPAE page table format to support custom allocators We need that in order to implement the VM_BIND ioctl in the GPU driver targeting new Mali GPUs. VM_BIND is about executing MMU map/unmap requests asynchronously, possibly after waiting for external dependencies encoded as dma_fences. We intend to use the drm_sched framework to automate the dependency tracking and VM job dequeuing logic, but this comes with its own set of constraints, one of them being the fact we are not allowed to allocate memory in the drm_gpu_scheduler_ops::run_job() to avoid this sort of deadlocks: - VM_BIND map job needs to allocate a page table to map some memory to the VM. No memory available, so kswapd is kicked - GPU driver shrinker backend ends up waiting on the fence attached to the VM map job or any other job fence depending on this VM operation. With custom allocators, we will be able to pre-reserve enough pages to guarantee the map/unmap operations we queued will take place without going through the system allocator. But we can also optimize allocation/reservation by not free-ing pages immediately, so any upcoming page table allocation requests can be serviced by some free page table pool kept at the driver level. I might also be valuable for other aspects of GPU and similar use-cases, like fine-grained memory accounting and resource limiting. Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com> Reviewed-by: Steven Price <steven.price@arm.com> Reviewed-by: Robin Murphy <robin.murphy@arm.com> Link: https://lore.kernel.org/r/20231124142434.1577550-3-boris.brezillon@collabora.com Signed-off-by: Joerg Roedel <jroedel@suse.de>
2023-11-24 15:24:34 +01:00
if (cfg->free)
cfg->free(cookie, pages, size);
else
iommu_free_pages(pages);
iommu: Extend LPAE page table format to support custom allocators We need that in order to implement the VM_BIND ioctl in the GPU driver targeting new Mali GPUs. VM_BIND is about executing MMU map/unmap requests asynchronously, possibly after waiting for external dependencies encoded as dma_fences. We intend to use the drm_sched framework to automate the dependency tracking and VM job dequeuing logic, but this comes with its own set of constraints, one of them being the fact we are not allowed to allocate memory in the drm_gpu_scheduler_ops::run_job() to avoid this sort of deadlocks: - VM_BIND map job needs to allocate a page table to map some memory to the VM. No memory available, so kswapd is kicked - GPU driver shrinker backend ends up waiting on the fence attached to the VM map job or any other job fence depending on this VM operation. With custom allocators, we will be able to pre-reserve enough pages to guarantee the map/unmap operations we queued will take place without going through the system allocator. But we can also optimize allocation/reservation by not free-ing pages immediately, so any upcoming page table allocation requests can be serviced by some free page table pool kept at the driver level. I might also be valuable for other aspects of GPU and similar use-cases, like fine-grained memory accounting and resource limiting. Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com> Reviewed-by: Steven Price <steven.price@arm.com> Reviewed-by: Robin Murphy <robin.murphy@arm.com> Link: https://lore.kernel.org/r/20231124142434.1577550-3-boris.brezillon@collabora.com Signed-off-by: Joerg Roedel <jroedel@suse.de>
2023-11-24 15:24:34 +01:00
return NULL;
}
static void __arm_lpae_free_pages(void *pages, size_t size,
iommu: Extend LPAE page table format to support custom allocators We need that in order to implement the VM_BIND ioctl in the GPU driver targeting new Mali GPUs. VM_BIND is about executing MMU map/unmap requests asynchronously, possibly after waiting for external dependencies encoded as dma_fences. We intend to use the drm_sched framework to automate the dependency tracking and VM job dequeuing logic, but this comes with its own set of constraints, one of them being the fact we are not allowed to allocate memory in the drm_gpu_scheduler_ops::run_job() to avoid this sort of deadlocks: - VM_BIND map job needs to allocate a page table to map some memory to the VM. No memory available, so kswapd is kicked - GPU driver shrinker backend ends up waiting on the fence attached to the VM map job or any other job fence depending on this VM operation. With custom allocators, we will be able to pre-reserve enough pages to guarantee the map/unmap operations we queued will take place without going through the system allocator. But we can also optimize allocation/reservation by not free-ing pages immediately, so any upcoming page table allocation requests can be serviced by some free page table pool kept at the driver level. I might also be valuable for other aspects of GPU and similar use-cases, like fine-grained memory accounting and resource limiting. Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com> Reviewed-by: Steven Price <steven.price@arm.com> Reviewed-by: Robin Murphy <robin.murphy@arm.com> Link: https://lore.kernel.org/r/20231124142434.1577550-3-boris.brezillon@collabora.com Signed-off-by: Joerg Roedel <jroedel@suse.de>
2023-11-24 15:24:34 +01:00
struct io_pgtable_cfg *cfg,
void *cookie)
{
if (!cfg->coherent_walk)
dma_unmap_single(cfg->iommu_dev, __arm_lpae_dma_addr(pages),
size, DMA_TO_DEVICE);
iommu: Extend LPAE page table format to support custom allocators We need that in order to implement the VM_BIND ioctl in the GPU driver targeting new Mali GPUs. VM_BIND is about executing MMU map/unmap requests asynchronously, possibly after waiting for external dependencies encoded as dma_fences. We intend to use the drm_sched framework to automate the dependency tracking and VM job dequeuing logic, but this comes with its own set of constraints, one of them being the fact we are not allowed to allocate memory in the drm_gpu_scheduler_ops::run_job() to avoid this sort of deadlocks: - VM_BIND map job needs to allocate a page table to map some memory to the VM. No memory available, so kswapd is kicked - GPU driver shrinker backend ends up waiting on the fence attached to the VM map job or any other job fence depending on this VM operation. With custom allocators, we will be able to pre-reserve enough pages to guarantee the map/unmap operations we queued will take place without going through the system allocator. But we can also optimize allocation/reservation by not free-ing pages immediately, so any upcoming page table allocation requests can be serviced by some free page table pool kept at the driver level. I might also be valuable for other aspects of GPU and similar use-cases, like fine-grained memory accounting and resource limiting. Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com> Reviewed-by: Steven Price <steven.price@arm.com> Reviewed-by: Robin Murphy <robin.murphy@arm.com> Link: https://lore.kernel.org/r/20231124142434.1577550-3-boris.brezillon@collabora.com Signed-off-by: Joerg Roedel <jroedel@suse.de>
2023-11-24 15:24:34 +01:00
if (cfg->free)
cfg->free(cookie, pages, size);
else
iommu_free_pages(pages);
}
static void __arm_lpae_sync_pte(arm_lpae_iopte *ptep, int num_entries,
struct io_pgtable_cfg *cfg)
{
dma_sync_single_for_device(cfg->iommu_dev, __arm_lpae_dma_addr(ptep),
sizeof(*ptep) * num_entries, DMA_TO_DEVICE);
}
static void __arm_lpae_clear_pte(arm_lpae_iopte *ptep, struct io_pgtable_cfg *cfg, int num_entries)
{
for (int i = 0; i < num_entries; i++)
ptep[i] = 0;
if (!cfg->coherent_walk && num_entries)
__arm_lpae_sync_pte(ptep, num_entries, cfg);
}
static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data,
struct iommu_iotlb_gather *gather,
unsigned long iova, size_t size, size_t pgcount,
int lvl, arm_lpae_iopte *ptep);
iommu/io-pgtable-arm: Unmap and free table when overwriting with block When installing a block mapping, we unconditionally overwrite a non-leaf PTE if we find one. However, this can cause a problem if the following sequence of events occur: (1) iommu_map called for a 4k (i.e. PAGE_SIZE) mapping at some address - We initialise the page table all the way down to a leaf entry - No TLB maintenance is required, because we're going from invalid to valid. (2) iommu_unmap is called on the mapping installed in (1) - We walk the page table to the final (leaf) entry and zero it - We only changed a valid leaf entry, so we invalidate leaf-only (3) iommu_map is called on the same address as (1), but this time for a 2MB (i.e. BLOCK_SIZE) mapping) - We walk the page table down to the penultimate level, where we find a table entry - We overwrite the table entry with a block mapping and return without any TLB maintenance and without freeing the memory used by the now-orphaned table. This last step can lead to a walk-cache caching the overwritten table entry, causing unexpected faults when the new mapping is accessed by a device. One way to fix this would be to collapse the page table when freeing the last page at a given level, but this would require expensive iteration on every map call. Instead, this patch detects the case when we are overwriting a table entry and explicitly unmaps the table first, which takes care of both freeing and TLB invalidation. Cc: <stable@vger.kernel.org> Reported-by: Brian Starkey <brian.starkey@arm.com> Tested-by: Brian Starkey <brian.starkey@arm.com> Signed-off-by: Will Deacon <will.deacon@arm.com> Signed-off-by: Joerg Roedel <jroedel@suse.de>
2015-08-11 16:48:32 +01:00
static void __arm_lpae_init_pte(struct arm_lpae_io_pgtable *data,
phys_addr_t paddr, arm_lpae_iopte prot,
int lvl, int num_entries, arm_lpae_iopte *ptep)
{
arm_lpae_iopte pte = prot;
struct io_pgtable_cfg *cfg = &data->iop.cfg;
size_t sz = ARM_LPAE_BLOCK_SIZE(lvl, data);
int i;
if (data->iop.fmt != ARM_MALI_LPAE && lvl == ARM_LPAE_MAX_LEVELS - 1)
pte |= ARM_LPAE_PTE_TYPE_PAGE;
else
pte |= ARM_LPAE_PTE_TYPE_BLOCK;
for (i = 0; i < num_entries; i++)
ptep[i] = pte | paddr_to_iopte(paddr + i * sz, data);
if (!cfg->coherent_walk)
__arm_lpae_sync_pte(ptep, num_entries, cfg);
}
static int arm_lpae_init_pte(struct arm_lpae_io_pgtable *data,
unsigned long iova, phys_addr_t paddr,
arm_lpae_iopte prot, int lvl, int num_entries,
arm_lpae_iopte *ptep)
{
int i;
for (i = 0; i < num_entries; i++)
if (iopte_leaf(ptep[i], lvl, data->iop.fmt)) {
/* We require an unmap first */
WARN_ON(!(data->iop.cfg.quirks & IO_PGTABLE_QUIRK_NO_WARN));
return -EEXIST;
} else if (iopte_type(ptep[i]) == ARM_LPAE_PTE_TYPE_TABLE) {
/*
* We need to unmap and free the old table before
* overwriting it with a block entry.
*/
arm_lpae_iopte *tblp;
size_t sz = ARM_LPAE_BLOCK_SIZE(lvl, data);
tblp = ptep - ARM_LPAE_LVL_IDX(iova, lvl, data);
if (__arm_lpae_unmap(data, NULL, iova + i * sz, sz, 1,
lvl, tblp) != sz) {
WARN_ON(1);
return -EINVAL;
}
}
__arm_lpae_init_pte(data, paddr, prot, lvl, num_entries, ptep);
return 0;
}
static arm_lpae_iopte arm_lpae_install_table(arm_lpae_iopte *table,
arm_lpae_iopte *ptep,
arm_lpae_iopte curr,
struct arm_lpae_io_pgtable *data)
{
arm_lpae_iopte old, new;
struct io_pgtable_cfg *cfg = &data->iop.cfg;
new = paddr_to_iopte(__pa(table), data) | ARM_LPAE_PTE_TYPE_TABLE;
if (cfg->quirks & IO_PGTABLE_QUIRK_ARM_NS)
new |= ARM_LPAE_PTE_NSTABLE;
/*
* Ensure the table itself is visible before its PTE can be.
* Whilst we could get away with cmpxchg64_release below, this
* doesn't have any ordering semantics when !CONFIG_SMP.
*/
dma_wmb();
old = cmpxchg64_relaxed(ptep, curr, new);
if (cfg->coherent_walk || (old & ARM_LPAE_PTE_SW_SYNC))
return old;
/* Even if it's not ours, there's no point waiting; just kick it */
__arm_lpae_sync_pte(ptep, 1, cfg);
if (old == curr)
WRITE_ONCE(*ptep, new | ARM_LPAE_PTE_SW_SYNC);
return old;
}
static int __arm_lpae_map(struct arm_lpae_io_pgtable *data, unsigned long iova,
phys_addr_t paddr, size_t size, size_t pgcount,
arm_lpae_iopte prot, int lvl, arm_lpae_iopte *ptep,
gfp_t gfp, size_t *mapped)
{
arm_lpae_iopte *cptep, pte;
size_t block_size = ARM_LPAE_BLOCK_SIZE(lvl, data);
size_t tblsz = ARM_LPAE_GRANULE(data);
struct io_pgtable_cfg *cfg = &data->iop.cfg;
int ret = 0, num_entries, max_entries, map_idx_start;
/* Find our entry at the current level */
map_idx_start = ARM_LPAE_LVL_IDX(iova, lvl, data);
ptep += map_idx_start;
/* If we can install a leaf entry at this level, then do so */
if (size == block_size) {
max_entries = arm_lpae_max_entries(map_idx_start, data);
num_entries = min_t(int, pgcount, max_entries);
ret = arm_lpae_init_pte(data, iova, paddr, prot, lvl, num_entries, ptep);
if (!ret)
*mapped += num_entries * size;
return ret;
}
/* We can't allocate tables at the final level */
if (WARN_ON(lvl >= ARM_LPAE_MAX_LEVELS - 1))
return -EINVAL;
/* Grab a pointer to the next level */
pte = READ_ONCE(*ptep);
if (!pte) {
iommu: Extend LPAE page table format to support custom allocators We need that in order to implement the VM_BIND ioctl in the GPU driver targeting new Mali GPUs. VM_BIND is about executing MMU map/unmap requests asynchronously, possibly after waiting for external dependencies encoded as dma_fences. We intend to use the drm_sched framework to automate the dependency tracking and VM job dequeuing logic, but this comes with its own set of constraints, one of them being the fact we are not allowed to allocate memory in the drm_gpu_scheduler_ops::run_job() to avoid this sort of deadlocks: - VM_BIND map job needs to allocate a page table to map some memory to the VM. No memory available, so kswapd is kicked - GPU driver shrinker backend ends up waiting on the fence attached to the VM map job or any other job fence depending on this VM operation. With custom allocators, we will be able to pre-reserve enough pages to guarantee the map/unmap operations we queued will take place without going through the system allocator. But we can also optimize allocation/reservation by not free-ing pages immediately, so any upcoming page table allocation requests can be serviced by some free page table pool kept at the driver level. I might also be valuable for other aspects of GPU and similar use-cases, like fine-grained memory accounting and resource limiting. Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com> Reviewed-by: Steven Price <steven.price@arm.com> Reviewed-by: Robin Murphy <robin.murphy@arm.com> Link: https://lore.kernel.org/r/20231124142434.1577550-3-boris.brezillon@collabora.com Signed-off-by: Joerg Roedel <jroedel@suse.de>
2023-11-24 15:24:34 +01:00
cptep = __arm_lpae_alloc_pages(tblsz, gfp, cfg, data->iop.cookie);
if (!cptep)
return -ENOMEM;
pte = arm_lpae_install_table(cptep, ptep, 0, data);
if (pte)
iommu: Extend LPAE page table format to support custom allocators We need that in order to implement the VM_BIND ioctl in the GPU driver targeting new Mali GPUs. VM_BIND is about executing MMU map/unmap requests asynchronously, possibly after waiting for external dependencies encoded as dma_fences. We intend to use the drm_sched framework to automate the dependency tracking and VM job dequeuing logic, but this comes with its own set of constraints, one of them being the fact we are not allowed to allocate memory in the drm_gpu_scheduler_ops::run_job() to avoid this sort of deadlocks: - VM_BIND map job needs to allocate a page table to map some memory to the VM. No memory available, so kswapd is kicked - GPU driver shrinker backend ends up waiting on the fence attached to the VM map job or any other job fence depending on this VM operation. With custom allocators, we will be able to pre-reserve enough pages to guarantee the map/unmap operations we queued will take place without going through the system allocator. But we can also optimize allocation/reservation by not free-ing pages immediately, so any upcoming page table allocation requests can be serviced by some free page table pool kept at the driver level. I might also be valuable for other aspects of GPU and similar use-cases, like fine-grained memory accounting and resource limiting. Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com> Reviewed-by: Steven Price <steven.price@arm.com> Reviewed-by: Robin Murphy <robin.murphy@arm.com> Link: https://lore.kernel.org/r/20231124142434.1577550-3-boris.brezillon@collabora.com Signed-off-by: Joerg Roedel <jroedel@suse.de>
2023-11-24 15:24:34 +01:00
__arm_lpae_free_pages(cptep, tblsz, cfg, data->iop.cookie);
} else if (!cfg->coherent_walk && !(pte & ARM_LPAE_PTE_SW_SYNC)) {
__arm_lpae_sync_pte(ptep, 1, cfg);
}
if (pte && !iopte_leaf(pte, lvl, data->iop.fmt)) {
cptep = iopte_deref(pte, data);
} else if (pte) {
/* We require an unmap first */
WARN_ON(!(cfg->quirks & IO_PGTABLE_QUIRK_NO_WARN));
return -EEXIST;
}
/* Rinse, repeat */
return __arm_lpae_map(data, iova, paddr, size, pgcount, prot, lvl + 1,
cptep, gfp, mapped);
}
static arm_lpae_iopte arm_lpae_prot_to_pte(struct arm_lpae_io_pgtable *data,
int prot)
{
arm_lpae_iopte pte;
if (data->iop.fmt == ARM_64_LPAE_S1 ||
data->iop.fmt == ARM_32_LPAE_S1) {
pte = ARM_LPAE_PTE_nG;
if (!(prot & IOMMU_WRITE) && (prot & IOMMU_READ))
pte |= ARM_LPAE_PTE_AP_RDONLY;
else if (data->iop.cfg.quirks & IO_PGTABLE_QUIRK_ARM_HD)
pte |= ARM_LPAE_PTE_DBM;
if (!(prot & IOMMU_PRIV))
pte |= ARM_LPAE_PTE_AP_UNPRIV;
} else {
pte = ARM_LPAE_PTE_HAP_FAULT;
if (prot & IOMMU_READ)
pte |= ARM_LPAE_PTE_HAP_READ;
if (prot & IOMMU_WRITE)
pte |= ARM_LPAE_PTE_HAP_WRITE;
}
/*
* Note that this logic is structured to accommodate Mali LPAE
* having stage-1-like attributes but stage-2-like permissions.
*/
if (data->iop.fmt == ARM_64_LPAE_S2 ||
data->iop.fmt == ARM_32_LPAE_S2) {
if (prot & IOMMU_MMIO) {
pte |= ARM_LPAE_PTE_MEMATTR_DEV;
} else if (prot & IOMMU_CACHE) {
if (data->iop.cfg.quirks & IO_PGTABLE_QUIRK_ARM_S2FWB)
pte |= ARM_LPAE_PTE_MEMATTR_FWB_WB;
else
pte |= ARM_LPAE_PTE_MEMATTR_OIWB;
} else {
pte |= ARM_LPAE_PTE_MEMATTR_NC;
}
} else {
if (prot & IOMMU_MMIO)
pte |= (ARM_LPAE_MAIR_ATTR_IDX_DEV
<< ARM_LPAE_PTE_ATTRINDX_SHIFT);
else if (prot & IOMMU_CACHE)
pte |= (ARM_LPAE_MAIR_ATTR_IDX_CACHE
<< ARM_LPAE_PTE_ATTRINDX_SHIFT);
}
/*
* Also Mali has its own notions of shareability wherein its Inner
* domain covers the cores within the GPU, and its Outer domain is
* "outside the GPU" (i.e. either the Inner or System domain in CPU
* terms, depending on coherency).
*/
if (prot & IOMMU_CACHE && data->iop.fmt != ARM_MALI_LPAE)
pte |= ARM_LPAE_PTE_SH_IS;
else
pte |= ARM_LPAE_PTE_SH_OS;
if (prot & IOMMU_NOEXEC)
pte |= ARM_LPAE_PTE_XN;
if (data->iop.cfg.quirks & IO_PGTABLE_QUIRK_ARM_NS)
pte |= ARM_LPAE_PTE_NS;
if (data->iop.fmt != ARM_MALI_LPAE)
pte |= ARM_LPAE_PTE_AF;
return pte;
}
static int arm_lpae_map_pages(struct io_pgtable_ops *ops, unsigned long iova,
phys_addr_t paddr, size_t pgsize, size_t pgcount,
int iommu_prot, gfp_t gfp, size_t *mapped)
{
struct arm_lpae_io_pgtable *data = io_pgtable_ops_to_data(ops);
struct io_pgtable_cfg *cfg = &data->iop.cfg;
arm_lpae_iopte *ptep = data->pgd;
int ret, lvl = data->start_level;
arm_lpae_iopte prot;
long iaext = (s64)iova >> cfg->ias;
if (WARN_ON(!pgsize || (pgsize & cfg->pgsize_bitmap) != pgsize))
return -EINVAL;
if (cfg->quirks & IO_PGTABLE_QUIRK_ARM_TTBR1)
iaext = ~iaext;
if (WARN_ON(iaext || paddr >> cfg->oas))
return -ERANGE;
if (!(iommu_prot & (IOMMU_READ | IOMMU_WRITE)))
return -EINVAL;
prot = arm_lpae_prot_to_pte(data, iommu_prot);
ret = __arm_lpae_map(data, iova, paddr, pgsize, pgcount, prot, lvl,
ptep, gfp, mapped);
/*
* Synchronise all PTE updates for the new mapping before there's
* a chance for anything to kick off a table walk for the new iova.
*/
wmb();
return ret;
}
static void __arm_lpae_free_pgtable(struct arm_lpae_io_pgtable *data, int lvl,
arm_lpae_iopte *ptep)
{
arm_lpae_iopte *start, *end;
unsigned long table_size;
if (lvl == data->start_level)
table_size = ARM_LPAE_PGD_SIZE(data);
else
table_size = ARM_LPAE_GRANULE(data);
start = ptep;
/* Only leaf entries at the last level */
if (lvl == ARM_LPAE_MAX_LEVELS - 1)
end = ptep;
else
end = (void *)ptep + table_size;
while (ptep != end) {
arm_lpae_iopte pte = *ptep++;
if (!pte || iopte_leaf(pte, lvl, data->iop.fmt))
continue;
__arm_lpae_free_pgtable(data, lvl + 1, iopte_deref(pte, data));
}
iommu: Extend LPAE page table format to support custom allocators We need that in order to implement the VM_BIND ioctl in the GPU driver targeting new Mali GPUs. VM_BIND is about executing MMU map/unmap requests asynchronously, possibly after waiting for external dependencies encoded as dma_fences. We intend to use the drm_sched framework to automate the dependency tracking and VM job dequeuing logic, but this comes with its own set of constraints, one of them being the fact we are not allowed to allocate memory in the drm_gpu_scheduler_ops::run_job() to avoid this sort of deadlocks: - VM_BIND map job needs to allocate a page table to map some memory to the VM. No memory available, so kswapd is kicked - GPU driver shrinker backend ends up waiting on the fence attached to the VM map job or any other job fence depending on this VM operation. With custom allocators, we will be able to pre-reserve enough pages to guarantee the map/unmap operations we queued will take place without going through the system allocator. But we can also optimize allocation/reservation by not free-ing pages immediately, so any upcoming page table allocation requests can be serviced by some free page table pool kept at the driver level. I might also be valuable for other aspects of GPU and similar use-cases, like fine-grained memory accounting and resource limiting. Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com> Reviewed-by: Steven Price <steven.price@arm.com> Reviewed-by: Robin Murphy <robin.murphy@arm.com> Link: https://lore.kernel.org/r/20231124142434.1577550-3-boris.brezillon@collabora.com Signed-off-by: Joerg Roedel <jroedel@suse.de>
2023-11-24 15:24:34 +01:00
__arm_lpae_free_pages(start, table_size, &data->iop.cfg, data->iop.cookie);
}
static void arm_lpae_free_pgtable(struct io_pgtable *iop)
{
struct arm_lpae_io_pgtable *data = io_pgtable_to_data(iop);
__arm_lpae_free_pgtable(data, data->start_level, data->pgd);
kfree(data);
}
static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data,
struct iommu_iotlb_gather *gather,
unsigned long iova, size_t size, size_t pgcount,
int lvl, arm_lpae_iopte *ptep)
{
arm_lpae_iopte pte;
struct io_pgtable *iop = &data->iop;
int i = 0, num_entries, max_entries, unmap_idx_start;
/* Something went horribly wrong and we ran out of page table */
if (WARN_ON(lvl == ARM_LPAE_MAX_LEVELS))
return 0;
unmap_idx_start = ARM_LPAE_LVL_IDX(iova, lvl, data);
ptep += unmap_idx_start;
pte = READ_ONCE(*ptep);
if (!pte) {
WARN_ON(!(data->iop.cfg.quirks & IO_PGTABLE_QUIRK_NO_WARN));
return -ENOENT;
}
/* If the size matches this level, we're in the right place */
if (size == ARM_LPAE_BLOCK_SIZE(lvl, data)) {
max_entries = arm_lpae_max_entries(unmap_idx_start, data);
num_entries = min_t(int, pgcount, max_entries);
/* Find and handle non-leaf entries */
for (i = 0; i < num_entries; i++) {
pte = READ_ONCE(ptep[i]);
if (!pte) {
WARN_ON(!(data->iop.cfg.quirks & IO_PGTABLE_QUIRK_NO_WARN));
break;
}
if (!iopte_leaf(pte, lvl, iop->fmt)) {
__arm_lpae_clear_pte(&ptep[i], &iop->cfg, 1);
/* Also flush any partial walks */
io_pgtable_tlb_flush_walk(iop, iova + i * size, size,
ARM_LPAE_GRANULE(data));
__arm_lpae_free_pgtable(data, lvl + 1, iopte_deref(pte, data));
}
}
/* Clear the remaining entries */
__arm_lpae_clear_pte(ptep, &iop->cfg, i);
if (gather && !iommu_iotlb_gather_queued(gather))
for (int j = 0; j < i; j++)
io_pgtable_tlb_add_page(iop, gather, iova + j * size, size);
return i * size;
} else if (iopte_leaf(pte, lvl, iop->fmt)) {
iommu/io-pgtable-arm: Remove split on unmap behavior A minority of page table implementations (arm_lpae, armv7) are unique in how they handle partial unmap of large IOPTEs. Other implementations will unmap the large IOPTE and return it's length. For example if a 2M IOPTE is present and the first 4K is requested to be unmapped then unmap will remove the whole 2M and report 2M as the result. arm_lpae instead replaces the IOPTE with a table of smaller IOPTEs, unmaps the 4K and returns 4k. This is actually an illegal/non-hitless operation on at least SMMUv3 because of the BBM level 0 rules. Will says this was done to support VFIO, but upon deeper analysis this was never strictly necessary: https://lore.kernel.org/all/20241024134411.GA6956@nvidia.com/ In summary, historical VFIO supported the AMD behavior of unmapping the whole large IOPTE and returning the size, even if asked to unmap a portion. The driver would see this as a request to split a large IOPTE. Modern VFIO always unmaps entire large IOPTEs (except on AMD) and drivers don't see an IOPTE split. Given it doesn't work fully correctly on SMMUv3 and relying on ARM unique behavior would create portability problems across IOMMU drivers, retire this functionality. Outside the iommu users, this will potentially effect io_pgtable users of ARM_32_LPAE_S1, ARM_32_LPAE_S2, ARM_64_LPAE_S1, ARM_64_LPAE_S2, and ARM_MALI_LPAE formats. Cc: Boris Brezillon <boris.brezillon@collabora.com> Cc: Steven Price <steven.price@arm.com> Cc: Liviu Dudau <liviu.dudau@arm.com> Cc: dri-devel@lists.freedesktop.org Reviewed-by: Liviu Dudau <liviu.dudau@arm.com> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com> Link: https://lore.kernel.org/r/1-v3-b3a5b5937f56+7bb-arm_no_split_jgg@nvidia.com Signed-off-by: Will Deacon <will@kernel.org>
2024-11-05 14:14:24 -04:00
WARN_ONCE(true, "Unmap of a partial large IOPTE is not allowed");
return 0;
}
/* Keep on walkin' */
ptep = iopte_deref(pte, data);
return __arm_lpae_unmap(data, gather, iova, size, pgcount, lvl + 1, ptep);
}
static size_t arm_lpae_unmap_pages(struct io_pgtable_ops *ops, unsigned long iova,
size_t pgsize, size_t pgcount,
struct iommu_iotlb_gather *gather)
{
struct arm_lpae_io_pgtable *data = io_pgtable_ops_to_data(ops);
struct io_pgtable_cfg *cfg = &data->iop.cfg;
arm_lpae_iopte *ptep = data->pgd;
long iaext = (s64)iova >> cfg->ias;
if (WARN_ON(!pgsize || (pgsize & cfg->pgsize_bitmap) != pgsize || !pgcount))
return 0;
if (cfg->quirks & IO_PGTABLE_QUIRK_ARM_TTBR1)
iaext = ~iaext;
if (WARN_ON(iaext))
return 0;
return __arm_lpae_unmap(data, gather, iova, pgsize, pgcount,
data->start_level, ptep);
}
struct io_pgtable_walk_data {
struct io_pgtable *iop;
void *data;
int (*visit)(struct io_pgtable_walk_data *walk_data, int lvl,
arm_lpae_iopte *ptep, size_t size);
unsigned long flags;
u64 addr;
const u64 end;
};
static int __arm_lpae_iopte_walk(struct arm_lpae_io_pgtable *data,
struct io_pgtable_walk_data *walk_data,
arm_lpae_iopte *ptep,
int lvl);
struct iova_to_phys_data {
arm_lpae_iopte pte;
int lvl;
};
static int visit_iova_to_phys(struct io_pgtable_walk_data *walk_data, int lvl,
arm_lpae_iopte *ptep, size_t size)
{
struct iova_to_phys_data *data = walk_data->data;
data->pte = *ptep;
data->lvl = lvl;
return 0;
}
static phys_addr_t arm_lpae_iova_to_phys(struct io_pgtable_ops *ops,
unsigned long iova)
{
struct arm_lpae_io_pgtable *data = io_pgtable_ops_to_data(ops);
struct iova_to_phys_data d;
struct io_pgtable_walk_data walk_data = {
.data = &d,
.visit = visit_iova_to_phys,
.addr = iova,
.end = iova + 1,
};
int ret;
ret = __arm_lpae_iopte_walk(data, &walk_data, data->pgd, data->start_level);
if (ret)
return 0;
iova &= (ARM_LPAE_BLOCK_SIZE(d.lvl, data) - 1);
return iopte_to_paddr(d.pte, data) | iova;
}
static int visit_pgtable_walk(struct io_pgtable_walk_data *walk_data, int lvl,
arm_lpae_iopte *ptep, size_t size)
{
struct arm_lpae_io_pgtable_walk_data *data = walk_data->data;
data->ptes[lvl] = *ptep;
return 0;
}
static int arm_lpae_pgtable_walk(struct io_pgtable_ops *ops, unsigned long iova,
void *wd)
{
struct arm_lpae_io_pgtable *data = io_pgtable_ops_to_data(ops);
struct io_pgtable_walk_data walk_data = {
.data = wd,
.visit = visit_pgtable_walk,
.addr = iova,
.end = iova + 1,
};
return __arm_lpae_iopte_walk(data, &walk_data, data->pgd, data->start_level);
}
static int io_pgtable_visit(struct arm_lpae_io_pgtable *data,
struct io_pgtable_walk_data *walk_data,
arm_lpae_iopte *ptep, int lvl)
{
struct io_pgtable *iop = &data->iop;
arm_lpae_iopte pte = READ_ONCE(*ptep);
size_t size = ARM_LPAE_BLOCK_SIZE(lvl, data);
int ret = walk_data->visit(walk_data, lvl, ptep, size);
if (ret)
return ret;
if (iopte_leaf(pte, lvl, iop->fmt)) {
walk_data->addr += size;
return 0;
}
if (!iopte_table(pte, lvl)) {
return -EINVAL;
}
ptep = iopte_deref(pte, data);
return __arm_lpae_iopte_walk(data, walk_data, ptep, lvl + 1);
}
static int __arm_lpae_iopte_walk(struct arm_lpae_io_pgtable *data,
struct io_pgtable_walk_data *walk_data,
arm_lpae_iopte *ptep,
int lvl)
{
u32 idx;
int max_entries, ret;
if (WARN_ON(lvl == ARM_LPAE_MAX_LEVELS))
return -EINVAL;
if (lvl == data->start_level)
max_entries = ARM_LPAE_PGD_SIZE(data) / sizeof(arm_lpae_iopte);
else
max_entries = ARM_LPAE_PTES_PER_TABLE(data);
for (idx = ARM_LPAE_LVL_IDX(walk_data->addr, lvl, data);
(idx < max_entries) && (walk_data->addr < walk_data->end); ++idx) {
ret = io_pgtable_visit(data, walk_data, ptep + idx, lvl);
if (ret)
return ret;
}
return 0;
}
static int visit_dirty(struct io_pgtable_walk_data *walk_data, int lvl,
arm_lpae_iopte *ptep, size_t size)
{
struct iommu_dirty_bitmap *dirty = walk_data->data;
if (!iopte_leaf(*ptep, lvl, walk_data->iop->fmt))
return 0;
if (iopte_writeable_dirty(*ptep)) {
iommu_dirty_bitmap_record(dirty, walk_data->addr, size);
if (!(walk_data->flags & IOMMU_DIRTY_NO_CLEAR))
iopte_set_writeable_clean(ptep);
}
return 0;
}
static int arm_lpae_read_and_clear_dirty(struct io_pgtable_ops *ops,
unsigned long iova, size_t size,
unsigned long flags,
struct iommu_dirty_bitmap *dirty)
{
struct arm_lpae_io_pgtable *data = io_pgtable_ops_to_data(ops);
struct io_pgtable_cfg *cfg = &data->iop.cfg;
struct io_pgtable_walk_data walk_data = {
.iop = &data->iop,
.data = dirty,
.visit = visit_dirty,
.flags = flags,
.addr = iova,
.end = iova + size,
};
arm_lpae_iopte *ptep = data->pgd;
int lvl = data->start_level;
if (WARN_ON(!size))
return -EINVAL;
if (WARN_ON((iova + size - 1) & ~(BIT(cfg->ias) - 1)))
return -EINVAL;
if (data->iop.fmt != ARM_64_LPAE_S1)
return -EINVAL;
return __arm_lpae_iopte_walk(data, &walk_data, ptep, lvl);
}
static void arm_lpae_restrict_pgsizes(struct io_pgtable_cfg *cfg)
{
unsigned long granule, page_sizes;
unsigned int max_addr_bits = 48;
/*
* We need to restrict the supported page sizes to match the
* translation regime for a particular granule. Aim to match
* the CPU page size if possible, otherwise prefer smaller sizes.
* While we're at it, restrict the block sizes to match the
* chosen granule.
*/
if (cfg->pgsize_bitmap & PAGE_SIZE)
granule = PAGE_SIZE;
else if (cfg->pgsize_bitmap & ~PAGE_MASK)
granule = 1UL << __fls(cfg->pgsize_bitmap & ~PAGE_MASK);
else if (cfg->pgsize_bitmap & PAGE_MASK)
granule = 1UL << __ffs(cfg->pgsize_bitmap & PAGE_MASK);
else
granule = 0;
switch (granule) {
case SZ_4K:
page_sizes = (SZ_4K | SZ_2M | SZ_1G);
break;
case SZ_16K:
page_sizes = (SZ_16K | SZ_32M);
break;
case SZ_64K:
max_addr_bits = 52;
page_sizes = (SZ_64K | SZ_512M);
if (cfg->oas > 48)
page_sizes |= 1ULL << 42; /* 4TB */
break;
default:
page_sizes = 0;
}
cfg->pgsize_bitmap &= page_sizes;
cfg->ias = min(cfg->ias, max_addr_bits);
cfg->oas = min(cfg->oas, max_addr_bits);
}
static struct arm_lpae_io_pgtable *
arm_lpae_alloc_pgtable(struct io_pgtable_cfg *cfg)
{
struct arm_lpae_io_pgtable *data;
int levels, va_bits, pg_shift;
arm_lpae_restrict_pgsizes(cfg);
if (!(cfg->pgsize_bitmap & (SZ_4K | SZ_16K | SZ_64K)))
return NULL;
if (cfg->ias > ARM_LPAE_MAX_ADDR_BITS)
return NULL;
if (cfg->oas > ARM_LPAE_MAX_ADDR_BITS)
return NULL;
data = kmalloc(sizeof(*data), GFP_KERNEL);
if (!data)
return NULL;
pg_shift = __ffs(cfg->pgsize_bitmap);
data->bits_per_level = pg_shift - ilog2(sizeof(arm_lpae_iopte));
va_bits = cfg->ias - pg_shift;
levels = DIV_ROUND_UP(va_bits, data->bits_per_level);
data->start_level = ARM_LPAE_MAX_LEVELS - levels;
/* Calculate the actual size of our pgd (without concatenation) */
data->pgd_bits = va_bits - (data->bits_per_level * (levels - 1));
data->iop.ops = (struct io_pgtable_ops) {
.map_pages = arm_lpae_map_pages,
.unmap_pages = arm_lpae_unmap_pages,
.iova_to_phys = arm_lpae_iova_to_phys,
.read_and_clear_dirty = arm_lpae_read_and_clear_dirty,
.pgtable_walk = arm_lpae_pgtable_walk,
};
return data;
}
static struct io_pgtable *
arm_64_lpae_alloc_pgtable_s1(struct io_pgtable_cfg *cfg, void *cookie)
{
u64 reg;
struct arm_lpae_io_pgtable *data;
typeof(&cfg->arm_lpae_s1_cfg.tcr) tcr = &cfg->arm_lpae_s1_cfg.tcr;
bool tg1;
if (cfg->quirks & ~(IO_PGTABLE_QUIRK_ARM_NS |
IO_PGTABLE_QUIRK_ARM_TTBR1 |
IO_PGTABLE_QUIRK_ARM_OUTER_WBWA |
IO_PGTABLE_QUIRK_ARM_HD |
IO_PGTABLE_QUIRK_NO_WARN))
return NULL;
data = arm_lpae_alloc_pgtable(cfg);
if (!data)
return NULL;
/* TCR */
if (cfg->coherent_walk) {
tcr->sh = ARM_LPAE_TCR_SH_IS;
tcr->irgn = ARM_LPAE_TCR_RGN_WBWA;
tcr->orgn = ARM_LPAE_TCR_RGN_WBWA;
if (cfg->quirks & IO_PGTABLE_QUIRK_ARM_OUTER_WBWA)
goto out_free_data;
} else {
tcr->sh = ARM_LPAE_TCR_SH_OS;
tcr->irgn = ARM_LPAE_TCR_RGN_NC;
if (!(cfg->quirks & IO_PGTABLE_QUIRK_ARM_OUTER_WBWA))
tcr->orgn = ARM_LPAE_TCR_RGN_NC;
else
tcr->orgn = ARM_LPAE_TCR_RGN_WBWA;
}
tg1 = cfg->quirks & IO_PGTABLE_QUIRK_ARM_TTBR1;
switch (ARM_LPAE_GRANULE(data)) {
case SZ_4K:
tcr->tg = tg1 ? ARM_LPAE_TCR_TG1_4K : ARM_LPAE_TCR_TG0_4K;
break;
case SZ_16K:
tcr->tg = tg1 ? ARM_LPAE_TCR_TG1_16K : ARM_LPAE_TCR_TG0_16K;
break;
case SZ_64K:
tcr->tg = tg1 ? ARM_LPAE_TCR_TG1_64K : ARM_LPAE_TCR_TG0_64K;
break;
}
switch (cfg->oas) {
case 32:
tcr->ips = ARM_LPAE_TCR_PS_32_BIT;
break;
case 36:
tcr->ips = ARM_LPAE_TCR_PS_36_BIT;
break;
case 40:
tcr->ips = ARM_LPAE_TCR_PS_40_BIT;
break;
case 42:
tcr->ips = ARM_LPAE_TCR_PS_42_BIT;
break;
case 44:
tcr->ips = ARM_LPAE_TCR_PS_44_BIT;
break;
case 48:
tcr->ips = ARM_LPAE_TCR_PS_48_BIT;
break;
case 52:
tcr->ips = ARM_LPAE_TCR_PS_52_BIT;
break;
default:
goto out_free_data;
}
tcr->tsz = 64ULL - cfg->ias;
/* MAIRs */
reg = (ARM_LPAE_MAIR_ATTR_NC
<< ARM_LPAE_MAIR_ATTR_SHIFT(ARM_LPAE_MAIR_ATTR_IDX_NC)) |
(ARM_LPAE_MAIR_ATTR_WBRWA
<< ARM_LPAE_MAIR_ATTR_SHIFT(ARM_LPAE_MAIR_ATTR_IDX_CACHE)) |
(ARM_LPAE_MAIR_ATTR_DEVICE
iommu/io-pgtable-arm: Add support to use system cache Few Qualcomm platforms such as, sdm845 have an additional outer cache called as System cache, aka. Last level cache (LLC) that allows non-coherent devices to upgrade to using caching. This cache sits right before the DDR, and is tightly coupled with the memory controller. The clients using this cache request their slices from this system cache, make it active, and can then start using it. There is a fundamental assumption that non-coherent devices can't access caches. This change adds an exception where they *can* use some level of cache despite still being non-coherent overall. The coherent devices that use cacheable memory, and CPU make use of this system cache by default. Looking at memory types, we have following - a) Normal uncached :- MAIR 0x44, inner non-cacheable, outer non-cacheable; b) Normal cached :- MAIR 0xff, inner read write-back non-transient, outer read write-back non-transient; attribute setting for coherenet I/O devices. and, for non-coherent i/o devices that can allocate in system cache another type gets added - c) Normal sys-cached :- MAIR 0xf4, inner non-cacheable, outer read write-back non-transient Coherent I/O devices use system cache by marking the memory as normal cached. Non-coherent I/O devices should mark the memory as normal sys-cached in page tables to use system cache. Acked-by: Robin Murphy <robin.murphy@arm.com> Signed-off-by: Vivek Gautam <vivek.gautam@codeaurora.org> Signed-off-by: Will Deacon <will.deacon@arm.com>
2019-05-16 15:00:20 +05:30
<< ARM_LPAE_MAIR_ATTR_SHIFT(ARM_LPAE_MAIR_ATTR_IDX_DEV)) |
(ARM_LPAE_MAIR_ATTR_INC_OWBRWA
<< ARM_LPAE_MAIR_ATTR_SHIFT(ARM_LPAE_MAIR_ATTR_IDX_INC_OCACHE));
cfg->arm_lpae_s1_cfg.mair = reg;
/* Looking good; allocate a pgd */
data->pgd = __arm_lpae_alloc_pages(ARM_LPAE_PGD_SIZE(data),
iommu: Extend LPAE page table format to support custom allocators We need that in order to implement the VM_BIND ioctl in the GPU driver targeting new Mali GPUs. VM_BIND is about executing MMU map/unmap requests asynchronously, possibly after waiting for external dependencies encoded as dma_fences. We intend to use the drm_sched framework to automate the dependency tracking and VM job dequeuing logic, but this comes with its own set of constraints, one of them being the fact we are not allowed to allocate memory in the drm_gpu_scheduler_ops::run_job() to avoid this sort of deadlocks: - VM_BIND map job needs to allocate a page table to map some memory to the VM. No memory available, so kswapd is kicked - GPU driver shrinker backend ends up waiting on the fence attached to the VM map job or any other job fence depending on this VM operation. With custom allocators, we will be able to pre-reserve enough pages to guarantee the map/unmap operations we queued will take place without going through the system allocator. But we can also optimize allocation/reservation by not free-ing pages immediately, so any upcoming page table allocation requests can be serviced by some free page table pool kept at the driver level. I might also be valuable for other aspects of GPU and similar use-cases, like fine-grained memory accounting and resource limiting. Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com> Reviewed-by: Steven Price <steven.price@arm.com> Reviewed-by: Robin Murphy <robin.murphy@arm.com> Link: https://lore.kernel.org/r/20231124142434.1577550-3-boris.brezillon@collabora.com Signed-off-by: Joerg Roedel <jroedel@suse.de>
2023-11-24 15:24:34 +01:00
GFP_KERNEL, cfg, cookie);
if (!data->pgd)
goto out_free_data;
/* Ensure the empty pgd is visible before any actual TTBR write */
wmb();
/* TTBR */
cfg->arm_lpae_s1_cfg.ttbr = virt_to_phys(data->pgd);
return &data->iop;
out_free_data:
kfree(data);
return NULL;
}
static struct io_pgtable *
arm_64_lpae_alloc_pgtable_s2(struct io_pgtable_cfg *cfg, void *cookie)
{
u64 sl;
struct arm_lpae_io_pgtable *data;
typeof(&cfg->arm_lpae_s2_cfg.vtcr) vtcr = &cfg->arm_lpae_s2_cfg.vtcr;
if (cfg->quirks & ~(IO_PGTABLE_QUIRK_ARM_S2FWB |
IO_PGTABLE_QUIRK_NO_WARN))
return NULL;
data = arm_lpae_alloc_pgtable(cfg);
if (!data)
return NULL;
if (arm_lpae_concat_mandatory(cfg, data)) {
if (WARN_ON((ARM_LPAE_PGD_SIZE(data) / sizeof(arm_lpae_iopte)) >
ARM_LPAE_S2_MAX_CONCAT_PAGES))
return NULL;
data->pgd_bits += data->bits_per_level;
data->start_level++;
}
/* VTCR */
if (cfg->coherent_walk) {
vtcr->sh = ARM_LPAE_TCR_SH_IS;
vtcr->irgn = ARM_LPAE_TCR_RGN_WBWA;
vtcr->orgn = ARM_LPAE_TCR_RGN_WBWA;
} else {
vtcr->sh = ARM_LPAE_TCR_SH_OS;
vtcr->irgn = ARM_LPAE_TCR_RGN_NC;
vtcr->orgn = ARM_LPAE_TCR_RGN_NC;
}
sl = data->start_level;
switch (ARM_LPAE_GRANULE(data)) {
case SZ_4K:
vtcr->tg = ARM_LPAE_TCR_TG0_4K;
sl++; /* SL0 format is different for 4K granule size */
break;
case SZ_16K:
vtcr->tg = ARM_LPAE_TCR_TG0_16K;
break;
case SZ_64K:
vtcr->tg = ARM_LPAE_TCR_TG0_64K;
break;
}
switch (cfg->oas) {
case 32:
vtcr->ps = ARM_LPAE_TCR_PS_32_BIT;
break;
case 36:
vtcr->ps = ARM_LPAE_TCR_PS_36_BIT;
break;
case 40:
vtcr->ps = ARM_LPAE_TCR_PS_40_BIT;
break;
case 42:
vtcr->ps = ARM_LPAE_TCR_PS_42_BIT;
break;
case 44:
vtcr->ps = ARM_LPAE_TCR_PS_44_BIT;
break;
case 48:
vtcr->ps = ARM_LPAE_TCR_PS_48_BIT;
break;
case 52:
vtcr->ps = ARM_LPAE_TCR_PS_52_BIT;
break;
default:
goto out_free_data;
}
vtcr->tsz = 64ULL - cfg->ias;
vtcr->sl = ~sl & ARM_LPAE_VTCR_SL0_MASK;
/* Allocate pgd pages */
data->pgd = __arm_lpae_alloc_pages(ARM_LPAE_PGD_SIZE(data),
iommu: Extend LPAE page table format to support custom allocators We need that in order to implement the VM_BIND ioctl in the GPU driver targeting new Mali GPUs. VM_BIND is about executing MMU map/unmap requests asynchronously, possibly after waiting for external dependencies encoded as dma_fences. We intend to use the drm_sched framework to automate the dependency tracking and VM job dequeuing logic, but this comes with its own set of constraints, one of them being the fact we are not allowed to allocate memory in the drm_gpu_scheduler_ops::run_job() to avoid this sort of deadlocks: - VM_BIND map job needs to allocate a page table to map some memory to the VM. No memory available, so kswapd is kicked - GPU driver shrinker backend ends up waiting on the fence attached to the VM map job or any other job fence depending on this VM operation. With custom allocators, we will be able to pre-reserve enough pages to guarantee the map/unmap operations we queued will take place without going through the system allocator. But we can also optimize allocation/reservation by not free-ing pages immediately, so any upcoming page table allocation requests can be serviced by some free page table pool kept at the driver level. I might also be valuable for other aspects of GPU and similar use-cases, like fine-grained memory accounting and resource limiting. Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com> Reviewed-by: Steven Price <steven.price@arm.com> Reviewed-by: Robin Murphy <robin.murphy@arm.com> Link: https://lore.kernel.org/r/20231124142434.1577550-3-boris.brezillon@collabora.com Signed-off-by: Joerg Roedel <jroedel@suse.de>
2023-11-24 15:24:34 +01:00
GFP_KERNEL, cfg, cookie);
if (!data->pgd)
goto out_free_data;
/* Ensure the empty pgd is visible before any actual TTBR write */
wmb();
/* VTTBR */
cfg->arm_lpae_s2_cfg.vttbr = virt_to_phys(data->pgd);
return &data->iop;
out_free_data:
kfree(data);
return NULL;
}
static struct io_pgtable *
arm_32_lpae_alloc_pgtable_s1(struct io_pgtable_cfg *cfg, void *cookie)
{
if (cfg->ias > 32 || cfg->oas > 40)
return NULL;
cfg->pgsize_bitmap &= (SZ_4K | SZ_2M | SZ_1G);
return arm_64_lpae_alloc_pgtable_s1(cfg, cookie);
}
static struct io_pgtable *
arm_32_lpae_alloc_pgtable_s2(struct io_pgtable_cfg *cfg, void *cookie)
{
if (cfg->ias > 40 || cfg->oas > 40)
return NULL;
cfg->pgsize_bitmap &= (SZ_4K | SZ_2M | SZ_1G);
return arm_64_lpae_alloc_pgtable_s2(cfg, cookie);
}
static struct io_pgtable *
arm_mali_lpae_alloc_pgtable(struct io_pgtable_cfg *cfg, void *cookie)
{
struct arm_lpae_io_pgtable *data;
/* No quirks for Mali (hopefully) */
if (cfg->quirks)
return NULL;
if (cfg->ias > 48 || cfg->oas > 40)
return NULL;
cfg->pgsize_bitmap &= (SZ_4K | SZ_2M | SZ_1G);
data = arm_lpae_alloc_pgtable(cfg);
if (!data)
return NULL;
/* Mali seems to need a full 4-level table regardless of IAS */
if (data->start_level > 0) {
data->start_level = 0;
data->pgd_bits = 0;
}
/*
* MEMATTR: Mali has no actual notion of a non-cacheable type, so the
* best we can do is mimic the out-of-tree driver and hope that the
* "implementation-defined caching policy" is good enough. Similarly,
* we'll use it for the sake of a valid attribute for our 'device'
* index, although callers should never request that in practice.
*/
cfg->arm_mali_lpae_cfg.memattr =
(ARM_MALI_LPAE_MEMATTR_IMP_DEF
<< ARM_LPAE_MAIR_ATTR_SHIFT(ARM_LPAE_MAIR_ATTR_IDX_NC)) |
(ARM_MALI_LPAE_MEMATTR_WRITE_ALLOC
<< ARM_LPAE_MAIR_ATTR_SHIFT(ARM_LPAE_MAIR_ATTR_IDX_CACHE)) |
(ARM_MALI_LPAE_MEMATTR_IMP_DEF
<< ARM_LPAE_MAIR_ATTR_SHIFT(ARM_LPAE_MAIR_ATTR_IDX_DEV));
data->pgd = __arm_lpae_alloc_pages(ARM_LPAE_PGD_SIZE(data), GFP_KERNEL,
iommu: Extend LPAE page table format to support custom allocators We need that in order to implement the VM_BIND ioctl in the GPU driver targeting new Mali GPUs. VM_BIND is about executing MMU map/unmap requests asynchronously, possibly after waiting for external dependencies encoded as dma_fences. We intend to use the drm_sched framework to automate the dependency tracking and VM job dequeuing logic, but this comes with its own set of constraints, one of them being the fact we are not allowed to allocate memory in the drm_gpu_scheduler_ops::run_job() to avoid this sort of deadlocks: - VM_BIND map job needs to allocate a page table to map some memory to the VM. No memory available, so kswapd is kicked - GPU driver shrinker backend ends up waiting on the fence attached to the VM map job or any other job fence depending on this VM operation. With custom allocators, we will be able to pre-reserve enough pages to guarantee the map/unmap operations we queued will take place without going through the system allocator. But we can also optimize allocation/reservation by not free-ing pages immediately, so any upcoming page table allocation requests can be serviced by some free page table pool kept at the driver level. I might also be valuable for other aspects of GPU and similar use-cases, like fine-grained memory accounting and resource limiting. Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com> Reviewed-by: Steven Price <steven.price@arm.com> Reviewed-by: Robin Murphy <robin.murphy@arm.com> Link: https://lore.kernel.org/r/20231124142434.1577550-3-boris.brezillon@collabora.com Signed-off-by: Joerg Roedel <jroedel@suse.de>
2023-11-24 15:24:34 +01:00
cfg, cookie);
if (!data->pgd)
goto out_free_data;
/* Ensure the empty pgd is visible before TRANSTAB can be written */
wmb();
cfg->arm_mali_lpae_cfg.transtab = virt_to_phys(data->pgd) |
ARM_MALI_LPAE_TTBR_READ_INNER |
ARM_MALI_LPAE_TTBR_ADRMODE_TABLE;
if (cfg->coherent_walk)
cfg->arm_mali_lpae_cfg.transtab |= ARM_MALI_LPAE_TTBR_SHARE_OUTER;
return &data->iop;
out_free_data:
kfree(data);
return NULL;
}
struct io_pgtable_init_fns io_pgtable_arm_64_lpae_s1_init_fns = {
iommu: Extend LPAE page table format to support custom allocators We need that in order to implement the VM_BIND ioctl in the GPU driver targeting new Mali GPUs. VM_BIND is about executing MMU map/unmap requests asynchronously, possibly after waiting for external dependencies encoded as dma_fences. We intend to use the drm_sched framework to automate the dependency tracking and VM job dequeuing logic, but this comes with its own set of constraints, one of them being the fact we are not allowed to allocate memory in the drm_gpu_scheduler_ops::run_job() to avoid this sort of deadlocks: - VM_BIND map job needs to allocate a page table to map some memory to the VM. No memory available, so kswapd is kicked - GPU driver shrinker backend ends up waiting on the fence attached to the VM map job or any other job fence depending on this VM operation. With custom allocators, we will be able to pre-reserve enough pages to guarantee the map/unmap operations we queued will take place without going through the system allocator. But we can also optimize allocation/reservation by not free-ing pages immediately, so any upcoming page table allocation requests can be serviced by some free page table pool kept at the driver level. I might also be valuable for other aspects of GPU and similar use-cases, like fine-grained memory accounting and resource limiting. Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com> Reviewed-by: Steven Price <steven.price@arm.com> Reviewed-by: Robin Murphy <robin.murphy@arm.com> Link: https://lore.kernel.org/r/20231124142434.1577550-3-boris.brezillon@collabora.com Signed-off-by: Joerg Roedel <jroedel@suse.de>
2023-11-24 15:24:34 +01:00
.caps = IO_PGTABLE_CAP_CUSTOM_ALLOCATOR,
.alloc = arm_64_lpae_alloc_pgtable_s1,
.free = arm_lpae_free_pgtable,
};
struct io_pgtable_init_fns io_pgtable_arm_64_lpae_s2_init_fns = {
iommu: Extend LPAE page table format to support custom allocators We need that in order to implement the VM_BIND ioctl in the GPU driver targeting new Mali GPUs. VM_BIND is about executing MMU map/unmap requests asynchronously, possibly after waiting for external dependencies encoded as dma_fences. We intend to use the drm_sched framework to automate the dependency tracking and VM job dequeuing logic, but this comes with its own set of constraints, one of them being the fact we are not allowed to allocate memory in the drm_gpu_scheduler_ops::run_job() to avoid this sort of deadlocks: - VM_BIND map job needs to allocate a page table to map some memory to the VM. No memory available, so kswapd is kicked - GPU driver shrinker backend ends up waiting on the fence attached to the VM map job or any other job fence depending on this VM operation. With custom allocators, we will be able to pre-reserve enough pages to guarantee the map/unmap operations we queued will take place without going through the system allocator. But we can also optimize allocation/reservation by not free-ing pages immediately, so any upcoming page table allocation requests can be serviced by some free page table pool kept at the driver level. I might also be valuable for other aspects of GPU and similar use-cases, like fine-grained memory accounting and resource limiting. Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com> Reviewed-by: Steven Price <steven.price@arm.com> Reviewed-by: Robin Murphy <robin.murphy@arm.com> Link: https://lore.kernel.org/r/20231124142434.1577550-3-boris.brezillon@collabora.com Signed-off-by: Joerg Roedel <jroedel@suse.de>
2023-11-24 15:24:34 +01:00
.caps = IO_PGTABLE_CAP_CUSTOM_ALLOCATOR,
.alloc = arm_64_lpae_alloc_pgtable_s2,
.free = arm_lpae_free_pgtable,
};
struct io_pgtable_init_fns io_pgtable_arm_32_lpae_s1_init_fns = {
iommu: Extend LPAE page table format to support custom allocators We need that in order to implement the VM_BIND ioctl in the GPU driver targeting new Mali GPUs. VM_BIND is about executing MMU map/unmap requests asynchronously, possibly after waiting for external dependencies encoded as dma_fences. We intend to use the drm_sched framework to automate the dependency tracking and VM job dequeuing logic, but this comes with its own set of constraints, one of them being the fact we are not allowed to allocate memory in the drm_gpu_scheduler_ops::run_job() to avoid this sort of deadlocks: - VM_BIND map job needs to allocate a page table to map some memory to the VM. No memory available, so kswapd is kicked - GPU driver shrinker backend ends up waiting on the fence attached to the VM map job or any other job fence depending on this VM operation. With custom allocators, we will be able to pre-reserve enough pages to guarantee the map/unmap operations we queued will take place without going through the system allocator. But we can also optimize allocation/reservation by not free-ing pages immediately, so any upcoming page table allocation requests can be serviced by some free page table pool kept at the driver level. I might also be valuable for other aspects of GPU and similar use-cases, like fine-grained memory accounting and resource limiting. Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com> Reviewed-by: Steven Price <steven.price@arm.com> Reviewed-by: Robin Murphy <robin.murphy@arm.com> Link: https://lore.kernel.org/r/20231124142434.1577550-3-boris.brezillon@collabora.com Signed-off-by: Joerg Roedel <jroedel@suse.de>
2023-11-24 15:24:34 +01:00
.caps = IO_PGTABLE_CAP_CUSTOM_ALLOCATOR,
.alloc = arm_32_lpae_alloc_pgtable_s1,
.free = arm_lpae_free_pgtable,
};
struct io_pgtable_init_fns io_pgtable_arm_32_lpae_s2_init_fns = {
iommu: Extend LPAE page table format to support custom allocators We need that in order to implement the VM_BIND ioctl in the GPU driver targeting new Mali GPUs. VM_BIND is about executing MMU map/unmap requests asynchronously, possibly after waiting for external dependencies encoded as dma_fences. We intend to use the drm_sched framework to automate the dependency tracking and VM job dequeuing logic, but this comes with its own set of constraints, one of them being the fact we are not allowed to allocate memory in the drm_gpu_scheduler_ops::run_job() to avoid this sort of deadlocks: - VM_BIND map job needs to allocate a page table to map some memory to the VM. No memory available, so kswapd is kicked - GPU driver shrinker backend ends up waiting on the fence attached to the VM map job or any other job fence depending on this VM operation. With custom allocators, we will be able to pre-reserve enough pages to guarantee the map/unmap operations we queued will take place without going through the system allocator. But we can also optimize allocation/reservation by not free-ing pages immediately, so any upcoming page table allocation requests can be serviced by some free page table pool kept at the driver level. I might also be valuable for other aspects of GPU and similar use-cases, like fine-grained memory accounting and resource limiting. Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com> Reviewed-by: Steven Price <steven.price@arm.com> Reviewed-by: Robin Murphy <robin.murphy@arm.com> Link: https://lore.kernel.org/r/20231124142434.1577550-3-boris.brezillon@collabora.com Signed-off-by: Joerg Roedel <jroedel@suse.de>
2023-11-24 15:24:34 +01:00
.caps = IO_PGTABLE_CAP_CUSTOM_ALLOCATOR,
.alloc = arm_32_lpae_alloc_pgtable_s2,
.free = arm_lpae_free_pgtable,
};
struct io_pgtable_init_fns io_pgtable_arm_mali_lpae_init_fns = {
iommu: Extend LPAE page table format to support custom allocators We need that in order to implement the VM_BIND ioctl in the GPU driver targeting new Mali GPUs. VM_BIND is about executing MMU map/unmap requests asynchronously, possibly after waiting for external dependencies encoded as dma_fences. We intend to use the drm_sched framework to automate the dependency tracking and VM job dequeuing logic, but this comes with its own set of constraints, one of them being the fact we are not allowed to allocate memory in the drm_gpu_scheduler_ops::run_job() to avoid this sort of deadlocks: - VM_BIND map job needs to allocate a page table to map some memory to the VM. No memory available, so kswapd is kicked - GPU driver shrinker backend ends up waiting on the fence attached to the VM map job or any other job fence depending on this VM operation. With custom allocators, we will be able to pre-reserve enough pages to guarantee the map/unmap operations we queued will take place without going through the system allocator. But we can also optimize allocation/reservation by not free-ing pages immediately, so any upcoming page table allocation requests can be serviced by some free page table pool kept at the driver level. I might also be valuable for other aspects of GPU and similar use-cases, like fine-grained memory accounting and resource limiting. Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com> Reviewed-by: Steven Price <steven.price@arm.com> Reviewed-by: Robin Murphy <robin.murphy@arm.com> Link: https://lore.kernel.org/r/20231124142434.1577550-3-boris.brezillon@collabora.com Signed-off-by: Joerg Roedel <jroedel@suse.de>
2023-11-24 15:24:34 +01:00
.caps = IO_PGTABLE_CAP_CUSTOM_ALLOCATOR,
.alloc = arm_mali_lpae_alloc_pgtable,
.free = arm_lpae_free_pgtable,
};
#ifdef CONFIG_IOMMU_IO_PGTABLE_LPAE_SELFTEST
static struct io_pgtable_cfg *cfg_cookie __initdata;
static void __init dummy_tlb_flush_all(void *cookie)
{
WARN_ON(cookie != cfg_cookie);
}
static void __init dummy_tlb_flush(unsigned long iova, size_t size,
size_t granule, void *cookie)
{
WARN_ON(cookie != cfg_cookie);
WARN_ON(!(size & cfg_cookie->pgsize_bitmap));
}
static void __init dummy_tlb_add_page(struct iommu_iotlb_gather *gather,
unsigned long iova, size_t granule,
void *cookie)
{
dummy_tlb_flush(iova, granule, granule, cookie);
}
static const struct iommu_flush_ops dummy_tlb_ops __initconst = {
.tlb_flush_all = dummy_tlb_flush_all,
.tlb_flush_walk = dummy_tlb_flush,
.tlb_add_page = dummy_tlb_add_page,
};
static void __init arm_lpae_dump_ops(struct io_pgtable_ops *ops)
{
struct arm_lpae_io_pgtable *data = io_pgtable_ops_to_data(ops);
struct io_pgtable_cfg *cfg = &data->iop.cfg;
pr_err("cfg: pgsize_bitmap 0x%lx, ias %u-bit\n",
cfg->pgsize_bitmap, cfg->ias);
pr_err("data: %d levels, 0x%zx pgd_size, %u pg_shift, %u bits_per_level, pgd @ %p\n",
ARM_LPAE_MAX_LEVELS - data->start_level, ARM_LPAE_PGD_SIZE(data),
ilog2(ARM_LPAE_GRANULE(data)), data->bits_per_level, data->pgd);
}
#define __FAIL(ops, i) ({ \
WARN(1, "selftest: test failed for fmt idx %d\n", (i)); \
arm_lpae_dump_ops(ops); \
-EFAULT; \
})
static int __init arm_lpae_run_tests(struct io_pgtable_cfg *cfg)
{
static const enum io_pgtable_fmt fmts[] __initconst = {
ARM_64_LPAE_S1,
ARM_64_LPAE_S2,
};
int i, j;
unsigned long iova;
size_t size, mapped;
struct io_pgtable_ops *ops;
for (i = 0; i < ARRAY_SIZE(fmts); ++i) {
cfg_cookie = cfg;
ops = alloc_io_pgtable_ops(fmts[i], cfg, cfg);
if (!ops) {
pr_err("selftest: failed to allocate io pgtable ops\n");
return -ENOMEM;
}
/*
* Initial sanity checks.
* Empty page tables shouldn't provide any translations.
*/
if (ops->iova_to_phys(ops, 42))
return __FAIL(ops, i);
if (ops->iova_to_phys(ops, SZ_1G + 42))
return __FAIL(ops, i);
if (ops->iova_to_phys(ops, SZ_2G + 42))
return __FAIL(ops, i);
/*
* Distinct mappings of different granule sizes.
*/
iova = 0;
for_each_set_bit(j, &cfg->pgsize_bitmap, BITS_PER_LONG) {
size = 1UL << j;
if (ops->map_pages(ops, iova, iova, size, 1,
IOMMU_READ | IOMMU_WRITE |
IOMMU_NOEXEC | IOMMU_CACHE,
GFP_KERNEL, &mapped))
return __FAIL(ops, i);
/* Overlapping mappings */
if (!ops->map_pages(ops, iova, iova + size, size, 1,
IOMMU_READ | IOMMU_NOEXEC,
GFP_KERNEL, &mapped))
return __FAIL(ops, i);
if (ops->iova_to_phys(ops, iova + 42) != (iova + 42))
return __FAIL(ops, i);
iova += SZ_1G;
}
/* Full unmap */
iova = 0;
for_each_set_bit(j, &cfg->pgsize_bitmap, BITS_PER_LONG) {
size = 1UL << j;
if (ops->unmap_pages(ops, iova, size, 1, NULL) != size)
return __FAIL(ops, i);
if (ops->iova_to_phys(ops, iova + 42))
return __FAIL(ops, i);
/* Remap full block */
if (ops->map_pages(ops, iova, iova, size, 1,
IOMMU_WRITE, GFP_KERNEL, &mapped))
return __FAIL(ops, i);
if (ops->iova_to_phys(ops, iova + 42) != (iova + 42))
return __FAIL(ops, i);
iova += SZ_1G;
}
/*
* Map/unmap the last largest supported page of the IAS, this can
* trigger corner cases in the concatednated page tables.
*/
mapped = 0;
size = 1UL << __fls(cfg->pgsize_bitmap);
iova = (1UL << cfg->ias) - size;
if (ops->map_pages(ops, iova, iova, size, 1,
IOMMU_READ | IOMMU_WRITE |
IOMMU_NOEXEC | IOMMU_CACHE,
GFP_KERNEL, &mapped))
return __FAIL(ops, i);
if (mapped != size)
return __FAIL(ops, i);
if (ops->unmap_pages(ops, iova, size, 1, NULL) != size)
return __FAIL(ops, i);
free_io_pgtable_ops(ops);
}
return 0;
}
static int __init arm_lpae_do_selftests(void)
{
static const unsigned long pgsize[] __initconst = {
SZ_4K | SZ_2M | SZ_1G,
SZ_16K | SZ_32M,
SZ_64K | SZ_512M,
};
static const unsigned int address_size[] __initconst = {
32, 36, 40, 42, 44, 48,
};
int i, j, k, pass = 0, fail = 0;
struct faux_device *dev;
struct io_pgtable_cfg cfg = {
.tlb = &dummy_tlb_ops,
.coherent_walk = true,
.quirks = IO_PGTABLE_QUIRK_NO_WARN,
};
dev = faux_device_create("io-pgtable-test", NULL, 0);
if (!dev)
return -ENOMEM;
cfg.iommu_dev = &dev->dev;
for (i = 0; i < ARRAY_SIZE(pgsize); ++i) {
for (j = 0; j < ARRAY_SIZE(address_size); ++j) {
/* Don't use ias > oas as it is not valid for stage-2. */
for (k = 0; k <= j; ++k) {
cfg.pgsize_bitmap = pgsize[i];
cfg.ias = address_size[k];
cfg.oas = address_size[j];
pr_info("selftest: pgsize_bitmap 0x%08lx, IAS %u OAS %u\n",
pgsize[i], cfg.ias, cfg.oas);
if (arm_lpae_run_tests(&cfg))
fail++;
else
pass++;
}
}
}
pr_info("selftest: completed with %d PASS %d FAIL\n", pass, fail);
faux_device_destroy(dev);
return fail ? -EFAULT : 0;
}
subsys_initcall(arm_lpae_do_selftests);
#endif