2024-03-27 14:59:09 -06:00
|
|
|
// SPDX-License-Identifier: GPL-2.0
|
|
|
|
#include <linux/kernel.h>
|
|
|
|
#include <linux/init.h>
|
|
|
|
#include <linux/errno.h>
|
|
|
|
#include <linux/mm.h>
|
|
|
|
#include <linux/mman.h>
|
|
|
|
#include <linux/slab.h>
|
|
|
|
#include <linux/vmalloc.h>
|
|
|
|
#include <linux/io_uring.h>
|
|
|
|
#include <linux/io_uring_types.h>
|
|
|
|
#include <asm/shmparam.h>
|
|
|
|
|
|
|
|
#include "memmap.h"
|
|
|
|
#include "kbuf.h"
|
io_uring: introduce concept of memory regions
We've got a good number of mappings we share with the userspace, that
includes the main rings, provided buffer rings, upcoming rings for
zerocopy rx and more. All of them duplicate user argument parsing and
some internal details as well (page pinnning, huge page optimisations,
mmap'ing, etc.)
Introduce a notion of regions. For userspace for now it's just a new
structure called struct io_uring_region_desc which is supposed to
parameterise all such mapping / queue creations. A region either
represents a user provided chunk of memory, in which case the user_addr
field should point to it, or a request for the kernel to allocate the
memory, in which case the user would need to mmap it after using the
offset returned in the mmap_offset field. With a uniform userspace API
we can avoid additional boiler plate code and apply future optimisation
to all of them at once.
Internally, there is a new structure struct io_mapped_region holding all
relevant runtime information and some helpers to work with it. This
patch limits it to user provided regions.
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/0e6fe25818dfbaebd1bd90b870a6cac503fe1a24.1731689588.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2024-11-15 16:54:41 +00:00
|
|
|
#include "rsrc.h"
|
2024-03-27 14:59:09 -06:00
|
|
|
|
|
|
|
static void *io_mem_alloc_compound(struct page **pages, int nr_pages,
|
|
|
|
size_t size, gfp_t gfp)
|
|
|
|
{
|
|
|
|
struct page *page;
|
|
|
|
int i, order;
|
|
|
|
|
|
|
|
order = get_order(size);
|
|
|
|
if (order > MAX_PAGE_ORDER)
|
|
|
|
return ERR_PTR(-ENOMEM);
|
|
|
|
else if (order)
|
|
|
|
gfp |= __GFP_COMP;
|
|
|
|
|
|
|
|
page = alloc_pages(gfp, order);
|
|
|
|
if (!page)
|
|
|
|
return ERR_PTR(-ENOMEM);
|
|
|
|
|
|
|
|
for (i = 0; i < nr_pages; i++)
|
|
|
|
pages[i] = page + i;
|
|
|
|
|
|
|
|
return page_address(page);
|
|
|
|
}
|
|
|
|
|
|
|
|
struct page **io_pin_pages(unsigned long uaddr, unsigned long len, int *npages)
|
|
|
|
{
|
|
|
|
unsigned long start, end, nr_pages;
|
|
|
|
struct page **pages;
|
|
|
|
int ret;
|
|
|
|
|
2024-11-26 00:34:18 +00:00
|
|
|
if (check_add_overflow(uaddr, len, &end))
|
|
|
|
return ERR_PTR(-EOVERFLOW);
|
|
|
|
if (check_add_overflow(end, PAGE_SIZE - 1, &end))
|
|
|
|
return ERR_PTR(-EOVERFLOW);
|
|
|
|
|
|
|
|
end = end >> PAGE_SHIFT;
|
2024-03-27 14:59:09 -06:00
|
|
|
start = uaddr >> PAGE_SHIFT;
|
|
|
|
nr_pages = end - start;
|
|
|
|
if (WARN_ON_ONCE(!nr_pages))
|
|
|
|
return ERR_PTR(-EINVAL);
|
2024-11-15 16:54:38 +00:00
|
|
|
if (WARN_ON_ONCE(nr_pages > INT_MAX))
|
|
|
|
return ERR_PTR(-EOVERFLOW);
|
2024-03-27 14:59:09 -06:00
|
|
|
|
|
|
|
pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL);
|
|
|
|
if (!pages)
|
|
|
|
return ERR_PTR(-ENOMEM);
|
|
|
|
|
|
|
|
ret = pin_user_pages_fast(uaddr, nr_pages, FOLL_WRITE | FOLL_LONGTERM,
|
|
|
|
pages);
|
|
|
|
/* success, mapped all pages */
|
|
|
|
if (ret == nr_pages) {
|
|
|
|
*npages = nr_pages;
|
|
|
|
return pages;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* partial map, or didn't map anything */
|
|
|
|
if (ret >= 0) {
|
|
|
|
/* if we did partial map, release any pages we did get */
|
|
|
|
if (ret)
|
|
|
|
unpin_user_pages(pages, ret);
|
|
|
|
ret = -EFAULT;
|
|
|
|
}
|
|
|
|
kvfree(pages);
|
|
|
|
return ERR_PTR(ret);
|
|
|
|
}
|
|
|
|
|
2024-11-29 13:34:24 +00:00
|
|
|
enum {
|
|
|
|
/* memory was vmap'ed for the kernel, freeing the region vunmap's it */
|
|
|
|
IO_REGION_F_VMAP = 1,
|
2024-11-29 13:34:25 +00:00
|
|
|
/* memory is provided by user and pinned by the kernel */
|
|
|
|
IO_REGION_F_USER_PROVIDED = 2,
|
2024-11-29 13:34:30 +00:00
|
|
|
/* only the first page in the array is ref'ed */
|
|
|
|
IO_REGION_F_SINGLE_REF = 4,
|
2024-11-29 13:34:24 +00:00
|
|
|
};
|
|
|
|
|
io_uring: introduce concept of memory regions
We've got a good number of mappings we share with the userspace, that
includes the main rings, provided buffer rings, upcoming rings for
zerocopy rx and more. All of them duplicate user argument parsing and
some internal details as well (page pinnning, huge page optimisations,
mmap'ing, etc.)
Introduce a notion of regions. For userspace for now it's just a new
structure called struct io_uring_region_desc which is supposed to
parameterise all such mapping / queue creations. A region either
represents a user provided chunk of memory, in which case the user_addr
field should point to it, or a request for the kernel to allocate the
memory, in which case the user would need to mmap it after using the
offset returned in the mmap_offset field. With a uniform userspace API
we can avoid additional boiler plate code and apply future optimisation
to all of them at once.
Internally, there is a new structure struct io_mapped_region holding all
relevant runtime information and some helpers to work with it. This
patch limits it to user provided regions.
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/0e6fe25818dfbaebd1bd90b870a6cac503fe1a24.1731689588.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2024-11-15 16:54:41 +00:00
|
|
|
void io_free_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr)
|
|
|
|
{
|
|
|
|
if (mr->pages) {
|
2024-11-29 13:34:30 +00:00
|
|
|
long nr_refs = mr->nr_pages;
|
|
|
|
|
|
|
|
if (mr->flags & IO_REGION_F_SINGLE_REF)
|
|
|
|
nr_refs = 1;
|
|
|
|
|
2024-11-29 13:34:25 +00:00
|
|
|
if (mr->flags & IO_REGION_F_USER_PROVIDED)
|
2024-11-29 13:34:30 +00:00
|
|
|
unpin_user_pages(mr->pages, nr_refs);
|
2024-11-29 13:34:25 +00:00
|
|
|
else
|
2024-11-29 13:34:30 +00:00
|
|
|
release_pages(mr->pages, nr_refs);
|
|
|
|
|
io_uring: introduce concept of memory regions
We've got a good number of mappings we share with the userspace, that
includes the main rings, provided buffer rings, upcoming rings for
zerocopy rx and more. All of them duplicate user argument parsing and
some internal details as well (page pinnning, huge page optimisations,
mmap'ing, etc.)
Introduce a notion of regions. For userspace for now it's just a new
structure called struct io_uring_region_desc which is supposed to
parameterise all such mapping / queue creations. A region either
represents a user provided chunk of memory, in which case the user_addr
field should point to it, or a request for the kernel to allocate the
memory, in which case the user would need to mmap it after using the
offset returned in the mmap_offset field. With a uniform userspace API
we can avoid additional boiler plate code and apply future optimisation
to all of them at once.
Internally, there is a new structure struct io_mapped_region holding all
relevant runtime information and some helpers to work with it. This
patch limits it to user provided regions.
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/0e6fe25818dfbaebd1bd90b870a6cac503fe1a24.1731689588.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2024-11-15 16:54:41 +00:00
|
|
|
kvfree(mr->pages);
|
|
|
|
}
|
2024-11-29 13:34:24 +00:00
|
|
|
if ((mr->flags & IO_REGION_F_VMAP) && mr->ptr)
|
|
|
|
vunmap(mr->ptr);
|
io_uring: introduce concept of memory regions
We've got a good number of mappings we share with the userspace, that
includes the main rings, provided buffer rings, upcoming rings for
zerocopy rx and more. All of them duplicate user argument parsing and
some internal details as well (page pinnning, huge page optimisations,
mmap'ing, etc.)
Introduce a notion of regions. For userspace for now it's just a new
structure called struct io_uring_region_desc which is supposed to
parameterise all such mapping / queue creations. A region either
represents a user provided chunk of memory, in which case the user_addr
field should point to it, or a request for the kernel to allocate the
memory, in which case the user would need to mmap it after using the
offset returned in the mmap_offset field. With a uniform userspace API
we can avoid additional boiler plate code and apply future optimisation
to all of them at once.
Internally, there is a new structure struct io_mapped_region holding all
relevant runtime information and some helpers to work with it. This
patch limits it to user provided regions.
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/0e6fe25818dfbaebd1bd90b870a6cac503fe1a24.1731689588.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2024-11-15 16:54:41 +00:00
|
|
|
if (mr->nr_pages && ctx->user)
|
|
|
|
__io_unaccount_mem(ctx->user, mr->nr_pages);
|
|
|
|
|
|
|
|
memset(mr, 0, sizeof(*mr));
|
|
|
|
}
|
|
|
|
|
2024-11-29 13:34:28 +00:00
|
|
|
static int io_region_init_ptr(struct io_mapped_region *mr)
|
|
|
|
{
|
|
|
|
struct io_imu_folio_data ifd;
|
|
|
|
void *ptr;
|
|
|
|
|
|
|
|
if (io_check_coalesce_buffer(mr->pages, mr->nr_pages, &ifd)) {
|
|
|
|
if (ifd.nr_folios == 1) {
|
|
|
|
mr->ptr = page_address(mr->pages[0]);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
ptr = vmap(mr->pages, mr->nr_pages, VM_MAP, PAGE_KERNEL);
|
|
|
|
if (!ptr)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
mr->ptr = ptr;
|
|
|
|
mr->flags |= IO_REGION_F_VMAP;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2024-11-29 13:34:29 +00:00
|
|
|
static int io_region_pin_pages(struct io_ring_ctx *ctx,
|
|
|
|
struct io_mapped_region *mr,
|
|
|
|
struct io_uring_region_desc *reg)
|
|
|
|
{
|
|
|
|
unsigned long size = mr->nr_pages << PAGE_SHIFT;
|
|
|
|
struct page **pages;
|
|
|
|
int nr_pages;
|
|
|
|
|
|
|
|
pages = io_pin_pages(reg->user_addr, size, &nr_pages);
|
|
|
|
if (IS_ERR(pages))
|
|
|
|
return PTR_ERR(pages);
|
|
|
|
if (WARN_ON_ONCE(nr_pages != mr->nr_pages))
|
|
|
|
return -EFAULT;
|
|
|
|
|
|
|
|
mr->pages = pages;
|
|
|
|
mr->flags |= IO_REGION_F_USER_PROVIDED;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2024-11-29 13:34:31 +00:00
|
|
|
static int io_region_allocate_pages(struct io_ring_ctx *ctx,
|
|
|
|
struct io_mapped_region *mr,
|
2024-11-29 13:34:32 +00:00
|
|
|
struct io_uring_region_desc *reg,
|
|
|
|
unsigned long mmap_offset)
|
2024-11-29 13:34:31 +00:00
|
|
|
{
|
|
|
|
gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN;
|
|
|
|
unsigned long size = mr->nr_pages << PAGE_SHIFT;
|
|
|
|
unsigned long nr_allocated;
|
|
|
|
struct page **pages;
|
|
|
|
void *p;
|
|
|
|
|
|
|
|
pages = kvmalloc_array(mr->nr_pages, sizeof(*pages), gfp);
|
|
|
|
if (!pages)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
p = io_mem_alloc_compound(pages, mr->nr_pages, size, gfp);
|
|
|
|
if (!IS_ERR(p)) {
|
|
|
|
mr->flags |= IO_REGION_F_SINGLE_REF;
|
2024-11-29 13:34:32 +00:00
|
|
|
goto done;
|
2024-11-29 13:34:31 +00:00
|
|
|
}
|
|
|
|
|
The various patchsets are summarized below. Plus of course many
indivudual patches which are described in their changelogs.
- "Allocate and free frozen pages" from Matthew Wilcox reorganizes the
page allocator so we end up with the ability to allocate and free
zero-refcount pages. So that callers (ie, slab) can avoid a refcount
inc & dec.
- "Support large folios for tmpfs" from Baolin Wang teaches tmpfs to use
large folios other than PMD-sized ones.
- "Fix mm/rodata_test" from Petr Tesarik performs some maintenance and
fixes for this small built-in kernel selftest.
- "mas_anode_descend() related cleanup" from Wei Yang tidies up part of
the mapletree code.
- "mm: fix format issues and param types" from Keren Sun implements a
few minor code cleanups.
- "simplify split calculation" from Wei Yang provides a few fixes and a
test for the mapletree code.
- "mm/vma: make more mmap logic userland testable" from Lorenzo Stoakes
continues the work of moving vma-related code into the (relatively) new
mm/vma.c.
- "mm/page_alloc: gfp flags cleanups for alloc_contig_*()" from David
Hildenbrand cleans up and rationalizes handling of gfp flags in the page
allocator.
- "readahead: Reintroduce fix for improper RA window sizing" from Jan
Kara is a second attempt at fixing a readahead window sizing issue. It
should reduce the amount of unnecessary reading.
- "synchronously scan and reclaim empty user PTE pages" from Qi Zheng
addresses an issue where "huge" amounts of pte pagetables are
accumulated
(https://lore.kernel.org/lkml/cover.1718267194.git.zhengqi.arch@bytedance.com/).
Qi's series addresses this windup by synchronously freeing PTE memory
within the context of madvise(MADV_DONTNEED).
- "selftest/mm: Remove warnings found by adding compiler flags" from
Muhammad Usama Anjum fixes some build warnings in the selftests code
when optional compiler warnings are enabled.
- "mm: don't use __GFP_HARDWALL when migrating remote pages" from David
Hildenbrand tightens the allocator's observance of __GFP_HARDWALL.
- "pkeys kselftests improvements" from Kevin Brodsky implements various
fixes and cleanups in the MM selftests code, mainly pertaining to the
pkeys tests.
- "mm/damon: add sample modules" from SeongJae Park enhances DAMON to
estimate application working set size.
- "memcg/hugetlb: Rework memcg hugetlb charging" from Joshua Hahn
provides some cleanups to memcg's hugetlb charging logic.
- "mm/swap_cgroup: remove global swap cgroup lock" from Kairui Song
removes the global swap cgroup lock. A speedup of 10% for a tmpfs-based
kernel build was demonstrated.
- "zram: split page type read/write handling" from Sergey Senozhatsky
has several fixes and cleaups for zram in the area of zram_write_page().
A watchdog softlockup warning was eliminated.
- "move pagetable_*_dtor() to __tlb_remove_table()" from Kevin Brodsky
cleans up the pagetable destructor implementations. A rare
use-after-free race is fixed.
- "mm/debug: introduce and use VM_WARN_ON_VMG()" from Lorenzo Stoakes
simplifies and cleans up the debugging code in the VMA merging logic.
- "Account page tables at all levels" from Kevin Brodsky cleans up and
regularizes the pagetable ctor/dtor handling. This results in
improvements in accounting accuracy.
- "mm/damon: replace most damon_callback usages in sysfs with new core
functions" from SeongJae Park cleans up and generalizes DAMON's sysfs
file interface logic.
- "mm/damon: enable page level properties based monitoring" from
SeongJae Park increases the amount of information which is presented in
response to DAMOS actions.
- "mm/damon: remove DAMON debugfs interface" from SeongJae Park removes
DAMON's long-deprecated debugfs interfaces. Thus the migration to sysfs
is completed.
- "mm/hugetlb: Refactor hugetlb allocation resv accounting" from Peter
Xu cleans up and generalizes the hugetlb reservation accounting.
- "mm: alloc_pages_bulk: small API refactor" from Luiz Capitulino
removes a never-used feature of the alloc_pages_bulk() interface.
- "mm/damon: extend DAMOS filters for inclusion" from SeongJae Park
extends DAMOS filters to support not only exclusion (rejecting), but
also inclusion (allowing) behavior.
- "Add zpdesc memory descriptor for zswap.zpool" from Alex Shi
"introduces a new memory descriptor for zswap.zpool that currently
overlaps with struct page for now. This is part of the effort to reduce
the size of struct page and to enable dynamic allocation of memory
descriptors."
- "mm, swap: rework of swap allocator locks" from Kairui Song redoes and
simplifies the swap allocator locking. A speedup of 400% was
demonstrated for one workload. As was a 35% reduction for kernel build
time with swap-on-zram.
- "mm: update mips to use do_mmap(), make mmap_region() internal" from
Lorenzo Stoakes reworks MIPS's use of mmap_region() so that
mmap_region() can be made MM-internal.
- "mm/mglru: performance optimizations" from Yu Zhao fixes a few MGLRU
regressions and otherwise improves MGLRU performance.
- "Docs/mm/damon: add tuning guide and misc updates" from SeongJae Park
updates DAMON documentation.
- "Cleanup for memfd_create()" from Isaac Manjarres does that thing.
- "mm: hugetlb+THP folio and migration cleanups" from David Hildenbrand
provides various cleanups in the areas of hugetlb folios, THP folios and
migration.
- "Uncached buffered IO" from Jens Axboe implements the new
RWF_DONTCACHE flag which provides synchronous dropbehind for pagecache
reading and writing. To permite userspace to address issues with
massive buildup of useless pagecache when reading/writing fast devices.
- "selftests/mm: virtual_address_range: Reduce memory" from Thomas
Weißschuh fixes and optimizes some of the MM selftests.
-----BEGIN PGP SIGNATURE-----
iHUEABYKAB0WIQTTMBEPP41GrTpTJgfdBJ7gKXxAjgUCZ5a+cwAKCRDdBJ7gKXxA
jtoyAP9R58oaOKPJuTizEKKXvh/RpMyD6sYcz/uPpnf+cKTZxQEAqfVznfWlw/Lz
uC3KRZYhmd5YrxU4o+qjbzp9XWX/xAE=
=Ib2s
-----END PGP SIGNATURE-----
Merge tag 'mm-stable-2025-01-26-14-59' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Pull MM updates from Andrew Morton:
"The various patchsets are summarized below. Plus of course many
indivudual patches which are described in their changelogs.
- "Allocate and free frozen pages" from Matthew Wilcox reorganizes
the page allocator so we end up with the ability to allocate and
free zero-refcount pages. So that callers (ie, slab) can avoid a
refcount inc & dec
- "Support large folios for tmpfs" from Baolin Wang teaches tmpfs to
use large folios other than PMD-sized ones
- "Fix mm/rodata_test" from Petr Tesarik performs some maintenance
and fixes for this small built-in kernel selftest
- "mas_anode_descend() related cleanup" from Wei Yang tidies up part
of the mapletree code
- "mm: fix format issues and param types" from Keren Sun implements a
few minor code cleanups
- "simplify split calculation" from Wei Yang provides a few fixes and
a test for the mapletree code
- "mm/vma: make more mmap logic userland testable" from Lorenzo
Stoakes continues the work of moving vma-related code into the
(relatively) new mm/vma.c
- "mm/page_alloc: gfp flags cleanups for alloc_contig_*()" from David
Hildenbrand cleans up and rationalizes handling of gfp flags in the
page allocator
- "readahead: Reintroduce fix for improper RA window sizing" from Jan
Kara is a second attempt at fixing a readahead window sizing issue.
It should reduce the amount of unnecessary reading
- "synchronously scan and reclaim empty user PTE pages" from Qi Zheng
addresses an issue where "huge" amounts of pte pagetables are
accumulated:
https://lore.kernel.org/lkml/cover.1718267194.git.zhengqi.arch@bytedance.com/
Qi's series addresses this windup by synchronously freeing PTE
memory within the context of madvise(MADV_DONTNEED)
- "selftest/mm: Remove warnings found by adding compiler flags" from
Muhammad Usama Anjum fixes some build warnings in the selftests
code when optional compiler warnings are enabled
- "mm: don't use __GFP_HARDWALL when migrating remote pages" from
David Hildenbrand tightens the allocator's observance of
__GFP_HARDWALL
- "pkeys kselftests improvements" from Kevin Brodsky implements
various fixes and cleanups in the MM selftests code, mainly
pertaining to the pkeys tests
- "mm/damon: add sample modules" from SeongJae Park enhances DAMON to
estimate application working set size
- "memcg/hugetlb: Rework memcg hugetlb charging" from Joshua Hahn
provides some cleanups to memcg's hugetlb charging logic
- "mm/swap_cgroup: remove global swap cgroup lock" from Kairui Song
removes the global swap cgroup lock. A speedup of 10% for a
tmpfs-based kernel build was demonstrated
- "zram: split page type read/write handling" from Sergey Senozhatsky
has several fixes and cleaups for zram in the area of
zram_write_page(). A watchdog softlockup warning was eliminated
- "move pagetable_*_dtor() to __tlb_remove_table()" from Kevin
Brodsky cleans up the pagetable destructor implementations. A rare
use-after-free race is fixed
- "mm/debug: introduce and use VM_WARN_ON_VMG()" from Lorenzo Stoakes
simplifies and cleans up the debugging code in the VMA merging
logic
- "Account page tables at all levels" from Kevin Brodsky cleans up
and regularizes the pagetable ctor/dtor handling. This results in
improvements in accounting accuracy
- "mm/damon: replace most damon_callback usages in sysfs with new
core functions" from SeongJae Park cleans up and generalizes
DAMON's sysfs file interface logic
- "mm/damon: enable page level properties based monitoring" from
SeongJae Park increases the amount of information which is
presented in response to DAMOS actions
- "mm/damon: remove DAMON debugfs interface" from SeongJae Park
removes DAMON's long-deprecated debugfs interfaces. Thus the
migration to sysfs is completed
- "mm/hugetlb: Refactor hugetlb allocation resv accounting" from
Peter Xu cleans up and generalizes the hugetlb reservation
accounting
- "mm: alloc_pages_bulk: small API refactor" from Luiz Capitulino
removes a never-used feature of the alloc_pages_bulk() interface
- "mm/damon: extend DAMOS filters for inclusion" from SeongJae Park
extends DAMOS filters to support not only exclusion (rejecting),
but also inclusion (allowing) behavior
- "Add zpdesc memory descriptor for zswap.zpool" from Alex Shi
introduces a new memory descriptor for zswap.zpool that currently
overlaps with struct page for now. This is part of the effort to
reduce the size of struct page and to enable dynamic allocation of
memory descriptors
- "mm, swap: rework of swap allocator locks" from Kairui Song redoes
and simplifies the swap allocator locking. A speedup of 400% was
demonstrated for one workload. As was a 35% reduction for kernel
build time with swap-on-zram
- "mm: update mips to use do_mmap(), make mmap_region() internal"
from Lorenzo Stoakes reworks MIPS's use of mmap_region() so that
mmap_region() can be made MM-internal
- "mm/mglru: performance optimizations" from Yu Zhao fixes a few
MGLRU regressions and otherwise improves MGLRU performance
- "Docs/mm/damon: add tuning guide and misc updates" from SeongJae
Park updates DAMON documentation
- "Cleanup for memfd_create()" from Isaac Manjarres does that thing
- "mm: hugetlb+THP folio and migration cleanups" from David
Hildenbrand provides various cleanups in the areas of hugetlb
folios, THP folios and migration
- "Uncached buffered IO" from Jens Axboe implements the new
RWF_DONTCACHE flag which provides synchronous dropbehind for
pagecache reading and writing. To permite userspace to address
issues with massive buildup of useless pagecache when
reading/writing fast devices
- "selftests/mm: virtual_address_range: Reduce memory" from Thomas
Weißschuh fixes and optimizes some of the MM selftests"
* tag 'mm-stable-2025-01-26-14-59' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (321 commits)
mm/compaction: fix UBSAN shift-out-of-bounds warning
s390/mm: add missing ctor/dtor on page table upgrade
kasan: sw_tags: use str_on_off() helper in kasan_init_sw_tags()
tools: add VM_WARN_ON_VMG definition
mm/damon/core: use str_high_low() helper in damos_wmark_wait_us()
seqlock: add missing parameter documentation for raw_seqcount_try_begin()
mm/page-writeback: consolidate wb_thresh bumping logic into __wb_calc_thresh
mm/page_alloc: remove the incorrect and misleading comment
zram: remove zcomp_stream_put() from write_incompressible_page()
mm: separate move/undo parts from migrate_pages_batch()
mm/kfence: use str_write_read() helper in get_access_type()
selftests/mm/mkdirty: fix memory leak in test_uffdio_copy()
kasan: hw_tags: Use str_on_off() helper in kasan_init_hw_tags()
selftests/mm: virtual_address_range: avoid reading from VM_IO mappings
selftests/mm: vm_util: split up /proc/self/smaps parsing
selftests/mm: virtual_address_range: unmap chunks after validation
selftests/mm: virtual_address_range: mmap() without PROT_WRITE
selftests/memfd/memfd_test: fix possible NULL pointer dereference
mm: add FGP_DONTCACHE folio creation flag
mm: call filemap_fdatawrite_range_kick() after IOCB_DONTCACHE issue
...
2025-01-26 18:36:23 -08:00
|
|
|
nr_allocated = alloc_pages_bulk_node(gfp, NUMA_NO_NODE,
|
|
|
|
mr->nr_pages, pages);
|
2024-11-29 13:34:31 +00:00
|
|
|
if (nr_allocated != mr->nr_pages) {
|
|
|
|
if (nr_allocated)
|
|
|
|
release_pages(pages, nr_allocated);
|
|
|
|
kvfree(pages);
|
|
|
|
return -ENOMEM;
|
|
|
|
}
|
2024-11-29 13:34:32 +00:00
|
|
|
done:
|
|
|
|
reg->mmap_offset = mmap_offset;
|
2024-11-29 13:34:31 +00:00
|
|
|
mr->pages = pages;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
io_uring: introduce concept of memory regions
We've got a good number of mappings we share with the userspace, that
includes the main rings, provided buffer rings, upcoming rings for
zerocopy rx and more. All of them duplicate user argument parsing and
some internal details as well (page pinnning, huge page optimisations,
mmap'ing, etc.)
Introduce a notion of regions. For userspace for now it's just a new
structure called struct io_uring_region_desc which is supposed to
parameterise all such mapping / queue creations. A region either
represents a user provided chunk of memory, in which case the user_addr
field should point to it, or a request for the kernel to allocate the
memory, in which case the user would need to mmap it after using the
offset returned in the mmap_offset field. With a uniform userspace API
we can avoid additional boiler plate code and apply future optimisation
to all of them at once.
Internally, there is a new structure struct io_mapped_region holding all
relevant runtime information and some helpers to work with it. This
patch limits it to user provided regions.
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/0e6fe25818dfbaebd1bd90b870a6cac503fe1a24.1731689588.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2024-11-15 16:54:41 +00:00
|
|
|
int io_create_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr,
|
2024-11-29 13:34:32 +00:00
|
|
|
struct io_uring_region_desc *reg,
|
|
|
|
unsigned long mmap_offset)
|
io_uring: introduce concept of memory regions
We've got a good number of mappings we share with the userspace, that
includes the main rings, provided buffer rings, upcoming rings for
zerocopy rx and more. All of them duplicate user argument parsing and
some internal details as well (page pinnning, huge page optimisations,
mmap'ing, etc.)
Introduce a notion of regions. For userspace for now it's just a new
structure called struct io_uring_region_desc which is supposed to
parameterise all such mapping / queue creations. A region either
represents a user provided chunk of memory, in which case the user_addr
field should point to it, or a request for the kernel to allocate the
memory, in which case the user would need to mmap it after using the
offset returned in the mmap_offset field. With a uniform userspace API
we can avoid additional boiler plate code and apply future optimisation
to all of them at once.
Internally, there is a new structure struct io_mapped_region holding all
relevant runtime information and some helpers to work with it. This
patch limits it to user provided regions.
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/0e6fe25818dfbaebd1bd90b870a6cac503fe1a24.1731689588.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2024-11-15 16:54:41 +00:00
|
|
|
{
|
|
|
|
int nr_pages, ret;
|
|
|
|
u64 end;
|
|
|
|
|
2024-11-29 13:34:24 +00:00
|
|
|
if (WARN_ON_ONCE(mr->pages || mr->ptr || mr->nr_pages))
|
io_uring: introduce concept of memory regions
We've got a good number of mappings we share with the userspace, that
includes the main rings, provided buffer rings, upcoming rings for
zerocopy rx and more. All of them duplicate user argument parsing and
some internal details as well (page pinnning, huge page optimisations,
mmap'ing, etc.)
Introduce a notion of regions. For userspace for now it's just a new
structure called struct io_uring_region_desc which is supposed to
parameterise all such mapping / queue creations. A region either
represents a user provided chunk of memory, in which case the user_addr
field should point to it, or a request for the kernel to allocate the
memory, in which case the user would need to mmap it after using the
offset returned in the mmap_offset field. With a uniform userspace API
we can avoid additional boiler plate code and apply future optimisation
to all of them at once.
Internally, there is a new structure struct io_mapped_region holding all
relevant runtime information and some helpers to work with it. This
patch limits it to user provided regions.
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/0e6fe25818dfbaebd1bd90b870a6cac503fe1a24.1731689588.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2024-11-15 16:54:41 +00:00
|
|
|
return -EFAULT;
|
|
|
|
if (memchr_inv(®->__resv, 0, sizeof(reg->__resv)))
|
|
|
|
return -EINVAL;
|
2024-11-29 13:34:31 +00:00
|
|
|
if (reg->flags & ~IORING_MEM_REGION_TYPE_USER)
|
io_uring: introduce concept of memory regions
We've got a good number of mappings we share with the userspace, that
includes the main rings, provided buffer rings, upcoming rings for
zerocopy rx and more. All of them duplicate user argument parsing and
some internal details as well (page pinnning, huge page optimisations,
mmap'ing, etc.)
Introduce a notion of regions. For userspace for now it's just a new
structure called struct io_uring_region_desc which is supposed to
parameterise all such mapping / queue creations. A region either
represents a user provided chunk of memory, in which case the user_addr
field should point to it, or a request for the kernel to allocate the
memory, in which case the user would need to mmap it after using the
offset returned in the mmap_offset field. With a uniform userspace API
we can avoid additional boiler plate code and apply future optimisation
to all of them at once.
Internally, there is a new structure struct io_mapped_region holding all
relevant runtime information and some helpers to work with it. This
patch limits it to user provided regions.
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/0e6fe25818dfbaebd1bd90b870a6cac503fe1a24.1731689588.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2024-11-15 16:54:41 +00:00
|
|
|
return -EINVAL;
|
2024-11-29 13:34:31 +00:00
|
|
|
/* user_addr should be set IFF it's a user memory backed region */
|
|
|
|
if ((reg->flags & IORING_MEM_REGION_TYPE_USER) != !!reg->user_addr)
|
io_uring: introduce concept of memory regions
We've got a good number of mappings we share with the userspace, that
includes the main rings, provided buffer rings, upcoming rings for
zerocopy rx and more. All of them duplicate user argument parsing and
some internal details as well (page pinnning, huge page optimisations,
mmap'ing, etc.)
Introduce a notion of regions. For userspace for now it's just a new
structure called struct io_uring_region_desc which is supposed to
parameterise all such mapping / queue creations. A region either
represents a user provided chunk of memory, in which case the user_addr
field should point to it, or a request for the kernel to allocate the
memory, in which case the user would need to mmap it after using the
offset returned in the mmap_offset field. With a uniform userspace API
we can avoid additional boiler plate code and apply future optimisation
to all of them at once.
Internally, there is a new structure struct io_mapped_region holding all
relevant runtime information and some helpers to work with it. This
patch limits it to user provided regions.
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/0e6fe25818dfbaebd1bd90b870a6cac503fe1a24.1731689588.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2024-11-15 16:54:41 +00:00
|
|
|
return -EFAULT;
|
|
|
|
if (!reg->size || reg->mmap_offset || reg->id)
|
|
|
|
return -EINVAL;
|
|
|
|
if ((reg->size >> PAGE_SHIFT) > INT_MAX)
|
2024-11-20 12:15:05 +03:00
|
|
|
return -E2BIG;
|
io_uring: introduce concept of memory regions
We've got a good number of mappings we share with the userspace, that
includes the main rings, provided buffer rings, upcoming rings for
zerocopy rx and more. All of them duplicate user argument parsing and
some internal details as well (page pinnning, huge page optimisations,
mmap'ing, etc.)
Introduce a notion of regions. For userspace for now it's just a new
structure called struct io_uring_region_desc which is supposed to
parameterise all such mapping / queue creations. A region either
represents a user provided chunk of memory, in which case the user_addr
field should point to it, or a request for the kernel to allocate the
memory, in which case the user would need to mmap it after using the
offset returned in the mmap_offset field. With a uniform userspace API
we can avoid additional boiler plate code and apply future optimisation
to all of them at once.
Internally, there is a new structure struct io_mapped_region holding all
relevant runtime information and some helpers to work with it. This
patch limits it to user provided regions.
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/0e6fe25818dfbaebd1bd90b870a6cac503fe1a24.1731689588.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2024-11-15 16:54:41 +00:00
|
|
|
if ((reg->user_addr | reg->size) & ~PAGE_MASK)
|
|
|
|
return -EINVAL;
|
|
|
|
if (check_add_overflow(reg->user_addr, reg->size, &end))
|
|
|
|
return -EOVERFLOW;
|
|
|
|
|
2024-11-29 13:34:26 +00:00
|
|
|
nr_pages = reg->size >> PAGE_SHIFT;
|
io_uring: introduce concept of memory regions
We've got a good number of mappings we share with the userspace, that
includes the main rings, provided buffer rings, upcoming rings for
zerocopy rx and more. All of them duplicate user argument parsing and
some internal details as well (page pinnning, huge page optimisations,
mmap'ing, etc.)
Introduce a notion of regions. For userspace for now it's just a new
structure called struct io_uring_region_desc which is supposed to
parameterise all such mapping / queue creations. A region either
represents a user provided chunk of memory, in which case the user_addr
field should point to it, or a request for the kernel to allocate the
memory, in which case the user would need to mmap it after using the
offset returned in the mmap_offset field. With a uniform userspace API
we can avoid additional boiler plate code and apply future optimisation
to all of them at once.
Internally, there is a new structure struct io_mapped_region holding all
relevant runtime information and some helpers to work with it. This
patch limits it to user provided regions.
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/0e6fe25818dfbaebd1bd90b870a6cac503fe1a24.1731689588.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2024-11-15 16:54:41 +00:00
|
|
|
if (ctx->user) {
|
|
|
|
ret = __io_account_mem(ctx->user, nr_pages);
|
|
|
|
if (ret)
|
2024-11-29 13:34:26 +00:00
|
|
|
return ret;
|
io_uring: introduce concept of memory regions
We've got a good number of mappings we share with the userspace, that
includes the main rings, provided buffer rings, upcoming rings for
zerocopy rx and more. All of them duplicate user argument parsing and
some internal details as well (page pinnning, huge page optimisations,
mmap'ing, etc.)
Introduce a notion of regions. For userspace for now it's just a new
structure called struct io_uring_region_desc which is supposed to
parameterise all such mapping / queue creations. A region either
represents a user provided chunk of memory, in which case the user_addr
field should point to it, or a request for the kernel to allocate the
memory, in which case the user would need to mmap it after using the
offset returned in the mmap_offset field. With a uniform userspace API
we can avoid additional boiler plate code and apply future optimisation
to all of them at once.
Internally, there is a new structure struct io_mapped_region holding all
relevant runtime information and some helpers to work with it. This
patch limits it to user provided regions.
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/0e6fe25818dfbaebd1bd90b870a6cac503fe1a24.1731689588.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2024-11-15 16:54:41 +00:00
|
|
|
}
|
2024-11-29 13:34:27 +00:00
|
|
|
mr->nr_pages = nr_pages;
|
io_uring: introduce concept of memory regions
We've got a good number of mappings we share with the userspace, that
includes the main rings, provided buffer rings, upcoming rings for
zerocopy rx and more. All of them duplicate user argument parsing and
some internal details as well (page pinnning, huge page optimisations,
mmap'ing, etc.)
Introduce a notion of regions. For userspace for now it's just a new
structure called struct io_uring_region_desc which is supposed to
parameterise all such mapping / queue creations. A region either
represents a user provided chunk of memory, in which case the user_addr
field should point to it, or a request for the kernel to allocate the
memory, in which case the user would need to mmap it after using the
offset returned in the mmap_offset field. With a uniform userspace API
we can avoid additional boiler plate code and apply future optimisation
to all of them at once.
Internally, there is a new structure struct io_mapped_region holding all
relevant runtime information and some helpers to work with it. This
patch limits it to user provided regions.
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/0e6fe25818dfbaebd1bd90b870a6cac503fe1a24.1731689588.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2024-11-15 16:54:41 +00:00
|
|
|
|
2024-11-29 13:34:31 +00:00
|
|
|
if (reg->flags & IORING_MEM_REGION_TYPE_USER)
|
|
|
|
ret = io_region_pin_pages(ctx, mr, reg);
|
|
|
|
else
|
2024-11-29 13:34:32 +00:00
|
|
|
ret = io_region_allocate_pages(ctx, mr, reg, mmap_offset);
|
2024-11-29 13:34:29 +00:00
|
|
|
if (ret)
|
2024-11-29 13:34:26 +00:00
|
|
|
goto out_free;
|
|
|
|
|
2024-11-29 13:34:28 +00:00
|
|
|
ret = io_region_init_ptr(mr);
|
|
|
|
if (ret)
|
io_uring: introduce concept of memory regions
We've got a good number of mappings we share with the userspace, that
includes the main rings, provided buffer rings, upcoming rings for
zerocopy rx and more. All of them duplicate user argument parsing and
some internal details as well (page pinnning, huge page optimisations,
mmap'ing, etc.)
Introduce a notion of regions. For userspace for now it's just a new
structure called struct io_uring_region_desc which is supposed to
parameterise all such mapping / queue creations. A region either
represents a user provided chunk of memory, in which case the user_addr
field should point to it, or a request for the kernel to allocate the
memory, in which case the user would need to mmap it after using the
offset returned in the mmap_offset field. With a uniform userspace API
we can avoid additional boiler plate code and apply future optimisation
to all of them at once.
Internally, there is a new structure struct io_mapped_region holding all
relevant runtime information and some helpers to work with it. This
patch limits it to user provided regions.
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/0e6fe25818dfbaebd1bd90b870a6cac503fe1a24.1731689588.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2024-11-15 16:54:41 +00:00
|
|
|
goto out_free;
|
|
|
|
return 0;
|
|
|
|
out_free:
|
2024-11-29 13:34:27 +00:00
|
|
|
io_free_region(ctx, mr);
|
io_uring: introduce concept of memory regions
We've got a good number of mappings we share with the userspace, that
includes the main rings, provided buffer rings, upcoming rings for
zerocopy rx and more. All of them duplicate user argument parsing and
some internal details as well (page pinnning, huge page optimisations,
mmap'ing, etc.)
Introduce a notion of regions. For userspace for now it's just a new
structure called struct io_uring_region_desc which is supposed to
parameterise all such mapping / queue creations. A region either
represents a user provided chunk of memory, in which case the user_addr
field should point to it, or a request for the kernel to allocate the
memory, in which case the user would need to mmap it after using the
offset returned in the mmap_offset field. With a uniform userspace API
we can avoid additional boiler plate code and apply future optimisation
to all of them at once.
Internally, there is a new structure struct io_mapped_region holding all
relevant runtime information and some helpers to work with it. This
patch limits it to user provided regions.
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/0e6fe25818dfbaebd1bd90b870a6cac503fe1a24.1731689588.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2024-11-15 16:54:41 +00:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2024-11-29 13:34:32 +00:00
|
|
|
int io_create_region_mmap_safe(struct io_ring_ctx *ctx, struct io_mapped_region *mr,
|
|
|
|
struct io_uring_region_desc *reg,
|
|
|
|
unsigned long mmap_offset)
|
|
|
|
{
|
|
|
|
struct io_mapped_region tmp_mr;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
memcpy(&tmp_mr, mr, sizeof(tmp_mr));
|
|
|
|
ret = io_create_region(ctx, &tmp_mr, reg, mmap_offset);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Once published mmap can find it without holding only the ->mmap_lock
|
|
|
|
* and not ->uring_lock.
|
|
|
|
*/
|
|
|
|
guard(mutex)(&ctx->mmap_lock);
|
|
|
|
memcpy(mr, &tmp_mr, sizeof(tmp_mr));
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2024-11-29 13:34:39 +00:00
|
|
|
static struct io_mapped_region *io_mmap_get_region(struct io_ring_ctx *ctx,
|
|
|
|
loff_t pgoff)
|
|
|
|
{
|
|
|
|
loff_t offset = pgoff << PAGE_SHIFT;
|
|
|
|
unsigned int bgid;
|
|
|
|
|
|
|
|
switch (offset & IORING_OFF_MMAP_MASK) {
|
|
|
|
case IORING_OFF_SQ_RING:
|
|
|
|
case IORING_OFF_CQ_RING:
|
|
|
|
return &ctx->ring_region;
|
|
|
|
case IORING_OFF_SQES:
|
|
|
|
return &ctx->sq_region;
|
|
|
|
case IORING_OFF_PBUF_RING:
|
|
|
|
bgid = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT;
|
|
|
|
return io_pbuf_get_region(ctx, bgid);
|
|
|
|
case IORING_MAP_OFF_PARAM_REGION:
|
|
|
|
return &ctx->param_region;
|
|
|
|
}
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2024-11-29 13:34:32 +00:00
|
|
|
static void *io_region_validate_mmap(struct io_ring_ctx *ctx,
|
|
|
|
struct io_mapped_region *mr)
|
|
|
|
{
|
|
|
|
lockdep_assert_held(&ctx->mmap_lock);
|
|
|
|
|
|
|
|
if (!io_region_is_set(mr))
|
|
|
|
return ERR_PTR(-EINVAL);
|
|
|
|
if (mr->flags & IO_REGION_F_USER_PROVIDED)
|
|
|
|
return ERR_PTR(-EINVAL);
|
|
|
|
|
|
|
|
return io_region_get_ptr(mr);
|
|
|
|
}
|
|
|
|
|
2024-03-27 14:59:09 -06:00
|
|
|
static void *io_uring_validate_mmap_request(struct file *file, loff_t pgoff,
|
|
|
|
size_t sz)
|
|
|
|
{
|
|
|
|
struct io_ring_ctx *ctx = file->private_data;
|
2024-11-29 13:34:39 +00:00
|
|
|
struct io_mapped_region *region;
|
2024-03-27 14:59:09 -06:00
|
|
|
|
2024-11-29 13:34:39 +00:00
|
|
|
region = io_mmap_get_region(ctx, pgoff);
|
|
|
|
if (!region)
|
|
|
|
return ERR_PTR(-EINVAL);
|
|
|
|
return io_region_validate_mmap(ctx, region);
|
2024-03-27 14:59:09 -06:00
|
|
|
}
|
|
|
|
|
|
|
|
#ifdef CONFIG_MMU
|
|
|
|
|
2024-11-29 13:34:32 +00:00
|
|
|
static int io_region_mmap(struct io_ring_ctx *ctx,
|
|
|
|
struct io_mapped_region *mr,
|
2024-11-29 13:34:35 +00:00
|
|
|
struct vm_area_struct *vma,
|
|
|
|
unsigned max_pages)
|
2024-11-29 13:34:32 +00:00
|
|
|
{
|
2024-11-29 13:34:35 +00:00
|
|
|
unsigned long nr_pages = min(mr->nr_pages, max_pages);
|
2024-11-29 13:34:32 +00:00
|
|
|
|
|
|
|
vm_flags_set(vma, VM_DONTEXPAND);
|
|
|
|
return vm_insert_pages(vma, vma->vm_start, mr->pages, &nr_pages);
|
|
|
|
}
|
|
|
|
|
2024-03-27 14:59:09 -06:00
|
|
|
__cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
|
|
|
|
{
|
|
|
|
struct io_ring_ctx *ctx = file->private_data;
|
|
|
|
size_t sz = vma->vm_end - vma->vm_start;
|
|
|
|
long offset = vma->vm_pgoff << PAGE_SHIFT;
|
2024-11-29 13:34:39 +00:00
|
|
|
unsigned int page_limit = UINT_MAX;
|
|
|
|
struct io_mapped_region *region;
|
2024-03-27 14:59:09 -06:00
|
|
|
void *ptr;
|
|
|
|
|
2024-11-29 13:34:22 +00:00
|
|
|
guard(mutex)(&ctx->mmap_lock);
|
io_uring/register: add IORING_REGISTER_RESIZE_RINGS
Once a ring has been created, the size of the CQ and SQ rings are fixed.
Usually this isn't a problem on the SQ ring side, as it merely controls
the available number of requests that can be submitted in a single
system call, and there's rarely a need to change that.
For the CQ ring, it's a different story. For most efficient use of
io_uring, it's important that the CQ ring never overflows. This means
that applications must size it for the worst case scenario, which can
be wasteful.
Add IORING_REGISTER_RESIZE_RINGS, which allows an application to resize
the existing rings. It takes a struct io_uring_params argument, the same
one which is used to setup the ring initially, and resizes rings
according to the sizes given.
Certain properties are always inherited from the original ring setup,
like SQE128/CQE32 and other setup options. The implementation only
allows flag associated with how the CQ ring is sized and clamped.
Existing unconsumed SQE and CQE entries are copied as part of the
process. If either the SQ or CQ resized destination ring cannot hold the
entries already present in the source rings, then the operation is failed
with -EOVERFLOW. Any register op holds ->uring_lock, which prevents new
submissions, and the internal mapping holds the completion lock as well
across moving CQ ring state.
To prevent races between mmap and ring resizing, add a mutex that's
solely used to serialize ring resize and mmap. mmap_sem can't be used
here, as as fork'ed process may be doing mmaps on the ring as well.
The ctx->resize_lock is held across mmap operations, and the resize
will grab it before swapping out the already mapped new data.
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2024-10-21 13:34:10 -06:00
|
|
|
|
2024-03-27 14:59:09 -06:00
|
|
|
ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz);
|
|
|
|
if (IS_ERR(ptr))
|
|
|
|
return PTR_ERR(ptr);
|
|
|
|
|
|
|
|
switch (offset & IORING_OFF_MMAP_MASK) {
|
|
|
|
case IORING_OFF_SQ_RING:
|
|
|
|
case IORING_OFF_CQ_RING:
|
2024-11-29 13:34:35 +00:00
|
|
|
page_limit = (sz + PAGE_SIZE - 1) >> PAGE_SHIFT;
|
2024-11-29 13:34:39 +00:00
|
|
|
break;
|
2024-03-27 14:59:09 -06:00
|
|
|
}
|
|
|
|
|
2024-11-29 13:34:39 +00:00
|
|
|
region = io_mmap_get_region(ctx, vma->vm_pgoff);
|
|
|
|
return io_region_mmap(ctx, region, vma, page_limit);
|
2024-03-27 14:59:09 -06:00
|
|
|
}
|
|
|
|
|
|
|
|
unsigned long io_uring_get_unmapped_area(struct file *filp, unsigned long addr,
|
|
|
|
unsigned long len, unsigned long pgoff,
|
|
|
|
unsigned long flags)
|
|
|
|
{
|
io_uring/register: add IORING_REGISTER_RESIZE_RINGS
Once a ring has been created, the size of the CQ and SQ rings are fixed.
Usually this isn't a problem on the SQ ring side, as it merely controls
the available number of requests that can be submitted in a single
system call, and there's rarely a need to change that.
For the CQ ring, it's a different story. For most efficient use of
io_uring, it's important that the CQ ring never overflows. This means
that applications must size it for the worst case scenario, which can
be wasteful.
Add IORING_REGISTER_RESIZE_RINGS, which allows an application to resize
the existing rings. It takes a struct io_uring_params argument, the same
one which is used to setup the ring initially, and resizes rings
according to the sizes given.
Certain properties are always inherited from the original ring setup,
like SQE128/CQE32 and other setup options. The implementation only
allows flag associated with how the CQ ring is sized and clamped.
Existing unconsumed SQE and CQE entries are copied as part of the
process. If either the SQ or CQ resized destination ring cannot hold the
entries already present in the source rings, then the operation is failed
with -EOVERFLOW. Any register op holds ->uring_lock, which prevents new
submissions, and the internal mapping holds the completion lock as well
across moving CQ ring state.
To prevent races between mmap and ring resizing, add a mutex that's
solely used to serialize ring resize and mmap. mmap_sem can't be used
here, as as fork'ed process may be doing mmaps on the ring as well.
The ctx->resize_lock is held across mmap operations, and the resize
will grab it before swapping out the already mapped new data.
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2024-10-21 13:34:10 -06:00
|
|
|
struct io_ring_ctx *ctx = filp->private_data;
|
2024-03-27 14:59:09 -06:00
|
|
|
void *ptr;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Do not allow to map to user-provided address to avoid breaking the
|
|
|
|
* aliasing rules. Userspace is not able to guess the offset address of
|
|
|
|
* kernel kmalloc()ed memory area.
|
|
|
|
*/
|
|
|
|
if (addr)
|
|
|
|
return -EINVAL;
|
|
|
|
|
2024-11-29 13:34:22 +00:00
|
|
|
guard(mutex)(&ctx->mmap_lock);
|
io_uring/register: add IORING_REGISTER_RESIZE_RINGS
Once a ring has been created, the size of the CQ and SQ rings are fixed.
Usually this isn't a problem on the SQ ring side, as it merely controls
the available number of requests that can be submitted in a single
system call, and there's rarely a need to change that.
For the CQ ring, it's a different story. For most efficient use of
io_uring, it's important that the CQ ring never overflows. This means
that applications must size it for the worst case scenario, which can
be wasteful.
Add IORING_REGISTER_RESIZE_RINGS, which allows an application to resize
the existing rings. It takes a struct io_uring_params argument, the same
one which is used to setup the ring initially, and resizes rings
according to the sizes given.
Certain properties are always inherited from the original ring setup,
like SQE128/CQE32 and other setup options. The implementation only
allows flag associated with how the CQ ring is sized and clamped.
Existing unconsumed SQE and CQE entries are copied as part of the
process. If either the SQ or CQ resized destination ring cannot hold the
entries already present in the source rings, then the operation is failed
with -EOVERFLOW. Any register op holds ->uring_lock, which prevents new
submissions, and the internal mapping holds the completion lock as well
across moving CQ ring state.
To prevent races between mmap and ring resizing, add a mutex that's
solely used to serialize ring resize and mmap. mmap_sem can't be used
here, as as fork'ed process may be doing mmaps on the ring as well.
The ctx->resize_lock is held across mmap operations, and the resize
will grab it before swapping out the already mapped new data.
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2024-10-21 13:34:10 -06:00
|
|
|
|
2024-03-27 14:59:09 -06:00
|
|
|
ptr = io_uring_validate_mmap_request(filp, pgoff, len);
|
|
|
|
if (IS_ERR(ptr))
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Some architectures have strong cache aliasing requirements.
|
|
|
|
* For such architectures we need a coherent mapping which aliases
|
|
|
|
* kernel memory *and* userspace memory. To achieve that:
|
|
|
|
* - use a NULL file pointer to reference physical memory, and
|
|
|
|
* - use the kernel virtual address of the shared io_uring context
|
|
|
|
* (instead of the userspace-provided address, which has to be 0UL
|
|
|
|
* anyway).
|
|
|
|
* - use the same pgoff which the get_unmapped_area() uses to
|
|
|
|
* calculate the page colouring.
|
|
|
|
* For architectures without such aliasing requirements, the
|
|
|
|
* architecture will return any suitable mapping because addr is 0.
|
|
|
|
*/
|
|
|
|
filp = NULL;
|
|
|
|
flags |= MAP_SHARED;
|
|
|
|
pgoff = 0; /* has been translated to ptr above */
|
|
|
|
#ifdef SHM_COLOUR
|
|
|
|
addr = (uintptr_t) ptr;
|
|
|
|
pgoff = addr >> PAGE_SHIFT;
|
|
|
|
#else
|
|
|
|
addr = 0UL;
|
|
|
|
#endif
|
The usual shower of singleton fixes and minor series all over MM,
documented (hopefully adequately) in the respective changelogs. Notable
series include:
- Lucas Stach has provided some page-mapping
cleanup/consolidation/maintainability work in the series "mm/treewide:
Remove pXd_huge() API".
- In the series "Allow migrate on protnone reference with
MPOL_PREFERRED_MANY policy", Donet Tom has optimized mempolicy's
MPOL_PREFERRED_MANY mode, yielding almost doubled performance in one
test.
- In their series "Memory allocation profiling" Kent Overstreet and
Suren Baghdasaryan have contributed a means of determining (via
/proc/allocinfo) whereabouts in the kernel memory is being allocated:
number of calls and amount of memory.
- Matthew Wilcox has provided the series "Various significant MM
patches" which does a number of rather unrelated things, but in largely
similar code sites.
- In his series "mm: page_alloc: freelist migratetype hygiene" Johannes
Weiner has fixed the page allocator's handling of migratetype requests,
with resulting improvements in compaction efficiency.
- In the series "make the hugetlb migration strategy consistent" Baolin
Wang has fixed a hugetlb migration issue, which should improve hugetlb
allocation reliability.
- Liu Shixin has hit an I/O meltdown caused by readahead in a
memory-tight memcg. Addressed in the series "Fix I/O high when memory
almost met memcg limit".
- In the series "mm/filemap: optimize folio adding and splitting" Kairui
Song has optimized pagecache insertion, yielding ~10% performance
improvement in one test.
- Baoquan He has cleaned up and consolidated the early zone
initialization code in the series "mm/mm_init.c: refactor
free_area_init_core()".
- Baoquan has also redone some MM initializatio code in the series
"mm/init: minor clean up and improvement".
- MM helper cleanups from Christoph Hellwig in his series "remove
follow_pfn".
- More cleanups from Matthew Wilcox in the series "Various page->flags
cleanups".
- Vlastimil Babka has contributed maintainability improvements in the
series "memcg_kmem hooks refactoring".
- More folio conversions and cleanups in Matthew Wilcox's series
"Convert huge_zero_page to huge_zero_folio"
"khugepaged folio conversions"
"Remove page_idle and page_young wrappers"
"Use folio APIs in procfs"
"Clean up __folio_put()"
"Some cleanups for memory-failure"
"Remove page_mapping()"
"More folio compat code removal"
- David Hildenbrand chipped in with "fs/proc/task_mmu: convert hugetlb
functions to work on folis".
- Code consolidation and cleanup work related to GUP's handling of
hugetlbs in Peter Xu's series "mm/gup: Unify hugetlb, part 2".
- Rick Edgecombe has developed some fixes to stack guard gaps in the
series "Cover a guard gap corner case".
- Jinjiang Tu has fixed KSM's behaviour after a fork+exec in the series
"mm/ksm: fix ksm exec support for prctl".
- Baolin Wang has implemented NUMA balancing for multi-size THPs. This
is a simple first-cut implementation for now. The series is "support
multi-size THP numa balancing".
- Cleanups to vma handling helper functions from Matthew Wilcox in the
series "Unify vma_address and vma_pgoff_address".
- Some selftests maintenance work from Dev Jain in the series
"selftests/mm: mremap_test: Optimizations and style fixes".
- Improvements to the swapping of multi-size THPs from Ryan Roberts in
the series "Swap-out mTHP without splitting".
- Kefeng Wang has significantly optimized the handling of arm64's
permission page faults in the series
"arch/mm/fault: accelerate pagefault when badaccess"
"mm: remove arch's private VM_FAULT_BADMAP/BADACCESS"
- GUP cleanups from David Hildenbrand in "mm/gup: consistently call it
GUP-fast".
- hugetlb fault code cleanups from Vishal Moola in "Hugetlb fault path to
use struct vm_fault".
- selftests build fixes from John Hubbard in the series "Fix
selftests/mm build without requiring "make headers"".
- Memory tiering fixes/improvements from Ho-Ren (Jack) Chuang in the
series "Improved Memory Tier Creation for CPUless NUMA Nodes". Fixes
the initialization code so that migration between different memory types
works as intended.
- David Hildenbrand has improved follow_pte() and fixed an errant driver
in the series "mm: follow_pte() improvements and acrn follow_pte()
fixes".
- David also did some cleanup work on large folio mapcounts in his
series "mm: mapcount for large folios + page_mapcount() cleanups".
- Folio conversions in KSM in Alex Shi's series "transfer page to folio
in KSM".
- Barry Song has added some sysfs stats for monitoring multi-size THP's
in the series "mm: add per-order mTHP alloc and swpout counters".
- Some zswap cleanups from Yosry Ahmed in the series "zswap same-filled
and limit checking cleanups".
- Matthew Wilcox has been looking at buffer_head code and found the
documentation to be lacking. The series is "Improve buffer head
documentation".
- Multi-size THPs get more work, this time from Lance Yang. His series
"mm/madvise: enhance lazyfreeing with mTHP in madvise_free" optimizes
the freeing of these things.
- Kemeng Shi has added more userspace-visible writeback instrumentation
in the series "Improve visibility of writeback".
- Kemeng Shi then sent some maintenance work on top in the series "Fix
and cleanups to page-writeback".
- Matthew Wilcox reduces mmap_lock traffic in the anon vma code in the
series "Improve anon_vma scalability for anon VMAs". Intel's test bot
reported an improbable 3x improvement in one test.
- SeongJae Park adds some DAMON feature work in the series
"mm/damon: add a DAMOS filter type for page granularity access recheck"
"selftests/damon: add DAMOS quota goal test"
- Also some maintenance work in the series
"mm/damon/paddr: simplify page level access re-check for pageout"
"mm/damon: misc fixes and improvements"
- David Hildenbrand has disabled some known-to-fail selftests ni the
series "selftests: mm: cow: flag vmsplice() hugetlb tests as XFAIL".
- memcg metadata storage optimizations from Shakeel Butt in "memcg:
reduce memory consumption by memcg stats".
- DAX fixes and maintenance work from Vishal Verma in the series
"dax/bus.c: Fixups for dax-bus locking".
-----BEGIN PGP SIGNATURE-----
iHUEABYIAB0WIQTTMBEPP41GrTpTJgfdBJ7gKXxAjgUCZkgQYwAKCRDdBJ7gKXxA
jrdKAP9WVJdpEcXxpoub/vVE0UWGtffr8foifi9bCwrQrGh5mgEAx7Yf0+d/oBZB
nvA4E0DcPrUAFy144FNM0NTCb7u9vAw=
=V3R/
-----END PGP SIGNATURE-----
Merge tag 'mm-stable-2024-05-17-19-19' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Pull mm updates from Andrew Morton:
"The usual shower of singleton fixes and minor series all over MM,
documented (hopefully adequately) in the respective changelogs.
Notable series include:
- Lucas Stach has provided some page-mapping cleanup/consolidation/
maintainability work in the series "mm/treewide: Remove pXd_huge()
API".
- In the series "Allow migrate on protnone reference with
MPOL_PREFERRED_MANY policy", Donet Tom has optimized mempolicy's
MPOL_PREFERRED_MANY mode, yielding almost doubled performance in
one test.
- In their series "Memory allocation profiling" Kent Overstreet and
Suren Baghdasaryan have contributed a means of determining (via
/proc/allocinfo) whereabouts in the kernel memory is being
allocated: number of calls and amount of memory.
- Matthew Wilcox has provided the series "Various significant MM
patches" which does a number of rather unrelated things, but in
largely similar code sites.
- In his series "mm: page_alloc: freelist migratetype hygiene"
Johannes Weiner has fixed the page allocator's handling of
migratetype requests, with resulting improvements in compaction
efficiency.
- In the series "make the hugetlb migration strategy consistent"
Baolin Wang has fixed a hugetlb migration issue, which should
improve hugetlb allocation reliability.
- Liu Shixin has hit an I/O meltdown caused by readahead in a
memory-tight memcg. Addressed in the series "Fix I/O high when
memory almost met memcg limit".
- In the series "mm/filemap: optimize folio adding and splitting"
Kairui Song has optimized pagecache insertion, yielding ~10%
performance improvement in one test.
- Baoquan He has cleaned up and consolidated the early zone
initialization code in the series "mm/mm_init.c: refactor
free_area_init_core()".
- Baoquan has also redone some MM initializatio code in the series
"mm/init: minor clean up and improvement".
- MM helper cleanups from Christoph Hellwig in his series "remove
follow_pfn".
- More cleanups from Matthew Wilcox in the series "Various
page->flags cleanups".
- Vlastimil Babka has contributed maintainability improvements in the
series "memcg_kmem hooks refactoring".
- More folio conversions and cleanups in Matthew Wilcox's series:
"Convert huge_zero_page to huge_zero_folio"
"khugepaged folio conversions"
"Remove page_idle and page_young wrappers"
"Use folio APIs in procfs"
"Clean up __folio_put()"
"Some cleanups for memory-failure"
"Remove page_mapping()"
"More folio compat code removal"
- David Hildenbrand chipped in with "fs/proc/task_mmu: convert
hugetlb functions to work on folis".
- Code consolidation and cleanup work related to GUP's handling of
hugetlbs in Peter Xu's series "mm/gup: Unify hugetlb, part 2".
- Rick Edgecombe has developed some fixes to stack guard gaps in the
series "Cover a guard gap corner case".
- Jinjiang Tu has fixed KSM's behaviour after a fork+exec in the
series "mm/ksm: fix ksm exec support for prctl".
- Baolin Wang has implemented NUMA balancing for multi-size THPs.
This is a simple first-cut implementation for now. The series is
"support multi-size THP numa balancing".
- Cleanups to vma handling helper functions from Matthew Wilcox in
the series "Unify vma_address and vma_pgoff_address".
- Some selftests maintenance work from Dev Jain in the series
"selftests/mm: mremap_test: Optimizations and style fixes".
- Improvements to the swapping of multi-size THPs from Ryan Roberts
in the series "Swap-out mTHP without splitting".
- Kefeng Wang has significantly optimized the handling of arm64's
permission page faults in the series
"arch/mm/fault: accelerate pagefault when badaccess"
"mm: remove arch's private VM_FAULT_BADMAP/BADACCESS"
- GUP cleanups from David Hildenbrand in "mm/gup: consistently call
it GUP-fast".
- hugetlb fault code cleanups from Vishal Moola in "Hugetlb fault
path to use struct vm_fault".
- selftests build fixes from John Hubbard in the series "Fix
selftests/mm build without requiring "make headers"".
- Memory tiering fixes/improvements from Ho-Ren (Jack) Chuang in the
series "Improved Memory Tier Creation for CPUless NUMA Nodes".
Fixes the initialization code so that migration between different
memory types works as intended.
- David Hildenbrand has improved follow_pte() and fixed an errant
driver in the series "mm: follow_pte() improvements and acrn
follow_pte() fixes".
- David also did some cleanup work on large folio mapcounts in his
series "mm: mapcount for large folios + page_mapcount() cleanups".
- Folio conversions in KSM in Alex Shi's series "transfer page to
folio in KSM".
- Barry Song has added some sysfs stats for monitoring multi-size
THP's in the series "mm: add per-order mTHP alloc and swpout
counters".
- Some zswap cleanups from Yosry Ahmed in the series "zswap
same-filled and limit checking cleanups".
- Matthew Wilcox has been looking at buffer_head code and found the
documentation to be lacking. The series is "Improve buffer head
documentation".
- Multi-size THPs get more work, this time from Lance Yang. His
series "mm/madvise: enhance lazyfreeing with mTHP in madvise_free"
optimizes the freeing of these things.
- Kemeng Shi has added more userspace-visible writeback
instrumentation in the series "Improve visibility of writeback".
- Kemeng Shi then sent some maintenance work on top in the series
"Fix and cleanups to page-writeback".
- Matthew Wilcox reduces mmap_lock traffic in the anon vma code in
the series "Improve anon_vma scalability for anon VMAs". Intel's
test bot reported an improbable 3x improvement in one test.
- SeongJae Park adds some DAMON feature work in the series
"mm/damon: add a DAMOS filter type for page granularity access recheck"
"selftests/damon: add DAMOS quota goal test"
- Also some maintenance work in the series
"mm/damon/paddr: simplify page level access re-check for pageout"
"mm/damon: misc fixes and improvements"
- David Hildenbrand has disabled some known-to-fail selftests ni the
series "selftests: mm: cow: flag vmsplice() hugetlb tests as
XFAIL".
- memcg metadata storage optimizations from Shakeel Butt in "memcg:
reduce memory consumption by memcg stats".
- DAX fixes and maintenance work from Vishal Verma in the series
"dax/bus.c: Fixups for dax-bus locking""
* tag 'mm-stable-2024-05-17-19-19' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (426 commits)
memcg, oom: cleanup unused memcg_oom_gfp_mask and memcg_oom_order
selftests/mm: hugetlb_madv_vs_map: avoid test skipping by querying hugepage size at runtime
mm/hugetlb: add missing VM_FAULT_SET_HINDEX in hugetlb_wp
mm/hugetlb: add missing VM_FAULT_SET_HINDEX in hugetlb_fault
selftests: cgroup: add tests to verify the zswap writeback path
mm: memcg: make alloc_mem_cgroup_per_node_info() return bool
mm/damon/core: fix return value from damos_wmark_metric_value
mm: do not update memcg stats for NR_{FILE/SHMEM}_PMDMAPPED
selftests: cgroup: remove redundant enabling of memory controller
Docs/mm/damon/maintainer-profile: allow posting patches based on damon/next tree
Docs/mm/damon/maintainer-profile: change the maintainer's timezone from PST to PT
Docs/mm/damon/design: use a list for supported filters
Docs/admin-guide/mm/damon/usage: fix wrong schemes effective quota update command
Docs/admin-guide/mm/damon/usage: fix wrong example of DAMOS filter matching sysfs file
selftests/damon: classify tests for functionalities and regressions
selftests/damon/_damon_sysfs: use 'is' instead of '==' for 'None'
selftests/damon/_damon_sysfs: find sysfs mount point from /proc/mounts
selftests/damon/_damon_sysfs: check errors from nr_schemes file reads
mm/damon/core: initialize ->esz_bp from damos_quota_init_priv()
selftests/damon: add a test for DAMOS quota goal
...
2024-05-19 09:21:03 -07:00
|
|
|
return mm_get_unmapped_area(current->mm, filp, addr, len, pgoff, flags);
|
2024-03-27 14:59:09 -06:00
|
|
|
}
|
|
|
|
|
|
|
|
#else /* !CONFIG_MMU */
|
|
|
|
|
|
|
|
int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
|
|
|
|
{
|
|
|
|
return is_nommu_shared_mapping(vma->vm_flags) ? 0 : -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
unsigned int io_uring_nommu_mmap_capabilities(struct file *file)
|
|
|
|
{
|
|
|
|
return NOMMU_MAP_DIRECT | NOMMU_MAP_READ | NOMMU_MAP_WRITE;
|
|
|
|
}
|
|
|
|
|
|
|
|
unsigned long io_uring_get_unmapped_area(struct file *file, unsigned long addr,
|
|
|
|
unsigned long len, unsigned long pgoff,
|
|
|
|
unsigned long flags)
|
|
|
|
{
|
io_uring/register: add IORING_REGISTER_RESIZE_RINGS
Once a ring has been created, the size of the CQ and SQ rings are fixed.
Usually this isn't a problem on the SQ ring side, as it merely controls
the available number of requests that can be submitted in a single
system call, and there's rarely a need to change that.
For the CQ ring, it's a different story. For most efficient use of
io_uring, it's important that the CQ ring never overflows. This means
that applications must size it for the worst case scenario, which can
be wasteful.
Add IORING_REGISTER_RESIZE_RINGS, which allows an application to resize
the existing rings. It takes a struct io_uring_params argument, the same
one which is used to setup the ring initially, and resizes rings
according to the sizes given.
Certain properties are always inherited from the original ring setup,
like SQE128/CQE32 and other setup options. The implementation only
allows flag associated with how the CQ ring is sized and clamped.
Existing unconsumed SQE and CQE entries are copied as part of the
process. If either the SQ or CQ resized destination ring cannot hold the
entries already present in the source rings, then the operation is failed
with -EOVERFLOW. Any register op holds ->uring_lock, which prevents new
submissions, and the internal mapping holds the completion lock as well
across moving CQ ring state.
To prevent races between mmap and ring resizing, add a mutex that's
solely used to serialize ring resize and mmap. mmap_sem can't be used
here, as as fork'ed process may be doing mmaps on the ring as well.
The ctx->resize_lock is held across mmap operations, and the resize
will grab it before swapping out the already mapped new data.
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2024-10-21 13:34:10 -06:00
|
|
|
struct io_ring_ctx *ctx = file->private_data;
|
2024-03-27 14:59:09 -06:00
|
|
|
void *ptr;
|
|
|
|
|
2024-11-29 13:34:22 +00:00
|
|
|
guard(mutex)(&ctx->mmap_lock);
|
io_uring/register: add IORING_REGISTER_RESIZE_RINGS
Once a ring has been created, the size of the CQ and SQ rings are fixed.
Usually this isn't a problem on the SQ ring side, as it merely controls
the available number of requests that can be submitted in a single
system call, and there's rarely a need to change that.
For the CQ ring, it's a different story. For most efficient use of
io_uring, it's important that the CQ ring never overflows. This means
that applications must size it for the worst case scenario, which can
be wasteful.
Add IORING_REGISTER_RESIZE_RINGS, which allows an application to resize
the existing rings. It takes a struct io_uring_params argument, the same
one which is used to setup the ring initially, and resizes rings
according to the sizes given.
Certain properties are always inherited from the original ring setup,
like SQE128/CQE32 and other setup options. The implementation only
allows flag associated with how the CQ ring is sized and clamped.
Existing unconsumed SQE and CQE entries are copied as part of the
process. If either the SQ or CQ resized destination ring cannot hold the
entries already present in the source rings, then the operation is failed
with -EOVERFLOW. Any register op holds ->uring_lock, which prevents new
submissions, and the internal mapping holds the completion lock as well
across moving CQ ring state.
To prevent races between mmap and ring resizing, add a mutex that's
solely used to serialize ring resize and mmap. mmap_sem can't be used
here, as as fork'ed process may be doing mmaps on the ring as well.
The ctx->resize_lock is held across mmap operations, and the resize
will grab it before swapping out the already mapped new data.
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2024-10-21 13:34:10 -06:00
|
|
|
|
2024-03-27 14:59:09 -06:00
|
|
|
ptr = io_uring_validate_mmap_request(file, pgoff, len);
|
|
|
|
if (IS_ERR(ptr))
|
|
|
|
return PTR_ERR(ptr);
|
|
|
|
|
|
|
|
return (unsigned long) ptr;
|
|
|
|
}
|
|
|
|
|
|
|
|
#endif /* !CONFIG_MMU */
|