From 943d0609d0571af092dc13456cbca70351e4d20e Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 29 Nov 2024 13:34:22 +0000 Subject: [PATCH 01/56] io_uring: rename ->resize_lock ->resize_lock is used for resizing rings, but it's a good idea to reuse it in other cases as well. Rename it into mmap_lock as it's protects from races with mmap. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/68f705306f3ac4d2fb999eb80ea1615015ce9f7f.1732886067.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 2 +- io_uring/io_uring.c | 2 +- io_uring/memmap.c | 6 +++--- io_uring/register.c | 8 ++++---- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index fd4cdb0860a2..fafc1d779eb1 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -424,7 +424,7 @@ struct io_ring_ctx { * side will need to grab this lock, to prevent either side from * being run concurrently with the other. */ - struct mutex resize_lock; + struct mutex mmap_lock; /* * If IORING_SETUP_NO_MMAP is used, then the below holds diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index d3403c8216db..539fecde0244 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -350,7 +350,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) INIT_WQ_LIST(&ctx->submit_state.compl_reqs); INIT_HLIST_HEAD(&ctx->cancelable_uring_cmd); io_napi_init(ctx); - mutex_init(&ctx->resize_lock); + mutex_init(&ctx->mmap_lock); return ctx; diff --git a/io_uring/memmap.c b/io_uring/memmap.c index 57de9bccbf50..a0d4151d11af 100644 --- a/io_uring/memmap.c +++ b/io_uring/memmap.c @@ -329,7 +329,7 @@ __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma) unsigned int npages; void *ptr; - guard(mutex)(&ctx->resize_lock); + guard(mutex)(&ctx->mmap_lock); ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz); if (IS_ERR(ptr)) @@ -365,7 +365,7 @@ unsigned long io_uring_get_unmapped_area(struct file *filp, unsigned long addr, if (addr) return -EINVAL; - guard(mutex)(&ctx->resize_lock); + guard(mutex)(&ctx->mmap_lock); ptr = io_uring_validate_mmap_request(filp, pgoff, len); if (IS_ERR(ptr)) @@ -415,7 +415,7 @@ unsigned long io_uring_get_unmapped_area(struct file *file, unsigned long addr, struct io_ring_ctx *ctx = file->private_data; void *ptr; - guard(mutex)(&ctx->resize_lock); + guard(mutex)(&ctx->mmap_lock); ptr = io_uring_validate_mmap_request(file, pgoff, len); if (IS_ERR(ptr)) diff --git a/io_uring/register.c b/io_uring/register.c index fdd44914c39c..77743e3f6751 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -489,15 +489,15 @@ static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg) } /* - * We'll do the swap. Grab the ctx->resize_lock, which will exclude + * We'll do the swap. Grab the ctx->mmap_lock, which will exclude * any new mmap's on the ring fd. Clear out existing mappings to prevent * mmap from seeing them, as we'll unmap them. Any attempt to mmap * existing rings beyond this point will fail. Not that it could proceed * at this point anyway, as the io_uring mmap side needs go grab the - * ctx->resize_lock as well. Likewise, hold the completion lock over the + * ctx->mmap_lock as well. Likewise, hold the completion lock over the * duration of the actual swap. */ - mutex_lock(&ctx->resize_lock); + mutex_lock(&ctx->mmap_lock); spin_lock(&ctx->completion_lock); o.rings = ctx->rings; ctx->rings = NULL; @@ -564,7 +564,7 @@ overflow: ret = 0; out: spin_unlock(&ctx->completion_lock); - mutex_unlock(&ctx->resize_lock); + mutex_unlock(&ctx->mmap_lock); io_register_free_rings(&p, to_free); if (ctx->sq_data) From 7427b0b49ad51304c041ffb8c413f342e148cdea Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 29 Nov 2024 13:34:23 +0000 Subject: [PATCH 02/56] io_uring/rsrc: export io_check_coalesce_buffer io_try_coalesce_buffer() is a useful helper collecting useful info about a set of pages, I want to reuse it for analysing ring/etc. mappings. I don't need the entire thing and only interested if it can be coalesced into a single page, but that's better than duplicating the parsing. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/353b447953cd5d34c454a7d909bb6024c391d6e2.1732886067.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/rsrc.c | 22 ++++++++++++---------- io_uring/rsrc.h | 4 ++++ 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 077f84684c18..2d2333970eb5 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -626,11 +626,12 @@ static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages, return ret; } -static bool io_do_coalesce_buffer(struct page ***pages, int *nr_pages, - struct io_imu_folio_data *data, int nr_folios) +static bool io_coalesce_buffer(struct page ***pages, int *nr_pages, + struct io_imu_folio_data *data) { struct page **page_array = *pages, **new_array = NULL; int nr_pages_left = *nr_pages, i, j; + int nr_folios = data->nr_folios; /* Store head pages only*/ new_array = kvmalloc_array(nr_folios, sizeof(struct page *), @@ -667,15 +668,14 @@ static bool io_do_coalesce_buffer(struct page ***pages, int *nr_pages, return true; } -static bool io_try_coalesce_buffer(struct page ***pages, int *nr_pages, - struct io_imu_folio_data *data) +bool io_check_coalesce_buffer(struct page **page_array, int nr_pages, + struct io_imu_folio_data *data) { - struct page **page_array = *pages; struct folio *folio = page_folio(page_array[0]); unsigned int count = 1, nr_folios = 1; int i; - if (*nr_pages <= 1) + if (nr_pages <= 1) return false; data->nr_pages_mid = folio_nr_pages(folio); @@ -687,7 +687,7 @@ static bool io_try_coalesce_buffer(struct page ***pages, int *nr_pages, * Check if pages are contiguous inside a folio, and all folios have * the same page count except for the head and tail. */ - for (i = 1; i < *nr_pages; i++) { + for (i = 1; i < nr_pages; i++) { if (page_folio(page_array[i]) == folio && page_array[i] == page_array[i-1] + 1) { count++; @@ -715,7 +715,8 @@ static bool io_try_coalesce_buffer(struct page ***pages, int *nr_pages, if (nr_folios == 1) data->nr_pages_head = count; - return io_do_coalesce_buffer(pages, nr_pages, data, nr_folios); + data->nr_folios = nr_folios; + return true; } static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx, @@ -729,7 +730,7 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx, size_t size; int ret, nr_pages, i; struct io_imu_folio_data data; - bool coalesced; + bool coalesced = false; if (!iov->iov_base) return NULL; @@ -749,7 +750,8 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx, } /* If it's huge page(s), try to coalesce them into fewer bvec entries */ - coalesced = io_try_coalesce_buffer(&pages, &nr_pages, &data); + if (io_check_coalesce_buffer(pages, nr_pages, &data)) + coalesced = io_coalesce_buffer(&pages, &nr_pages, &data); imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL); if (!imu) diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index 7a4668deaa1a..c8b093584461 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -40,6 +40,7 @@ struct io_imu_folio_data { /* For non-head/tail folios, has to be fully included */ unsigned int nr_pages_mid; unsigned int folio_shift; + unsigned int nr_folios; }; struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx, int type); @@ -66,6 +67,9 @@ int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg, int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg, unsigned int size, unsigned int type); +bool io_check_coalesce_buffer(struct page **page_array, int nr_pages, + struct io_imu_folio_data *data); + static inline struct io_rsrc_node *io_rsrc_node_lookup(struct io_rsrc_data *data, int index) { From a730d2047d4ef822262c37f51ff7267f2f0e7167 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 29 Nov 2024 13:34:24 +0000 Subject: [PATCH 03/56] io_uring/memmap: flag vmap'ed regions Add internal flags for struct io_mapped_region. The first flag we need is IO_REGION_F_VMAPPED, that indicates that the pointer has to be unmapped on region destruction. For now all regions are vmap'ed, so it's set unconditionally. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/5a3d8046a038da97c0f8a8c8f1733fa3fc689d31.1732886067.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 5 +++-- io_uring/memmap.c | 14 ++++++++++---- io_uring/memmap.h | 2 +- 3 files changed, 14 insertions(+), 7 deletions(-) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index fafc1d779eb1..0793a91b66a5 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -78,8 +78,9 @@ struct io_hash_table { struct io_mapped_region { struct page **pages; - void *vmap_ptr; - size_t nr_pages; + void *ptr; + unsigned nr_pages; + unsigned flags; }; /* diff --git a/io_uring/memmap.c b/io_uring/memmap.c index a0d4151d11af..31fb8c8ffe4e 100644 --- a/io_uring/memmap.c +++ b/io_uring/memmap.c @@ -202,14 +202,19 @@ void *__io_uaddr_map(struct page ***pages, unsigned short *npages, return ERR_PTR(-ENOMEM); } +enum { + /* memory was vmap'ed for the kernel, freeing the region vunmap's it */ + IO_REGION_F_VMAP = 1, +}; + void io_free_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr) { if (mr->pages) { unpin_user_pages(mr->pages, mr->nr_pages); kvfree(mr->pages); } - if (mr->vmap_ptr) - vunmap(mr->vmap_ptr); + if ((mr->flags & IO_REGION_F_VMAP) && mr->ptr) + vunmap(mr->ptr); if (mr->nr_pages && ctx->user) __io_unaccount_mem(ctx->user, mr->nr_pages); @@ -225,7 +230,7 @@ int io_create_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr, void *vptr; u64 end; - if (WARN_ON_ONCE(mr->pages || mr->vmap_ptr || mr->nr_pages)) + if (WARN_ON_ONCE(mr->pages || mr->ptr || mr->nr_pages)) return -EFAULT; if (memchr_inv(®->__resv, 0, sizeof(reg->__resv))) return -EINVAL; @@ -260,8 +265,9 @@ int io_create_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr, } mr->pages = pages; - mr->vmap_ptr = vptr; + mr->ptr = vptr; mr->nr_pages = nr_pages; + mr->flags |= IO_REGION_F_VMAP; return 0; out_free: if (pages_accounted) diff --git a/io_uring/memmap.h b/io_uring/memmap.h index f361a635b6c7..2096a8427277 100644 --- a/io_uring/memmap.h +++ b/io_uring/memmap.h @@ -28,7 +28,7 @@ int io_create_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr, static inline void *io_region_get_ptr(struct io_mapped_region *mr) { - return mr->vmap_ptr; + return mr->ptr; } static inline bool io_region_is_set(struct io_mapped_region *mr) From 16375af32d0fd5a6398c48d2a684b3f4fbb17a8e Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 29 Nov 2024 13:34:25 +0000 Subject: [PATCH 04/56] io_uring/memmap: flag regions with user pages In preparation to kernel allocated regions add a flag telling if the region contains user pinned pages or not. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/0dc91564642654405bab080b7ec911cb4a43ec6e.1732886067.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/memmap.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/io_uring/memmap.c b/io_uring/memmap.c index 31fb8c8ffe4e..a0416733e921 100644 --- a/io_uring/memmap.c +++ b/io_uring/memmap.c @@ -205,12 +205,17 @@ void *__io_uaddr_map(struct page ***pages, unsigned short *npages, enum { /* memory was vmap'ed for the kernel, freeing the region vunmap's it */ IO_REGION_F_VMAP = 1, + /* memory is provided by user and pinned by the kernel */ + IO_REGION_F_USER_PROVIDED = 2, }; void io_free_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr) { if (mr->pages) { - unpin_user_pages(mr->pages, mr->nr_pages); + if (mr->flags & IO_REGION_F_USER_PROVIDED) + unpin_user_pages(mr->pages, mr->nr_pages); + else + release_pages(mr->pages, mr->nr_pages); kvfree(mr->pages); } if ((mr->flags & IO_REGION_F_VMAP) && mr->ptr) @@ -267,7 +272,7 @@ int io_create_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr, mr->pages = pages; mr->ptr = vptr; mr->nr_pages = nr_pages; - mr->flags |= IO_REGION_F_VMAP; + mr->flags |= IO_REGION_F_VMAP | IO_REGION_F_USER_PROVIDED; return 0; out_free: if (pages_accounted) From fc5f22a64649db7582f988d01651fa7d50054f90 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 29 Nov 2024 13:34:26 +0000 Subject: [PATCH 05/56] io_uring/memmap: account memory before pinning Move memory accounting before page pinning. It shouldn't even try to pin pages if it's not allowed, and accounting is also relatively inexpensive. It also give a better code structure as we do generic accounting and then can branch for different mapping types. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/1e242b8038411a222e8b269d35e021fa5015289f.1732886067.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/memmap.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/io_uring/memmap.c b/io_uring/memmap.c index a0416733e921..fca93bc4c6f1 100644 --- a/io_uring/memmap.c +++ b/io_uring/memmap.c @@ -252,17 +252,21 @@ int io_create_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr, if (check_add_overflow(reg->user_addr, reg->size, &end)) return -EOVERFLOW; - pages = io_pin_pages(reg->user_addr, reg->size, &nr_pages); - if (IS_ERR(pages)) - return PTR_ERR(pages); - + nr_pages = reg->size >> PAGE_SHIFT; if (ctx->user) { ret = __io_account_mem(ctx->user, nr_pages); if (ret) - goto out_free; + return ret; pages_accounted = nr_pages; } + pages = io_pin_pages(reg->user_addr, reg->size, &nr_pages); + if (IS_ERR(pages)) { + ret = PTR_ERR(pages); + pages = NULL; + goto out_free; + } + vptr = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL); if (!vptr) { ret = -ENOMEM; @@ -277,7 +281,8 @@ int io_create_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr, out_free: if (pages_accounted) __io_unaccount_mem(ctx->user, pages_accounted); - io_pages_free(&pages, nr_pages); + if (pages) + io_pages_free(&pages, nr_pages); return ret; } From 226ae1b4d1111b0b0041677b58371af9b8cd31a9 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 29 Nov 2024 13:34:27 +0000 Subject: [PATCH 06/56] io_uring/memmap: reuse io_free_region for failure path Regions are going to become more complex with allocation options and optimisations, I want to split initialisation into steps and for that it needs a sane fail path. Reuse io_free_region(), it's smart enough to undo only what's needed and leaves the structure in a consistent state. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/b853b4ec407cc80d033d021bdd2c14e22378fc78.1732886067.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/memmap.c | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/io_uring/memmap.c b/io_uring/memmap.c index fca93bc4c6f1..96c4f6b61171 100644 --- a/io_uring/memmap.c +++ b/io_uring/memmap.c @@ -229,7 +229,6 @@ void io_free_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr) int io_create_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr, struct io_uring_region_desc *reg) { - int pages_accounted = 0; struct page **pages; int nr_pages, ret; void *vptr; @@ -257,32 +256,27 @@ int io_create_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr, ret = __io_account_mem(ctx->user, nr_pages); if (ret) return ret; - pages_accounted = nr_pages; } + mr->nr_pages = nr_pages; pages = io_pin_pages(reg->user_addr, reg->size, &nr_pages); if (IS_ERR(pages)) { ret = PTR_ERR(pages); - pages = NULL; goto out_free; } + mr->pages = pages; + mr->flags |= IO_REGION_F_USER_PROVIDED; vptr = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL); if (!vptr) { ret = -ENOMEM; goto out_free; } - - mr->pages = pages; mr->ptr = vptr; - mr->nr_pages = nr_pages; - mr->flags |= IO_REGION_F_VMAP | IO_REGION_F_USER_PROVIDED; + mr->flags |= IO_REGION_F_VMAP; return 0; out_free: - if (pages_accounted) - __io_unaccount_mem(ctx->user, pages_accounted); - if (pages) - io_pages_free(&pages, nr_pages); + io_free_region(ctx, mr); return ret; } From c4d0ac1c1567ee822529124a3dc10b384838c3bc Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 29 Nov 2024 13:34:28 +0000 Subject: [PATCH 07/56] io_uring/memmap: optimise single folio regions We don't need to vmap if memory is already physically contiguous. There are two important cases it covers: PAGE_SIZE regions and huge pages. Use io_check_coalesce_buffer() to get the number of contiguous folios. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/d5240af23064a824c29d14d2406f1ae764bf4505.1732886067.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/memmap.c | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/io_uring/memmap.c b/io_uring/memmap.c index 96c4f6b61171..fd348c98f64f 100644 --- a/io_uring/memmap.c +++ b/io_uring/memmap.c @@ -226,12 +226,31 @@ void io_free_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr) memset(mr, 0, sizeof(*mr)); } +static int io_region_init_ptr(struct io_mapped_region *mr) +{ + struct io_imu_folio_data ifd; + void *ptr; + + if (io_check_coalesce_buffer(mr->pages, mr->nr_pages, &ifd)) { + if (ifd.nr_folios == 1) { + mr->ptr = page_address(mr->pages[0]); + return 0; + } + } + ptr = vmap(mr->pages, mr->nr_pages, VM_MAP, PAGE_KERNEL); + if (!ptr) + return -ENOMEM; + + mr->ptr = ptr; + mr->flags |= IO_REGION_F_VMAP; + return 0; +} + int io_create_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr, struct io_uring_region_desc *reg) { struct page **pages; int nr_pages, ret; - void *vptr; u64 end; if (WARN_ON_ONCE(mr->pages || mr->ptr || mr->nr_pages)) @@ -267,13 +286,9 @@ int io_create_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr, mr->pages = pages; mr->flags |= IO_REGION_F_USER_PROVIDED; - vptr = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL); - if (!vptr) { - ret = -ENOMEM; + ret = io_region_init_ptr(mr); + if (ret) goto out_free; - } - mr->ptr = vptr; - mr->flags |= IO_REGION_F_VMAP; return 0; out_free: io_free_region(ctx, mr); From a90558b36cceeb546c19b3cd17ed94f9810f8eec Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 29 Nov 2024 13:34:29 +0000 Subject: [PATCH 08/56] io_uring/memmap: helper for pinning region pages In preparation to adding kernel allocated regions extract a new helper that pins user pages. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/a17d7c39c3de4266b66b75b2dcf768150e1fc618.1732886067.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/memmap.c | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/io_uring/memmap.c b/io_uring/memmap.c index fd348c98f64f..5d261e07c2e3 100644 --- a/io_uring/memmap.c +++ b/io_uring/memmap.c @@ -246,10 +246,28 @@ static int io_region_init_ptr(struct io_mapped_region *mr) return 0; } +static int io_region_pin_pages(struct io_ring_ctx *ctx, + struct io_mapped_region *mr, + struct io_uring_region_desc *reg) +{ + unsigned long size = mr->nr_pages << PAGE_SHIFT; + struct page **pages; + int nr_pages; + + pages = io_pin_pages(reg->user_addr, size, &nr_pages); + if (IS_ERR(pages)) + return PTR_ERR(pages); + if (WARN_ON_ONCE(nr_pages != mr->nr_pages)) + return -EFAULT; + + mr->pages = pages; + mr->flags |= IO_REGION_F_USER_PROVIDED; + return 0; +} + int io_create_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr, struct io_uring_region_desc *reg) { - struct page **pages; int nr_pages, ret; u64 end; @@ -278,13 +296,9 @@ int io_create_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr, } mr->nr_pages = nr_pages; - pages = io_pin_pages(reg->user_addr, reg->size, &nr_pages); - if (IS_ERR(pages)) { - ret = PTR_ERR(pages); + ret = io_region_pin_pages(ctx, mr, reg); + if (ret) goto out_free; - } - mr->pages = pages; - mr->flags |= IO_REGION_F_USER_PROVIDED; ret = io_region_init_ptr(mr); if (ret) From 4b851d20d325dc59f9abdce55d42dc4b68179db0 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 29 Nov 2024 13:34:30 +0000 Subject: [PATCH 09/56] io_uring/memmap: add IO_REGION_F_SINGLE_REF Kernel allocated compound pages will have just one reference for the entire page array, add a flag telling io_free_region about that. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/a7abfa7535e9728d5fcade29a1ea1605ec2c04ce.1732886067.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/memmap.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/io_uring/memmap.c b/io_uring/memmap.c index 5d261e07c2e3..a37ccb167258 100644 --- a/io_uring/memmap.c +++ b/io_uring/memmap.c @@ -207,15 +207,23 @@ enum { IO_REGION_F_VMAP = 1, /* memory is provided by user and pinned by the kernel */ IO_REGION_F_USER_PROVIDED = 2, + /* only the first page in the array is ref'ed */ + IO_REGION_F_SINGLE_REF = 4, }; void io_free_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr) { if (mr->pages) { + long nr_refs = mr->nr_pages; + + if (mr->flags & IO_REGION_F_SINGLE_REF) + nr_refs = 1; + if (mr->flags & IO_REGION_F_USER_PROVIDED) - unpin_user_pages(mr->pages, mr->nr_pages); + unpin_user_pages(mr->pages, nr_refs); else - release_pages(mr->pages, mr->nr_pages); + release_pages(mr->pages, nr_refs); + kvfree(mr->pages); } if ((mr->flags & IO_REGION_F_VMAP) && mr->ptr) From 1e21df691ffa2c277e0b1a4928c9da0e86e9a2be Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 29 Nov 2024 13:34:31 +0000 Subject: [PATCH 10/56] io_uring/memmap: implement kernel allocated regions Allow the kernel to allocate memory for a region. That's the classical way SQ/CQ are allocated. It's not yet useful to user space as there is no way to mmap it, which is why it's explicitly disabled in io_register_mem_region(). Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/7b8c40e6542546bbf93f4842a9a42a7373b81e0d.1732886067.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/memmap.c | 43 ++++++++++++++++++++++++++++++++++++++++--- io_uring/register.c | 2 ++ 2 files changed, 42 insertions(+), 3 deletions(-) diff --git a/io_uring/memmap.c b/io_uring/memmap.c index a37ccb167258..0908a71bf57e 100644 --- a/io_uring/memmap.c +++ b/io_uring/memmap.c @@ -273,6 +273,39 @@ static int io_region_pin_pages(struct io_ring_ctx *ctx, return 0; } +static int io_region_allocate_pages(struct io_ring_ctx *ctx, + struct io_mapped_region *mr, + struct io_uring_region_desc *reg) +{ + gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN; + unsigned long size = mr->nr_pages << PAGE_SHIFT; + unsigned long nr_allocated; + struct page **pages; + void *p; + + pages = kvmalloc_array(mr->nr_pages, sizeof(*pages), gfp); + if (!pages) + return -ENOMEM; + + p = io_mem_alloc_compound(pages, mr->nr_pages, size, gfp); + if (!IS_ERR(p)) { + mr->flags |= IO_REGION_F_SINGLE_REF; + mr->pages = pages; + return 0; + } + + nr_allocated = alloc_pages_bulk_array_node(gfp, NUMA_NO_NODE, + mr->nr_pages, pages); + if (nr_allocated != mr->nr_pages) { + if (nr_allocated) + release_pages(pages, nr_allocated); + kvfree(pages); + return -ENOMEM; + } + mr->pages = pages; + return 0; +} + int io_create_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr, struct io_uring_region_desc *reg) { @@ -283,9 +316,10 @@ int io_create_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr, return -EFAULT; if (memchr_inv(®->__resv, 0, sizeof(reg->__resv))) return -EINVAL; - if (reg->flags != IORING_MEM_REGION_TYPE_USER) + if (reg->flags & ~IORING_MEM_REGION_TYPE_USER) return -EINVAL; - if (!reg->user_addr) + /* user_addr should be set IFF it's a user memory backed region */ + if ((reg->flags & IORING_MEM_REGION_TYPE_USER) != !!reg->user_addr) return -EFAULT; if (!reg->size || reg->mmap_offset || reg->id) return -EINVAL; @@ -304,7 +338,10 @@ int io_create_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr, } mr->nr_pages = nr_pages; - ret = io_region_pin_pages(ctx, mr, reg); + if (reg->flags & IORING_MEM_REGION_TYPE_USER) + ret = io_region_pin_pages(ctx, mr, reg); + else + ret = io_region_allocate_pages(ctx, mr, reg); if (ret) goto out_free; diff --git a/io_uring/register.c b/io_uring/register.c index 77743e3f6751..47992bc1ffcb 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -589,6 +589,8 @@ static int io_register_mem_region(struct io_ring_ctx *ctx, void __user *uarg) if (copy_from_user(&rd, rd_uptr, sizeof(rd))) return -EFAULT; + if (!(rd.flags & IORING_MEM_REGION_TYPE_USER)) + return -EINVAL; if (memchr_inv(®.__resv, 0, sizeof(reg.__resv))) return -EINVAL; if (reg.flags & ~IORING_MEM_REGION_REG_WAIT_ARG) From 087f997870a948820ec366701d178f402c6a23a3 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 29 Nov 2024 13:34:32 +0000 Subject: [PATCH 11/56] io_uring/memmap: implement mmap for regions The patch implements mmap for the param region and enables the kernel allocation mode. Internally it uses a fixed mmap offset, however the user has to use the offset returned in struct io_uring_region_desc::mmap_offset. Note, mmap doesn't and can't take ->uring_lock and the region / ring lookup is protected by ->mmap_lock, and it's directly peeking at ctx->param_region. We can't protect io_create_region() with the mmap_lock as it'd deadlock, which is why io_create_region_mmap_safe() initialises it for us in a temporary variable and then publishes it with the lock taken. It's intentionally decoupled from main region helpers, and in the future we might want to have a list of active regions, which then could be protected by the ->mmap_lock. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/0f1212bd6af7fb39b63514b34fae8948014221d1.1732886067.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/memmap.c | 61 +++++++++++++++++++++++++++++++++++++++++---- io_uring/memmap.h | 10 +++++++- io_uring/register.c | 6 ++--- 3 files changed, 67 insertions(+), 10 deletions(-) diff --git a/io_uring/memmap.c b/io_uring/memmap.c index 0908a71bf57e..9a182c8a4be1 100644 --- a/io_uring/memmap.c +++ b/io_uring/memmap.c @@ -275,7 +275,8 @@ static int io_region_pin_pages(struct io_ring_ctx *ctx, static int io_region_allocate_pages(struct io_ring_ctx *ctx, struct io_mapped_region *mr, - struct io_uring_region_desc *reg) + struct io_uring_region_desc *reg, + unsigned long mmap_offset) { gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN; unsigned long size = mr->nr_pages << PAGE_SHIFT; @@ -290,8 +291,7 @@ static int io_region_allocate_pages(struct io_ring_ctx *ctx, p = io_mem_alloc_compound(pages, mr->nr_pages, size, gfp); if (!IS_ERR(p)) { mr->flags |= IO_REGION_F_SINGLE_REF; - mr->pages = pages; - return 0; + goto done; } nr_allocated = alloc_pages_bulk_array_node(gfp, NUMA_NO_NODE, @@ -302,12 +302,15 @@ static int io_region_allocate_pages(struct io_ring_ctx *ctx, kvfree(pages); return -ENOMEM; } +done: + reg->mmap_offset = mmap_offset; mr->pages = pages; return 0; } int io_create_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr, - struct io_uring_region_desc *reg) + struct io_uring_region_desc *reg, + unsigned long mmap_offset) { int nr_pages, ret; u64 end; @@ -341,7 +344,7 @@ int io_create_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr, if (reg->flags & IORING_MEM_REGION_TYPE_USER) ret = io_region_pin_pages(ctx, mr, reg); else - ret = io_region_allocate_pages(ctx, mr, reg); + ret = io_region_allocate_pages(ctx, mr, reg, mmap_offset); if (ret) goto out_free; @@ -354,6 +357,40 @@ out_free: return ret; } +int io_create_region_mmap_safe(struct io_ring_ctx *ctx, struct io_mapped_region *mr, + struct io_uring_region_desc *reg, + unsigned long mmap_offset) +{ + struct io_mapped_region tmp_mr; + int ret; + + memcpy(&tmp_mr, mr, sizeof(tmp_mr)); + ret = io_create_region(ctx, &tmp_mr, reg, mmap_offset); + if (ret) + return ret; + + /* + * Once published mmap can find it without holding only the ->mmap_lock + * and not ->uring_lock. + */ + guard(mutex)(&ctx->mmap_lock); + memcpy(mr, &tmp_mr, sizeof(tmp_mr)); + return 0; +} + +static void *io_region_validate_mmap(struct io_ring_ctx *ctx, + struct io_mapped_region *mr) +{ + lockdep_assert_held(&ctx->mmap_lock); + + if (!io_region_is_set(mr)) + return ERR_PTR(-EINVAL); + if (mr->flags & IO_REGION_F_USER_PROVIDED) + return ERR_PTR(-EINVAL); + + return io_region_get_ptr(mr); +} + static void *io_uring_validate_mmap_request(struct file *file, loff_t pgoff, size_t sz) { @@ -389,6 +426,8 @@ static void *io_uring_validate_mmap_request(struct file *file, loff_t pgoff, io_put_bl(ctx, bl); return ptr; } + case IORING_MAP_OFF_PARAM_REGION: + return io_region_validate_mmap(ctx, &ctx->param_region); } return ERR_PTR(-EINVAL); @@ -405,6 +444,16 @@ int io_uring_mmap_pages(struct io_ring_ctx *ctx, struct vm_area_struct *vma, #ifdef CONFIG_MMU +static int io_region_mmap(struct io_ring_ctx *ctx, + struct io_mapped_region *mr, + struct vm_area_struct *vma) +{ + unsigned long nr_pages = mr->nr_pages; + + vm_flags_set(vma, VM_DONTEXPAND); + return vm_insert_pages(vma, vma->vm_start, mr->pages, &nr_pages); +} + __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma) { struct io_ring_ctx *ctx = file->private_data; @@ -429,6 +478,8 @@ __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma) ctx->n_sqe_pages); case IORING_OFF_PBUF_RING: return io_pbuf_mmap(file, vma); + case IORING_MAP_OFF_PARAM_REGION: + return io_region_mmap(ctx, &ctx->param_region, vma); } return -EINVAL; diff --git a/io_uring/memmap.h b/io_uring/memmap.h index 2096a8427277..2402bca3d700 100644 --- a/io_uring/memmap.h +++ b/io_uring/memmap.h @@ -1,6 +1,8 @@ #ifndef IO_URING_MEMMAP_H #define IO_URING_MEMMAP_H +#define IORING_MAP_OFF_PARAM_REGION 0x20000000ULL + struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages); void io_pages_free(struct page ***pages, int npages); int io_uring_mmap_pages(struct io_ring_ctx *ctx, struct vm_area_struct *vma, @@ -24,7 +26,13 @@ int io_uring_mmap(struct file *file, struct vm_area_struct *vma); void io_free_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr); int io_create_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr, - struct io_uring_region_desc *reg); + struct io_uring_region_desc *reg, + unsigned long mmap_offset); + +int io_create_region_mmap_safe(struct io_ring_ctx *ctx, + struct io_mapped_region *mr, + struct io_uring_region_desc *reg, + unsigned long mmap_offset); static inline void *io_region_get_ptr(struct io_mapped_region *mr) { diff --git a/io_uring/register.c b/io_uring/register.c index 47992bc1ffcb..8c19fbe84808 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -588,9 +588,6 @@ static int io_register_mem_region(struct io_ring_ctx *ctx, void __user *uarg) rd_uptr = u64_to_user_ptr(reg.region_uptr); if (copy_from_user(&rd, rd_uptr, sizeof(rd))) return -EFAULT; - - if (!(rd.flags & IORING_MEM_REGION_TYPE_USER)) - return -EINVAL; if (memchr_inv(®.__resv, 0, sizeof(reg.__resv))) return -EINVAL; if (reg.flags & ~IORING_MEM_REGION_REG_WAIT_ARG) @@ -605,7 +602,8 @@ static int io_register_mem_region(struct io_ring_ctx *ctx, void __user *uarg) !(ctx->flags & IORING_SETUP_R_DISABLED)) return -EINVAL; - ret = io_create_region(ctx, &ctx->param_region, &rd); + ret = io_create_region_mmap_safe(ctx, &ctx->param_region, &rd, + IORING_MAP_OFF_PARAM_REGION); if (ret) return ret; if (copy_to_user(rd_uptr, &rd, sizeof(rd))) { From 02255d55260a6836423c8401628b58264e198973 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 29 Nov 2024 13:34:33 +0000 Subject: [PATCH 12/56] io_uring: pass ctx to io_register_free_rings A preparation patch, pass the context to io_register_free_rings. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/c1865fd2b3d4db22d1a1aac7dd06ea22cb990834.1732886067.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/register.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/io_uring/register.c b/io_uring/register.c index 8c19fbe84808..43dd6688a464 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -375,7 +375,8 @@ struct io_ring_ctx_rings { struct io_rings *rings; }; -static void io_register_free_rings(struct io_uring_params *p, +static void io_register_free_rings(struct io_ring_ctx *ctx, + struct io_uring_params *p, struct io_ring_ctx_rings *r) { if (!(p->flags & IORING_SETUP_NO_MMAP)) { @@ -455,7 +456,7 @@ static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg) n.rings->cq_ring_entries = p.cq_entries; if (copy_to_user(arg, &p, sizeof(p))) { - io_register_free_rings(&p, &n); + io_register_free_rings(ctx, &p, &n); return -EFAULT; } @@ -464,7 +465,7 @@ static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg) else size = array_size(sizeof(struct io_uring_sqe), p.sq_entries); if (size == SIZE_MAX) { - io_register_free_rings(&p, &n); + io_register_free_rings(ctx, &p, &n); return -EOVERFLOW; } @@ -475,7 +476,7 @@ static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg) p.sq_off.user_addr, size); if (IS_ERR(ptr)) { - io_register_free_rings(&p, &n); + io_register_free_rings(ctx, &p, &n); return PTR_ERR(ptr); } @@ -565,7 +566,7 @@ overflow: out: spin_unlock(&ctx->completion_lock); mutex_unlock(&ctx->mmap_lock); - io_register_free_rings(&p, to_free); + io_register_free_rings(ctx, &p, to_free); if (ctx->sq_data) io_sq_thread_unpark(ctx->sq_data); From 8078486e1d53591ed946c943177339e59e3089e0 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 29 Nov 2024 13:34:34 +0000 Subject: [PATCH 13/56] io_uring: use region api for SQ Convert internal parts of the SQ managment to the region API. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/1fb73ced6b835cb319ab0fe1dc0b2e982a9a5650.1732886067.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 3 +-- io_uring/io_uring.c | 38 ++++++++++++++-------------------- io_uring/memmap.c | 3 +-- io_uring/register.c | 37 +++++++++++++++------------------ 4 files changed, 34 insertions(+), 47 deletions(-) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 0793a91b66a5..808accf1776b 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -432,10 +432,9 @@ struct io_ring_ctx { * the gup'ed pages for the two rings, and the sqes. */ unsigned short n_ring_pages; - unsigned short n_sqe_pages; struct page **ring_pages; - struct page **sqe_pages; + struct io_mapped_region sq_region; /* used for optimised request parameter and wait argument passing */ struct io_mapped_region param_region; }; diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 539fecde0244..bdd4bc984bc8 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -2641,29 +2641,19 @@ static void *io_rings_map(struct io_ring_ctx *ctx, unsigned long uaddr, size); } -static void *io_sqes_map(struct io_ring_ctx *ctx, unsigned long uaddr, - size_t size) -{ - return __io_uaddr_map(&ctx->sqe_pages, &ctx->n_sqe_pages, uaddr, - size); -} - static void io_rings_free(struct io_ring_ctx *ctx) { if (!(ctx->flags & IORING_SETUP_NO_MMAP)) { io_pages_unmap(ctx->rings, &ctx->ring_pages, &ctx->n_ring_pages, true); - io_pages_unmap(ctx->sq_sqes, &ctx->sqe_pages, &ctx->n_sqe_pages, - true); } else { io_pages_free(&ctx->ring_pages, ctx->n_ring_pages); ctx->n_ring_pages = 0; - io_pages_free(&ctx->sqe_pages, ctx->n_sqe_pages); - ctx->n_sqe_pages = 0; vunmap(ctx->rings); - vunmap(ctx->sq_sqes); } + io_free_region(ctx, &ctx->sq_region); + ctx->rings = NULL; ctx->sq_sqes = NULL; } @@ -3481,9 +3471,10 @@ bool io_is_uring_fops(struct file *file) static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx, struct io_uring_params *p) { + struct io_uring_region_desc rd; struct io_rings *rings; size_t size, sq_array_offset; - void *ptr; + int ret; /* make sure these are sane, as we already accounted them */ ctx->sq_entries = p->sq_entries; @@ -3519,17 +3510,18 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx, return -EOVERFLOW; } - if (!(ctx->flags & IORING_SETUP_NO_MMAP)) - ptr = io_pages_map(&ctx->sqe_pages, &ctx->n_sqe_pages, size); - else - ptr = io_sqes_map(ctx, p->sq_off.user_addr, size); - - if (IS_ERR(ptr)) { - io_rings_free(ctx); - return PTR_ERR(ptr); + memset(&rd, 0, sizeof(rd)); + rd.size = PAGE_ALIGN(size); + if (ctx->flags & IORING_SETUP_NO_MMAP) { + rd.user_addr = p->sq_off.user_addr; + rd.flags |= IORING_MEM_REGION_TYPE_USER; } - - ctx->sq_sqes = ptr; + ret = io_create_region(ctx, &ctx->sq_region, &rd, IORING_OFF_SQES); + if (ret) { + io_rings_free(ctx); + return ret; + } + ctx->sq_sqes = io_region_get_ptr(&ctx->sq_region); return 0; } diff --git a/io_uring/memmap.c b/io_uring/memmap.c index 9a182c8a4be1..b9aaa25182a5 100644 --- a/io_uring/memmap.c +++ b/io_uring/memmap.c @@ -474,8 +474,7 @@ __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma) npages = min(ctx->n_ring_pages, (sz + PAGE_SIZE - 1) >> PAGE_SHIFT); return io_uring_mmap_pages(ctx, vma, ctx->ring_pages, npages); case IORING_OFF_SQES: - return io_uring_mmap_pages(ctx, vma, ctx->sqe_pages, - ctx->n_sqe_pages); + return io_region_mmap(ctx, &ctx->sq_region, vma); case IORING_OFF_PBUF_RING: return io_pbuf_mmap(file, vma); case IORING_MAP_OFF_PARAM_REGION: diff --git a/io_uring/register.c b/io_uring/register.c index 43dd6688a464..1cf47ad40dc1 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -368,11 +368,11 @@ static int io_register_clock(struct io_ring_ctx *ctx, */ struct io_ring_ctx_rings { unsigned short n_ring_pages; - unsigned short n_sqe_pages; struct page **ring_pages; - struct page **sqe_pages; - struct io_uring_sqe *sq_sqes; struct io_rings *rings; + + struct io_uring_sqe *sq_sqes; + struct io_mapped_region sq_region; }; static void io_register_free_rings(struct io_ring_ctx *ctx, @@ -382,14 +382,11 @@ static void io_register_free_rings(struct io_ring_ctx *ctx, if (!(p->flags & IORING_SETUP_NO_MMAP)) { io_pages_unmap(r->rings, &r->ring_pages, &r->n_ring_pages, true); - io_pages_unmap(r->sq_sqes, &r->sqe_pages, &r->n_sqe_pages, - true); } else { io_pages_free(&r->ring_pages, r->n_ring_pages); - io_pages_free(&r->sqe_pages, r->n_sqe_pages); vunmap(r->rings); - vunmap(r->sq_sqes); } + io_free_region(ctx, &r->sq_region); } #define swap_old(ctx, o, n, field) \ @@ -404,11 +401,11 @@ static void io_register_free_rings(struct io_ring_ctx *ctx, static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg) { + struct io_uring_region_desc rd; struct io_ring_ctx_rings o = { }, n = { }, *to_free = NULL; size_t size, sq_array_offset; struct io_uring_params p; unsigned i, tail; - void *ptr; int ret; /* for single issuer, must be owner resizing */ @@ -469,16 +466,18 @@ static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg) return -EOVERFLOW; } - if (!(p.flags & IORING_SETUP_NO_MMAP)) - ptr = io_pages_map(&n.sqe_pages, &n.n_sqe_pages, size); - else - ptr = __io_uaddr_map(&n.sqe_pages, &n.n_sqe_pages, - p.sq_off.user_addr, - size); - if (IS_ERR(ptr)) { - io_register_free_rings(ctx, &p, &n); - return PTR_ERR(ptr); + memset(&rd, 0, sizeof(rd)); + rd.size = PAGE_ALIGN(size); + if (p.flags & IORING_SETUP_NO_MMAP) { + rd.user_addr = p.sq_off.user_addr; + rd.flags |= IORING_MEM_REGION_TYPE_USER; } + ret = io_create_region_mmap_safe(ctx, &n.sq_region, &rd, IORING_OFF_SQES); + if (ret) { + io_register_free_rings(ctx, &p, &n); + return ret; + } + n.sq_sqes = io_region_get_ptr(&n.sq_region); /* * If using SQPOLL, park the thread @@ -509,7 +508,6 @@ static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg) * Now copy SQ and CQ entries, if any. If either of the destination * rings can't hold what is already there, then fail the operation. */ - n.sq_sqes = ptr; tail = o.rings->sq.tail; if (tail - o.rings->sq.head > p.sq_entries) goto overflow; @@ -558,9 +556,8 @@ overflow: ctx->rings = n.rings; ctx->sq_sqes = n.sq_sqes; swap_old(ctx, o, n, n_ring_pages); - swap_old(ctx, o, n, n_sqe_pages); swap_old(ctx, o, n, ring_pages); - swap_old(ctx, o, n, sqe_pages); + swap_old(ctx, o, n, sq_region); to_free = &o; ret = 0; out: From 81a4058e0cd0f07139f088fbeb65bc488f687829 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 29 Nov 2024 13:34:35 +0000 Subject: [PATCH 14/56] io_uring: use region api for CQ Convert internal parts of the CQ/SQ array managment to the region API. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/46fc3c801290d6b1ac16023d78f6b8e685c87fd6.1732886067.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 8 +---- io_uring/io_uring.c | 36 +++++++--------------- io_uring/memmap.c | 55 +++++----------------------------- io_uring/memmap.h | 4 --- io_uring/register.c | 35 ++++++++++------------ 5 files changed, 36 insertions(+), 102 deletions(-) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 808accf1776b..b63f44220e8b 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -427,14 +427,8 @@ struct io_ring_ctx { */ struct mutex mmap_lock; - /* - * If IORING_SETUP_NO_MMAP is used, then the below holds - * the gup'ed pages for the two rings, and the sqes. - */ - unsigned short n_ring_pages; - struct page **ring_pages; - struct io_mapped_region sq_region; + struct io_mapped_region ring_region; /* used for optimised request parameter and wait argument passing */ struct io_mapped_region param_region; }; diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index bdd4bc984bc8..8dd3fa9be993 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -2634,26 +2634,10 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags, return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0; } -static void *io_rings_map(struct io_ring_ctx *ctx, unsigned long uaddr, - size_t size) -{ - return __io_uaddr_map(&ctx->ring_pages, &ctx->n_ring_pages, uaddr, - size); -} - static void io_rings_free(struct io_ring_ctx *ctx) { - if (!(ctx->flags & IORING_SETUP_NO_MMAP)) { - io_pages_unmap(ctx->rings, &ctx->ring_pages, &ctx->n_ring_pages, - true); - } else { - io_pages_free(&ctx->ring_pages, ctx->n_ring_pages); - ctx->n_ring_pages = 0; - vunmap(ctx->rings); - } - io_free_region(ctx, &ctx->sq_region); - + io_free_region(ctx, &ctx->ring_region); ctx->rings = NULL; ctx->sq_sqes = NULL; } @@ -3485,15 +3469,17 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx, if (size == SIZE_MAX) return -EOVERFLOW; - if (!(ctx->flags & IORING_SETUP_NO_MMAP)) - rings = io_pages_map(&ctx->ring_pages, &ctx->n_ring_pages, size); - else - rings = io_rings_map(ctx, p->cq_off.user_addr, size); + memset(&rd, 0, sizeof(rd)); + rd.size = PAGE_ALIGN(size); + if (ctx->flags & IORING_SETUP_NO_MMAP) { + rd.user_addr = p->cq_off.user_addr; + rd.flags |= IORING_MEM_REGION_TYPE_USER; + } + ret = io_create_region(ctx, &ctx->ring_region, &rd, IORING_OFF_CQ_RING); + if (ret) + return ret; + ctx->rings = rings = io_region_get_ptr(&ctx->ring_region); - if (IS_ERR(rings)) - return PTR_ERR(rings); - - ctx->rings = rings; if (!(ctx->flags & IORING_SETUP_NO_SQARRAY)) ctx->sq_array = (u32 *)((char *)rings + sq_array_offset); rings->sq_ring_mask = p->sq_entries - 1; diff --git a/io_uring/memmap.c b/io_uring/memmap.c index b9aaa25182a5..668b1c3579a2 100644 --- a/io_uring/memmap.c +++ b/io_uring/memmap.c @@ -120,18 +120,6 @@ void io_pages_unmap(void *ptr, struct page ***pages, unsigned short *npages, *npages = 0; } -void io_pages_free(struct page ***pages, int npages) -{ - struct page **page_array = *pages; - - if (!page_array) - return; - - unpin_user_pages(page_array, npages); - kvfree(page_array); - *pages = NULL; -} - struct page **io_pin_pages(unsigned long uaddr, unsigned long len, int *npages) { unsigned long start, end, nr_pages; @@ -174,34 +162,6 @@ struct page **io_pin_pages(unsigned long uaddr, unsigned long len, int *npages) return ERR_PTR(ret); } -void *__io_uaddr_map(struct page ***pages, unsigned short *npages, - unsigned long uaddr, size_t size) -{ - struct page **page_array; - unsigned int nr_pages; - void *page_addr; - - *npages = 0; - - if (uaddr & (PAGE_SIZE - 1) || !size) - return ERR_PTR(-EINVAL); - - nr_pages = 0; - page_array = io_pin_pages(uaddr, size, &nr_pages); - if (IS_ERR(page_array)) - return page_array; - - page_addr = vmap(page_array, nr_pages, VM_MAP, PAGE_KERNEL); - if (page_addr) { - *pages = page_array; - *npages = nr_pages; - return page_addr; - } - - io_pages_free(&page_array, nr_pages); - return ERR_PTR(-ENOMEM); -} - enum { /* memory was vmap'ed for the kernel, freeing the region vunmap's it */ IO_REGION_F_VMAP = 1, @@ -446,9 +406,10 @@ int io_uring_mmap_pages(struct io_ring_ctx *ctx, struct vm_area_struct *vma, static int io_region_mmap(struct io_ring_ctx *ctx, struct io_mapped_region *mr, - struct vm_area_struct *vma) + struct vm_area_struct *vma, + unsigned max_pages) { - unsigned long nr_pages = mr->nr_pages; + unsigned long nr_pages = min(mr->nr_pages, max_pages); vm_flags_set(vma, VM_DONTEXPAND); return vm_insert_pages(vma, vma->vm_start, mr->pages, &nr_pages); @@ -459,7 +420,7 @@ __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma) struct io_ring_ctx *ctx = file->private_data; size_t sz = vma->vm_end - vma->vm_start; long offset = vma->vm_pgoff << PAGE_SHIFT; - unsigned int npages; + unsigned int page_limit; void *ptr; guard(mutex)(&ctx->mmap_lock); @@ -471,14 +432,14 @@ __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma) switch (offset & IORING_OFF_MMAP_MASK) { case IORING_OFF_SQ_RING: case IORING_OFF_CQ_RING: - npages = min(ctx->n_ring_pages, (sz + PAGE_SIZE - 1) >> PAGE_SHIFT); - return io_uring_mmap_pages(ctx, vma, ctx->ring_pages, npages); + page_limit = (sz + PAGE_SIZE - 1) >> PAGE_SHIFT; + return io_region_mmap(ctx, &ctx->ring_region, vma, page_limit); case IORING_OFF_SQES: - return io_region_mmap(ctx, &ctx->sq_region, vma); + return io_region_mmap(ctx, &ctx->sq_region, vma, UINT_MAX); case IORING_OFF_PBUF_RING: return io_pbuf_mmap(file, vma); case IORING_MAP_OFF_PARAM_REGION: - return io_region_mmap(ctx, &ctx->param_region, vma); + return io_region_mmap(ctx, &ctx->param_region, vma, UINT_MAX); } return -EINVAL; diff --git a/io_uring/memmap.h b/io_uring/memmap.h index 2402bca3d700..7395996eb353 100644 --- a/io_uring/memmap.h +++ b/io_uring/memmap.h @@ -4,7 +4,6 @@ #define IORING_MAP_OFF_PARAM_REGION 0x20000000ULL struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages); -void io_pages_free(struct page ***pages, int npages); int io_uring_mmap_pages(struct io_ring_ctx *ctx, struct vm_area_struct *vma, struct page **pages, int npages); @@ -13,9 +12,6 @@ void *io_pages_map(struct page ***out_pages, unsigned short *npages, void io_pages_unmap(void *ptr, struct page ***pages, unsigned short *npages, bool put_pages); -void *__io_uaddr_map(struct page ***pages, unsigned short *npages, - unsigned long uaddr, size_t size); - #ifndef CONFIG_MMU unsigned int io_uring_nommu_mmap_capabilities(struct file *file); #endif diff --git a/io_uring/register.c b/io_uring/register.c index 1cf47ad40dc1..5e48413706ac 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -367,26 +367,19 @@ static int io_register_clock(struct io_ring_ctx *ctx, * either mapping or freeing. */ struct io_ring_ctx_rings { - unsigned short n_ring_pages; - struct page **ring_pages; struct io_rings *rings; - struct io_uring_sqe *sq_sqes; + struct io_mapped_region sq_region; + struct io_mapped_region ring_region; }; static void io_register_free_rings(struct io_ring_ctx *ctx, struct io_uring_params *p, struct io_ring_ctx_rings *r) { - if (!(p->flags & IORING_SETUP_NO_MMAP)) { - io_pages_unmap(r->rings, &r->ring_pages, &r->n_ring_pages, - true); - } else { - io_pages_free(&r->ring_pages, r->n_ring_pages); - vunmap(r->rings); - } io_free_region(ctx, &r->sq_region); + io_free_region(ctx, &r->ring_region); } #define swap_old(ctx, o, n, field) \ @@ -439,13 +432,18 @@ static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg) if (size == SIZE_MAX) return -EOVERFLOW; - if (!(p.flags & IORING_SETUP_NO_MMAP)) - n.rings = io_pages_map(&n.ring_pages, &n.n_ring_pages, size); - else - n.rings = __io_uaddr_map(&n.ring_pages, &n.n_ring_pages, - p.cq_off.user_addr, size); - if (IS_ERR(n.rings)) - return PTR_ERR(n.rings); + memset(&rd, 0, sizeof(rd)); + rd.size = PAGE_ALIGN(size); + if (p.flags & IORING_SETUP_NO_MMAP) { + rd.user_addr = p.cq_off.user_addr; + rd.flags |= IORING_MEM_REGION_TYPE_USER; + } + ret = io_create_region_mmap_safe(ctx, &n.ring_region, &rd, IORING_OFF_CQ_RING); + if (ret) { + io_register_free_rings(ctx, &p, &n); + return ret; + } + n.rings = io_region_get_ptr(&n.ring_region); n.rings->sq_ring_mask = p.sq_entries - 1; n.rings->cq_ring_mask = p.cq_entries - 1; @@ -555,8 +553,7 @@ overflow: ctx->rings = n.rings; ctx->sq_sqes = n.sq_sqes; - swap_old(ctx, o, n, n_ring_pages); - swap_old(ctx, o, n, ring_pages); + swap_old(ctx, o, n, ring_region); swap_old(ctx, o, n, sq_region); to_free = &o; ret = 0; From 78fda3d056417ccb9921663383b12f771aa0dd43 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 29 Nov 2024 13:34:36 +0000 Subject: [PATCH 15/56] io_uring/kbuf: use mmap_lock to sync with mmap A preparation / cleanup patch simplifying the buf ring - mmap synchronisation. Instead of relying on RCU, which is trickier, do it by grabbing the mmap_lock when when anyone tries to publish or remove a registered buffer to / from ->io_bl_xa. Modifications of the xarray should always be protected by both ->uring_lock and ->mmap_lock, while lookups should hold either of them. While a struct io_buffer_list is in the xarray, the mmap related fields like ->flags and ->buf_pages should stay stable. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/af13bde56ee1a26bcaefaa9aad37a9ea318a590e.1732886067.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 5 +++ io_uring/kbuf.c | 56 +++++++++++++++------------------- io_uring/kbuf.h | 1 - 3 files changed, 29 insertions(+), 33 deletions(-) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index b63f44220e8b..73575d545d3c 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -294,6 +294,11 @@ struct io_ring_ctx { struct io_submit_state submit_state; + /* + * Modifications are protected by ->uring_lock and ->mmap_lock. + * The flags, buf_pages and buf_nr_pages fields should be stable + * once published. + */ struct xarray io_bl_xa; struct io_hash_table cancel_table; diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index d407576ddfb7..662e928cc3b0 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -45,10 +45,11 @@ static int io_buffer_add_list(struct io_ring_ctx *ctx, /* * Store buffer group ID and finally mark the list as visible. * The normal lookup doesn't care about the visibility as we're - * always under the ->uring_lock, but the RCU lookup from mmap does. + * always under the ->uring_lock, but lookups from mmap do. */ bl->bgid = bgid; atomic_set(&bl->refs, 1); + guard(mutex)(&ctx->mmap_lock); return xa_err(xa_store(&ctx->io_bl_xa, bgid, bl, GFP_KERNEL)); } @@ -388,7 +389,7 @@ void io_put_bl(struct io_ring_ctx *ctx, struct io_buffer_list *bl) { if (atomic_dec_and_test(&bl->refs)) { __io_remove_buffers(ctx, bl, -1U); - kfree_rcu(bl, rcu); + kfree(bl); } } @@ -397,10 +398,17 @@ void io_destroy_buffers(struct io_ring_ctx *ctx) struct io_buffer_list *bl; struct list_head *item, *tmp; struct io_buffer *buf; - unsigned long index; - xa_for_each(&ctx->io_bl_xa, index, bl) { - xa_erase(&ctx->io_bl_xa, bl->bgid); + while (1) { + unsigned long index = 0; + + scoped_guard(mutex, &ctx->mmap_lock) { + bl = xa_find(&ctx->io_bl_xa, &index, ULONG_MAX, XA_PRESENT); + if (bl) + xa_erase(&ctx->io_bl_xa, bl->bgid); + } + if (!bl) + break; io_put_bl(ctx, bl); } @@ -589,11 +597,7 @@ int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags) INIT_LIST_HEAD(&bl->buf_list); ret = io_buffer_add_list(ctx, bl, p->bgid); if (ret) { - /* - * Doesn't need rcu free as it was never visible, but - * let's keep it consistent throughout. - */ - kfree_rcu(bl, rcu); + kfree(bl); goto err; } } @@ -736,7 +740,7 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) return 0; } - kfree_rcu(free_bl, rcu); + kfree(free_bl); return ret; } @@ -760,7 +764,9 @@ int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) if (!(bl->flags & IOBL_BUF_RING)) return -EINVAL; - xa_erase(&ctx->io_bl_xa, bl->bgid); + scoped_guard(mutex, &ctx->mmap_lock) + xa_erase(&ctx->io_bl_xa, bl->bgid); + io_put_bl(ctx, bl); return 0; } @@ -795,29 +801,13 @@ struct io_buffer_list *io_pbuf_get_bl(struct io_ring_ctx *ctx, unsigned long bgid) { struct io_buffer_list *bl; - bool ret; - /* - * We have to be a bit careful here - we're inside mmap and cannot grab - * the uring_lock. This means the buffer_list could be simultaneously - * going away, if someone is trying to be sneaky. Look it up under rcu - * so we know it's not going away, and attempt to grab a reference to - * it. If the ref is already zero, then fail the mapping. If successful, - * the caller will call io_put_bl() to drop the the reference at at the - * end. This may then safely free the buffer_list (and drop the pages) - * at that point, vm_insert_pages() would've already grabbed the - * necessary vma references. - */ - rcu_read_lock(); bl = xa_load(&ctx->io_bl_xa, bgid); /* must be a mmap'able buffer ring and have pages */ - ret = false; - if (bl && bl->flags & IOBL_MMAP) - ret = atomic_inc_not_zero(&bl->refs); - rcu_read_unlock(); - - if (ret) - return bl; + if (bl && bl->flags & IOBL_MMAP) { + if (atomic_inc_not_zero(&bl->refs)) + return bl; + } return ERR_PTR(-EINVAL); } @@ -829,6 +819,8 @@ int io_pbuf_mmap(struct file *file, struct vm_area_struct *vma) struct io_buffer_list *bl; int bgid, ret; + lockdep_assert_held(&ctx->mmap_lock); + bgid = (pgoff & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT; bl = io_pbuf_get_bl(ctx, bgid); if (IS_ERR(bl)) diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h index 36aadfe5ac00..d5e4afcbfbb3 100644 --- a/io_uring/kbuf.h +++ b/io_uring/kbuf.h @@ -25,7 +25,6 @@ struct io_buffer_list { struct page **buf_pages; struct io_uring_buf_ring *buf_ring; }; - struct rcu_head rcu; }; __u16 bgid; From 90175f3f503213903b00bc7ba9f8ae436fc5c00e Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 29 Nov 2024 13:34:37 +0000 Subject: [PATCH 16/56] io_uring/kbuf: remove pbuf ring refcounting struct io_buffer_list refcounting was needed for RCU based sync with mmap, now we can kill it. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/4a9cc54bf0077bb2bf2f3daf917549ddd41080da.1732886067.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/kbuf.c | 21 +++++++-------------- io_uring/kbuf.h | 3 --- io_uring/memmap.c | 1 - 3 files changed, 7 insertions(+), 18 deletions(-) diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index 662e928cc3b0..644f61445ec9 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -48,7 +48,6 @@ static int io_buffer_add_list(struct io_ring_ctx *ctx, * always under the ->uring_lock, but lookups from mmap do. */ bl->bgid = bgid; - atomic_set(&bl->refs, 1); guard(mutex)(&ctx->mmap_lock); return xa_err(xa_store(&ctx->io_bl_xa, bgid, bl, GFP_KERNEL)); } @@ -385,12 +384,10 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx, return i; } -void io_put_bl(struct io_ring_ctx *ctx, struct io_buffer_list *bl) +static void io_put_bl(struct io_ring_ctx *ctx, struct io_buffer_list *bl) { - if (atomic_dec_and_test(&bl->refs)) { - __io_remove_buffers(ctx, bl, -1U); - kfree(bl); - } + __io_remove_buffers(ctx, bl, -1U); + kfree(bl); } void io_destroy_buffers(struct io_ring_ctx *ctx) @@ -804,10 +801,8 @@ struct io_buffer_list *io_pbuf_get_bl(struct io_ring_ctx *ctx, bl = xa_load(&ctx->io_bl_xa, bgid); /* must be a mmap'able buffer ring and have pages */ - if (bl && bl->flags & IOBL_MMAP) { - if (atomic_inc_not_zero(&bl->refs)) - return bl; - } + if (bl && bl->flags & IOBL_MMAP) + return bl; return ERR_PTR(-EINVAL); } @@ -817,7 +812,7 @@ int io_pbuf_mmap(struct file *file, struct vm_area_struct *vma) struct io_ring_ctx *ctx = file->private_data; loff_t pgoff = vma->vm_pgoff << PAGE_SHIFT; struct io_buffer_list *bl; - int bgid, ret; + int bgid; lockdep_assert_held(&ctx->mmap_lock); @@ -826,7 +821,5 @@ int io_pbuf_mmap(struct file *file, struct vm_area_struct *vma) if (IS_ERR(bl)) return PTR_ERR(bl); - ret = io_uring_mmap_pages(ctx, vma, bl->buf_pages, bl->buf_nr_pages); - io_put_bl(ctx, bl); - return ret; + return io_uring_mmap_pages(ctx, vma, bl->buf_pages, bl->buf_nr_pages); } diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h index d5e4afcbfbb3..dff7444026a6 100644 --- a/io_uring/kbuf.h +++ b/io_uring/kbuf.h @@ -35,8 +35,6 @@ struct io_buffer_list { __u16 mask; __u16 flags; - - atomic_t refs; }; struct io_buffer { @@ -83,7 +81,6 @@ void __io_put_kbuf(struct io_kiocb *req, int len, unsigned issue_flags); bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags); -void io_put_bl(struct io_ring_ctx *ctx, struct io_buffer_list *bl); struct io_buffer_list *io_pbuf_get_bl(struct io_ring_ctx *ctx, unsigned long bgid); int io_pbuf_mmap(struct file *file, struct vm_area_struct *vma); diff --git a/io_uring/memmap.c b/io_uring/memmap.c index 668b1c3579a2..73b73f4ea1bd 100644 --- a/io_uring/memmap.c +++ b/io_uring/memmap.c @@ -383,7 +383,6 @@ static void *io_uring_validate_mmap_request(struct file *file, loff_t pgoff, if (IS_ERR(bl)) return bl; ptr = bl->buf_ring; - io_put_bl(ctx, bl); return ptr; } case IORING_MAP_OFF_PARAM_REGION: From ef62de3c4ad58fab4e37bc267177bc723e48e5e2 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 29 Nov 2024 13:34:38 +0000 Subject: [PATCH 17/56] io_uring/kbuf: use region api for pbuf rings Convert internal parts of the provided buffer ring managment to the region API. It's the last non-region mapped ring we have, so it also kills a bunch of now unused memmap.c helpers. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/6c40cf7beaa648558acd4d84bc0fb3279a35d74b.1732886067.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/kbuf.c | 172 ++++++++++++++-------------------------------- io_uring/kbuf.h | 18 ++--- io_uring/memmap.c | 118 +++++-------------------------- io_uring/memmap.h | 7 -- 4 files changed, 74 insertions(+), 241 deletions(-) diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index 644f61445ec9..2dfb9f9419a0 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -351,17 +351,7 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx, if (bl->flags & IOBL_BUF_RING) { i = bl->buf_ring->tail - bl->head; - if (bl->buf_nr_pages) { - int j; - - if (!(bl->flags & IOBL_MMAP)) { - for (j = 0; j < bl->buf_nr_pages; j++) - unpin_user_page(bl->buf_pages[j]); - } - io_pages_unmap(bl->buf_ring, &bl->buf_pages, - &bl->buf_nr_pages, bl->flags & IOBL_MMAP); - bl->flags &= ~IOBL_MMAP; - } + io_free_region(ctx, &bl->region); /* make sure it's seen as empty */ INIT_LIST_HEAD(&bl->buf_list); bl->flags &= ~IOBL_BUF_RING; @@ -614,75 +604,14 @@ err: return IOU_OK; } -static int io_pin_pbuf_ring(struct io_uring_buf_reg *reg, - struct io_buffer_list *bl) -{ - struct io_uring_buf_ring *br = NULL; - struct page **pages; - int nr_pages, ret; - - pages = io_pin_pages(reg->ring_addr, - flex_array_size(br, bufs, reg->ring_entries), - &nr_pages); - if (IS_ERR(pages)) - return PTR_ERR(pages); - - br = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL); - if (!br) { - ret = -ENOMEM; - goto error_unpin; - } - -#ifdef SHM_COLOUR - /* - * On platforms that have specific aliasing requirements, SHM_COLOUR - * is set and we must guarantee that the kernel and user side align - * nicely. We cannot do that if IOU_PBUF_RING_MMAP isn't set and - * the application mmap's the provided ring buffer. Fail the request - * if we, by chance, don't end up with aligned addresses. The app - * should use IOU_PBUF_RING_MMAP instead, and liburing will handle - * this transparently. - */ - if ((reg->ring_addr | (unsigned long) br) & (SHM_COLOUR - 1)) { - ret = -EINVAL; - goto error_unpin; - } -#endif - bl->buf_pages = pages; - bl->buf_nr_pages = nr_pages; - bl->buf_ring = br; - bl->flags |= IOBL_BUF_RING; - bl->flags &= ~IOBL_MMAP; - return 0; -error_unpin: - unpin_user_pages(pages, nr_pages); - kvfree(pages); - vunmap(br); - return ret; -} - -static int io_alloc_pbuf_ring(struct io_ring_ctx *ctx, - struct io_uring_buf_reg *reg, - struct io_buffer_list *bl) -{ - size_t ring_size; - - ring_size = reg->ring_entries * sizeof(struct io_uring_buf_ring); - - bl->buf_ring = io_pages_map(&bl->buf_pages, &bl->buf_nr_pages, ring_size); - if (IS_ERR(bl->buf_ring)) { - bl->buf_ring = NULL; - return -ENOMEM; - } - - bl->flags |= (IOBL_BUF_RING | IOBL_MMAP); - return 0; -} - int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) { struct io_uring_buf_reg reg; struct io_buffer_list *bl, *free_bl = NULL; + struct io_uring_region_desc rd; + struct io_uring_buf_ring *br; + unsigned long mmap_offset; + unsigned long ring_size; int ret; lockdep_assert_held(&ctx->uring_lock); @@ -694,19 +623,8 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) return -EINVAL; if (reg.flags & ~(IOU_PBUF_RING_MMAP | IOU_PBUF_RING_INC)) return -EINVAL; - if (!(reg.flags & IOU_PBUF_RING_MMAP)) { - if (!reg.ring_addr) - return -EFAULT; - if (reg.ring_addr & ~PAGE_MASK) - return -EINVAL; - } else { - if (reg.ring_addr) - return -EINVAL; - } - if (!is_power_of_2(reg.ring_entries)) return -EINVAL; - /* cannot disambiguate full vs empty due to head/tail size */ if (reg.ring_entries >= 65536) return -EINVAL; @@ -722,21 +640,47 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) return -ENOMEM; } - if (!(reg.flags & IOU_PBUF_RING_MMAP)) - ret = io_pin_pbuf_ring(®, bl); - else - ret = io_alloc_pbuf_ring(ctx, ®, bl); + mmap_offset = reg.bgid << IORING_OFF_PBUF_SHIFT; + ring_size = flex_array_size(br, bufs, reg.ring_entries); - if (!ret) { - bl->nr_entries = reg.ring_entries; - bl->mask = reg.ring_entries - 1; - if (reg.flags & IOU_PBUF_RING_INC) - bl->flags |= IOBL_INC; - - io_buffer_add_list(ctx, bl, reg.bgid); - return 0; + memset(&rd, 0, sizeof(rd)); + rd.size = PAGE_ALIGN(ring_size); + if (!(reg.flags & IOU_PBUF_RING_MMAP)) { + rd.user_addr = reg.ring_addr; + rd.flags |= IORING_MEM_REGION_TYPE_USER; } + ret = io_create_region_mmap_safe(ctx, &bl->region, &rd, mmap_offset); + if (ret) + goto fail; + br = io_region_get_ptr(&bl->region); +#ifdef SHM_COLOUR + /* + * On platforms that have specific aliasing requirements, SHM_COLOUR + * is set and we must guarantee that the kernel and user side align + * nicely. We cannot do that if IOU_PBUF_RING_MMAP isn't set and + * the application mmap's the provided ring buffer. Fail the request + * if we, by chance, don't end up with aligned addresses. The app + * should use IOU_PBUF_RING_MMAP instead, and liburing will handle + * this transparently. + */ + if (!(reg.flags & IOU_PBUF_RING_MMAP) && + ((reg.ring_addr | (unsigned long)br) & (SHM_COLOUR - 1))) { + ret = -EINVAL; + goto fail; + } +#endif + + bl->nr_entries = reg.ring_entries; + bl->mask = reg.ring_entries - 1; + bl->flags |= IOBL_BUF_RING; + bl->buf_ring = br; + if (reg.flags & IOU_PBUF_RING_INC) + bl->flags |= IOBL_INC; + io_buffer_add_list(ctx, bl, reg.bgid); + return 0; +fail: + io_free_region(ctx, &bl->region); kfree(free_bl); return ret; } @@ -794,32 +738,18 @@ int io_register_pbuf_status(struct io_ring_ctx *ctx, void __user *arg) return 0; } -struct io_buffer_list *io_pbuf_get_bl(struct io_ring_ctx *ctx, - unsigned long bgid) +struct io_mapped_region *io_pbuf_get_region(struct io_ring_ctx *ctx, + unsigned int bgid) { struct io_buffer_list *bl; - bl = xa_load(&ctx->io_bl_xa, bgid); - /* must be a mmap'able buffer ring and have pages */ - if (bl && bl->flags & IOBL_MMAP) - return bl; - - return ERR_PTR(-EINVAL); -} - -int io_pbuf_mmap(struct file *file, struct vm_area_struct *vma) -{ - struct io_ring_ctx *ctx = file->private_data; - loff_t pgoff = vma->vm_pgoff << PAGE_SHIFT; - struct io_buffer_list *bl; - int bgid; - lockdep_assert_held(&ctx->mmap_lock); - bgid = (pgoff & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT; - bl = io_pbuf_get_bl(ctx, bgid); - if (IS_ERR(bl)) - return PTR_ERR(bl); + bl = xa_load(&ctx->io_bl_xa, bgid); + if (!bl || !(bl->flags & IOBL_BUF_RING)) + return NULL; + if (WARN_ON_ONCE(!io_region_is_set(&bl->region))) + return NULL; - return io_uring_mmap_pages(ctx, vma, bl->buf_pages, bl->buf_nr_pages); + return &bl->region; } diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h index dff7444026a6..bd80c44c5af1 100644 --- a/io_uring/kbuf.h +++ b/io_uring/kbuf.h @@ -3,15 +3,13 @@ #define IOU_KBUF_H #include +#include enum { /* ring mapped provided buffers */ IOBL_BUF_RING = 1, - /* ring mapped provided buffers, but mmap'ed by application */ - IOBL_MMAP = 2, /* buffers are consumed incrementally rather than always fully */ - IOBL_INC = 4, - + IOBL_INC = 2, }; struct io_buffer_list { @@ -21,10 +19,7 @@ struct io_buffer_list { */ union { struct list_head buf_list; - struct { - struct page **buf_pages; - struct io_uring_buf_ring *buf_ring; - }; + struct io_uring_buf_ring *buf_ring; }; __u16 bgid; @@ -35,6 +30,8 @@ struct io_buffer_list { __u16 mask; __u16 flags; + + struct io_mapped_region region; }; struct io_buffer { @@ -81,9 +78,8 @@ void __io_put_kbuf(struct io_kiocb *req, int len, unsigned issue_flags); bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags); -struct io_buffer_list *io_pbuf_get_bl(struct io_ring_ctx *ctx, - unsigned long bgid); -int io_pbuf_mmap(struct file *file, struct vm_area_struct *vma); +struct io_mapped_region *io_pbuf_get_region(struct io_ring_ctx *ctx, + unsigned int bgid); static inline bool io_kbuf_recycle_ring(struct io_kiocb *req) { diff --git a/io_uring/memmap.c b/io_uring/memmap.c index 73b73f4ea1bd..6d8a98bd9cac 100644 --- a/io_uring/memmap.c +++ b/io_uring/memmap.c @@ -36,90 +36,6 @@ static void *io_mem_alloc_compound(struct page **pages, int nr_pages, return page_address(page); } -static void *io_mem_alloc_single(struct page **pages, int nr_pages, size_t size, - gfp_t gfp) -{ - void *ret; - int i; - - for (i = 0; i < nr_pages; i++) { - pages[i] = alloc_page(gfp); - if (!pages[i]) - goto err; - } - - ret = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL); - if (ret) - return ret; -err: - while (i--) - put_page(pages[i]); - return ERR_PTR(-ENOMEM); -} - -void *io_pages_map(struct page ***out_pages, unsigned short *npages, - size_t size) -{ - gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN; - struct page **pages; - int nr_pages; - void *ret; - - nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; - pages = kvmalloc_array(nr_pages, sizeof(struct page *), gfp); - if (!pages) - return ERR_PTR(-ENOMEM); - - ret = io_mem_alloc_compound(pages, nr_pages, size, gfp); - if (!IS_ERR(ret)) - goto done; - if (nr_pages == 1) - goto fail; - - ret = io_mem_alloc_single(pages, nr_pages, size, gfp); - if (!IS_ERR(ret)) { -done: - *out_pages = pages; - *npages = nr_pages; - return ret; - } -fail: - kvfree(pages); - *out_pages = NULL; - *npages = 0; - return ret; -} - -void io_pages_unmap(void *ptr, struct page ***pages, unsigned short *npages, - bool put_pages) -{ - bool do_vunmap = false; - - if (!ptr) - return; - - if (put_pages && *npages) { - struct page **to_free = *pages; - int i; - - /* - * Only did vmap for the non-compound multiple page case. - * For the compound page, we just need to put the head. - */ - if (PageCompound(to_free[0])) - *npages = 1; - else if (*npages > 1) - do_vunmap = true; - for (i = 0; i < *npages; i++) - put_page(to_free[i]); - } - if (do_vunmap) - vunmap(ptr); - kvfree(*pages); - *pages = NULL; - *npages = 0; -} - struct page **io_pin_pages(unsigned long uaddr, unsigned long len, int *npages) { unsigned long start, end, nr_pages; @@ -374,16 +290,14 @@ static void *io_uring_validate_mmap_request(struct file *file, loff_t pgoff, return ERR_PTR(-EFAULT); return ctx->sq_sqes; case IORING_OFF_PBUF_RING: { - struct io_buffer_list *bl; + struct io_mapped_region *region; unsigned int bgid; - void *ptr; bgid = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT; - bl = io_pbuf_get_bl(ctx, bgid); - if (IS_ERR(bl)) - return bl; - ptr = bl->buf_ring; - return ptr; + region = io_pbuf_get_region(ctx, bgid); + if (!region) + return ERR_PTR(-EINVAL); + return io_region_validate_mmap(ctx, region); } case IORING_MAP_OFF_PARAM_REGION: return io_region_validate_mmap(ctx, &ctx->param_region); @@ -392,15 +306,6 @@ static void *io_uring_validate_mmap_request(struct file *file, loff_t pgoff, return ERR_PTR(-EINVAL); } -int io_uring_mmap_pages(struct io_ring_ctx *ctx, struct vm_area_struct *vma, - struct page **pages, int npages) -{ - unsigned long nr_pages = npages; - - vm_flags_set(vma, VM_DONTEXPAND); - return vm_insert_pages(vma, vma->vm_start, pages, &nr_pages); -} - #ifdef CONFIG_MMU static int io_region_mmap(struct io_ring_ctx *ctx, @@ -435,8 +340,17 @@ __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma) return io_region_mmap(ctx, &ctx->ring_region, vma, page_limit); case IORING_OFF_SQES: return io_region_mmap(ctx, &ctx->sq_region, vma, UINT_MAX); - case IORING_OFF_PBUF_RING: - return io_pbuf_mmap(file, vma); + case IORING_OFF_PBUF_RING: { + struct io_mapped_region *region; + unsigned int bgid; + + bgid = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT; + region = io_pbuf_get_region(ctx, bgid); + if (!region) + return -EINVAL; + + return io_region_mmap(ctx, region, vma, UINT_MAX); + } case IORING_MAP_OFF_PARAM_REGION: return io_region_mmap(ctx, &ctx->param_region, vma, UINT_MAX); } diff --git a/io_uring/memmap.h b/io_uring/memmap.h index 7395996eb353..c898dcba2b4e 100644 --- a/io_uring/memmap.h +++ b/io_uring/memmap.h @@ -4,13 +4,6 @@ #define IORING_MAP_OFF_PARAM_REGION 0x20000000ULL struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages); -int io_uring_mmap_pages(struct io_ring_ctx *ctx, struct vm_area_struct *vma, - struct page **pages, int npages); - -void *io_pages_map(struct page ***out_pages, unsigned short *npages, - size_t size); -void io_pages_unmap(void *ptr, struct page ***pages, unsigned short *npages, - bool put_pages); #ifndef CONFIG_MMU unsigned int io_uring_nommu_mmap_capabilities(struct file *file); From 7cd7b9575270e4c4f80cc5ff71b13f4102b59b9e Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 29 Nov 2024 13:34:39 +0000 Subject: [PATCH 18/56] io_uring/memmap: unify io_uring mmap'ing code All mapped memory is now backed by regions and we can unify and clean up io_region_validate_mmap() and io_uring_mmap(). Extract a function looking up a region, the rest of the handling should be generic and just needs the region. There is one more ring type specific code, i.e. the mmaping size truncation quirk for IORING_OFF_[S,C]Q_RING, which is left as is. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/f5e1eda1562bfd34276de07465525ae5f10e1e84.1732886067.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/kbuf.c | 3 -- io_uring/memmap.c | 81 ++++++++++++++++++----------------------------- 2 files changed, 31 insertions(+), 53 deletions(-) diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index 2dfb9f9419a0..e91260a6156b 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -748,8 +748,5 @@ struct io_mapped_region *io_pbuf_get_region(struct io_ring_ctx *ctx, bl = xa_load(&ctx->io_bl_xa, bgid); if (!bl || !(bl->flags & IOBL_BUF_RING)) return NULL; - if (WARN_ON_ONCE(!io_region_is_set(&bl->region))) - return NULL; - return &bl->region; } diff --git a/io_uring/memmap.c b/io_uring/memmap.c index 6d8a98bd9cac..dda846190fbd 100644 --- a/io_uring/memmap.c +++ b/io_uring/memmap.c @@ -254,6 +254,27 @@ int io_create_region_mmap_safe(struct io_ring_ctx *ctx, struct io_mapped_region return 0; } +static struct io_mapped_region *io_mmap_get_region(struct io_ring_ctx *ctx, + loff_t pgoff) +{ + loff_t offset = pgoff << PAGE_SHIFT; + unsigned int bgid; + + switch (offset & IORING_OFF_MMAP_MASK) { + case IORING_OFF_SQ_RING: + case IORING_OFF_CQ_RING: + return &ctx->ring_region; + case IORING_OFF_SQES: + return &ctx->sq_region; + case IORING_OFF_PBUF_RING: + bgid = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT; + return io_pbuf_get_region(ctx, bgid); + case IORING_MAP_OFF_PARAM_REGION: + return &ctx->param_region; + } + return NULL; +} + static void *io_region_validate_mmap(struct io_ring_ctx *ctx, struct io_mapped_region *mr) { @@ -271,39 +292,12 @@ static void *io_uring_validate_mmap_request(struct file *file, loff_t pgoff, size_t sz) { struct io_ring_ctx *ctx = file->private_data; - loff_t offset = pgoff << PAGE_SHIFT; + struct io_mapped_region *region; - switch ((pgoff << PAGE_SHIFT) & IORING_OFF_MMAP_MASK) { - case IORING_OFF_SQ_RING: - case IORING_OFF_CQ_RING: - /* Don't allow mmap if the ring was setup without it */ - if (ctx->flags & IORING_SETUP_NO_MMAP) - return ERR_PTR(-EINVAL); - if (!ctx->rings) - return ERR_PTR(-EFAULT); - return ctx->rings; - case IORING_OFF_SQES: - /* Don't allow mmap if the ring was setup without it */ - if (ctx->flags & IORING_SETUP_NO_MMAP) - return ERR_PTR(-EINVAL); - if (!ctx->sq_sqes) - return ERR_PTR(-EFAULT); - return ctx->sq_sqes; - case IORING_OFF_PBUF_RING: { - struct io_mapped_region *region; - unsigned int bgid; - - bgid = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT; - region = io_pbuf_get_region(ctx, bgid); - if (!region) - return ERR_PTR(-EINVAL); - return io_region_validate_mmap(ctx, region); - } - case IORING_MAP_OFF_PARAM_REGION: - return io_region_validate_mmap(ctx, &ctx->param_region); - } - - return ERR_PTR(-EINVAL); + region = io_mmap_get_region(ctx, pgoff); + if (!region) + return ERR_PTR(-EINVAL); + return io_region_validate_mmap(ctx, region); } #ifdef CONFIG_MMU @@ -324,7 +318,8 @@ __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma) struct io_ring_ctx *ctx = file->private_data; size_t sz = vma->vm_end - vma->vm_start; long offset = vma->vm_pgoff << PAGE_SHIFT; - unsigned int page_limit; + unsigned int page_limit = UINT_MAX; + struct io_mapped_region *region; void *ptr; guard(mutex)(&ctx->mmap_lock); @@ -337,25 +332,11 @@ __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma) case IORING_OFF_SQ_RING: case IORING_OFF_CQ_RING: page_limit = (sz + PAGE_SIZE - 1) >> PAGE_SHIFT; - return io_region_mmap(ctx, &ctx->ring_region, vma, page_limit); - case IORING_OFF_SQES: - return io_region_mmap(ctx, &ctx->sq_region, vma, UINT_MAX); - case IORING_OFF_PBUF_RING: { - struct io_mapped_region *region; - unsigned int bgid; - - bgid = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT; - region = io_pbuf_get_region(ctx, bgid); - if (!region) - return -EINVAL; - - return io_region_mmap(ctx, region, vma, UINT_MAX); - } - case IORING_MAP_OFF_PARAM_REGION: - return io_region_mmap(ctx, &ctx->param_region, vma, UINT_MAX); + break; } - return -EINVAL; + region = io_mmap_get_region(ctx, vma->vm_pgoff); + return io_region_mmap(ctx, region, vma, page_limit); } unsigned long io_uring_get_unmapped_area(struct file *filp, unsigned long addr, From 5dbb3cbd060aa86a722d7d44278e537ae3f63081 Mon Sep 17 00:00:00 2001 From: Anuj Gupta Date: Thu, 28 Nov 2024 16:52:31 +0530 Subject: [PATCH 19/56] block: define set of integrity flags to be inherited by cloned bip Introduce BIP_CLONE_FLAGS describing integrity flags that should be inherited in the cloned bip from the parent. Suggested-by: Christoph Hellwig Signed-off-by: Anuj Gupta Reviewed-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Reviewed-by: Keith Busch Link: https://lore.kernel.org/r/20241128112240.8867-2-anuj20.g@samsung.com Signed-off-by: Jens Axboe --- block/bio-integrity.c | 2 +- include/linux/bio-integrity.h | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 2a4bd6611692..a448a25d13de 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -559,7 +559,7 @@ int bio_integrity_clone(struct bio *bio, struct bio *bio_src, bip->bip_vec = bip_src->bip_vec; bip->bip_iter = bip_src->bip_iter; - bip->bip_flags = bip_src->bip_flags & ~BIP_BLOCK_INTEGRITY; + bip->bip_flags = bip_src->bip_flags & BIP_CLONE_FLAGS; return 0; } diff --git a/include/linux/bio-integrity.h b/include/linux/bio-integrity.h index dbf0f74c1529..0f0cf10222e8 100644 --- a/include/linux/bio-integrity.h +++ b/include/linux/bio-integrity.h @@ -30,6 +30,9 @@ struct bio_integrity_payload { struct bio_vec bip_inline_vecs[];/* embedded bvec array */ }; +#define BIP_CLONE_FLAGS (BIP_MAPPED_INTEGRITY | BIP_CTRL_NOCHECK | \ + BIP_DISK_NOCHECK | BIP_IP_CHECKSUM) + #ifdef CONFIG_BLK_DEV_INTEGRITY #define bip_for_each_vec(bvl, bip, iter) \ From 031141976be0bd5f385775727a4ed3cc845eb7ba Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 28 Nov 2024 16:52:32 +0530 Subject: [PATCH 20/56] block: copy back bounce buffer to user-space correctly in case of split Copy back the bounce buffer to user-space in entirety when the parent bio completes. The existing code uses bip_iter.bi_size for sizing the copy, which can be modified. So move away from that and fetch it from the vector passed to the block layer. While at it, switch to using better variable names. Fixes: 492c5d455969f ("block: bio-integrity: directly map user buffers") Signed-off-by: Christoph Hellwig Signed-off-by: Anuj Gupta Reviewed-by: Keith Busch Link: https://lore.kernel.org/r/20241128112240.8867-3-anuj20.g@samsung.com Signed-off-by: Jens Axboe --- block/bio-integrity.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/block/bio-integrity.c b/block/bio-integrity.c index a448a25d13de..4341b0d4efa1 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -118,17 +118,18 @@ static void bio_integrity_unpin_bvec(struct bio_vec *bv, int nr_vecs, static void bio_integrity_uncopy_user(struct bio_integrity_payload *bip) { - unsigned short nr_vecs = bip->bip_max_vcnt - 1; - struct bio_vec *copy = &bip->bip_vec[1]; - size_t bytes = bip->bip_iter.bi_size; - struct iov_iter iter; + unsigned short orig_nr_vecs = bip->bip_max_vcnt - 1; + struct bio_vec *orig_bvecs = &bip->bip_vec[1]; + struct bio_vec *bounce_bvec = &bip->bip_vec[0]; + size_t bytes = bounce_bvec->bv_len; + struct iov_iter orig_iter; int ret; - iov_iter_bvec(&iter, ITER_DEST, copy, nr_vecs, bytes); - ret = copy_to_iter(bvec_virt(bip->bip_vec), bytes, &iter); + iov_iter_bvec(&orig_iter, ITER_DEST, orig_bvecs, orig_nr_vecs, bytes); + ret = copy_to_iter(bvec_virt(bounce_bvec), bytes, &orig_iter); WARN_ON_ONCE(ret != bytes); - bio_integrity_unpin_bvec(copy, nr_vecs, true); + bio_integrity_unpin_bvec(orig_bvecs, orig_nr_vecs, true); } /** From fe8f4ca7107e968b0eb7328155c8811f2a19424a Mon Sep 17 00:00:00 2001 From: Anuj Gupta Date: Thu, 28 Nov 2024 16:52:33 +0530 Subject: [PATCH 21/56] block: modify bio_integrity_map_user to accept iov_iter as argument This patch refactors bio_integrity_map_user to accept iov_iter as argument. This is a prep patch. Signed-off-by: Anuj Gupta Signed-off-by: Kanchan Joshi Reviewed-by: Christoph Hellwig Reviewed-by: Keith Busch Link: https://lore.kernel.org/r/20241128112240.8867-4-anuj20.g@samsung.com Signed-off-by: Jens Axboe --- block/bio-integrity.c | 12 +++++------- block/blk-integrity.c | 10 +++++++++- include/linux/bio-integrity.h | 5 ++--- 3 files changed, 16 insertions(+), 11 deletions(-) diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 4341b0d4efa1..f56d01cec689 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -302,16 +302,15 @@ static unsigned int bvec_from_pages(struct bio_vec *bvec, struct page **pages, return nr_bvecs; } -int bio_integrity_map_user(struct bio *bio, void __user *ubuf, ssize_t bytes) +int bio_integrity_map_user(struct bio *bio, struct iov_iter *iter) { struct request_queue *q = bdev_get_queue(bio->bi_bdev); unsigned int align = blk_lim_dma_alignment_and_pad(&q->limits); struct page *stack_pages[UIO_FASTIOV], **pages = stack_pages; struct bio_vec stack_vec[UIO_FASTIOV], *bvec = stack_vec; + size_t offset, bytes = iter->count; unsigned int direction, nr_bvecs; - struct iov_iter iter; int ret, nr_vecs; - size_t offset; bool copy; if (bio_integrity(bio)) @@ -324,8 +323,7 @@ int bio_integrity_map_user(struct bio *bio, void __user *ubuf, ssize_t bytes) else direction = ITER_SOURCE; - iov_iter_ubuf(&iter, direction, ubuf, bytes); - nr_vecs = iov_iter_npages(&iter, BIO_MAX_VECS + 1); + nr_vecs = iov_iter_npages(iter, BIO_MAX_VECS + 1); if (nr_vecs > BIO_MAX_VECS) return -E2BIG; if (nr_vecs > UIO_FASTIOV) { @@ -335,8 +333,8 @@ int bio_integrity_map_user(struct bio *bio, void __user *ubuf, ssize_t bytes) pages = NULL; } - copy = !iov_iter_is_aligned(&iter, align, align); - ret = iov_iter_extract_pages(&iter, &pages, bytes, nr_vecs, 0, &offset); + copy = !iov_iter_is_aligned(iter, align, align); + ret = iov_iter_extract_pages(iter, &pages, bytes, nr_vecs, 0, &offset); if (unlikely(ret < 0)) goto free_bvec; diff --git a/block/blk-integrity.c b/block/blk-integrity.c index b180cac61a9d..4a29754f1bc2 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -115,8 +115,16 @@ EXPORT_SYMBOL(blk_rq_map_integrity_sg); int blk_rq_integrity_map_user(struct request *rq, void __user *ubuf, ssize_t bytes) { - int ret = bio_integrity_map_user(rq->bio, ubuf, bytes); + int ret; + struct iov_iter iter; + unsigned int direction; + if (op_is_write(req_op(rq))) + direction = ITER_DEST; + else + direction = ITER_SOURCE; + iov_iter_ubuf(&iter, direction, ubuf, bytes); + ret = bio_integrity_map_user(rq->bio, &iter); if (ret) return ret; diff --git a/include/linux/bio-integrity.h b/include/linux/bio-integrity.h index 0f0cf10222e8..58ff9988433a 100644 --- a/include/linux/bio-integrity.h +++ b/include/linux/bio-integrity.h @@ -75,7 +75,7 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, gfp_t gfp, unsigned int nr); int bio_integrity_add_page(struct bio *bio, struct page *page, unsigned int len, unsigned int offset); -int bio_integrity_map_user(struct bio *bio, void __user *ubuf, ssize_t len); +int bio_integrity_map_user(struct bio *bio, struct iov_iter *iter); void bio_integrity_unmap_user(struct bio *bio); bool bio_integrity_prep(struct bio *bio); void bio_integrity_advance(struct bio *bio, unsigned int bytes_done); @@ -101,8 +101,7 @@ static inline void bioset_integrity_free(struct bio_set *bs) { } -static inline int bio_integrity_map_user(struct bio *bio, void __user *ubuf, - ssize_t len) +static int bio_integrity_map_user(struct bio *bio, struct iov_iter *iter) { return -EINVAL; } From 10783d0ba0d7731ec81d88c54f83cf0ff89d1c2a Mon Sep 17 00:00:00 2001 From: Anuj Gupta Date: Thu, 28 Nov 2024 16:52:34 +0530 Subject: [PATCH 22/56] fs, iov_iter: define meta io descriptor Add flags to describe checks for integrity meta buffer. Also, introduce a new 'uio_meta' structure that upper layer can use to pass the meta/integrity information. Signed-off-by: Kanchan Joshi Signed-off-by: Anuj Gupta Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20241128112240.8867-5-anuj20.g@samsung.com Signed-off-by: Jens Axboe --- include/linux/uio.h | 9 +++++++++ include/uapi/linux/fs.h | 9 +++++++++ 2 files changed, 18 insertions(+) diff --git a/include/linux/uio.h b/include/linux/uio.h index 853f9de5aa05..8ada84e85447 100644 --- a/include/linux/uio.h +++ b/include/linux/uio.h @@ -82,6 +82,15 @@ struct iov_iter { }; }; +typedef __u16 uio_meta_flags_t; + +struct uio_meta { + uio_meta_flags_t flags; + u16 app_tag; + u64 seed; + struct iov_iter iter; +}; + static inline const struct iovec *iter_iov(const struct iov_iter *iter) { if (iter->iter_type == ITER_UBUF) diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index 753971770733..9070ef19f0a3 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -40,6 +40,15 @@ #define BLOCK_SIZE_BITS 10 #define BLOCK_SIZE (1< Date: Thu, 28 Nov 2024 16:52:35 +0530 Subject: [PATCH 23/56] fs: introduce IOCB_HAS_METADATA for metadata Introduce an IOCB_HAS_METADATA flag for the kiocb struct, for handling requests containing meta payload. Signed-off-by: Anuj Gupta Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20241128112240.8867-6-anuj20.g@samsung.com Signed-off-by: Jens Axboe --- include/linux/fs.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/linux/fs.h b/include/linux/fs.h index 7e29433c5ecc..2cc3d45da7b0 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -348,6 +348,7 @@ struct readahead_control; #define IOCB_DIO_CALLER_COMP (1 << 22) /* kiocb is a read or write operation submitted by fs/aio.c. */ #define IOCB_AIO_RW (1 << 23) +#define IOCB_HAS_METADATA (1 << 24) /* for use in trace events */ #define TRACE_IOCB_STRINGS \ From 59a7d12a7fb5ab24efa893e6980a00ffc090c777 Mon Sep 17 00:00:00 2001 From: Anuj Gupta Date: Thu, 28 Nov 2024 16:52:36 +0530 Subject: [PATCH 24/56] io_uring: introduce attributes for read/write and PI support Add the ability to pass additional attributes along with read/write. Application can prepare attibute specific information and pass its address using the SQE field: __u64 attr_ptr; Along with setting a mask indicating attributes being passed: __u64 attr_type_mask; Overall 64 attributes are allowed and currently one attribute 'IORING_RW_ATTR_FLAG_PI' is supported. With PI attribute, userspace can pass following information: - flags: integrity check flags IO_INTEGRITY_CHK_{GUARD/APPTAG/REFTAG} - len: length of PI/metadata buffer - addr: address of metadata buffer - seed: seed value for reftag remapping - app_tag: application defined 16b value Process this information to prepare uio_meta_descriptor and pass it down using kiocb->private. PI attribute is supported only for direct IO. Signed-off-by: Anuj Gupta Signed-off-by: Kanchan Joshi Link: https://lore.kernel.org/r/20241128112240.8867-7-anuj20.g@samsung.com Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 16 +++++++ io_uring/io_uring.c | 2 + io_uring/rw.c | 83 ++++++++++++++++++++++++++++++++++- io_uring/rw.h | 14 +++++- 4 files changed, 112 insertions(+), 3 deletions(-) diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index aac9a4f8fa9a..38f0d6b10eaf 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -98,6 +98,10 @@ struct io_uring_sqe { __u64 addr3; __u64 __pad2[1]; }; + struct { + __u64 attr_ptr; /* pointer to attribute information */ + __u64 attr_type_mask; /* bit mask of attributes */ + }; __u64 optval; /* * If the ring is initialized with IORING_SETUP_SQE128, then @@ -107,6 +111,18 @@ struct io_uring_sqe { }; }; +/* sqe->attr_type_mask flags */ +#define IORING_RW_ATTR_FLAG_PI (1U << 0) +/* PI attribute information */ +struct io_uring_attr_pi { + __u16 flags; + __u16 app_tag; + __u32 len; + __u64 addr; + __u64 seed; + __u64 rsvd; +}; + /* * If sqe->file_index is set to this for opcodes that instantiate a new * direct descriptor (like openat/openat2/accept), then io_uring will allocate diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 8dd3fa9be993..2e6891aadecc 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -3876,6 +3876,8 @@ static int __init io_uring_init(void) BUILD_BUG_SQE_ELEM(46, __u16, __pad3[0]); BUILD_BUG_SQE_ELEM(48, __u64, addr3); BUILD_BUG_SQE_ELEM_SIZE(48, 0, cmd); + BUILD_BUG_SQE_ELEM(48, __u64, attr_ptr); + BUILD_BUG_SQE_ELEM(56, __u64, attr_type_mask); BUILD_BUG_SQE_ELEM(56, __u64, __pad2); BUILD_BUG_ON(sizeof(struct io_uring_files_update) != diff --git a/io_uring/rw.c b/io_uring/rw.c index 0bcb83e4ce3c..04e4467ab0ee 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -257,11 +257,53 @@ static int io_prep_rw_setup(struct io_kiocb *req, int ddir, bool do_import) return 0; } +static inline void io_meta_save_state(struct io_async_rw *io) +{ + io->meta_state.seed = io->meta.seed; + iov_iter_save_state(&io->meta.iter, &io->meta_state.iter_meta); +} + +static inline void io_meta_restore(struct io_async_rw *io, struct kiocb *kiocb) +{ + if (kiocb->ki_flags & IOCB_HAS_METADATA) { + io->meta.seed = io->meta_state.seed; + iov_iter_restore(&io->meta.iter, &io->meta_state.iter_meta); + } +} + +static int io_prep_rw_pi(struct io_kiocb *req, struct io_rw *rw, int ddir, + u64 attr_ptr, u64 attr_type_mask) +{ + struct io_uring_attr_pi pi_attr; + struct io_async_rw *io; + int ret; + + if (copy_from_user(&pi_attr, u64_to_user_ptr(attr_ptr), + sizeof(pi_attr))) + return -EFAULT; + + if (pi_attr.rsvd) + return -EINVAL; + + io = req->async_data; + io->meta.flags = pi_attr.flags; + io->meta.app_tag = pi_attr.app_tag; + io->meta.seed = pi_attr.seed; + ret = import_ubuf(ddir, u64_to_user_ptr(pi_attr.addr), + pi_attr.len, &io->meta.iter); + if (unlikely(ret < 0)) + return ret; + rw->kiocb.ki_flags |= IOCB_HAS_METADATA; + io_meta_save_state(io); + return ret; +} + static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, int ddir, bool do_import) { struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); unsigned ioprio; + u64 attr_type_mask; int ret; rw->kiocb.ki_pos = READ_ONCE(sqe->off); @@ -279,11 +321,28 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, rw->kiocb.ki_ioprio = get_current_ioprio(); } rw->kiocb.dio_complete = NULL; + rw->kiocb.ki_flags = 0; rw->addr = READ_ONCE(sqe->addr); rw->len = READ_ONCE(sqe->len); rw->flags = READ_ONCE(sqe->rw_flags); - return io_prep_rw_setup(req, ddir, do_import); + ret = io_prep_rw_setup(req, ddir, do_import); + + if (unlikely(ret)) + return ret; + + attr_type_mask = READ_ONCE(sqe->attr_type_mask); + if (attr_type_mask) { + u64 attr_ptr; + + /* only PI attribute is supported currently */ + if (attr_type_mask != IORING_RW_ATTR_FLAG_PI) + return -EINVAL; + + attr_ptr = READ_ONCE(sqe->attr_ptr); + ret = io_prep_rw_pi(req, rw, ddir, attr_ptr, attr_type_mask); + } + return ret; } int io_prep_read(struct io_kiocb *req, const struct io_uring_sqe *sqe) @@ -409,7 +468,9 @@ static inline loff_t *io_kiocb_update_pos(struct io_kiocb *req) static void io_resubmit_prep(struct io_kiocb *req) { struct io_async_rw *io = req->async_data; + struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); + io_meta_restore(io, &rw->kiocb); iov_iter_restore(&io->iter, &io->iter_state); } @@ -744,6 +805,10 @@ static bool io_rw_should_retry(struct io_kiocb *req) if (kiocb->ki_flags & (IOCB_DIRECT | IOCB_HIPRI)) return false; + /* never retry for meta io */ + if (kiocb->ki_flags & IOCB_HAS_METADATA) + return false; + /* * just use poll if we can, and don't attempt if the fs doesn't * support callback based unlocks @@ -794,7 +859,7 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode, int rw_type) if (!(req->flags & REQ_F_FIXED_FILE)) req->flags |= io_file_get_flags(file); - kiocb->ki_flags = file->f_iocb_flags; + kiocb->ki_flags |= file->f_iocb_flags; ret = kiocb_set_rw_flags(kiocb, rw->flags, rw_type); if (unlikely(ret)) return ret; @@ -828,6 +893,18 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode, int rw_type) kiocb->ki_complete = io_complete_rw; } + if (kiocb->ki_flags & IOCB_HAS_METADATA) { + struct io_async_rw *io = req->async_data; + + /* + * We have a union of meta fields with wpq used for buffered-io + * in io_async_rw, so fail it here. + */ + if (!(req->file->f_flags & O_DIRECT)) + return -EOPNOTSUPP; + kiocb->private = &io->meta; + } + return 0; } @@ -902,6 +979,7 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags) * manually if we need to. */ iov_iter_restore(&io->iter, &io->iter_state); + io_meta_restore(io, kiocb); do { /* @@ -1125,6 +1203,7 @@ done: } else { ret_eagain: iov_iter_restore(&io->iter, &io->iter_state); + io_meta_restore(io, kiocb); if (kiocb->ki_flags & IOCB_WRITE) io_req_end_write(req); return -EAGAIN; diff --git a/io_uring/rw.h b/io_uring/rw.h index 3f432dc75441..2d7656bd268d 100644 --- a/io_uring/rw.h +++ b/io_uring/rw.h @@ -2,6 +2,11 @@ #include +struct io_meta_state { + u32 seed; + struct iov_iter_state iter_meta; +}; + struct io_async_rw { size_t bytes_done; struct iov_iter iter; @@ -9,7 +14,14 @@ struct io_async_rw { struct iovec fast_iov; struct iovec *free_iovec; int free_iov_nr; - struct wait_page_queue wpq; + /* wpq is for buffered io, while meta fields are used with direct io */ + union { + struct wait_page_queue wpq; + struct { + struct uio_meta meta; + struct io_meta_state meta_state; + }; + }; }; int io_prep_read_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe); From 2c0487d8b1f1351d48a13b77b254a2bb6de49eb3 Mon Sep 17 00:00:00 2001 From: Anuj Gupta Date: Thu, 28 Nov 2024 16:52:37 +0530 Subject: [PATCH 25/56] block: introduce BIP_CHECK_GUARD/REFTAG/APPTAG bip_flags This patch introduces BIP_CHECK_GUARD/REFTAG/APPTAG bip_flags which indicate how the hardware should check the integrity payload. BIP_CHECK_GUARD/REFTAG are conversion of existing semantics, while BIP_CHECK_APPTAG is a new flag. The driver can now just rely on block layer flags, and doesn't need to know the integrity source. Submitter of PI decides which tags to check. This would also give us a unified interface for user and kernel generated integrity. Signed-off-by: Anuj Gupta Signed-off-by: Kanchan Joshi Reviewed-by: Christoph Hellwig Reviewed-by: Keith Busch Link: https://lore.kernel.org/r/20241128112240.8867-8-anuj20.g@samsung.com Signed-off-by: Jens Axboe --- block/bio-integrity.c | 5 +++++ drivers/nvme/host/core.c | 11 +++-------- include/linux/bio-integrity.h | 6 +++++- 3 files changed, 13 insertions(+), 9 deletions(-) diff --git a/block/bio-integrity.c b/block/bio-integrity.c index f56d01cec689..3bee43b87001 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -434,6 +434,11 @@ bool bio_integrity_prep(struct bio *bio) if (bi->csum_type == BLK_INTEGRITY_CSUM_IP) bip->bip_flags |= BIP_IP_CHECKSUM; + /* describe what tags to check in payload */ + if (bi->csum_type) + bip->bip_flags |= BIP_CHECK_GUARD; + if (bi->flags & BLK_INTEGRITY_REF_TAG) + bip->bip_flags |= BIP_CHECK_REFTAG; if (bio_integrity_add_page(bio, virt_to_page(buf), len, offset_in_page(buf)) < len) { printk(KERN_ERR "could not attach integrity payload\n"); diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index a970168a3014..766df130cd82 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1017,18 +1017,13 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns, control |= NVME_RW_PRINFO_PRACT; } - switch (ns->head->pi_type) { - case NVME_NS_DPS_PI_TYPE3: + if (bio_integrity_flagged(req->bio, BIP_CHECK_GUARD)) control |= NVME_RW_PRINFO_PRCHK_GUARD; - break; - case NVME_NS_DPS_PI_TYPE1: - case NVME_NS_DPS_PI_TYPE2: - control |= NVME_RW_PRINFO_PRCHK_GUARD | - NVME_RW_PRINFO_PRCHK_REF; + if (bio_integrity_flagged(req->bio, BIP_CHECK_REFTAG)) { + control |= NVME_RW_PRINFO_PRCHK_REF; if (op == nvme_cmd_zone_append) control |= NVME_RW_APPEND_PIREMAP; nvme_set_ref_tag(ns, cmnd, req); - break; } } diff --git a/include/linux/bio-integrity.h b/include/linux/bio-integrity.h index 58ff9988433a..fe2bfe122db2 100644 --- a/include/linux/bio-integrity.h +++ b/include/linux/bio-integrity.h @@ -11,6 +11,9 @@ enum bip_flags { BIP_DISK_NOCHECK = 1 << 3, /* disable disk integrity checking */ BIP_IP_CHECKSUM = 1 << 4, /* IP checksum */ BIP_COPY_USER = 1 << 5, /* Kernel bounce buffer in use */ + BIP_CHECK_GUARD = 1 << 6, /* guard check */ + BIP_CHECK_REFTAG = 1 << 7, /* reftag check */ + BIP_CHECK_APPTAG = 1 << 8, /* apptag check */ }; struct bio_integrity_payload { @@ -31,7 +34,8 @@ struct bio_integrity_payload { }; #define BIP_CLONE_FLAGS (BIP_MAPPED_INTEGRITY | BIP_CTRL_NOCHECK | \ - BIP_DISK_NOCHECK | BIP_IP_CHECKSUM) + BIP_DISK_NOCHECK | BIP_IP_CHECKSUM | \ + BIP_CHECK_GUARD | BIP_CHECK_REFTAG | BIP_CHECK_APPTAG) #ifdef CONFIG_BLK_DEV_INTEGRITY From 472292cd8cfcfb9f2e7731c3c54196c35b8d283d Mon Sep 17 00:00:00 2001 From: Kanchan Joshi Date: Thu, 28 Nov 2024 16:52:38 +0530 Subject: [PATCH 26/56] nvme: add support for passing on the application tag With user integrity buffer, there is a way to specify the app_tag. Set the corresponding protocol specific flags and send the app_tag down. Reviewed-by: Christoph Hellwig Signed-off-by: Anuj Gupta Signed-off-by: Kanchan Joshi Reviewed-by: Keith Busch Link: https://lore.kernel.org/r/20241128112240.8867-9-anuj20.g@samsung.com Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 766df130cd82..0983482b3ea2 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -885,6 +885,12 @@ static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req, return BLK_STS_OK; } +static void nvme_set_app_tag(struct request *req, struct nvme_command *cmnd) +{ + cmnd->rw.lbat = cpu_to_le16(bio_integrity(req->bio)->app_tag); + cmnd->rw.lbatm = cpu_to_le16(0xffff); +} + static void nvme_set_ref_tag(struct nvme_ns *ns, struct nvme_command *cmnd, struct request *req) { @@ -1025,6 +1031,10 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns, control |= NVME_RW_APPEND_PIREMAP; nvme_set_ref_tag(ns, cmnd, req); } + if (bio_integrity_flagged(req->bio, BIP_CHECK_APPTAG)) { + control |= NVME_RW_PRINFO_PRCHK_APP; + nvme_set_app_tag(req, cmnd); + } } cmnd->rw.control = cpu_to_le16(control); From 18623503a3a514780214850bf8ba8b03ea0f3a4b Mon Sep 17 00:00:00 2001 From: Anuj Gupta Date: Thu, 28 Nov 2024 16:52:39 +0530 Subject: [PATCH 27/56] scsi: add support for user-meta interface Add support for sending user-meta buffer. Set tags to be checked using flags specified by user/block-layer. With this change, BIP_CTRL_NOCHECK becomes unused. Remove it. Signed-off-by: Anuj Gupta Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20241128112240.8867-10-anuj20.g@samsung.com Signed-off-by: Jens Axboe --- drivers/scsi/sd.c | 4 ++-- include/linux/bio-integrity.h | 16 +++++++--------- 2 files changed, 9 insertions(+), 11 deletions(-) diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 8947dab132d7..cb7ac8736b91 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -814,14 +814,14 @@ static unsigned char sd_setup_protect_cmnd(struct scsi_cmnd *scmd, if (bio_integrity_flagged(bio, BIP_IP_CHECKSUM)) scmd->prot_flags |= SCSI_PROT_IP_CHECKSUM; - if (bio_integrity_flagged(bio, BIP_CTRL_NOCHECK) == false) + if (bio_integrity_flagged(bio, BIP_CHECK_GUARD)) scmd->prot_flags |= SCSI_PROT_GUARD_CHECK; } if (dif != T10_PI_TYPE3_PROTECTION) { /* DIX/DIF Type 0, 1, 2 */ scmd->prot_flags |= SCSI_PROT_REF_INCREMENT; - if (bio_integrity_flagged(bio, BIP_CTRL_NOCHECK) == false) + if (bio_integrity_flagged(bio, BIP_CHECK_REFTAG)) scmd->prot_flags |= SCSI_PROT_REF_CHECK; } diff --git a/include/linux/bio-integrity.h b/include/linux/bio-integrity.h index fe2bfe122db2..2195bc06dcde 100644 --- a/include/linux/bio-integrity.h +++ b/include/linux/bio-integrity.h @@ -7,13 +7,12 @@ enum bip_flags { BIP_BLOCK_INTEGRITY = 1 << 0, /* block layer owns integrity data */ BIP_MAPPED_INTEGRITY = 1 << 1, /* ref tag has been remapped */ - BIP_CTRL_NOCHECK = 1 << 2, /* disable HBA integrity checking */ - BIP_DISK_NOCHECK = 1 << 3, /* disable disk integrity checking */ - BIP_IP_CHECKSUM = 1 << 4, /* IP checksum */ - BIP_COPY_USER = 1 << 5, /* Kernel bounce buffer in use */ - BIP_CHECK_GUARD = 1 << 6, /* guard check */ - BIP_CHECK_REFTAG = 1 << 7, /* reftag check */ - BIP_CHECK_APPTAG = 1 << 8, /* apptag check */ + BIP_DISK_NOCHECK = 1 << 2, /* disable disk integrity checking */ + BIP_IP_CHECKSUM = 1 << 3, /* IP checksum */ + BIP_COPY_USER = 1 << 4, /* Kernel bounce buffer in use */ + BIP_CHECK_GUARD = 1 << 5, /* guard check */ + BIP_CHECK_REFTAG = 1 << 6, /* reftag check */ + BIP_CHECK_APPTAG = 1 << 7, /* apptag check */ }; struct bio_integrity_payload { @@ -33,8 +32,7 @@ struct bio_integrity_payload { struct bio_vec bip_inline_vecs[];/* embedded bvec array */ }; -#define BIP_CLONE_FLAGS (BIP_MAPPED_INTEGRITY | BIP_CTRL_NOCHECK | \ - BIP_DISK_NOCHECK | BIP_IP_CHECKSUM | \ +#define BIP_CLONE_FLAGS (BIP_MAPPED_INTEGRITY | BIP_IP_CHECKSUM | \ BIP_CHECK_GUARD | BIP_CHECK_REFTAG | BIP_CHECK_APPTAG) #ifdef CONFIG_BLK_DEV_INTEGRITY From 3d8b5a22d40435b4a7e58f06ae2cd3506b222898 Mon Sep 17 00:00:00 2001 From: Kanchan Joshi Date: Thu, 28 Nov 2024 16:52:40 +0530 Subject: [PATCH 28/56] block: add support to pass user meta buffer If an iocb contains metadata, extract that and prepare the bip. Based on flags specified by the user, set corresponding guard/app/ref tags to be checked in bip. Reviewed-by: Christoph Hellwig Signed-off-by: Anuj Gupta Signed-off-by: Kanchan Joshi Reviewed-by: Keith Busch Link: https://lore.kernel.org/r/20241128112240.8867-11-anuj20.g@samsung.com Signed-off-by: Jens Axboe --- block/bio-integrity.c | 50 +++++++++++++++++++++++++++++++++++ block/fops.c | 45 ++++++++++++++++++++++++------- include/linux/bio-integrity.h | 7 +++++ 3 files changed, 92 insertions(+), 10 deletions(-) diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 3bee43b87001..5d81ad9a3d20 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -364,6 +364,55 @@ free_bvec: return ret; } +static void bio_uio_meta_to_bip(struct bio *bio, struct uio_meta *meta) +{ + struct bio_integrity_payload *bip = bio_integrity(bio); + + if (meta->flags & IO_INTEGRITY_CHK_GUARD) + bip->bip_flags |= BIP_CHECK_GUARD; + if (meta->flags & IO_INTEGRITY_CHK_APPTAG) + bip->bip_flags |= BIP_CHECK_APPTAG; + if (meta->flags & IO_INTEGRITY_CHK_REFTAG) + bip->bip_flags |= BIP_CHECK_REFTAG; + + bip->app_tag = meta->app_tag; +} + +int bio_integrity_map_iter(struct bio *bio, struct uio_meta *meta) +{ + struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); + unsigned int integrity_bytes; + int ret; + struct iov_iter it; + + if (!bi) + return -EINVAL; + /* + * original meta iterator can be bigger. + * process integrity info corresponding to current data buffer only. + */ + it = meta->iter; + integrity_bytes = bio_integrity_bytes(bi, bio_sectors(bio)); + if (it.count < integrity_bytes) + return -EINVAL; + + /* should fit into two bytes */ + BUILD_BUG_ON(IO_INTEGRITY_VALID_FLAGS >= (1 << 16)); + + if (meta->flags && (meta->flags & ~IO_INTEGRITY_VALID_FLAGS)) + return -EINVAL; + + it.count = integrity_bytes; + ret = bio_integrity_map_user(bio, &it); + if (!ret) { + bio_uio_meta_to_bip(bio, meta); + bip_set_seed(bio_integrity(bio), meta->seed); + iov_iter_advance(&meta->iter, integrity_bytes); + meta->seed += bio_integrity_intervals(bi, bio_sectors(bio)); + } + return ret; +} + /** * bio_integrity_prep - Prepare bio for integrity I/O * @bio: bio to prepare @@ -564,6 +613,7 @@ int bio_integrity_clone(struct bio *bio, struct bio *bio_src, bip->bip_vec = bip_src->bip_vec; bip->bip_iter = bip_src->bip_iter; bip->bip_flags = bip_src->bip_flags & BIP_CLONE_FLAGS; + bip->app_tag = bip_src->app_tag; return 0; } diff --git a/block/fops.c b/block/fops.c index 13a67940d040..6d5c4fc5a216 100644 --- a/block/fops.c +++ b/block/fops.c @@ -54,6 +54,7 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb, struct bio bio; ssize_t ret; + WARN_ON_ONCE(iocb->ki_flags & IOCB_HAS_METADATA); if (nr_pages <= DIO_INLINE_BIO_VECS) vecs = inline_vecs; else { @@ -124,12 +125,16 @@ static void blkdev_bio_end_io(struct bio *bio) { struct blkdev_dio *dio = bio->bi_private; bool should_dirty = dio->flags & DIO_SHOULD_DIRTY; + bool is_sync = dio->flags & DIO_IS_SYNC; if (bio->bi_status && !dio->bio.bi_status) dio->bio.bi_status = bio->bi_status; + if (!is_sync && (dio->iocb->ki_flags & IOCB_HAS_METADATA)) + bio_integrity_unmap_user(bio); + if (atomic_dec_and_test(&dio->ref)) { - if (!(dio->flags & DIO_IS_SYNC)) { + if (!is_sync) { struct kiocb *iocb = dio->iocb; ssize_t ret; @@ -221,14 +226,16 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, * a retry of this from blocking context. */ if (unlikely(iov_iter_count(iter))) { - bio_release_pages(bio, false); - bio_clear_flag(bio, BIO_REFFED); - bio_put(bio); - blk_finish_plug(&plug); - return -EAGAIN; + ret = -EAGAIN; + goto fail; } bio->bi_opf |= REQ_NOWAIT; } + if (!is_sync && (iocb->ki_flags & IOCB_HAS_METADATA)) { + ret = bio_integrity_map_iter(bio, iocb->private); + if (unlikely(ret)) + goto fail; + } if (is_read) { if (dio->flags & DIO_SHOULD_DIRTY) @@ -269,6 +276,12 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, bio_put(&dio->bio); return ret; +fail: + bio_release_pages(bio, false); + bio_clear_flag(bio, BIO_REFFED); + bio_put(bio); + blk_finish_plug(&plug); + return ret; } static void blkdev_bio_end_io_async(struct bio *bio) @@ -286,6 +299,9 @@ static void blkdev_bio_end_io_async(struct bio *bio) ret = blk_status_to_errno(bio->bi_status); } + if (iocb->ki_flags & IOCB_HAS_METADATA) + bio_integrity_unmap_user(bio); + iocb->ki_complete(iocb, ret); if (dio->flags & DIO_SHOULD_DIRTY) { @@ -330,10 +346,8 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb, bio_iov_bvec_set(bio, iter); } else { ret = bio_iov_iter_get_pages(bio, iter); - if (unlikely(ret)) { - bio_put(bio); - return ret; - } + if (unlikely(ret)) + goto out_bio_put; } dio->size = bio->bi_iter.bi_size; @@ -346,6 +360,13 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb, task_io_account_write(bio->bi_iter.bi_size); } + if (iocb->ki_flags & IOCB_HAS_METADATA) { + ret = bio_integrity_map_iter(bio, iocb->private); + WRITE_ONCE(iocb->private, NULL); + if (unlikely(ret)) + goto out_bio_put; + } + if (iocb->ki_flags & IOCB_ATOMIC) bio->bi_opf |= REQ_ATOMIC; @@ -360,6 +381,10 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb, submit_bio(bio); } return -EIOCBQUEUED; + +out_bio_put: + bio_put(bio); + return ret; } static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter) diff --git a/include/linux/bio-integrity.h b/include/linux/bio-integrity.h index 2195bc06dcde..de0a6c9de4d1 100644 --- a/include/linux/bio-integrity.h +++ b/include/linux/bio-integrity.h @@ -23,6 +23,7 @@ struct bio_integrity_payload { unsigned short bip_vcnt; /* # of integrity bio_vecs */ unsigned short bip_max_vcnt; /* integrity bio_vec slots */ unsigned short bip_flags; /* control flags */ + u16 app_tag; /* application tag value */ struct bvec_iter bio_iter; /* for rewinding parent bio */ @@ -78,6 +79,7 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, gfp_t gfp, int bio_integrity_add_page(struct bio *bio, struct page *page, unsigned int len, unsigned int offset); int bio_integrity_map_user(struct bio *bio, struct iov_iter *iter); +int bio_integrity_map_iter(struct bio *bio, struct uio_meta *meta); void bio_integrity_unmap_user(struct bio *bio); bool bio_integrity_prep(struct bio *bio); void bio_integrity_advance(struct bio *bio, unsigned int bytes_done); @@ -108,6 +110,11 @@ static int bio_integrity_map_user(struct bio *bio, struct iov_iter *iter) return -EINVAL; } +static inline int bio_integrity_map_iter(struct bio *bio, struct uio_meta *meta) +{ + return -EINVAL; +} + static inline void bio_integrity_unmap_user(struct bio *bio) { } From 546d191427cf5cf3215529744c2ea8558f0279db Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 29 Nov 2024 15:53:58 -0700 Subject: [PATCH 29/56] block: make bio_integrity_map_user() static inline If CONFIG_BLK_DEV_INTEGRITY isn't set, then the dummy helper must be static inline to avoid complaints about the function being unused. Fixes: fe8f4ca7107e ("block: modify bio_integrity_map_user to accept iov_iter as argument") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202411300229.y7h60mDg-lkp@intel.com/ Signed-off-by: Jens Axboe --- include/linux/bio-integrity.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/bio-integrity.h b/include/linux/bio-integrity.h index de0a6c9de4d1..802f52e38efd 100644 --- a/include/linux/bio-integrity.h +++ b/include/linux/bio-integrity.h @@ -105,7 +105,7 @@ static inline void bioset_integrity_free(struct bio_set *bs) { } -static int bio_integrity_map_user(struct bio *bio, struct iov_iter *iter) +static inline int bio_integrity_map_user(struct bio *bio, struct iov_iter *iter) { return -EINVAL; } From febfbf767174b8a027efb7aa434f9a9416adc23d Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 4 Dec 2024 15:39:23 +0000 Subject: [PATCH 30/56] io_uring/kbuf: fix unintentional sign extension on shift of reg.bgid Shifting reg.bgid << IORING_OFF_PBUF_SHIFT results in a promotion from __u16 to a 32 bit signed integer, this is then sign extended to a 64 bit unsigned long on 64 bit architectures. If reg.bgid is greater than 0x7fff then this leads to a sign extended result where all the upper 32 bits of mmap_offset are set to 1. Fix this by casting reg.bgid to the same type as mmap_offset before performing the shift. Fixes: ef62de3c4ad5 ("io_uring/kbuf: use region api for pbuf rings") Signed-off-by: Colin Ian King Link: https://lore.kernel.org/r/20241204153923.401674-1-colin.i.king@gmail.com Signed-off-by: Jens Axboe --- io_uring/kbuf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index e91260a6156b..15e5e6ec5968 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -640,7 +640,7 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) return -ENOMEM; } - mmap_offset = reg.bgid << IORING_OFF_PBUF_SHIFT; + mmap_offset = (unsigned long)reg.bgid << IORING_OFF_PBUF_SHIFT; ring_size = flex_array_size(br, bufs, reg.ring_entries); memset(&rd, 0, sizeof(rd)); From 2e6406a20a3999cc761a5a697a9afa7de40713e6 Mon Sep 17 00:00:00 2001 From: David Wei Date: Fri, 6 Dec 2024 16:41:44 -0800 Subject: [PATCH 31/56] io_uring: clean up io_prep_rw_setup() Remove unnecessary call to iov_iter_save_state() in io_prep_rw_setup() as io_import_iovec() already does this. Then the result from io_import_iovec() can be returned directly. Signed-off-by: David Wei Reviewed-by: Anuj Gupta Tested-by: Li Zetao Link: https://lore.kernel.org/r/20241207004144.783631-1-dw@davidwei.uk Signed-off-by: Jens Axboe --- io_uring/rw.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/io_uring/rw.c b/io_uring/rw.c index 04e4467ab0ee..5b24fd8b69f6 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -240,7 +240,6 @@ done: static int io_prep_rw_setup(struct io_kiocb *req, int ddir, bool do_import) { struct io_async_rw *rw; - int ret; if (io_rw_alloc_async(req)) return -ENOMEM; @@ -249,12 +248,7 @@ static int io_prep_rw_setup(struct io_kiocb *req, int ddir, bool do_import) return 0; rw = req->async_data; - ret = io_import_iovec(ddir, req, rw, 0); - if (unlikely(ret < 0)) - return ret; - - iov_iter_save_state(&rw->iter, &rw->iter_state); - return 0; + return io_import_iovec(ddir, req, rw, 0); } static inline void io_meta_save_state(struct io_async_rw *io) From de3b9e2e48190be28473ea84c384bc64931facf7 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 8 Dec 2024 21:46:01 +0000 Subject: [PATCH 32/56] io_uring: don't vmap single page regions When io_check_coalesce_buffer() meets a single page buffer it bails out and tells that it can be coalesced. That's fine for registered buffers as io_coalesce_buffer() wouldn't change anything, but the region code now uses the function to decided on whether to vmap the buffer or not. Report that a single page buffer is trivially coalescable and let io_sqe_buffer_register() to filter them. Fixes: c4d0ac1c1567 ("io_uring/memmap: optimise single folio regions") Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/cb83e053f318857068447d40c95becebcd8aeced.1733689833.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/rsrc.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 2d2333970eb5..f2ff108485c8 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -675,14 +675,9 @@ bool io_check_coalesce_buffer(struct page **page_array, int nr_pages, unsigned int count = 1, nr_folios = 1; int i; - if (nr_pages <= 1) - return false; - data->nr_pages_mid = folio_nr_pages(folio); - if (data->nr_pages_mid == 1) - return false; - data->folio_shift = folio_shift(folio); + /* * Check if pages are contiguous inside a folio, and all folios have * the same page count except for the head and tail. @@ -750,8 +745,10 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx, } /* If it's huge page(s), try to coalesce them into fewer bvec entries */ - if (io_check_coalesce_buffer(pages, nr_pages, &data)) - coalesced = io_coalesce_buffer(&pages, &nr_pages, &data); + if (nr_pages > 1 && io_check_coalesce_buffer(pages, nr_pages, &data)) { + if (data.nr_pages_mid != 1) + coalesced = io_coalesce_buffer(&pages, &nr_pages, &data); + } imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL); if (!imu) From 29b95ac917927ce9f95bf38797e16333ecb489b1 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 8 Dec 2024 21:43:22 +0000 Subject: [PATCH 33/56] io_uring: prevent reg-wait speculations With *ENTER_EXT_ARG_REG instead of passing a user pointer with arguments for the waiting loop the user can specify an offset into a pre-mapped region of memory, in which case the [offset, offset + sizeof(io_uring_reg_wait)) will be intepreted as the argument. As we address a kernel array using a user given index, it'd be a subject to speculation type of exploits. Use array_index_nospec() to prevent that. Make sure to pass not the full region size but truncate by the maximum offset allowed considering the structure size. Fixes: d617b3147d54c ("io_uring: restore back registered wait arguments") Fixes: aa00f67adc2c0 ("io_uring: add support for fixed wait regions") Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/1e3d9da7c43d619de7bcf41d1cd277ab2688c443.1733694126.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 1 + 1 file changed, 1 insertion(+) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 2e6891aadecc..66a93170ad68 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -3211,6 +3211,7 @@ static struct io_uring_reg_wait *io_get_ext_arg_reg(struct io_ring_ctx *ctx, end > ctx->cq_wait_size)) return ERR_PTR(-EFAULT); + offset = array_index_nospec(offset, ctx->cq_wait_size - size); return ctx->cq_wait_arg + offset; } From 479b2f4590bebd4a5a5ac9cb4c4803f4edf86768 Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Mon, 16 Dec 2024 15:46:07 -0500 Subject: [PATCH 34/56] io_uring: Fold allocation into alloc_cache helper The allocation paths that use alloc_cache duplicate the same code pattern, sometimes in a quite convoluted way. Fold the allocation into the cache code itself, making it just an allocator function, and keeping the cache policy invisible to callers. Another justification for doing this, beyond code simplicity, is that it makes it trivial to test the impact of disabling the cache and using slab directly, which I've used for slab improvement experiments. One relevant detail is that we provide a callback to optionally initialize memory only when we actually reach slab. This allows us to avoid blindly executing the allocation with GFP_ZERO and only clean fields when they matter. Signed-off-by: Gabriel Krisman Bertazi Link: https://lore.kernel.org/r/20241216204615.759089-2-krisman@suse.de Signed-off-by: Jens Axboe --- io_uring/alloc_cache.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/io_uring/alloc_cache.h b/io_uring/alloc_cache.h index b7a38a2069cf..a3a8cfec32ce 100644 --- a/io_uring/alloc_cache.h +++ b/io_uring/alloc_cache.h @@ -30,6 +30,19 @@ static inline void *io_alloc_cache_get(struct io_alloc_cache *cache) return NULL; } +static inline void *io_cache_alloc(struct io_alloc_cache *cache, gfp_t gfp, + void (*init_once)(void *obj)) +{ + if (unlikely(!cache->nr_cached)) { + void *obj = kmalloc(cache->elem_size, gfp); + + if (obj && init_once) + init_once(obj); + return obj; + } + return io_alloc_cache_get(cache); +} + /* returns false if the cache was initialized properly */ static inline bool io_alloc_cache_init(struct io_alloc_cache *cache, unsigned max_nr, size_t size) From 49f7a3098cc2406c9cf60d595f4a148d6780bce6 Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Mon, 16 Dec 2024 15:46:08 -0500 Subject: [PATCH 35/56] io_uring: Add generic helper to allocate async data This helper replaces io_alloc_async_data by using the folded allocation. Do it in a header to allow the compiler to decide whether to inline. Signed-off-by: Gabriel Krisman Bertazi Link: https://lore.kernel.org/r/20241216204615.759089-3-krisman@suse.de Signed-off-by: Jens Axboe --- io_uring/io_uring.h | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 12abee607e4a..e43e9194dd0a 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -8,6 +8,7 @@ #include #include #include +#include "alloc_cache.h" #include "io-wq.h" #include "slist.h" #include "filetable.h" @@ -222,6 +223,16 @@ static inline void io_req_set_res(struct io_kiocb *req, s32 res, u32 cflags) req->cqe.flags = cflags; } +static inline void *io_uring_alloc_async_data(struct io_alloc_cache *cache, + struct io_kiocb *req, + void (*init_once)(void *obj)) +{ + req->async_data = io_cache_alloc(cache, GFP_KERNEL, init_once); + if (req->async_data) + req->flags |= REQ_F_ASYNC_DATA; + return req->async_data; +} + static inline bool req_has_async_data(struct io_kiocb *req) { return req->flags & REQ_F_ASYNC_DATA; From b2846567060638a85724579781fc4a3ca6f137e4 Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Mon, 16 Dec 2024 15:46:09 -0500 Subject: [PATCH 36/56] io_uring/futex: Allocate ifd with generic alloc_cache helper Instead of open-coding the allocation, use the generic alloc_cache helper. Signed-off-by: Gabriel Krisman Bertazi Link: https://lore.kernel.org/r/20241216204615.759089-4-krisman@suse.de Signed-off-by: Jens Axboe --- io_uring/futex.c | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/io_uring/futex.c b/io_uring/futex.c index e29662f039e1..30139cc150f2 100644 --- a/io_uring/futex.c +++ b/io_uring/futex.c @@ -251,17 +251,6 @@ static void io_futex_wake_fn(struct wake_q_head *wake_q, struct futex_q *q) io_req_task_work_add(req); } -static struct io_futex_data *io_alloc_ifd(struct io_ring_ctx *ctx) -{ - struct io_futex_data *ifd; - - ifd = io_alloc_cache_get(&ctx->futex_cache); - if (ifd) - return ifd; - - return kmalloc(sizeof(struct io_futex_data), GFP_NOWAIT); -} - int io_futexv_wait(struct io_kiocb *req, unsigned int issue_flags) { struct io_futex *iof = io_kiocb_to_cmd(req, struct io_futex); @@ -331,7 +320,7 @@ int io_futex_wait(struct io_kiocb *req, unsigned int issue_flags) } io_ring_submit_lock(ctx, issue_flags); - ifd = io_alloc_ifd(ctx); + ifd = io_cache_alloc(&ctx->futex_cache, GFP_NOWAIT, NULL); if (!ifd) { ret = -ENOMEM; goto done_unlock; From 1210872918ef9feff60c9f5a4d3246372b732eae Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Mon, 16 Dec 2024 15:46:10 -0500 Subject: [PATCH 37/56] io_uring/poll: Allocate apoll with generic alloc_cache helper This abstracts away the cache details to simplify the code. Signed-off-by: Gabriel Krisman Bertazi Link: https://lore.kernel.org/r/20241216204615.759089-5-krisman@suse.de Signed-off-by: Jens Axboe --- io_uring/poll.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/io_uring/poll.c b/io_uring/poll.c index bced9edd5233..cc01c40b43d3 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -648,15 +648,12 @@ static struct async_poll *io_req_alloc_apoll(struct io_kiocb *req, if (req->flags & REQ_F_POLLED) { apoll = req->apoll; kfree(apoll->double_poll); - } else if (!(issue_flags & IO_URING_F_UNLOCKED)) { - apoll = io_alloc_cache_get(&ctx->apoll_cache); - if (!apoll) - goto alloc_apoll; - apoll->poll.retries = APOLL_MAX_RETRY; } else { -alloc_apoll: - apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC); - if (unlikely(!apoll)) + if (!(issue_flags & IO_URING_F_UNLOCKED)) + apoll = io_cache_alloc(&ctx->apoll_cache, GFP_ATOMIC, NULL); + else + apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC); + if (!apoll) return NULL; apoll->poll.retries = APOLL_MAX_RETRY; } From e9447dc0b18db40f7c9807f8d0a218f0fa07517f Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Mon, 16 Dec 2024 15:46:11 -0500 Subject: [PATCH 38/56] io_uring/uring_cmd: Allocate async data through generic helper This abstracts away the cache details and simplify the code. Signed-off-by: Gabriel Krisman Bertazi Link: https://lore.kernel.org/r/20241216204615.759089-6-krisman@suse.de Signed-off-by: Jens Axboe --- io_uring/uring_cmd.c | 20 ++------------------ 1 file changed, 2 insertions(+), 18 deletions(-) diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index af842e9b4eb9..d6ff803dbbe1 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -16,22 +16,6 @@ #include "rsrc.h" #include "uring_cmd.h" -static struct uring_cache *io_uring_async_get(struct io_kiocb *req) -{ - struct io_ring_ctx *ctx = req->ctx; - struct uring_cache *cache; - - cache = io_alloc_cache_get(&ctx->uring_cache); - if (cache) { - req->flags |= REQ_F_ASYNC_DATA; - req->async_data = cache; - return cache; - } - if (!io_alloc_async_data(req)) - return req->async_data; - return NULL; -} - static void io_req_uring_cleanup(struct io_kiocb *req, unsigned int issue_flags) { struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); @@ -185,8 +169,8 @@ static int io_uring_cmd_prep_setup(struct io_kiocb *req, struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); struct uring_cache *cache; - cache = io_uring_async_get(req); - if (unlikely(!cache)) + cache = io_uring_alloc_async_data(&req->ctx->uring_cache, req, NULL); + if (!cache) return -ENOMEM; if (!(req->flags & REQ_F_FORCE_ASYNC)) { From f49a85371d8c1b44650ce660652a2fe305d3ddf6 Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Mon, 16 Dec 2024 15:46:12 -0500 Subject: [PATCH 39/56] io_uring/net: Allocate msghdr async data through helper This abstracts away the cache details. Signed-off-by: Gabriel Krisman Bertazi Link: https://lore.kernel.org/r/20241216204615.759089-7-krisman@suse.de Signed-off-by: Jens Axboe --- io_uring/net.c | 35 ++++++++++++++++++----------------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/io_uring/net.c b/io_uring/net.c index df1f7dc6f1c8..8457408194e7 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -155,30 +155,31 @@ static void io_netmsg_recycle(struct io_kiocb *req, unsigned int issue_flags) } } +static void io_msg_async_data_init(void *obj) +{ + struct io_async_msghdr *hdr = (struct io_async_msghdr *)obj; + + hdr->free_iov = NULL; + hdr->free_iov_nr = 0; +} + static struct io_async_msghdr *io_msg_alloc_async(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; struct io_async_msghdr *hdr; - hdr = io_alloc_cache_get(&ctx->netmsg_cache); - if (hdr) { - if (hdr->free_iov) { - kasan_mempool_unpoison_object(hdr->free_iov, - hdr->free_iov_nr * sizeof(struct iovec)); - req->flags |= REQ_F_NEED_CLEANUP; - } - req->flags |= REQ_F_ASYNC_DATA; - req->async_data = hdr; - return hdr; - } + hdr = io_uring_alloc_async_data(&ctx->netmsg_cache, req, + io_msg_async_data_init); + if (!hdr) + return NULL; - if (!io_alloc_async_data(req)) { - hdr = req->async_data; - hdr->free_iov_nr = 0; - hdr->free_iov = NULL; - return hdr; + /* If the async data was cached, we might have an iov cached inside. */ + if (hdr->free_iov) { + kasan_mempool_unpoison_object(hdr->free_iov, + hdr->free_iov_nr * sizeof(struct iovec)); + req->flags |= REQ_F_NEED_CLEANUP; } - return NULL; + return hdr; } /* assign new iovec to kmsg, if we need to */ From d7f11616edf59b255f1302040604f584535876c7 Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Mon, 16 Dec 2024 15:46:13 -0500 Subject: [PATCH 40/56] io_uring/rw: Allocate async data through helper This abstract away the cache details. Signed-off-by: Gabriel Krisman Bertazi Link: https://lore.kernel.org/r/20241216204615.759089-8-krisman@suse.de Signed-off-by: Jens Axboe --- io_uring/rw.c | 36 ++++++++++++++++-------------------- 1 file changed, 16 insertions(+), 20 deletions(-) diff --git a/io_uring/rw.c b/io_uring/rw.c index 5b24fd8b69f6..bdfc3faef85d 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -208,33 +208,29 @@ static void io_req_rw_cleanup(struct io_kiocb *req, unsigned int issue_flags) } } +static void io_rw_async_data_init(void *obj) +{ + struct io_async_rw *rw = (struct io_async_rw *)obj; + + rw->free_iovec = 0; + rw->bytes_done = 0; +} + static int io_rw_alloc_async(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; struct io_async_rw *rw; - rw = io_alloc_cache_get(&ctx->rw_cache); - if (rw) { - if (rw->free_iovec) { - kasan_mempool_unpoison_object(rw->free_iovec, - rw->free_iov_nr * sizeof(struct iovec)); - req->flags |= REQ_F_NEED_CLEANUP; - } - req->flags |= REQ_F_ASYNC_DATA; - req->async_data = rw; - goto done; - } - - if (!io_alloc_async_data(req)) { - rw = req->async_data; - rw->free_iovec = NULL; - rw->free_iov_nr = 0; -done: + rw = io_uring_alloc_async_data(&ctx->rw_cache, req, io_rw_async_data_init); + if (!rw) + return -ENOMEM; + if (rw->free_iovec) { + kasan_mempool_unpoison_object(rw->free_iovec, + rw->free_iov_nr * sizeof(struct iovec)); + req->flags |= REQ_F_NEED_CLEANUP; rw->bytes_done = 0; - return 0; } - - return -ENOMEM; + return 0; } static int io_prep_rw_setup(struct io_kiocb *req, int ddir, bool do_import) From ef623a647f423c0d96aa75797cec182e3c5ba47d Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Mon, 16 Dec 2024 15:46:14 -0500 Subject: [PATCH 41/56] io_uring: Move old async data allocation helper to header There are two remaining uses of the old async data allocator that do not rely on the alloc cache. I don't want to make them use the new allocator helper because that would require a if(cache) check, which will result in dead code for the cached case (for callers passing a cache, gcc can't prove the cache isn't NULL, and will therefore preserve the check. Since this is an inline function and just a few lines long, keep a second helper to deal with cases where we don't have an async data cache. No functional change intended here. This is just moving the helper around and making it inline. Signed-off-by: Gabriel Krisman Bertazi Link: https://lore.kernel.org/r/20241216204615.759089-9-krisman@suse.de Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 13 ------------- io_uring/io_uring.h | 12 ++++++++++++ io_uring/timeout.c | 5 ++--- io_uring/waitid.c | 4 ++-- 4 files changed, 16 insertions(+), 18 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 66a93170ad68..e6d12104f8cb 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1643,19 +1643,6 @@ io_req_flags_t io_file_get_flags(struct file *file) return res; } -bool io_alloc_async_data(struct io_kiocb *req) -{ - const struct io_issue_def *def = &io_issue_defs[req->opcode]; - - WARN_ON_ONCE(!def->async_size); - req->async_data = kmalloc(def->async_size, GFP_KERNEL); - if (req->async_data) { - req->flags |= REQ_F_ASYNC_DATA; - return false; - } - return true; -} - static u32 io_get_sequence(struct io_kiocb *req) { u32 seq = req->ctx->cached_sq_head; diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index e43e9194dd0a..032758b28d78 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -12,6 +12,7 @@ #include "io-wq.h" #include "slist.h" #include "filetable.h" +#include "opdef.h" #ifndef CREATE_TRACE_POINTS #include @@ -233,6 +234,17 @@ static inline void *io_uring_alloc_async_data(struct io_alloc_cache *cache, return req->async_data; } +static inline void *io_uring_alloc_async_data_nocache(struct io_kiocb *req) +{ + const struct io_issue_def *def = &io_issue_defs[req->opcode]; + + WARN_ON_ONCE(!def->async_size); + req->async_data = kmalloc(def->async_size, GFP_KERNEL); + if (req->async_data) + req->flags |= REQ_F_ASYNC_DATA; + return req->async_data; +} + static inline bool req_has_async_data(struct io_kiocb *req) { return req->flags & REQ_F_ASYNC_DATA; diff --git a/io_uring/timeout.c b/io_uring/timeout.c index bbe58638eca7..a166fd90667a 100644 --- a/io_uring/timeout.c +++ b/io_uring/timeout.c @@ -525,10 +525,9 @@ static int __io_timeout_prep(struct io_kiocb *req, if (WARN_ON_ONCE(req_has_async_data(req))) return -EFAULT; - if (io_alloc_async_data(req)) + data = io_uring_alloc_async_data_nocache(req); + if (!data) return -ENOMEM; - - data = req->async_data; data->req = req; data->flags = flags; diff --git a/io_uring/waitid.c b/io_uring/waitid.c index daef5dd644f0..6778c0ee76c4 100644 --- a/io_uring/waitid.c +++ b/io_uring/waitid.c @@ -303,10 +303,10 @@ int io_waitid(struct io_kiocb *req, unsigned int issue_flags) struct io_waitid_async *iwa; int ret; - if (io_alloc_async_data(req)) + iwa = io_uring_alloc_async_data_nocache(req); + if (!iwa) return -ENOMEM; - iwa = req->async_data; iwa->req = req; ret = kernel_waitid_prepare(&iwa->wo, iw->which, iw->upid, &iw->info, From ce9464081d5168ee0f279d6932ba82260a5b97c4 Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Mon, 16 Dec 2024 15:46:15 -0500 Subject: [PATCH 42/56] io_uring/msg_ring: Drop custom destructor kfree can handle slab objects nowadays. Drop the extra callback and just use kfree. Signed-off-by: Gabriel Krisman Bertazi Link: https://lore.kernel.org/r/20241216204615.759089-10-krisman@suse.de Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 4 ++-- io_uring/msg_ring.c | 7 ------- io_uring/msg_ring.h | 1 - 3 files changed, 2 insertions(+), 10 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index e6d12104f8cb..42d4cc5da73b 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -361,7 +361,7 @@ err: io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free); io_alloc_cache_free(&ctx->rw_cache, io_rw_cache_free); io_alloc_cache_free(&ctx->uring_cache, kfree); - io_alloc_cache_free(&ctx->msg_cache, io_msg_cache_free); + io_alloc_cache_free(&ctx->msg_cache, kfree); io_futex_cache_free(ctx); kvfree(ctx->cancel_table.hbs); xa_destroy(&ctx->io_bl_xa); @@ -2696,7 +2696,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free); io_alloc_cache_free(&ctx->rw_cache, io_rw_cache_free); io_alloc_cache_free(&ctx->uring_cache, kfree); - io_alloc_cache_free(&ctx->msg_cache, io_msg_cache_free); + io_alloc_cache_free(&ctx->msg_cache, kfree); io_futex_cache_free(ctx); io_destroy_buffers(ctx); io_free_region(ctx, &ctx->param_region); diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c index 333c220d322a..bd3cd78d2dba 100644 --- a/io_uring/msg_ring.c +++ b/io_uring/msg_ring.c @@ -354,10 +354,3 @@ int io_uring_sync_msg_ring(struct io_uring_sqe *sqe) return __io_msg_ring_data(fd_file(f)->private_data, &io_msg, IO_URING_F_UNLOCKED); } - -void io_msg_cache_free(const void *entry) -{ - struct io_kiocb *req = (struct io_kiocb *) entry; - - kmem_cache_free(req_cachep, req); -} diff --git a/io_uring/msg_ring.h b/io_uring/msg_ring.h index 38e7f8f0c944..32236d2fb778 100644 --- a/io_uring/msg_ring.h +++ b/io_uring/msg_ring.h @@ -4,4 +4,3 @@ int io_uring_sync_msg_ring(struct io_uring_sqe *sqe); int io_msg_ring_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags); void io_msg_ring_cleanup(struct io_kiocb *req); -void io_msg_cache_free(const void *entry); From 1143be17d7acb02a7c4dba6169a33534983f4960 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 17 Dec 2024 07:44:25 -0700 Subject: [PATCH 43/56] io_uring/rw: don't mask in f_iocb_flags A previous commit changed overwriting kiocb->ki_flags with ->f_iocb_flags with masking it in. This breaks for retry situations, where we don't necessarily want to retain previously set flags, like IOCB_NOWAIT. The use case needs IOCB_HAS_METADATA to be persistent, but the change makes all flags persistent, which is an issue. Add a request flag to track whether the request has metadata or not, as that is persistent across issues. Fixes: 59a7d12a7fb5 ("io_uring: introduce attributes for read/write and PI support") Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 3 +++ io_uring/rw.c | 18 +++++++++--------- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 73575d545d3c..623d8e798a11 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -480,6 +480,7 @@ enum { REQ_F_BL_NO_RECYCLE_BIT, REQ_F_BUFFERS_COMMIT_BIT, REQ_F_BUF_NODE_BIT, + REQ_F_HAS_METADATA_BIT, /* not a real bit, just to check we're not overflowing the space */ __REQ_F_LAST_BIT, @@ -560,6 +561,8 @@ enum { REQ_F_BUFFERS_COMMIT = IO_REQ_FLAG(REQ_F_BUFFERS_COMMIT_BIT), /* buf node is valid */ REQ_F_BUF_NODE = IO_REQ_FLAG(REQ_F_BUF_NODE_BIT), + /* request has read/write metadata assigned */ + REQ_F_HAS_METADATA = IO_REQ_FLAG(REQ_F_HAS_METADATA_BIT), }; typedef void (*io_req_tw_func_t)(struct io_kiocb *req, struct io_tw_state *ts); diff --git a/io_uring/rw.c b/io_uring/rw.c index bdfc3faef85d..d0ac4a51420e 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -283,7 +283,7 @@ static int io_prep_rw_pi(struct io_kiocb *req, struct io_rw *rw, int ddir, pi_attr.len, &io->meta.iter); if (unlikely(ret < 0)) return ret; - rw->kiocb.ki_flags |= IOCB_HAS_METADATA; + req->flags |= REQ_F_HAS_METADATA; io_meta_save_state(io); return ret; } @@ -787,18 +787,17 @@ static bool io_rw_should_retry(struct io_kiocb *req) struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); struct kiocb *kiocb = &rw->kiocb; - /* never retry for NOWAIT, we just complete with -EAGAIN */ - if (req->flags & REQ_F_NOWAIT) + /* + * Never retry for NOWAIT or a request with metadata, we just complete + * with -EAGAIN. + */ + if (req->flags & (REQ_F_NOWAIT | REQ_F_HAS_METADATA)) return false; /* Only for buffered IO */ if (kiocb->ki_flags & (IOCB_DIRECT | IOCB_HIPRI)) return false; - /* never retry for meta io */ - if (kiocb->ki_flags & IOCB_HAS_METADATA) - return false; - /* * just use poll if we can, and don't attempt if the fs doesn't * support callback based unlocks @@ -849,7 +848,7 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode, int rw_type) if (!(req->flags & REQ_F_FIXED_FILE)) req->flags |= io_file_get_flags(file); - kiocb->ki_flags |= file->f_iocb_flags; + kiocb->ki_flags = file->f_iocb_flags; ret = kiocb_set_rw_flags(kiocb, rw->flags, rw_type); if (unlikely(ret)) return ret; @@ -883,7 +882,7 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode, int rw_type) kiocb->ki_complete = io_complete_rw; } - if (kiocb->ki_flags & IOCB_HAS_METADATA) { + if (req->flags & REQ_F_HAS_METADATA) { struct io_async_rw *io = req->async_data; /* @@ -892,6 +891,7 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode, int rw_type) */ if (!(req->file->f_flags & O_DIRECT)) return -EOPNOTSUPP; + kiocb->ki_flags |= IOCB_HAS_METADATA; kiocb->private = &io->meta; } From 21adbcaa8007f5e584d26bebb46ec46bfbbbd330 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 17 Dec 2024 12:43:49 -0700 Subject: [PATCH 44/56] io_uring/rw: use NULL for rw->free_iovec assigment It's a pointer, don't use 0 for that. sparse throws a warning for that, as the kernel test robot noticed. Fixes: d7f11616edf5 ("io_uring/rw: Allocate async data through helper") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202412180253.YML3qN4d-lkp@intel.com/ Signed-off-by: Jens Axboe --- io_uring/rw.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/io_uring/rw.c b/io_uring/rw.c index d0ac4a51420e..75f70935ccf4 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -212,7 +212,7 @@ static void io_rw_async_data_init(void *obj) { struct io_async_rw *rw = (struct io_async_rw *)obj; - rw->free_iovec = 0; + rw->free_iovec = NULL; rw->bytes_done = 0; } From c5f71916146033f9aba108075ff7087022075fd6 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 27 Dec 2024 09:49:32 -0700 Subject: [PATCH 45/56] io_uring/rw: always clear ->bytes_done on io_async_rw setup A previous commit mistakenly moved the clearing of the in-progress byte count into the section that's dependent on having a cached iovec or not, but it should be cleared for any IO. If not, then extra bytes may be added at IO completion time, causing potentially weird behavior like over-reporting the amount of IO done. Fixes: d7f11616edf5 ("io_uring/rw: Allocate async data through helper") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202412271132.a09c3500-lkp@intel.com Signed-off-by: Jens Axboe --- io_uring/rw.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/io_uring/rw.c b/io_uring/rw.c index 75f70935ccf4..ca1b19d3d142 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -228,8 +228,8 @@ static int io_rw_alloc_async(struct io_kiocb *req) kasan_mempool_unpoison_object(rw->free_iovec, rw->free_iov_nr * sizeof(struct iovec)); req->flags |= REQ_F_NEED_CLEANUP; - rw->bytes_done = 0; } + rw->bytes_done = 0; return 0; } From d62c2f0d82753a05133411b1e242baf31f4ef68e Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 30 Dec 2024 17:34:41 -0700 Subject: [PATCH 46/56] io_uring: ensure io_queue_deferred() is out-of-line This is not the hot path, it's a slow path. Yet the locking for it is in the hot path, and __cold does not prevent it from being inlined. Move the locking to the function itself, and mark it noinline as well to avoid it polluting the icache of the hot path. Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 42d4cc5da73b..db198bd435b5 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -550,8 +550,9 @@ void io_req_queue_iowq(struct io_kiocb *req) io_req_task_work_add(req); } -static __cold void io_queue_deferred(struct io_ring_ctx *ctx) +static __cold noinline void io_queue_deferred(struct io_ring_ctx *ctx) { + spin_lock(&ctx->completion_lock); while (!list_empty(&ctx->defer_list)) { struct io_defer_entry *de = list_first_entry(&ctx->defer_list, struct io_defer_entry, list); @@ -562,6 +563,7 @@ static __cold void io_queue_deferred(struct io_ring_ctx *ctx) io_req_task_queue(de->req); kfree(de); } + spin_unlock(&ctx->completion_lock); } void __io_commit_cqring_flush(struct io_ring_ctx *ctx) @@ -570,11 +572,8 @@ void __io_commit_cqring_flush(struct io_ring_ctx *ctx) io_poll_wq_wake(ctx); if (ctx->off_timeout_used) io_flush_timeouts(ctx); - if (ctx->drain_active) { - spin_lock(&ctx->completion_lock); + if (ctx->drain_active) io_queue_deferred(ctx); - spin_unlock(&ctx->completion_lock); - } if (ctx->has_evfd) io_eventfd_flush_signal(ctx); } From 2a51c327d4a4a2eb62d67f4ea13a17efd0f25c5c Mon Sep 17 00:00:00 2001 From: Bui Quang Minh Date: Fri, 3 Jan 2025 22:04:11 +0700 Subject: [PATCH 47/56] io_uring/rsrc: simplify the bvec iter count calculation As we don't use iov_iter_advance() but our own logic in io_import_fixed(), we can remove the logic that over-sets the iter's count to len + offset then adjusts it later to len. This helps to make the code cleaner. Signed-off-by: Bui Quang Minh Link: https://lore.kernel.org/r/20250103150412.12549-1-minhquangbui99@gmail.com Signed-off-by: Jens Axboe --- io_uring/rsrc.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index f2ff108485c8..964a47c8d85e 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -882,7 +882,7 @@ int io_import_fixed(int ddir, struct iov_iter *iter, * and advance us to the beginning. */ offset = buf_addr - imu->ubuf; - iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, offset + len); + iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, len); if (offset) { /* @@ -904,7 +904,6 @@ int io_import_fixed(int ddir, struct iov_iter *iter, const struct bio_vec *bvec = imu->bvec; if (offset < bvec->bv_len) { - iter->count -= offset; iter->iov_offset = offset; } else { unsigned long seg_skip; @@ -915,7 +914,6 @@ int io_import_fixed(int ddir, struct iov_iter *iter, iter->bvec += seg_skip; iter->nr_segs -= seg_skip; - iter->count -= bvec->bv_len + offset; iter->iov_offset = offset & ((1UL << imu->folio_shift) - 1); } } From 9ac273ae3dc296905b4d61e4c8e7a25592f6d183 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 7 Jan 2025 09:52:54 -0700 Subject: [PATCH 48/56] io_uring/rw: use io_rw_recycle() from cleanup path Cleanup should always have the uring lock held, it's safe to recycle from here. Signed-off-by: Jens Axboe --- io_uring/rw.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/io_uring/rw.c b/io_uring/rw.c index ca1b19d3d142..afc669048c5d 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -434,7 +434,8 @@ int io_read_mshot_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) void io_readv_writev_cleanup(struct io_kiocb *req) { - io_rw_iovec_free(req->async_data); + lockdep_assert_held(&req->ctx->uring_lock); + io_rw_recycle(req, 0); } static inline loff_t *io_kiocb_update_pos(struct io_kiocb *req) From d803d123948feffbd992213e144df224097f82b0 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 7 Jan 2025 10:59:35 -0700 Subject: [PATCH 49/56] io_uring/rw: handle -EAGAIN retry at IO completion time Rather than try and have io_read/io_write turn REQ_F_REISSUE into -EAGAIN, catch the REQ_F_REISSUE when the request is otherwise considered as done. This is saner as we know this isn't happening during an actual submission, and it removes the need to randomly check REQ_F_REISSUE after read/write submission. If REQ_F_REISSUE is set, __io_submit_flush_completions() will skip over this request in terms of posting a CQE, and the regular request cleaning will ensure that it gets reissued via io-wq. Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 15 +++++++-- io_uring/rw.c | 80 ++++++++++++++------------------------------- 2 files changed, 38 insertions(+), 57 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index db198bd435b5..92ba2fdcd087 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -115,7 +115,7 @@ REQ_F_ASYNC_DATA) #define IO_REQ_CLEAN_SLOW_FLAGS (REQ_F_REFCOUNT | REQ_F_LINK | REQ_F_HARDLINK |\ - IO_REQ_CLEAN_FLAGS) + REQ_F_REISSUE | IO_REQ_CLEAN_FLAGS) #define IO_TCTX_REFS_CACHE_NR (1U << 10) @@ -1403,6 +1403,12 @@ static void io_free_batch_list(struct io_ring_ctx *ctx, comp_list); if (unlikely(req->flags & IO_REQ_CLEAN_SLOW_FLAGS)) { + if (req->flags & REQ_F_REISSUE) { + node = req->comp_list.next; + req->flags &= ~REQ_F_REISSUE; + io_queue_iowq(req); + continue; + } if (req->flags & REQ_F_REFCOUNT) { node = req->comp_list.next; if (!req_ref_put_and_test(req)) @@ -1442,7 +1448,12 @@ void __io_submit_flush_completions(struct io_ring_ctx *ctx) struct io_kiocb *req = container_of(node, struct io_kiocb, comp_list); - if (!(req->flags & REQ_F_CQE_SKIP) && + /* + * Requests marked with REQUEUE should not post a CQE, they + * will go through the io-wq retry machinery and post one + * later. + */ + if (!(req->flags & (REQ_F_CQE_SKIP | REQ_F_REISSUE)) && unlikely(!io_fill_cqe_req(ctx, req))) { if (ctx->lockless_cq) { spin_lock(&ctx->completion_lock); diff --git a/io_uring/rw.c b/io_uring/rw.c index afc669048c5d..c52c0515f0a2 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -202,7 +202,7 @@ static void io_req_rw_cleanup(struct io_kiocb *req, unsigned int issue_flags) * mean that the underlying data can be gone at any time. But that * should be fixed seperately, and then this check could be killed. */ - if (!(req->flags & REQ_F_REFCOUNT)) { + if (!(req->flags & (REQ_F_REISSUE | REQ_F_REFCOUNT))) { req->flags &= ~REQ_F_NEED_CLEANUP; io_rw_recycle(req, issue_flags); } @@ -455,19 +455,12 @@ static inline loff_t *io_kiocb_update_pos(struct io_kiocb *req) return NULL; } -#ifdef CONFIG_BLOCK -static void io_resubmit_prep(struct io_kiocb *req) -{ - struct io_async_rw *io = req->async_data; - struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); - - io_meta_restore(io, &rw->kiocb); - iov_iter_restore(&io->iter, &io->iter_state); -} - static bool io_rw_should_reissue(struct io_kiocb *req) { +#ifdef CONFIG_BLOCK + struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); umode_t mode = file_inode(req->file)->i_mode; + struct io_async_rw *io = req->async_data; struct io_ring_ctx *ctx = req->ctx; if (!S_ISBLK(mode) && !S_ISREG(mode)) @@ -488,17 +481,14 @@ static bool io_rw_should_reissue(struct io_kiocb *req) */ if (!same_thread_group(req->tctx->task, current) || !in_task()) return false; + + io_meta_restore(io, &rw->kiocb); + iov_iter_restore(&io->iter, &io->iter_state); return true; -} #else -static void io_resubmit_prep(struct io_kiocb *req) -{ -} -static bool io_rw_should_reissue(struct io_kiocb *req) -{ return false; -} #endif +} static void io_req_end_write(struct io_kiocb *req) { @@ -525,22 +515,16 @@ static void io_req_io_end(struct io_kiocb *req) } } -static bool __io_complete_rw_common(struct io_kiocb *req, long res) +static void __io_complete_rw_common(struct io_kiocb *req, long res) { - if (unlikely(res != req->cqe.res)) { - if (res == -EAGAIN && io_rw_should_reissue(req)) { - /* - * Reissue will start accounting again, finish the - * current cycle. - */ - io_req_io_end(req); - req->flags |= REQ_F_REISSUE | REQ_F_BL_NO_RECYCLE; - return true; - } + if (res == req->cqe.res) + return; + if (res == -EAGAIN && io_rw_should_reissue(req)) { + req->flags |= REQ_F_REISSUE | REQ_F_BL_NO_RECYCLE; + } else { req_set_fail(req); req->cqe.res = res; } - return false; } static inline int io_fixup_rw_res(struct io_kiocb *req, long res) @@ -583,8 +567,7 @@ static void io_complete_rw(struct kiocb *kiocb, long res) struct io_kiocb *req = cmd_to_io_kiocb(rw); if (!kiocb->dio_complete || !(kiocb->ki_flags & IOCB_DIO_CALLER_COMP)) { - if (__io_complete_rw_common(req, res)) - return; + __io_complete_rw_common(req, res); io_req_set_res(req, io_fixup_rw_res(req, res), 0); } req->io_task_work.func = io_req_rw_complete; @@ -646,26 +629,19 @@ static int kiocb_done(struct io_kiocb *req, ssize_t ret, if (ret >= 0 && req->flags & REQ_F_CUR_POS) req->file->f_pos = rw->kiocb.ki_pos; if (ret >= 0 && (rw->kiocb.ki_complete == io_complete_rw)) { - if (!__io_complete_rw_common(req, ret)) { - /* - * Safe to call io_end from here as we're inline - * from the submission path. - */ - io_req_io_end(req); - io_req_set_res(req, final_ret, - io_put_kbuf(req, ret, issue_flags)); - io_req_rw_cleanup(req, issue_flags); - return IOU_OK; - } + __io_complete_rw_common(req, ret); + /* + * Safe to call io_end from here as we're inline + * from the submission path. + */ + io_req_io_end(req); + io_req_set_res(req, final_ret, io_put_kbuf(req, ret, issue_flags)); + io_req_rw_cleanup(req, issue_flags); + return IOU_OK; } else { io_rw_done(&rw->kiocb, ret); } - if (req->flags & REQ_F_REISSUE) { - req->flags &= ~REQ_F_REISSUE; - io_resubmit_prep(req); - return -EAGAIN; - } return IOU_ISSUE_SKIP_COMPLETE; } @@ -944,8 +920,7 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags) if (ret == -EOPNOTSUPP && force_nonblock) ret = -EAGAIN; - if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) { - req->flags &= ~REQ_F_REISSUE; + if (ret == -EAGAIN) { /* If we can poll, just do that. */ if (io_file_can_poll(req)) return -EAGAIN; @@ -1154,11 +1129,6 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags) else ret2 = -EINVAL; - if (req->flags & REQ_F_REISSUE) { - req->flags &= ~REQ_F_REISSUE; - ret2 = -EAGAIN; - } - /* * Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just * retry them without IOCB_NOWAIT. From b08e02045002e0412441d5243b0ee1a9bfc3952b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 7 Jan 2025 11:07:47 -0700 Subject: [PATCH 50/56] io_uring/rw: don't gate retry on completion context nvme multipath reports that they see spurious -EAGAIN bubbling back to userspace, which is caused by how they handle retries internally through a kworker. However, any data that needs preserving or importing for a read/write request has always been done so at prep time, and we can sanely skip this check. Reported-by: "Haeuptle, Michael" Link: https://lore.kernel.org/io-uring/DS7PR84MB31105C2C63CFA47BE8CBD6EE95102@DS7PR84MB3110.NAMPRD84.PROD.OUTLOOK.COM/ Signed-off-by: Jens Axboe --- io_uring/rw.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/io_uring/rw.c b/io_uring/rw.c index c52c0515f0a2..ee5d38db9b48 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -475,12 +475,6 @@ static bool io_rw_should_reissue(struct io_kiocb *req) */ if (percpu_ref_is_dying(&ctx->refs)) return false; - /* - * Play it safe and assume not safe to re-import and reissue if we're - * not in the original thread group (or in task context). - */ - if (!same_thread_group(req->tctx->task, current) || !in_task()) - return false; io_meta_restore(io, &rw->kiocb); iov_iter_restore(&io->iter, &io->iter_state); From 94d57442e56d2ad2ca20d096040b8ae6f216a921 Mon Sep 17 00:00:00 2001 From: Anuj Gupta Date: Thu, 5 Dec 2024 11:51:09 +0530 Subject: [PATCH 51/56] io_uring: expose read/write attribute capability After commit 9a213d3b80c0, we can pass additional attributes along with read/write. However, userspace doesn't know that. Add a new feature flag IORING_FEAT_RW_ATTR, to notify the userspace that the kernel has this ability. Signed-off-by: Anuj Gupta Reviewed-by: Li Zetao Reviewed-by: Martin K. Petersen Tested-by: Martin K. Petersen Reviewed-by: Pavel Begunkov Link: https://lore.kernel.org/r/20241205062109.1788-1-anuj20.g@samsung.com Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 1 + io_uring/io_uring.c | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 38f0d6b10eaf..e11c82638527 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -577,6 +577,7 @@ struct io_uring_params { #define IORING_FEAT_REG_REG_RING (1U << 13) #define IORING_FEAT_RECVSEND_BUNDLE (1U << 14) #define IORING_FEAT_MIN_TIMEOUT (1U << 15) +#define IORING_FEAT_RW_ATTR (1U << 16) /* * io_uring_register(2) opcodes and arguments diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 92ba2fdcd087..af03e9973b58 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -3713,7 +3713,8 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS | IORING_FEAT_RSRC_TAGS | IORING_FEAT_CQE_SKIP | IORING_FEAT_LINKED_FILE | IORING_FEAT_REG_REG_RING | - IORING_FEAT_RECVSEND_BUNDLE | IORING_FEAT_MIN_TIMEOUT; + IORING_FEAT_RECVSEND_BUNDLE | IORING_FEAT_MIN_TIMEOUT | + IORING_FEAT_RW_ATTR; if (copy_to_user(params, p, sizeof(*p))) { ret = -EFAULT; From a13030fd194c88961be4679f87a1380f1bda0ebe Mon Sep 17 00:00:00 2001 From: Bui Quang Minh Date: Mon, 13 Jan 2025 23:03:31 +0700 Subject: [PATCH 52/56] io_uring: simplify the SQPOLL thread check when cancelling requests In io_uring_try_cancel_requests, we check whether sq_data->thread == current to determine if the function is called by the SQPOLL thread to do iopoll when IORING_SETUP_SQPOLL is set. This check can race with the SQPOLL thread termination. io_uring_cancel_generic is used in 2 places: io_uring_cancel_generic and io_ring_exit_work. In io_uring_cancel_generic, we have the information whether the current is SQPOLL thread already. And the SQPOLL thread never reaches io_ring_exit_work. So to avoid the racy check, this commit adds a boolean flag to io_uring_try_cancel_requests to determine if the caller is SQPOLL thread. Reported-by: syzbot+3c750be01dab672c513d@syzkaller.appspotmail.com Reported-by: Li Zetao Reviewed-by: Li Zetao Signed-off-by: Bui Quang Minh Reviewed-by: Pavel Begunkov Link: https://lore.kernel.org/r/20250113160331.44057-1-minhquangbui99@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index af03e9973b58..20a46bc671ea 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -143,7 +143,8 @@ struct io_defer_entry { static bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, struct io_uring_task *tctx, - bool cancel_all); + bool cancel_all, + bool is_sqpoll_thread); static void io_queue_sqe(struct io_kiocb *req); @@ -2869,7 +2870,8 @@ static __cold void io_ring_exit_work(struct work_struct *work) if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) io_move_task_work_from_local(ctx); - while (io_uring_try_cancel_requests(ctx, NULL, true)) + /* The SQPOLL thread never reaches this path */ + while (io_uring_try_cancel_requests(ctx, NULL, true, false)) cond_resched(); if (ctx->sq_data) { @@ -3037,7 +3039,8 @@ static __cold bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx) static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, struct io_uring_task *tctx, - bool cancel_all) + bool cancel_all, + bool is_sqpoll_thread) { struct io_task_cancel cancel = { .tctx = tctx, .all = cancel_all, }; enum io_wq_cancel cret; @@ -3067,7 +3070,7 @@ static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, /* SQPOLL thread does its own polling */ if ((!(ctx->flags & IORING_SETUP_SQPOLL) && cancel_all) || - (ctx->sq_data && ctx->sq_data->thread == current)) { + is_sqpoll_thread) { while (!wq_list_empty(&ctx->iopoll_list)) { io_iopoll_try_reap_events(ctx); ret = true; @@ -3140,13 +3143,15 @@ __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd) continue; loop |= io_uring_try_cancel_requests(node->ctx, current->io_uring, - cancel_all); + cancel_all, + false); } } else { list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) loop |= io_uring_try_cancel_requests(ctx, current->io_uring, - cancel_all); + cancel_all, + true); } if (loop) { From 19d340a2988d4f3e673cded9dde405d727d7e248 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Tue, 14 Jan 2025 18:49:00 +0100 Subject: [PATCH 53/56] io_uring/rsrc: require cloned buffers to share accounting contexts When IORING_REGISTER_CLONE_BUFFERS is used to clone buffers from uring instance A to uring instance B, where A and B use different MMs for accounting, the accounting can go wrong: If uring instance A is closed before uring instance B, the pinned memory counters for uring instance B will be decremented, even though the pinned memory was originally accounted through uring instance A; so the MM of uring instance B can end up with negative locked memory. Cc: stable@vger.kernel.org Closes: https://lore.kernel.org/r/CAG48ez1zez4bdhmeGLEFxtbFADY4Czn3CV0u9d_TMcbvRA01bg@mail.gmail.com Fixes: 7cc2a6eadcd7 ("io_uring: add IORING_REGISTER_COPY_BUFFERS method") Signed-off-by: Jann Horn Link: https://lore.kernel.org/r/20250114-uring-check-accounting-v1-1-42e4145aa743@google.com Signed-off-by: Jens Axboe --- io_uring/rsrc.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 964a47c8d85e..688e277f0335 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -928,6 +928,13 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx int i, ret, off, nr; unsigned int nbufs; + /* + * Accounting state is shared between the two rings; that only works if + * both rings are accounted towards the same counters. + */ + if (ctx->user != src_ctx->user || ctx->mm_account != src_ctx->mm_account) + return -EINVAL; + /* if offsets are given, must have nr specified too */ if (!arg->nr && (arg->dst_off || arg->src_off)) return -EINVAL; From 53745105efc3a865d07ca4973b12ecb63c977bb7 Mon Sep 17 00:00:00 2001 From: Josh Triplett Date: Wed, 15 Jan 2025 11:14:33 +0200 Subject: [PATCH 54/56] io_uring: Factor out a function to parse restrictions Preparation for subsequent work on inherited restrictions. Signed-off-by: Josh Triplett Reviewed-by: Pavel Begunkov Link: https://lore.kernel.org/r/9bac2b4d1b9b9ab41c55ea3816021be847f354df.1736932318.git.josh@joshtriplett.org Signed-off-by: Jens Axboe --- io_uring/register.c | 64 +++++++++++++++++++++++---------------------- 1 file changed, 33 insertions(+), 31 deletions(-) diff --git a/io_uring/register.c b/io_uring/register.c index 5e48413706ac..4d507a0390e8 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -104,21 +104,13 @@ static int io_register_personality(struct io_ring_ctx *ctx) return id; } -static __cold int io_register_restrictions(struct io_ring_ctx *ctx, - void __user *arg, unsigned int nr_args) +static __cold int io_parse_restrictions(void __user *arg, unsigned int nr_args, + struct io_restriction *restrictions) { struct io_uring_restriction *res; size_t size; int i, ret; - /* Restrictions allowed only if rings started disabled */ - if (!(ctx->flags & IORING_SETUP_R_DISABLED)) - return -EBADFD; - - /* We allow only a single restrictions registration */ - if (ctx->restrictions.registered) - return -EBUSY; - if (!arg || nr_args > IORING_MAX_RESTRICTIONS) return -EINVAL; @@ -130,47 +122,57 @@ static __cold int io_register_restrictions(struct io_ring_ctx *ctx, if (IS_ERR(res)) return PTR_ERR(res); - ret = 0; + ret = -EINVAL; for (i = 0; i < nr_args; i++) { switch (res[i].opcode) { case IORING_RESTRICTION_REGISTER_OP: - if (res[i].register_op >= IORING_REGISTER_LAST) { - ret = -EINVAL; - goto out; - } - - __set_bit(res[i].register_op, - ctx->restrictions.register_op); + if (res[i].register_op >= IORING_REGISTER_LAST) + goto err; + __set_bit(res[i].register_op, restrictions->register_op); break; case IORING_RESTRICTION_SQE_OP: - if (res[i].sqe_op >= IORING_OP_LAST) { - ret = -EINVAL; - goto out; - } - - __set_bit(res[i].sqe_op, ctx->restrictions.sqe_op); + if (res[i].sqe_op >= IORING_OP_LAST) + goto err; + __set_bit(res[i].sqe_op, restrictions->sqe_op); break; case IORING_RESTRICTION_SQE_FLAGS_ALLOWED: - ctx->restrictions.sqe_flags_allowed = res[i].sqe_flags; + restrictions->sqe_flags_allowed = res[i].sqe_flags; break; case IORING_RESTRICTION_SQE_FLAGS_REQUIRED: - ctx->restrictions.sqe_flags_required = res[i].sqe_flags; + restrictions->sqe_flags_required = res[i].sqe_flags; break; default: - ret = -EINVAL; - goto out; + goto err; } } -out: + ret = 0; + +err: + kfree(res); + return ret; +} + +static __cold int io_register_restrictions(struct io_ring_ctx *ctx, + void __user *arg, unsigned int nr_args) +{ + int ret; + + /* Restrictions allowed only if rings started disabled */ + if (!(ctx->flags & IORING_SETUP_R_DISABLED)) + return -EBADFD; + + /* We allow only a single restrictions registration */ + if (ctx->restrictions.registered) + return -EBUSY; + + ret = io_parse_restrictions(arg, nr_args, &ctx->restrictions); /* Reset all restrictions if an error happened */ if (ret != 0) memset(&ctx->restrictions, 0, sizeof(ctx->restrictions)); else ctx->restrictions.registered = true; - - kfree(res); return ret; } From bab4b2cca027fbc4effc0ef60615a35bfee96ad0 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 15 Jan 2025 15:40:48 +0000 Subject: [PATCH 55/56] io_uring: reuse io_should_terminate_tw() for cmds io_uring_cmd_work() rolled a hard coded version of io_should_terminate_tw() to avoid conflicts, but now it's time to converge them. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/8a88dd6e4ed8e6c00c6552af0c20c9de02e458de.1736955455.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/uring_cmd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index d6ff803dbbe1..d235043db21e 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -105,7 +105,7 @@ static void io_uring_cmd_work(struct io_kiocb *req, struct io_tw_state *ts) struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); unsigned int flags = IO_URING_F_COMPLETE_DEFER; - if (current->flags & (PF_EXITING | PF_KTHREAD)) + if (io_should_terminate_tw()) flags |= IO_URING_F_TASK_DEAD; /* task_work executor checks the deffered list completion */ From 561e3a0c40dc7e3ab7b0b3647a2b89eca16215d9 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 19 Jan 2025 03:26:49 +0000 Subject: [PATCH 56/56] io_uring/fdinfo: fix io_uring_show_fdinfo() misuse of ->d_iname Output of io_uring_show_fdinfo() has several problems: * racy use of ->d_iname * junk if the name is long - in that case it's not stored in ->d_iname at all * lack of quoting (names can contain newlines, etc. - or be equal to "", for that matter). * lines for empty slots are pointless noise - we already have the total amount, so having just the non-empty ones would carry the same information. Signed-off-by: Al Viro Signed-off-by: Jens Axboe --- io_uring/fdinfo.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c index b214e5a407b5..f60d0a9d505e 100644 --- a/io_uring/fdinfo.c +++ b/io_uring/fdinfo.c @@ -211,10 +211,11 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file) if (ctx->file_table.data.nodes[i]) f = io_slot_file(ctx->file_table.data.nodes[i]); - if (f) - seq_printf(m, "%5u: %s\n", i, file_dentry(f)->d_iname); - else - seq_printf(m, "%5u: \n", i); + if (f) { + seq_printf(m, "%5u: ", i); + seq_file_path(m, f, " \t\n\\"); + seq_puts(m, "\n"); + } } seq_printf(m, "UserBufs:\t%u\n", ctx->buf_table.nr); for (i = 0; has_lock && i < ctx->buf_table.nr; i++) {