mirror of
git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2025-09-18 22:14:16 +00:00
block-6.16-20250614
-----BEGIN PGP SIGNATURE----- iQJEBAABCAAuFiEEwPw5LcreJtl1+l5K99NY+ylx4KYFAmhNaUIQHGF4Ym9lQGtl cm5lbC5kawAKCRD301j7KXHgpvUOD/0WlwBN8WxAA+rzUXo42QJ3W+XruQ+VhQdx Hs/DBEH6KZji86ZVzoJOIwsdlSL2/6PxRIZVqwr3Q8aYnNedUnsjcD4frNBl76EA wFfPttjL7DcaOvVhY0n37IrQNmaeQC1R1O2JxhiWTBzNoNf2iWj84vSSgbgcfDVR trfhRvEwRgmAy037/72pUFYN+JRlv80D03SGfWTQtp6/qq+AA/z5XqWdg9I/opVM 7+H5GoWHfPSG0wQo+Dms3mHV4zm5tOOfMmGIR2o4DoueKgMNgnUXRT8dc7DDBsqV 0moKRHKbTbeN1fz3zqcko0Mp1gq+62hF/eXppQSeJMpMuAbcxaA+/ZFv7Ho9ZwYF jJwcp0O5e8XbRHFqrYWysKGKSvYfvTjr08X70+QFzm9ZJaGCtJYd2ceUNmyO2p6s m54gUnPq5d3nABbpCkAdP5sAv0yVV5idIoezCHIaBYQv8qPpKDrdHHXTQY/VX05x VBGmg9hUZSDMiGkR1d4oKTBayehuWVIpyczhy65KbAfoBA62hAl+aAldkpvLRo1r gKsrMSGP/H6zBU/IRaMGc/bnEnP6zFkn5vxnGwpDcD2tdJn0g+yEjIvJSXrmGJ0w lwzqYd3/vhFPmaEDxE3PyOOGBVCOPqGic+Y6OEIuHA3p2HFO3bsh6+64+iqls/so EmiHPp7n5g== =N1zM -----END PGP SIGNATURE----- Merge tag 'block-6.16-20250614' of git://git.kernel.dk/linux Pull block fixes from Jens Axboe: - Fix for a deadlock on queue freeze with zoned writes - Fix for zoned append emulation - Two bio folio fixes, for sparsemem and for very large folios - Fix for a performance regression introduced in 6.13 when plug insertion was changed - Fix for NVMe passthrough handling for polled IO - Document the ublk auto registration feature - loop lockdep warning fix * tag 'block-6.16-20250614' of git://git.kernel.dk/linux: nvme: always punt polled uring_cmd end_io work to task_work Documentation: ublk: Separate UBLK_F_AUTO_BUF_REG fallback behavior sublists block: Fix bvec_set_folio() for very large folios bio: Fix bio_first_folio() for SPARSEMEM without VMEMMAP block: use plug request list tail for one-shot backmerge attempt block: don't use submit_bio_noacct_nocheck in blk_zone_wplug_bio_work block: Clear BIO_EMULATES_ZONE_APPEND flag on BIO completion ublk: document auto buffer registration(UBLK_F_AUTO_BUF_REG) loop: move lo_set_size() out of queue freeze
This commit is contained in:
commit
f713ffa363
7 changed files with 114 additions and 38 deletions
|
@ -352,6 +352,83 @@ For reaching best IO performance, ublk server should align its segment
|
|||
parameter of `struct ublk_param_segment` with backend for avoiding
|
||||
unnecessary IO split, which usually hurts io_uring performance.
|
||||
|
||||
Auto Buffer Registration
|
||||
------------------------
|
||||
|
||||
The ``UBLK_F_AUTO_BUF_REG`` feature automatically handles buffer registration
|
||||
and unregistration for I/O requests, which simplifies the buffer management
|
||||
process and reduces overhead in the ublk server implementation.
|
||||
|
||||
This is another feature flag for using zero copy, and it is compatible with
|
||||
``UBLK_F_SUPPORT_ZERO_COPY``.
|
||||
|
||||
Feature Overview
|
||||
~~~~~~~~~~~~~~~~
|
||||
|
||||
This feature automatically registers request buffers to the io_uring context
|
||||
before delivering I/O commands to the ublk server and unregisters them when
|
||||
completing I/O commands. This eliminates the need for manual buffer
|
||||
registration/unregistration via ``UBLK_IO_REGISTER_IO_BUF`` and
|
||||
``UBLK_IO_UNREGISTER_IO_BUF`` commands, then IO handling in ublk server
|
||||
can avoid dependency on the two uring_cmd operations.
|
||||
|
||||
IOs can't be issued concurrently to io_uring if there is any dependency
|
||||
among these IOs. So this way not only simplifies ublk server implementation,
|
||||
but also makes concurrent IO handling becomes possible by removing the
|
||||
dependency on buffer registration & unregistration commands.
|
||||
|
||||
Usage Requirements
|
||||
~~~~~~~~~~~~~~~~~~
|
||||
|
||||
1. The ublk server must create a sparse buffer table on the same ``io_ring_ctx``
|
||||
used for ``UBLK_IO_FETCH_REQ`` and ``UBLK_IO_COMMIT_AND_FETCH_REQ``. If
|
||||
uring_cmd is issued on a different ``io_ring_ctx``, manual buffer
|
||||
unregistration is required.
|
||||
|
||||
2. Buffer registration data must be passed via uring_cmd's ``sqe->addr`` with the
|
||||
following structure::
|
||||
|
||||
struct ublk_auto_buf_reg {
|
||||
__u16 index; /* Buffer index for registration */
|
||||
__u8 flags; /* Registration flags */
|
||||
__u8 reserved0; /* Reserved for future use */
|
||||
__u32 reserved1; /* Reserved for future use */
|
||||
};
|
||||
|
||||
ublk_auto_buf_reg_to_sqe_addr() is for converting the above structure into
|
||||
``sqe->addr``.
|
||||
|
||||
3. All reserved fields in ``ublk_auto_buf_reg`` must be zeroed.
|
||||
|
||||
4. Optional flags can be passed via ``ublk_auto_buf_reg.flags``.
|
||||
|
||||
Fallback Behavior
|
||||
~~~~~~~~~~~~~~~~~
|
||||
|
||||
If auto buffer registration fails:
|
||||
|
||||
1. When ``UBLK_AUTO_BUF_REG_FALLBACK`` is enabled:
|
||||
|
||||
- The uring_cmd is completed
|
||||
- ``UBLK_IO_F_NEED_REG_BUF`` is set in ``ublksrv_io_desc.op_flags``
|
||||
- The ublk server must manually deal with the failure, such as, register
|
||||
the buffer manually, or using user copy feature for retrieving the data
|
||||
for handling ublk IO
|
||||
|
||||
2. If fallback is not enabled:
|
||||
|
||||
- The ublk I/O request fails silently
|
||||
- The uring_cmd won't be completed
|
||||
|
||||
Limitations
|
||||
~~~~~~~~~~~
|
||||
|
||||
- Requires same ``io_ring_ctx`` for all operations
|
||||
- May require manual buffer management in fallback cases
|
||||
- io_ring_ctx buffer table has a max size of 16K, which may not be enough
|
||||
in case that too many ublk devices are handled by this single io_ring_ctx
|
||||
and each one has very large queue depth
|
||||
|
||||
References
|
||||
==========
|
||||
|
||||
|
|
|
@ -998,20 +998,20 @@ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
|
|||
if (!plug || rq_list_empty(&plug->mq_list))
|
||||
return false;
|
||||
|
||||
rq_list_for_each(&plug->mq_list, rq) {
|
||||
if (rq->q == q) {
|
||||
if (blk_attempt_bio_merge(q, rq, bio, nr_segs, false) ==
|
||||
BIO_MERGE_OK)
|
||||
return true;
|
||||
break;
|
||||
}
|
||||
rq = plug->mq_list.tail;
|
||||
if (rq->q == q)
|
||||
return blk_attempt_bio_merge(q, rq, bio, nr_segs, false) ==
|
||||
BIO_MERGE_OK;
|
||||
else if (!plug->multiple_queues)
|
||||
return false;
|
||||
|
||||
/*
|
||||
* Only keep iterating plug list for merges if we have multiple
|
||||
* queues
|
||||
*/
|
||||
if (!plug->multiple_queues)
|
||||
break;
|
||||
rq_list_for_each(&plug->mq_list, rq) {
|
||||
if (rq->q != q)
|
||||
continue;
|
||||
if (blk_attempt_bio_merge(q, rq, bio, nr_segs, false) ==
|
||||
BIO_MERGE_OK)
|
||||
return true;
|
||||
break;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
|
|
@ -1225,6 +1225,7 @@ void blk_zone_write_plug_bio_endio(struct bio *bio)
|
|||
if (bio_flagged(bio, BIO_EMULATES_ZONE_APPEND)) {
|
||||
bio->bi_opf &= ~REQ_OP_MASK;
|
||||
bio->bi_opf |= REQ_OP_ZONE_APPEND;
|
||||
bio_clear_flag(bio, BIO_EMULATES_ZONE_APPEND);
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -1306,7 +1307,6 @@ again:
|
|||
spin_unlock_irqrestore(&zwplug->lock, flags);
|
||||
|
||||
bdev = bio->bi_bdev;
|
||||
submit_bio_noacct_nocheck(bio);
|
||||
|
||||
/*
|
||||
* blk-mq devices will reuse the extra reference on the request queue
|
||||
|
@ -1314,8 +1314,12 @@ again:
|
|||
* path for BIO-based devices will not do that. So drop this extra
|
||||
* reference here.
|
||||
*/
|
||||
if (bdev_test_flag(bdev, BD_HAS_SUBMIT_BIO))
|
||||
if (bdev_test_flag(bdev, BD_HAS_SUBMIT_BIO)) {
|
||||
bdev->bd_disk->fops->submit_bio(bio);
|
||||
blk_queue_exit(bdev->bd_disk->queue);
|
||||
} else {
|
||||
blk_mq_submit_bio(bio);
|
||||
}
|
||||
|
||||
put_zwplug:
|
||||
/* Drop the reference we took in disk_zone_wplug_schedule_bio_work(). */
|
||||
|
|
|
@ -1248,12 +1248,6 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
|
|||
lo->lo_flags &= ~LOOP_SET_STATUS_CLEARABLE_FLAGS;
|
||||
lo->lo_flags |= (info->lo_flags & LOOP_SET_STATUS_SETTABLE_FLAGS);
|
||||
|
||||
if (size_changed) {
|
||||
loff_t new_size = get_size(lo->lo_offset, lo->lo_sizelimit,
|
||||
lo->lo_backing_file);
|
||||
loop_set_size(lo, new_size);
|
||||
}
|
||||
|
||||
/* update the direct I/O flag if lo_offset changed */
|
||||
loop_update_dio(lo);
|
||||
|
||||
|
@ -1261,6 +1255,11 @@ out_unfreeze:
|
|||
blk_mq_unfreeze_queue(lo->lo_queue, memflags);
|
||||
if (partscan)
|
||||
clear_bit(GD_SUPPRESS_PART_SCAN, &lo->lo_disk->state);
|
||||
if (!err && size_changed) {
|
||||
loff_t new_size = get_size(lo->lo_offset, lo->lo_sizelimit,
|
||||
lo->lo_backing_file);
|
||||
loop_set_size(lo, new_size);
|
||||
}
|
||||
out_unlock:
|
||||
mutex_unlock(&lo->lo_mutex);
|
||||
if (partscan)
|
||||
|
|
|
@ -429,21 +429,14 @@ static enum rq_end_io_ret nvme_uring_cmd_end_io(struct request *req,
|
|||
pdu->result = le64_to_cpu(nvme_req(req)->result.u64);
|
||||
|
||||
/*
|
||||
* For iopoll, complete it directly. Note that using the uring_cmd
|
||||
* helper for this is safe only because we check blk_rq_is_poll().
|
||||
* As that returns false if we're NOT on a polled queue, then it's
|
||||
* safe to use the polled completion helper.
|
||||
*
|
||||
* Otherwise, move the completion to task work.
|
||||
* IOPOLL could potentially complete this request directly, but
|
||||
* if multiple rings are polling on the same queue, then it's possible
|
||||
* for one ring to find completions for another ring. Punting the
|
||||
* completion via task_work will always direct it to the right
|
||||
* location, rather than potentially complete requests for ringA
|
||||
* under iopoll invocations from ringB.
|
||||
*/
|
||||
if (blk_rq_is_poll(req)) {
|
||||
if (pdu->bio)
|
||||
blk_rq_unmap_user(pdu->bio);
|
||||
io_uring_cmd_iopoll_done(ioucmd, pdu->result, pdu->status);
|
||||
} else {
|
||||
io_uring_cmd_do_in_task_lazy(ioucmd, nvme_uring_task_cb);
|
||||
}
|
||||
|
||||
io_uring_cmd_do_in_task_lazy(ioucmd, nvme_uring_task_cb);
|
||||
return RQ_END_IO_FREE;
|
||||
}
|
||||
|
||||
|
|
|
@ -291,7 +291,7 @@ static inline void bio_first_folio(struct folio_iter *fi, struct bio *bio,
|
|||
|
||||
fi->folio = page_folio(bvec->bv_page);
|
||||
fi->offset = bvec->bv_offset +
|
||||
PAGE_SIZE * (bvec->bv_page - &fi->folio->page);
|
||||
PAGE_SIZE * folio_page_idx(fi->folio, bvec->bv_page);
|
||||
fi->_seg_count = bvec->bv_len;
|
||||
fi->length = min(folio_size(fi->folio) - fi->offset, fi->_seg_count);
|
||||
fi->_next = folio_next(fi->folio);
|
||||
|
|
|
@ -57,9 +57,12 @@ static inline void bvec_set_page(struct bio_vec *bv, struct page *page,
|
|||
* @offset: offset into the folio
|
||||
*/
|
||||
static inline void bvec_set_folio(struct bio_vec *bv, struct folio *folio,
|
||||
unsigned int len, unsigned int offset)
|
||||
size_t len, size_t offset)
|
||||
{
|
||||
bvec_set_page(bv, &folio->page, len, offset);
|
||||
unsigned long nr = offset / PAGE_SIZE;
|
||||
|
||||
WARN_ON_ONCE(len > UINT_MAX);
|
||||
bvec_set_page(bv, folio_page(folio, nr), len, offset % PAGE_SIZE);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
Loading…
Add table
Reference in a new issue