mirror of
git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2025-04-13 09:59:31 +00:00

Use io_uring vectored fixed kernel buffer for handling stripe IO. Signed-off-by: Ming Lei <ming.lei@redhat.com> Link: https://lore.kernel.org/r/20250325135155.935398-5-ming.lei@redhat.com Signed-off-by: Jens Axboe <axboe@kernel.dk>
353 lines
8.9 KiB
C
353 lines
8.9 KiB
C
// SPDX-License-Identifier: GPL-2.0
|
|
|
|
#include "kublk.h"
|
|
|
|
#define NR_STRIPE MAX_BACK_FILES
|
|
|
|
struct stripe_conf {
|
|
unsigned nr_files;
|
|
unsigned shift;
|
|
};
|
|
|
|
struct stripe {
|
|
loff_t start;
|
|
unsigned nr_sects;
|
|
int seq;
|
|
|
|
struct iovec *vec;
|
|
unsigned nr_vec;
|
|
unsigned cap;
|
|
};
|
|
|
|
struct stripe_array {
|
|
struct stripe s[NR_STRIPE];
|
|
unsigned nr;
|
|
struct iovec _vec[];
|
|
};
|
|
|
|
static inline const struct stripe_conf *get_chunk_shift(const struct ublk_queue *q)
|
|
{
|
|
return (struct stripe_conf *)q->dev->private_data;
|
|
}
|
|
|
|
static inline unsigned calculate_nr_vec(const struct stripe_conf *conf,
|
|
const struct ublksrv_io_desc *iod)
|
|
{
|
|
const unsigned shift = conf->shift - 9;
|
|
const unsigned unit_sects = conf->nr_files << shift;
|
|
loff_t start = iod->start_sector;
|
|
loff_t end = start + iod->nr_sectors;
|
|
|
|
return (end / unit_sects) - (start / unit_sects) + 1;
|
|
}
|
|
|
|
static struct stripe_array *alloc_stripe_array(const struct stripe_conf *conf,
|
|
const struct ublksrv_io_desc *iod)
|
|
{
|
|
unsigned nr_vecs = calculate_nr_vec(conf, iod);
|
|
unsigned total = nr_vecs * conf->nr_files;
|
|
struct stripe_array *s;
|
|
int i;
|
|
|
|
s = malloc(sizeof(*s) + total * sizeof(struct iovec));
|
|
|
|
s->nr = 0;
|
|
for (i = 0; i < conf->nr_files; i++) {
|
|
struct stripe *t = &s->s[i];
|
|
|
|
t->nr_vec = 0;
|
|
t->vec = &s->_vec[i * nr_vecs];
|
|
t->nr_sects = 0;
|
|
t->cap = nr_vecs;
|
|
}
|
|
|
|
return s;
|
|
}
|
|
|
|
static void free_stripe_array(struct stripe_array *s)
|
|
{
|
|
free(s);
|
|
}
|
|
|
|
static void calculate_stripe_array(const struct stripe_conf *conf,
|
|
const struct ublksrv_io_desc *iod, struct stripe_array *s)
|
|
{
|
|
const unsigned shift = conf->shift - 9;
|
|
const unsigned chunk_sects = 1 << shift;
|
|
const unsigned unit_sects = conf->nr_files << shift;
|
|
off64_t start = iod->start_sector;
|
|
off64_t end = start + iod->nr_sectors;
|
|
unsigned long done = 0;
|
|
unsigned idx = 0;
|
|
|
|
while (start < end) {
|
|
unsigned nr_sects = chunk_sects - (start & (chunk_sects - 1));
|
|
loff_t unit_off = (start / unit_sects) * unit_sects;
|
|
unsigned seq = (start - unit_off) >> shift;
|
|
struct stripe *this = &s->s[idx];
|
|
loff_t stripe_off = (unit_off / conf->nr_files) +
|
|
(start & (chunk_sects - 1));
|
|
|
|
if (nr_sects > end - start)
|
|
nr_sects = end - start;
|
|
if (this->nr_sects == 0) {
|
|
this->nr_sects = nr_sects;
|
|
this->start = stripe_off;
|
|
this->seq = seq;
|
|
s->nr += 1;
|
|
} else {
|
|
assert(seq == this->seq);
|
|
assert(this->start + this->nr_sects == stripe_off);
|
|
this->nr_sects += nr_sects;
|
|
}
|
|
|
|
assert(this->nr_vec < this->cap);
|
|
this->vec[this->nr_vec].iov_base = (void *)(iod->addr + done);
|
|
this->vec[this->nr_vec++].iov_len = nr_sects << 9;
|
|
|
|
start += nr_sects;
|
|
done += nr_sects << 9;
|
|
idx = (idx + 1) % conf->nr_files;
|
|
}
|
|
}
|
|
|
|
static inline enum io_uring_op stripe_to_uring_op(
|
|
const struct ublksrv_io_desc *iod, int zc)
|
|
{
|
|
unsigned ublk_op = ublksrv_get_op(iod);
|
|
|
|
if (ublk_op == UBLK_IO_OP_READ)
|
|
return zc ? IORING_OP_READV_FIXED : IORING_OP_READV;
|
|
else if (ublk_op == UBLK_IO_OP_WRITE)
|
|
return zc ? IORING_OP_WRITEV_FIXED : IORING_OP_WRITEV;
|
|
assert(0);
|
|
}
|
|
|
|
static int stripe_queue_tgt_rw_io(struct ublk_queue *q, const struct ublksrv_io_desc *iod, int tag)
|
|
{
|
|
const struct stripe_conf *conf = get_chunk_shift(q);
|
|
int zc = !!(ublk_queue_use_zc(q) != 0);
|
|
enum io_uring_op op = stripe_to_uring_op(iod, zc);
|
|
struct io_uring_sqe *sqe[NR_STRIPE];
|
|
struct stripe_array *s = alloc_stripe_array(conf, iod);
|
|
struct ublk_io *io = ublk_get_io(q, tag);
|
|
int i, extra = zc ? 2 : 0;
|
|
|
|
io->private_data = s;
|
|
calculate_stripe_array(conf, iod, s);
|
|
|
|
ublk_queue_alloc_sqes(q, sqe, s->nr + extra);
|
|
|
|
if (zc) {
|
|
io_uring_prep_buf_register(sqe[0], 0, tag, q->q_id, tag);
|
|
sqe[0]->flags |= IOSQE_CQE_SKIP_SUCCESS | IOSQE_IO_HARDLINK;
|
|
sqe[0]->user_data = build_user_data(tag,
|
|
ublk_cmd_op_nr(sqe[0]->cmd_op), 0, 1);
|
|
}
|
|
|
|
for (i = zc; i < s->nr + extra - zc; i++) {
|
|
struct stripe *t = &s->s[i - zc];
|
|
|
|
io_uring_prep_rw(op, sqe[i],
|
|
t->seq + 1,
|
|
(void *)t->vec,
|
|
t->nr_vec,
|
|
t->start << 9);
|
|
if (zc) {
|
|
sqe[i]->buf_index = tag;
|
|
io_uring_sqe_set_flags(sqe[i],
|
|
IOSQE_FIXED_FILE | IOSQE_IO_HARDLINK);
|
|
} else {
|
|
io_uring_sqe_set_flags(sqe[i], IOSQE_FIXED_FILE);
|
|
}
|
|
/* bit63 marks us as tgt io */
|
|
sqe[i]->user_data = build_user_data(tag, ublksrv_get_op(iod), i - zc, 1);
|
|
}
|
|
if (zc) {
|
|
struct io_uring_sqe *unreg = sqe[s->nr + 1];
|
|
|
|
io_uring_prep_buf_unregister(unreg, 0, tag, q->q_id, tag);
|
|
unreg->user_data = build_user_data(tag, ublk_cmd_op_nr(unreg->cmd_op), 0, 1);
|
|
}
|
|
|
|
/* register buffer is skip_success */
|
|
return s->nr + zc;
|
|
}
|
|
|
|
static int handle_flush(struct ublk_queue *q, const struct ublksrv_io_desc *iod, int tag)
|
|
{
|
|
const struct stripe_conf *conf = get_chunk_shift(q);
|
|
struct io_uring_sqe *sqe[NR_STRIPE];
|
|
int i;
|
|
|
|
ublk_queue_alloc_sqes(q, sqe, conf->nr_files);
|
|
for (i = 0; i < conf->nr_files; i++) {
|
|
io_uring_prep_fsync(sqe[i], i + 1, IORING_FSYNC_DATASYNC);
|
|
io_uring_sqe_set_flags(sqe[i], IOSQE_FIXED_FILE);
|
|
sqe[i]->user_data = build_user_data(tag, UBLK_IO_OP_FLUSH, 0, 1);
|
|
}
|
|
return conf->nr_files;
|
|
}
|
|
|
|
static int stripe_queue_tgt_io(struct ublk_queue *q, int tag)
|
|
{
|
|
const struct ublksrv_io_desc *iod = ublk_get_iod(q, tag);
|
|
unsigned ublk_op = ublksrv_get_op(iod);
|
|
int ret = 0;
|
|
|
|
switch (ublk_op) {
|
|
case UBLK_IO_OP_FLUSH:
|
|
ret = handle_flush(q, iod, tag);
|
|
break;
|
|
case UBLK_IO_OP_WRITE_ZEROES:
|
|
case UBLK_IO_OP_DISCARD:
|
|
ret = -ENOTSUP;
|
|
break;
|
|
case UBLK_IO_OP_READ:
|
|
case UBLK_IO_OP_WRITE:
|
|
ret = stripe_queue_tgt_rw_io(q, iod, tag);
|
|
break;
|
|
default:
|
|
ret = -EINVAL;
|
|
break;
|
|
}
|
|
ublk_dbg(UBLK_DBG_IO, "%s: tag %d ublk io %x %llx %u ret %d\n", __func__, tag,
|
|
iod->op_flags, iod->start_sector, iod->nr_sectors << 9, ret);
|
|
return ret;
|
|
}
|
|
|
|
static int ublk_stripe_queue_io(struct ublk_queue *q, int tag)
|
|
{
|
|
int queued = stripe_queue_tgt_io(q, tag);
|
|
|
|
ublk_queued_tgt_io(q, tag, queued);
|
|
return 0;
|
|
}
|
|
|
|
static void ublk_stripe_io_done(struct ublk_queue *q, int tag,
|
|
const struct io_uring_cqe *cqe)
|
|
{
|
|
const struct ublksrv_io_desc *iod = ublk_get_iod(q, tag);
|
|
unsigned op = user_data_to_op(cqe->user_data);
|
|
struct ublk_io *io = ublk_get_io(q, tag);
|
|
int res = cqe->res;
|
|
|
|
if (res < 0 || op != ublk_cmd_op_nr(UBLK_U_IO_UNREGISTER_IO_BUF)) {
|
|
if (!io->result)
|
|
io->result = res;
|
|
if (res < 0)
|
|
ublk_err("%s: io failure %d tag %u\n", __func__, res, tag);
|
|
}
|
|
|
|
/* buffer register op is IOSQE_CQE_SKIP_SUCCESS */
|
|
if (op == ublk_cmd_op_nr(UBLK_U_IO_REGISTER_IO_BUF))
|
|
io->tgt_ios += 1;
|
|
|
|
/* fail short READ/WRITE simply */
|
|
if (op == UBLK_IO_OP_READ || op == UBLK_IO_OP_WRITE) {
|
|
unsigned seq = user_data_to_tgt_data(cqe->user_data);
|
|
struct stripe_array *s = io->private_data;
|
|
|
|
if (res < s->s[seq].nr_sects << 9) {
|
|
io->result = -EIO;
|
|
ublk_err("%s: short rw op %u res %d exp %u tag %u\n",
|
|
__func__, op, res, s->s[seq].vec->iov_len, tag);
|
|
}
|
|
}
|
|
|
|
if (ublk_completed_tgt_io(q, tag)) {
|
|
int res = io->result;
|
|
|
|
if (!res)
|
|
res = iod->nr_sectors << 9;
|
|
|
|
ublk_complete_io(q, tag, res);
|
|
|
|
free_stripe_array(io->private_data);
|
|
io->private_data = NULL;
|
|
}
|
|
}
|
|
|
|
static int ublk_stripe_tgt_init(const struct dev_ctx *ctx, struct ublk_dev *dev)
|
|
{
|
|
struct ublk_params p = {
|
|
.types = UBLK_PARAM_TYPE_BASIC,
|
|
.basic = {
|
|
.attrs = UBLK_ATTR_VOLATILE_CACHE,
|
|
.logical_bs_shift = 9,
|
|
.physical_bs_shift = 12,
|
|
.io_opt_shift = 12,
|
|
.io_min_shift = 9,
|
|
.max_sectors = dev->dev_info.max_io_buf_bytes >> 9,
|
|
},
|
|
};
|
|
unsigned chunk_size = ctx->chunk_size;
|
|
struct stripe_conf *conf;
|
|
unsigned chunk_shift;
|
|
loff_t bytes = 0;
|
|
int ret, i, mul = 1;
|
|
|
|
if ((chunk_size & (chunk_size - 1)) || !chunk_size) {
|
|
ublk_err("invalid chunk size %u\n", chunk_size);
|
|
return -EINVAL;
|
|
}
|
|
|
|
if (chunk_size < 4096 || chunk_size > 512 * 1024) {
|
|
ublk_err("invalid chunk size %u\n", chunk_size);
|
|
return -EINVAL;
|
|
}
|
|
|
|
chunk_shift = ilog2(chunk_size);
|
|
|
|
ret = backing_file_tgt_init(dev);
|
|
if (ret)
|
|
return ret;
|
|
|
|
if (!dev->tgt.nr_backing_files || dev->tgt.nr_backing_files > NR_STRIPE)
|
|
return -EINVAL;
|
|
|
|
assert(dev->nr_fds == dev->tgt.nr_backing_files + 1);
|
|
|
|
for (i = 0; i < dev->tgt.nr_backing_files; i++)
|
|
dev->tgt.backing_file_size[i] &= ~((1 << chunk_shift) - 1);
|
|
|
|
for (i = 0; i < dev->tgt.nr_backing_files; i++) {
|
|
unsigned long size = dev->tgt.backing_file_size[i];
|
|
|
|
if (size != dev->tgt.backing_file_size[0])
|
|
return -EINVAL;
|
|
bytes += size;
|
|
}
|
|
|
|
conf = malloc(sizeof(*conf));
|
|
conf->shift = chunk_shift;
|
|
conf->nr_files = dev->tgt.nr_backing_files;
|
|
|
|
dev->private_data = conf;
|
|
dev->tgt.dev_size = bytes;
|
|
p.basic.dev_sectors = bytes >> 9;
|
|
dev->tgt.params = p;
|
|
|
|
if (dev->dev_info.flags & UBLK_F_SUPPORT_ZERO_COPY)
|
|
mul = 2;
|
|
dev->tgt.sq_depth = mul * dev->dev_info.queue_depth * conf->nr_files;
|
|
dev->tgt.cq_depth = mul * dev->dev_info.queue_depth * conf->nr_files;
|
|
|
|
printf("%s: shift %u files %u\n", __func__, conf->shift, conf->nr_files);
|
|
|
|
return 0;
|
|
}
|
|
|
|
static void ublk_stripe_tgt_deinit(struct ublk_dev *dev)
|
|
{
|
|
free(dev->private_data);
|
|
backing_file_tgt_deinit(dev);
|
|
}
|
|
|
|
const struct ublk_tgt_ops stripe_tgt_ops = {
|
|
.name = "stripe",
|
|
.init_tgt = ublk_stripe_tgt_init,
|
|
.deinit_tgt = ublk_stripe_tgt_deinit,
|
|
.queue_io = ublk_stripe_queue_io,
|
|
.tgt_io_done = ublk_stripe_io_done,
|
|
};
|