mirror of
git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2025-08-05 16:54:27 +00:00

syzkaller reported recursion with a loop of three calls (netfs_rreq_assess,
netfs_retry_reads and netfs_rreq_terminated) hitting the limit of the stack
during an unbuffered or direct I/O read.
There are a number of issues:
(1) There is no limit on the number of retries.
(2) A subrequest is supposed to be abandoned if it does not transfer
anything (NETFS_SREQ_NO_PROGRESS), but that isn't checked under all
circumstances.
(3) The actual root cause, which is this:
if (atomic_dec_and_test(&rreq->nr_outstanding))
netfs_rreq_terminated(rreq, ...);
When we do a retry, we bump the rreq->nr_outstanding counter to
prevent the final cleanup phase running before we've finished
dispatching the retries. The problem is if we hit 0, we have to do
the cleanup phase - but we're in the cleanup phase and end up
repeating the retry cycle, hence the recursion.
Work around the problem by limiting the number of retries. This is based
on Lizhi Xu's patch[1], and makes the following changes:
(1) Replace NETFS_SREQ_NO_PROGRESS with NETFS_SREQ_MADE_PROGRESS and make
the filesystem set it if it managed to read or write at least one byte
of data. Clear this bit before issuing a subrequest.
(2) Add a ->retry_count member to the subrequest and increment it any time
we do a retry.
(3) Remove the NETFS_SREQ_RETRYING flag as it is superfluous with
->retry_count. If the latter is non-zero, we're doing a retry.
(4) Abandon a subrequest if retry_count is non-zero and we made no
progress.
(5) Use ->retry_count in both the write-side and the read-size.
[?] Question: Should I set a hard limit on retry_count in both read and
write? Say it hits 50, we always abandon it. The problem is that
these changes only mitigate the issue. As long as it made at least one
byte of progress, the recursion is still an issue. This patch
mitigates the problem, but does not fix the underlying cause. I have
patches that will do that, but it's an intrusive fix that's currently
pending for the next merge window.
The oops generated by KASAN looks something like:
BUG: TASK stack guard page was hit at ffffc9000482ff48 (stack is ffffc90004830000..ffffc90004838000)
Oops: stack guard page: 0000 [#1] PREEMPT SMP KASAN NOPTI
...
RIP: 0010:mark_lock+0x25/0xc60 kernel/locking/lockdep.c:4686
...
mark_usage kernel/locking/lockdep.c:4646 [inline]
__lock_acquire+0x906/0x3ce0 kernel/locking/lockdep.c:5156
lock_acquire.part.0+0x11b/0x380 kernel/locking/lockdep.c:5825
local_lock_acquire include/linux/local_lock_internal.h:29 [inline]
___slab_alloc+0x123/0x1880 mm/slub.c:3695
__slab_alloc.constprop.0+0x56/0xb0 mm/slub.c:3908
__slab_alloc_node mm/slub.c:3961 [inline]
slab_alloc_node mm/slub.c:4122 [inline]
kmem_cache_alloc_noprof+0x2a7/0x2f0 mm/slub.c:4141
radix_tree_node_alloc.constprop.0+0x1e8/0x350 lib/radix-tree.c:253
idr_get_free+0x528/0xa40 lib/radix-tree.c:1506
idr_alloc_u32+0x191/0x2f0 lib/idr.c:46
idr_alloc+0xc1/0x130 lib/idr.c:87
p9_tag_alloc+0x394/0x870 net/9p/client.c:321
p9_client_prepare_req+0x19f/0x4d0 net/9p/client.c:644
p9_client_zc_rpc.constprop.0+0x105/0x880 net/9p/client.c:793
p9_client_read_once+0x443/0x820 net/9p/client.c:1570
p9_client_read+0x13f/0x1b0 net/9p/client.c:1534
v9fs_issue_read+0x115/0x310 fs/9p/vfs_addr.c:74
netfs_retry_read_subrequests fs/netfs/read_retry.c:60 [inline]
netfs_retry_reads+0x153a/0x1d00 fs/netfs/read_retry.c:232
netfs_rreq_assess+0x5d3/0x870 fs/netfs/read_collect.c:371
netfs_rreq_terminated+0xe5/0x110 fs/netfs/read_collect.c:407
netfs_retry_reads+0x155e/0x1d00 fs/netfs/read_retry.c:235
netfs_rreq_assess+0x5d3/0x870 fs/netfs/read_collect.c:371
netfs_rreq_terminated+0xe5/0x110 fs/netfs/read_collect.c:407
netfs_retry_reads+0x155e/0x1d00 fs/netfs/read_retry.c:235
netfs_rreq_assess+0x5d3/0x870 fs/netfs/read_collect.c:371
...
netfs_rreq_terminated+0xe5/0x110 fs/netfs/read_collect.c:407
netfs_retry_reads+0x155e/0x1d00 fs/netfs/read_retry.c:235
netfs_rreq_assess+0x5d3/0x870 fs/netfs/read_collect.c:371
netfs_rreq_terminated+0xe5/0x110 fs/netfs/read_collect.c:407
netfs_retry_reads+0x155e/0x1d00 fs/netfs/read_retry.c:235
netfs_rreq_assess+0x5d3/0x870 fs/netfs/read_collect.c:371
netfs_rreq_terminated+0xe5/0x110 fs/netfs/read_collect.c:407
netfs_dispatch_unbuffered_reads fs/netfs/direct_read.c:103 [inline]
netfs_unbuffered_read fs/netfs/direct_read.c:127 [inline]
netfs_unbuffered_read_iter_locked+0x12f6/0x19b0 fs/netfs/direct_read.c:221
netfs_unbuffered_read_iter+0xc5/0x100 fs/netfs/direct_read.c:256
v9fs_file_read_iter+0xbf/0x100 fs/9p/vfs_file.c:361
do_iter_readv_writev+0x614/0x7f0 fs/read_write.c:832
vfs_readv+0x4cf/0x890 fs/read_write.c:1025
do_preadv fs/read_write.c:1142 [inline]
__do_sys_preadv fs/read_write.c:1192 [inline]
__se_sys_preadv fs/read_write.c:1187 [inline]
__x64_sys_preadv+0x22d/0x310 fs/read_write.c:1187
do_syscall_x64 arch/x86/entry/common.c:52 [inline]
do_syscall_64+0xcd/0x250 arch/x86/entry/common.c:83
Fixes: ee4cdf7ba8
("netfs: Speed up buffered reading")
Closes: https://syzkaller.appspot.com/bug?extid=1fc6f64c40a9d143cfb6
Signed-off-by: David Howells <dhowells@redhat.com>
Link: https://lore.kernel.org/r/20241108034020.3695718-1-lizhi.xu@windriver.com/ [1]
Link: https://lore.kernel.org/r/20241213135013.2964079-9-dhowells@redhat.com
Tested-by: syzbot+885c03ad650731743489@syzkaller.appspotmail.com
Suggested-by: Lizhi Xu <lizhi.xu@windriver.com>
cc: Dominique Martinet <asmadeus@codewreck.org>
cc: Jeff Layton <jlayton@kernel.org>
cc: v9fs@lists.linux.dev
cc: netfs@lists.linux.dev
cc: linux-fsdevel@vger.kernel.org
Reported-by: syzbot+885c03ad650731743489@syzkaller.appspotmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
296 lines
8 KiB
C
296 lines
8 KiB
C
// SPDX-License-Identifier: GPL-2.0-or-later
|
|
/* handling of writes to regular files and writing back to the server
|
|
*
|
|
* Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
|
|
* Written by David Howells (dhowells@redhat.com)
|
|
*/
|
|
|
|
#include <linux/backing-dev.h>
|
|
#include <linux/slab.h>
|
|
#include <linux/fs.h>
|
|
#include <linux/pagemap.h>
|
|
#include <linux/writeback.h>
|
|
#include <linux/pagevec.h>
|
|
#include <linux/netfs.h>
|
|
#include <trace/events/netfs.h>
|
|
#include "internal.h"
|
|
|
|
/*
|
|
* completion of write to server
|
|
*/
|
|
static void afs_pages_written_back(struct afs_vnode *vnode, loff_t start, unsigned int len)
|
|
{
|
|
_enter("{%llx:%llu},{%x @%llx}",
|
|
vnode->fid.vid, vnode->fid.vnode, len, start);
|
|
|
|
afs_prune_wb_keys(vnode);
|
|
_leave("");
|
|
}
|
|
|
|
/*
|
|
* Find a key to use for the writeback. We cached the keys used to author the
|
|
* writes on the vnode. wreq->netfs_priv2 will contain the last writeback key
|
|
* record used or NULL and we need to start from there if it's set.
|
|
* wreq->netfs_priv will be set to the key itself or NULL.
|
|
*/
|
|
static void afs_get_writeback_key(struct netfs_io_request *wreq)
|
|
{
|
|
struct afs_wb_key *wbk, *old = wreq->netfs_priv2;
|
|
struct afs_vnode *vnode = AFS_FS_I(wreq->inode);
|
|
|
|
key_put(wreq->netfs_priv);
|
|
wreq->netfs_priv = NULL;
|
|
wreq->netfs_priv2 = NULL;
|
|
|
|
spin_lock(&vnode->wb_lock);
|
|
if (old)
|
|
wbk = list_next_entry(old, vnode_link);
|
|
else
|
|
wbk = list_first_entry(&vnode->wb_keys, struct afs_wb_key, vnode_link);
|
|
|
|
list_for_each_entry_from(wbk, &vnode->wb_keys, vnode_link) {
|
|
_debug("wbk %u", key_serial(wbk->key));
|
|
if (key_validate(wbk->key) == 0) {
|
|
refcount_inc(&wbk->usage);
|
|
wreq->netfs_priv = key_get(wbk->key);
|
|
wreq->netfs_priv2 = wbk;
|
|
_debug("USE WB KEY %u", key_serial(wbk->key));
|
|
break;
|
|
}
|
|
}
|
|
|
|
spin_unlock(&vnode->wb_lock);
|
|
|
|
afs_put_wb_key(old);
|
|
}
|
|
|
|
static void afs_store_data_success(struct afs_operation *op)
|
|
{
|
|
struct afs_vnode *vnode = op->file[0].vnode;
|
|
|
|
op->ctime = op->file[0].scb.status.mtime_client;
|
|
afs_vnode_commit_status(op, &op->file[0]);
|
|
if (!afs_op_error(op)) {
|
|
afs_pages_written_back(vnode, op->store.pos, op->store.size);
|
|
afs_stat_v(vnode, n_stores);
|
|
atomic_long_add(op->store.size, &afs_v2net(vnode)->n_store_bytes);
|
|
}
|
|
}
|
|
|
|
static const struct afs_operation_ops afs_store_data_operation = {
|
|
.issue_afs_rpc = afs_fs_store_data,
|
|
.issue_yfs_rpc = yfs_fs_store_data,
|
|
.success = afs_store_data_success,
|
|
};
|
|
|
|
/*
|
|
* Prepare a subrequest to write to the server. This sets the max_len
|
|
* parameter.
|
|
*/
|
|
void afs_prepare_write(struct netfs_io_subrequest *subreq)
|
|
{
|
|
struct netfs_io_stream *stream = &subreq->rreq->io_streams[subreq->stream_nr];
|
|
|
|
//if (test_bit(NETFS_SREQ_RETRYING, &subreq->flags))
|
|
// subreq->max_len = 512 * 1024;
|
|
//else
|
|
stream->sreq_max_len = 256 * 1024 * 1024;
|
|
}
|
|
|
|
/*
|
|
* Issue a subrequest to write to the server.
|
|
*/
|
|
static void afs_issue_write_worker(struct work_struct *work)
|
|
{
|
|
struct netfs_io_subrequest *subreq = container_of(work, struct netfs_io_subrequest, work);
|
|
struct netfs_io_request *wreq = subreq->rreq;
|
|
struct afs_operation *op;
|
|
struct afs_vnode *vnode = AFS_FS_I(wreq->inode);
|
|
unsigned long long pos = subreq->start + subreq->transferred;
|
|
size_t len = subreq->len - subreq->transferred;
|
|
int ret = -ENOKEY;
|
|
|
|
_enter("R=%x[%x],%s{%llx:%llu.%u},%llx,%zx",
|
|
wreq->debug_id, subreq->debug_index,
|
|
vnode->volume->name,
|
|
vnode->fid.vid,
|
|
vnode->fid.vnode,
|
|
vnode->fid.unique,
|
|
pos, len);
|
|
|
|
#if 0 // Error injection
|
|
if (subreq->debug_index == 3)
|
|
return netfs_write_subrequest_terminated(subreq, -ENOANO, false);
|
|
|
|
if (!subreq->retry_count) {
|
|
set_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags);
|
|
return netfs_write_subrequest_terminated(subreq, -EAGAIN, false);
|
|
}
|
|
#endif
|
|
|
|
op = afs_alloc_operation(wreq->netfs_priv, vnode->volume);
|
|
if (IS_ERR(op))
|
|
return netfs_write_subrequest_terminated(subreq, -EAGAIN, false);
|
|
|
|
afs_op_set_vnode(op, 0, vnode);
|
|
op->file[0].dv_delta = 1;
|
|
op->file[0].modification = true;
|
|
op->store.pos = pos;
|
|
op->store.size = len;
|
|
op->flags |= AFS_OPERATION_UNINTR;
|
|
op->ops = &afs_store_data_operation;
|
|
|
|
afs_begin_vnode_operation(op);
|
|
|
|
op->store.write_iter = &subreq->io_iter;
|
|
op->store.i_size = umax(pos + len, vnode->netfs.remote_i_size);
|
|
op->mtime = inode_get_mtime(&vnode->netfs.inode);
|
|
|
|
afs_wait_for_operation(op);
|
|
ret = afs_put_operation(op);
|
|
switch (ret) {
|
|
case 0:
|
|
__set_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags);
|
|
break;
|
|
case -EACCES:
|
|
case -EPERM:
|
|
case -ENOKEY:
|
|
case -EKEYEXPIRED:
|
|
case -EKEYREJECTED:
|
|
case -EKEYREVOKED:
|
|
/* If there are more keys we can try, use the retry algorithm
|
|
* to rotate the keys.
|
|
*/
|
|
if (wreq->netfs_priv2)
|
|
set_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags);
|
|
break;
|
|
}
|
|
|
|
netfs_write_subrequest_terminated(subreq, ret < 0 ? ret : subreq->len, false);
|
|
}
|
|
|
|
void afs_issue_write(struct netfs_io_subrequest *subreq)
|
|
{
|
|
subreq->work.func = afs_issue_write_worker;
|
|
if (!queue_work(system_unbound_wq, &subreq->work))
|
|
WARN_ON_ONCE(1);
|
|
}
|
|
|
|
/*
|
|
* Writeback calls this when it finds a folio that needs uploading. This isn't
|
|
* called if writeback only has copy-to-cache to deal with.
|
|
*/
|
|
void afs_begin_writeback(struct netfs_io_request *wreq)
|
|
{
|
|
afs_get_writeback_key(wreq);
|
|
wreq->io_streams[0].avail = true;
|
|
}
|
|
|
|
/*
|
|
* Prepare to retry the writes in request. Use this to try rotating the
|
|
* available writeback keys.
|
|
*/
|
|
void afs_retry_request(struct netfs_io_request *wreq, struct netfs_io_stream *stream)
|
|
{
|
|
struct netfs_io_subrequest *subreq =
|
|
list_first_entry(&stream->subrequests,
|
|
struct netfs_io_subrequest, rreq_link);
|
|
|
|
switch (subreq->error) {
|
|
case -EACCES:
|
|
case -EPERM:
|
|
case -ENOKEY:
|
|
case -EKEYEXPIRED:
|
|
case -EKEYREJECTED:
|
|
case -EKEYREVOKED:
|
|
afs_get_writeback_key(wreq);
|
|
if (!wreq->netfs_priv)
|
|
stream->failed = true;
|
|
break;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* write some of the pending data back to the server
|
|
*/
|
|
int afs_writepages(struct address_space *mapping, struct writeback_control *wbc)
|
|
{
|
|
struct afs_vnode *vnode = AFS_FS_I(mapping->host);
|
|
int ret;
|
|
|
|
/* We have to be careful as we can end up racing with setattr()
|
|
* truncating the pagecache since the caller doesn't take a lock here
|
|
* to prevent it.
|
|
*/
|
|
if (wbc->sync_mode == WB_SYNC_ALL)
|
|
down_read(&vnode->validate_lock);
|
|
else if (!down_read_trylock(&vnode->validate_lock))
|
|
return 0;
|
|
|
|
ret = netfs_writepages(mapping, wbc);
|
|
up_read(&vnode->validate_lock);
|
|
return ret;
|
|
}
|
|
|
|
/*
|
|
* flush any dirty pages for this process, and check for write errors.
|
|
* - the return status from this call provides a reliable indication of
|
|
* whether any write errors occurred for this process.
|
|
*/
|
|
int afs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
|
|
{
|
|
struct afs_vnode *vnode = AFS_FS_I(file_inode(file));
|
|
struct afs_file *af = file->private_data;
|
|
int ret;
|
|
|
|
_enter("{%llx:%llu},{n=%pD},%d",
|
|
vnode->fid.vid, vnode->fid.vnode, file,
|
|
datasync);
|
|
|
|
ret = afs_validate(vnode, af->key);
|
|
if (ret < 0)
|
|
return ret;
|
|
|
|
return file_write_and_wait_range(file, start, end);
|
|
}
|
|
|
|
/*
|
|
* notification that a previously read-only page is about to become writable
|
|
* - if it returns an error, the caller will deliver a bus error signal
|
|
*/
|
|
vm_fault_t afs_page_mkwrite(struct vm_fault *vmf)
|
|
{
|
|
struct file *file = vmf->vma->vm_file;
|
|
|
|
if (afs_validate(AFS_FS_I(file_inode(file)), afs_file_key(file)) < 0)
|
|
return VM_FAULT_SIGBUS;
|
|
return netfs_page_mkwrite(vmf, NULL);
|
|
}
|
|
|
|
/*
|
|
* Prune the keys cached for writeback. The caller must hold vnode->wb_lock.
|
|
*/
|
|
void afs_prune_wb_keys(struct afs_vnode *vnode)
|
|
{
|
|
LIST_HEAD(graveyard);
|
|
struct afs_wb_key *wbk, *tmp;
|
|
|
|
/* Discard unused keys */
|
|
spin_lock(&vnode->wb_lock);
|
|
|
|
if (!mapping_tagged(&vnode->netfs.inode.i_data, PAGECACHE_TAG_WRITEBACK) &&
|
|
!mapping_tagged(&vnode->netfs.inode.i_data, PAGECACHE_TAG_DIRTY)) {
|
|
list_for_each_entry_safe(wbk, tmp, &vnode->wb_keys, vnode_link) {
|
|
if (refcount_read(&wbk->usage) == 1)
|
|
list_move(&wbk->vnode_link, &graveyard);
|
|
}
|
|
}
|
|
|
|
spin_unlock(&vnode->wb_lock);
|
|
|
|
while (!list_empty(&graveyard)) {
|
|
wbk = list_entry(graveyard.next, struct afs_wb_key, vnode_link);
|
|
list_del(&wbk->vnode_link);
|
|
afs_put_wb_key(wbk);
|
|
}
|
|
}
|