for-6.16-rc4-tag

-----BEGIN PGP SIGNATURE-----
 
 iQIzBAABCgAdFiEE8rQSAMVO+zA4DBdWxWXV+ddtWDsFAmhmwFQACgkQxWXV+ddt
 WDsVMA/+NuSth71V0AfiDnFyqjgDMqIlZL2+dqBiTYHXQQHKbqiUlKvYkWICCT6T
 1YgDV+95XJYy4TDBoA49Ndd/l+CiDcMLbOYeneIfbJy13ts84jVANPkl4n03gPkF
 ktibCw15h0MENVctTCPc71dX2X0cV9WPf4iDmoxUZiukDA376akGTArZKwH4tVVg
 4qVpzUtDdNOf848D+8DZKGd+ot/RWgEdLkFCZES27BMg/OFemxBK1MU6K8VjxiKF
 VoaSVJRDXuug8oVBAGNl86XpiSgd4gHyoNNA5b4mhdSWMSBMxUAaILsONT9pNQZA
 CFyHA1Jp2gLOIzQIzeXwWgXaAOQDtco8YWYaXhf0v0mySs89tweXjOibfj2mU9pS
 wPaJyeD+nyRDMwPa4VWEws64D3vXX6aKwiThUENuDmxBvrRXjrkGYH9tf0LNzDDe
 OKv/vOCfeyutxbjKhP+qElMhdh73BZnJ4UCxxYRRDq2v1Mg+k06swl+6uL6xenme
 a2KLJlwEoG6LAlkpZzV66ZEaIHDyGBZNdVYtuA/G3dDtmlt0aLXDdp1eq7NivS1j
 aV7cd0JMX89lAUtqKT932ZOw8RoDrUPPjsnXzCaZJ69mMVyEkxyCV+iYHTTJPDga
 W5Vg8Tq3d1gwxMebZHvyI6wwUhmGA0wUFG2eohYY/tcSrrUlrHQ=
 =Ke0p
 -----END PGP SIGNATURE-----

Merge tag 'for-6.16-rc4-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux

Pull btrfs fixes from David Sterba:

 - tree-log fixes:
    - fixes of log tracking of directories and subvolumes
    - fix iteration and error handling of inode references
      during log replay

 - fix free space tree rebuild (reported by syzbot)

* tag 'for-6.16-rc4-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux:
  btrfs: use btrfs_record_snapshot_destroy() during rmdir
  btrfs: propagate last_unlink_trans earlier when doing a rmdir
  btrfs: record new subvolume in parent dir earlier to avoid dir logging races
  btrfs: fix inode lookup error handling during log replay
  btrfs: fix iteration of extrefs during log replay
  btrfs: fix missing error handling when searching for inode refs during log replay
  btrfs: fix failure to rebuild free space tree using multiple transactions
This commit is contained in:
Linus Torvalds 2025-07-03 13:29:56 -07:00
commit 4c06e63b92
5 changed files with 131 additions and 88 deletions

View file

@ -83,6 +83,8 @@ enum btrfs_block_group_flags {
BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, BLOCK_GROUP_FLAG_ZONED_DATA_RELOC,
/* Does the block group need to be added to the free space tree? */ /* Does the block group need to be added to the free space tree? */
BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE,
/* Set after we add a new block group to the free space tree. */
BLOCK_GROUP_FLAG_FREE_SPACE_ADDED,
/* Indicate that the block group is placed on a sequential zone */ /* Indicate that the block group is placed on a sequential zone */
BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE, BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE,
/* /*

View file

@ -1241,6 +1241,7 @@ static int clear_free_space_tree(struct btrfs_trans_handle *trans,
{ {
BTRFS_PATH_AUTO_FREE(path); BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key; struct btrfs_key key;
struct rb_node *node;
int nr; int nr;
int ret; int ret;
@ -1269,6 +1270,16 @@ static int clear_free_space_tree(struct btrfs_trans_handle *trans,
btrfs_release_path(path); btrfs_release_path(path);
} }
node = rb_first_cached(&trans->fs_info->block_group_cache_tree);
while (node) {
struct btrfs_block_group *bg;
bg = rb_entry(node, struct btrfs_block_group, cache_node);
clear_bit(BLOCK_GROUP_FLAG_FREE_SPACE_ADDED, &bg->runtime_flags);
node = rb_next(node);
cond_resched();
}
return 0; return 0;
} }
@ -1358,12 +1369,18 @@ int btrfs_rebuild_free_space_tree(struct btrfs_fs_info *fs_info)
block_group = rb_entry(node, struct btrfs_block_group, block_group = rb_entry(node, struct btrfs_block_group,
cache_node); cache_node);
if (test_bit(BLOCK_GROUP_FLAG_FREE_SPACE_ADDED,
&block_group->runtime_flags))
goto next;
ret = populate_free_space_tree(trans, block_group); ret = populate_free_space_tree(trans, block_group);
if (ret) { if (ret) {
btrfs_abort_transaction(trans, ret); btrfs_abort_transaction(trans, ret);
btrfs_end_transaction(trans); btrfs_end_transaction(trans);
return ret; return ret;
} }
next:
if (btrfs_should_end_transaction(trans)) { if (btrfs_should_end_transaction(trans)) {
btrfs_end_transaction(trans); btrfs_end_transaction(trans);
trans = btrfs_start_transaction(free_space_root, 1); trans = btrfs_start_transaction(free_space_root, 1);
@ -1390,6 +1407,29 @@ static int __add_block_group_free_space(struct btrfs_trans_handle *trans,
clear_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &block_group->runtime_flags); clear_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &block_group->runtime_flags);
/*
* While rebuilding the free space tree we may allocate new metadata
* block groups while modifying the free space tree.
*
* Because during the rebuild (at btrfs_rebuild_free_space_tree()) we
* can use multiple transactions, every time btrfs_end_transaction() is
* called at btrfs_rebuild_free_space_tree() we finish the creation of
* new block groups by calling btrfs_create_pending_block_groups(), and
* that in turn calls us, through add_block_group_free_space(), to add
* a free space info item and a free space extent item for the block
* group.
*
* Then later btrfs_rebuild_free_space_tree() may find such new block
* groups and processes them with populate_free_space_tree(), which can
* fail with EEXIST since there are already items for the block group in
* the free space tree. Notice that we say "may find" because a new
* block group may be added to the block groups rbtree in a node before
* or after the block group currently being processed by the rebuild
* process. So signal the rebuild process to skip such new block groups
* if it finds them.
*/
set_bit(BLOCK_GROUP_FLAG_FREE_SPACE_ADDED, &block_group->runtime_flags);
ret = add_new_free_space_info(trans, block_group, path); ret = add_new_free_space_info(trans, block_group, path);
if (ret) if (ret)
return ret; return ret;

View file

@ -4710,7 +4710,6 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
int ret = 0; int ret = 0;
struct btrfs_trans_handle *trans; struct btrfs_trans_handle *trans;
u64 last_unlink_trans;
struct fscrypt_name fname; struct fscrypt_name fname;
if (inode->i_size > BTRFS_EMPTY_DIR_SIZE) if (inode->i_size > BTRFS_EMPTY_DIR_SIZE)
@ -4736,6 +4735,23 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
goto out_notrans; goto out_notrans;
} }
/*
* Propagate the last_unlink_trans value of the deleted dir to its
* parent directory. This is to prevent an unrecoverable log tree in the
* case we do something like this:
* 1) create dir foo
* 2) create snapshot under dir foo
* 3) delete the snapshot
* 4) rmdir foo
* 5) mkdir foo
* 6) fsync foo or some file inside foo
*
* This is because we can't unlink other roots when replaying the dir
* deletes for directory foo.
*/
if (BTRFS_I(inode)->last_unlink_trans >= trans->transid)
btrfs_record_snapshot_destroy(trans, BTRFS_I(dir));
if (unlikely(btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { if (unlikely(btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
ret = btrfs_unlink_subvol(trans, BTRFS_I(dir), dentry); ret = btrfs_unlink_subvol(trans, BTRFS_I(dir), dentry);
goto out; goto out;
@ -4745,27 +4761,11 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
if (ret) if (ret)
goto out; goto out;
last_unlink_trans = BTRFS_I(inode)->last_unlink_trans;
/* now the directory is empty */ /* now the directory is empty */
ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)), ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)),
&fname.disk_name); &fname.disk_name);
if (!ret) { if (!ret)
btrfs_i_size_write(BTRFS_I(inode), 0); btrfs_i_size_write(BTRFS_I(inode), 0);
/*
* Propagate the last_unlink_trans value of the deleted dir to
* its parent directory. This is to prevent an unrecoverable
* log tree in the case we do something like this:
* 1) create dir foo
* 2) create snapshot under dir foo
* 3) delete the snapshot
* 4) rmdir foo
* 5) mkdir foo
* 6) fsync foo or some file inside foo
*/
if (last_unlink_trans >= trans->transid)
BTRFS_I(dir)->last_unlink_trans = last_unlink_trans;
}
out: out:
btrfs_end_transaction(trans); btrfs_end_transaction(trans);
out_notrans: out_notrans:

View file

@ -666,14 +666,14 @@ static noinline int create_subvol(struct mnt_idmap *idmap,
goto out; goto out;
} }
btrfs_record_new_subvolume(trans, BTRFS_I(dir));
ret = btrfs_create_new_inode(trans, &new_inode_args); ret = btrfs_create_new_inode(trans, &new_inode_args);
if (ret) { if (ret) {
btrfs_abort_transaction(trans, ret); btrfs_abort_transaction(trans, ret);
goto out; goto out;
} }
btrfs_record_new_subvolume(trans, BTRFS_I(dir));
d_instantiate_new(dentry, new_inode_args.inode); d_instantiate_new(dentry, new_inode_args.inode);
new_inode_args.inode = NULL; new_inode_args.inode = NULL;

View file

@ -143,6 +143,9 @@ static struct btrfs_inode *btrfs_iget_logging(u64 objectid, struct btrfs_root *r
unsigned int nofs_flag; unsigned int nofs_flag;
struct btrfs_inode *inode; struct btrfs_inode *inode;
/* Only meant to be called for subvolume roots and not for log roots. */
ASSERT(is_fstree(btrfs_root_id(root)));
/* /*
* We're holding a transaction handle whether we are logging or * We're holding a transaction handle whether we are logging or
* replaying a log tree, so we must make sure NOFS semantics apply * replaying a log tree, so we must make sure NOFS semantics apply
@ -604,21 +607,6 @@ static int read_alloc_one_name(struct extent_buffer *eb, void *start, int len,
return 0; return 0;
} }
/*
* simple helper to read an inode off the disk from a given root
* This can only be called for subvolume roots and not for the log
*/
static noinline struct btrfs_inode *read_one_inode(struct btrfs_root *root,
u64 objectid)
{
struct btrfs_inode *inode;
inode = btrfs_iget_logging(objectid, root);
if (IS_ERR(inode))
return NULL;
return inode;
}
/* replays a single extent in 'eb' at 'slot' with 'key' into the /* replays a single extent in 'eb' at 'slot' with 'key' into the
* subvolume 'root'. path is released on entry and should be released * subvolume 'root'. path is released on entry and should be released
* on exit. * on exit.
@ -674,9 +662,9 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
return -EUCLEAN; return -EUCLEAN;
} }
inode = read_one_inode(root, key->objectid); inode = btrfs_iget_logging(key->objectid, root);
if (!inode) if (IS_ERR(inode))
return -EIO; return PTR_ERR(inode);
/* /*
* first check to see if we already have this extent in the * first check to see if we already have this extent in the
@ -948,9 +936,10 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
btrfs_release_path(path); btrfs_release_path(path);
inode = read_one_inode(root, location.objectid); inode = btrfs_iget_logging(location.objectid, root);
if (!inode) { if (IS_ERR(inode)) {
ret = -EIO; ret = PTR_ERR(inode);
inode = NULL;
goto out; goto out;
} }
@ -1073,7 +1062,9 @@ again:
search_key.type = BTRFS_INODE_REF_KEY; search_key.type = BTRFS_INODE_REF_KEY;
search_key.offset = parent_objectid; search_key.offset = parent_objectid;
ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
if (ret == 0) { if (ret < 0) {
return ret;
} else if (ret == 0) {
struct btrfs_inode_ref *victim_ref; struct btrfs_inode_ref *victim_ref;
unsigned long ptr; unsigned long ptr;
unsigned long ptr_end; unsigned long ptr_end;
@ -1146,13 +1137,13 @@ again:
struct fscrypt_str victim_name; struct fscrypt_str victim_name;
extref = (struct btrfs_inode_extref *)(base + cur_offset); extref = (struct btrfs_inode_extref *)(base + cur_offset);
victim_name.len = btrfs_inode_extref_name_len(leaf, extref);
if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid) if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid)
goto next; goto next;
ret = read_alloc_one_name(leaf, &extref->name, ret = read_alloc_one_name(leaf, &extref->name,
btrfs_inode_extref_name_len(leaf, extref), victim_name.len, &victim_name);
&victim_name);
if (ret) if (ret)
return ret; return ret;
@ -1167,10 +1158,10 @@ again:
kfree(victim_name.name); kfree(victim_name.name);
return ret; return ret;
} else if (!ret) { } else if (!ret) {
ret = -ENOENT; victim_parent = btrfs_iget_logging(parent_objectid, root);
victim_parent = read_one_inode(root, if (IS_ERR(victim_parent)) {
parent_objectid); ret = PTR_ERR(victim_parent);
if (victim_parent) { } else {
inc_nlink(&inode->vfs_inode); inc_nlink(&inode->vfs_inode);
btrfs_release_path(path); btrfs_release_path(path);
@ -1315,9 +1306,9 @@ again:
struct btrfs_inode *dir; struct btrfs_inode *dir;
btrfs_release_path(path); btrfs_release_path(path);
dir = read_one_inode(root, parent_id); dir = btrfs_iget_logging(parent_id, root);
if (!dir) { if (IS_ERR(dir)) {
ret = -ENOENT; ret = PTR_ERR(dir);
kfree(name.name); kfree(name.name);
goto out; goto out;
} }
@ -1389,15 +1380,17 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
* copy the back ref in. The link count fixup code will take * copy the back ref in. The link count fixup code will take
* care of the rest * care of the rest
*/ */
dir = read_one_inode(root, parent_objectid); dir = btrfs_iget_logging(parent_objectid, root);
if (!dir) { if (IS_ERR(dir)) {
ret = -ENOENT; ret = PTR_ERR(dir);
dir = NULL;
goto out; goto out;
} }
inode = read_one_inode(root, inode_objectid); inode = btrfs_iget_logging(inode_objectid, root);
if (!inode) { if (IS_ERR(inode)) {
ret = -EIO; ret = PTR_ERR(inode);
inode = NULL;
goto out; goto out;
} }
@ -1409,11 +1402,13 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
* parent object can change from one array * parent object can change from one array
* item to another. * item to another.
*/ */
if (!dir)
dir = read_one_inode(root, parent_objectid);
if (!dir) { if (!dir) {
ret = -ENOENT; dir = btrfs_iget_logging(parent_objectid, root);
goto out; if (IS_ERR(dir)) {
ret = PTR_ERR(dir);
dir = NULL;
goto out;
}
} }
} else { } else {
ret = ref_get_fields(eb, ref_ptr, &name, &ref_index); ret = ref_get_fields(eb, ref_ptr, &name, &ref_index);
@ -1682,9 +1677,9 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
break; break;
btrfs_release_path(path); btrfs_release_path(path);
inode = read_one_inode(root, key.offset); inode = btrfs_iget_logging(key.offset, root);
if (!inode) { if (IS_ERR(inode)) {
ret = -EIO; ret = PTR_ERR(inode);
break; break;
} }
@ -1720,9 +1715,9 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode; struct btrfs_inode *inode;
struct inode *vfs_inode; struct inode *vfs_inode;
inode = read_one_inode(root, objectid); inode = btrfs_iget_logging(objectid, root);
if (!inode) if (IS_ERR(inode))
return -EIO; return PTR_ERR(inode);
vfs_inode = &inode->vfs_inode; vfs_inode = &inode->vfs_inode;
key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
@ -1761,14 +1756,14 @@ static noinline int insert_one_name(struct btrfs_trans_handle *trans,
struct btrfs_inode *dir; struct btrfs_inode *dir;
int ret; int ret;
inode = read_one_inode(root, location->objectid); inode = btrfs_iget_logging(location->objectid, root);
if (!inode) if (IS_ERR(inode))
return -ENOENT; return PTR_ERR(inode);
dir = read_one_inode(root, dirid); dir = btrfs_iget_logging(dirid, root);
if (!dir) { if (IS_ERR(dir)) {
iput(&inode->vfs_inode); iput(&inode->vfs_inode);
return -EIO; return PTR_ERR(dir);
} }
ret = btrfs_add_link(trans, dir, inode, name, 1, index); ret = btrfs_add_link(trans, dir, inode, name, 1, index);
@ -1845,9 +1840,9 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
bool update_size = true; bool update_size = true;
bool name_added = false; bool name_added = false;
dir = read_one_inode(root, key->objectid); dir = btrfs_iget_logging(key->objectid, root);
if (!dir) if (IS_ERR(dir))
return -EIO; return PTR_ERR(dir);
ret = read_alloc_one_name(eb, di + 1, btrfs_dir_name_len(eb, di), &name); ret = read_alloc_one_name(eb, di + 1, btrfs_dir_name_len(eb, di), &name);
if (ret) if (ret)
@ -2147,9 +2142,10 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
btrfs_dir_item_key_to_cpu(eb, di, &location); btrfs_dir_item_key_to_cpu(eb, di, &location);
btrfs_release_path(path); btrfs_release_path(path);
btrfs_release_path(log_path); btrfs_release_path(log_path);
inode = read_one_inode(root, location.objectid); inode = btrfs_iget_logging(location.objectid, root);
if (!inode) { if (IS_ERR(inode)) {
ret = -EIO; ret = PTR_ERR(inode);
inode = NULL;
goto out; goto out;
} }
@ -2301,14 +2297,17 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
if (!log_path) if (!log_path)
return -ENOMEM; return -ENOMEM;
dir = read_one_inode(root, dirid); dir = btrfs_iget_logging(dirid, root);
/* it isn't an error if the inode isn't there, that can happen /*
* because we replay the deletes before we copy in the inode item * It isn't an error if the inode isn't there, that can happen because
* from the log * we replay the deletes before we copy in the inode item from the log.
*/ */
if (!dir) { if (IS_ERR(dir)) {
btrfs_free_path(log_path); btrfs_free_path(log_path);
return 0; ret = PTR_ERR(dir);
if (ret == -ENOENT)
ret = 0;
return ret;
} }
range_start = 0; range_start = 0;
@ -2467,9 +2466,9 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
struct btrfs_inode *inode; struct btrfs_inode *inode;
u64 from; u64 from;
inode = read_one_inode(root, key.objectid); inode = btrfs_iget_logging(key.objectid, root);
if (!inode) { if (IS_ERR(inode)) {
ret = -EIO; ret = PTR_ERR(inode);
break; break;
} }
from = ALIGN(i_size_read(&inode->vfs_inode), from = ALIGN(i_size_read(&inode->vfs_inode),
@ -7448,6 +7447,8 @@ void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans,
* full log sync. * full log sync.
* Also we don't need to worry with renames, since btrfs_rename() marks the log * Also we don't need to worry with renames, since btrfs_rename() marks the log
* for full commit when renaming a subvolume. * for full commit when renaming a subvolume.
*
* Must be called before creating the subvolume entry in its parent directory.
*/ */
void btrfs_record_new_subvolume(const struct btrfs_trans_handle *trans, void btrfs_record_new_subvolume(const struct btrfs_trans_handle *trans,
struct btrfs_inode *dir) struct btrfs_inode *dir)