btrfs: convert the buffer_radix to an xarray

In order to fully utilize xarray tagging to improve writeback we need to
convert the buffer_radix to a proper xarray.  This conversion is
relatively straightforward as the radix code uses the xarray underneath.
Using xarray directly allows for quite a lot less code.

Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
This commit is contained in:
Josef Bacik 2025-04-28 10:52:55 -04:00 committed by David Sterba
parent 656e9f51de
commit 19d7f65f03
5 changed files with 111 additions and 167 deletions

View file

@ -2761,10 +2761,21 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info)
return ret; return ret;
} }
/*
* Lockdep gets confused between our buffer_tree which requires IRQ locking because
* we modify marks in the IRQ context, and our delayed inode xarray which doesn't
* have these requirements. Use a class key so lockdep doesn't get them mixed up.
*/
static struct lock_class_key buffer_xa_class;
void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
{ {
INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC); INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC);
/* Use the same flags as mapping->i_pages. */
xa_init_flags(&fs_info->buffer_tree, XA_FLAGS_LOCK_IRQ | XA_FLAGS_ACCOUNT);
lockdep_set_class(&fs_info->buffer_tree.xa_lock, &buffer_xa_class);
INIT_LIST_HEAD(&fs_info->trans_list); INIT_LIST_HEAD(&fs_info->trans_list);
INIT_LIST_HEAD(&fs_info->dead_roots); INIT_LIST_HEAD(&fs_info->dead_roots);
INIT_LIST_HEAD(&fs_info->delayed_iputs); INIT_LIST_HEAD(&fs_info->delayed_iputs);
@ -2776,7 +2787,6 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
spin_lock_init(&fs_info->delayed_iput_lock); spin_lock_init(&fs_info->delayed_iput_lock);
spin_lock_init(&fs_info->defrag_inodes_lock); spin_lock_init(&fs_info->defrag_inodes_lock);
spin_lock_init(&fs_info->super_lock); spin_lock_init(&fs_info->super_lock);
spin_lock_init(&fs_info->buffer_lock);
spin_lock_init(&fs_info->unused_bgs_lock); spin_lock_init(&fs_info->unused_bgs_lock);
spin_lock_init(&fs_info->treelog_bg_lock); spin_lock_init(&fs_info->treelog_bg_lock);
spin_lock_init(&fs_info->zone_active_bgs_lock); spin_lock_init(&fs_info->zone_active_bgs_lock);

View file

@ -1866,19 +1866,17 @@ static void set_btree_ioerr(struct extent_buffer *eb)
* context. * context.
*/ */
static struct extent_buffer *find_extent_buffer_nolock( static struct extent_buffer *find_extent_buffer_nolock(
const struct btrfs_fs_info *fs_info, u64 start) struct btrfs_fs_info *fs_info, u64 start)
{ {
struct extent_buffer *eb; struct extent_buffer *eb;
unsigned long index = (start >> fs_info->sectorsize_bits);
rcu_read_lock(); rcu_read_lock();
eb = radix_tree_lookup(&fs_info->buffer_radix, eb = xa_load(&fs_info->buffer_tree, index);
start >> fs_info->sectorsize_bits); if (eb && !atomic_inc_not_zero(&eb->refs))
if (eb && atomic_inc_not_zero(&eb->refs)) { eb = NULL;
rcu_read_unlock();
return eb;
}
rcu_read_unlock(); rcu_read_unlock();
return NULL; return eb;
} }
static void end_bbio_meta_write(struct btrfs_bio *bbio) static void end_bbio_meta_write(struct btrfs_bio *bbio)
@ -2742,11 +2740,10 @@ static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct fo
if (!btrfs_meta_is_subpage(fs_info)) { if (!btrfs_meta_is_subpage(fs_info)) {
/* /*
* We do this since we'll remove the pages after we've * We do this since we'll remove the pages after we've removed
* removed the eb from the radix tree, so we could race * the eb from the xarray, so we could race and have this page
* and have this page now attached to the new eb. So * now attached to the new eb. So only clear folio if it's
* only clear folio if it's still connected to * still connected to this eb.
* this eb.
*/ */
if (folio_test_private(folio) && folio_get_private(folio) == eb) { if (folio_test_private(folio) && folio_get_private(folio) == eb) {
BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
@ -2911,9 +2908,9 @@ static void check_buffer_tree_ref(struct extent_buffer *eb)
{ {
int refs; int refs;
/* /*
* The TREE_REF bit is first set when the extent_buffer is added * The TREE_REF bit is first set when the extent_buffer is added to the
* to the radix tree. It is also reset, if unset, when a new reference * xarray. It is also reset, if unset, when a new reference is created
* is created by find_extent_buffer. * by find_extent_buffer.
* *
* It is only cleared in two cases: freeing the last non-tree * It is only cleared in two cases: freeing the last non-tree
* reference to the extent_buffer when its STALE bit is set or * reference to the extent_buffer when its STALE bit is set or
@ -2925,13 +2922,12 @@ static void check_buffer_tree_ref(struct extent_buffer *eb)
* conditions between the calls to check_buffer_tree_ref in those * conditions between the calls to check_buffer_tree_ref in those
* codepaths and clearing TREE_REF in try_release_extent_buffer. * codepaths and clearing TREE_REF in try_release_extent_buffer.
* *
* The actual lifetime of the extent_buffer in the radix tree is * The actual lifetime of the extent_buffer in the xarray is adequately
* adequately protected by the refcount, but the TREE_REF bit and * protected by the refcount, but the TREE_REF bit and its corresponding
* its corresponding reference are not. To protect against this * reference are not. To protect against this class of races, we call
* class of races, we call check_buffer_tree_ref from the codepaths * check_buffer_tree_ref() from the code paths which trigger io. Note that
* which trigger io. Note that once io is initiated, TREE_REF can no * once io is initiated, TREE_REF can no longer be cleared, so that is
* longer be cleared, so that is the moment at which any such race is * the moment at which any such race is best fixed.
* best fixed.
*/ */
refs = atomic_read(&eb->refs); refs = atomic_read(&eb->refs);
if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
@ -2995,23 +2991,25 @@ struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
return ERR_PTR(-ENOMEM); return ERR_PTR(-ENOMEM);
eb->fs_info = fs_info; eb->fs_info = fs_info;
again: again:
ret = radix_tree_preload(GFP_NOFS); xa_lock_irq(&fs_info->buffer_tree);
if (ret) { exists = __xa_cmpxchg(&fs_info->buffer_tree, start >> fs_info->sectorsize_bits,
exists = ERR_PTR(ret); NULL, eb, GFP_NOFS);
if (xa_is_err(exists)) {
ret = xa_err(exists);
xa_unlock_irq(&fs_info->buffer_tree);
btrfs_release_extent_buffer(eb);
return ERR_PTR(ret);
}
if (exists) {
if (!atomic_inc_not_zero(&exists->refs)) {
/* The extent buffer is being freed, retry. */
xa_unlock_irq(&fs_info->buffer_tree);
goto again;
}
xa_unlock_irq(&fs_info->buffer_tree);
goto free_eb; goto free_eb;
} }
spin_lock(&fs_info->buffer_lock); xa_unlock_irq(&fs_info->buffer_tree);
ret = radix_tree_insert(&fs_info->buffer_radix,
start >> fs_info->sectorsize_bits, eb);
spin_unlock(&fs_info->buffer_lock);
radix_tree_preload_end();
if (ret == -EEXIST) {
exists = find_extent_buffer(fs_info, start);
if (exists)
goto free_eb;
else
goto again;
}
check_buffer_tree_ref(eb); check_buffer_tree_ref(eb);
return eb; return eb;
@ -3032,9 +3030,9 @@ static struct extent_buffer *grab_extent_buffer(struct btrfs_fs_info *fs_info,
lockdep_assert_held(&folio->mapping->i_private_lock); lockdep_assert_held(&folio->mapping->i_private_lock);
/* /*
* For subpage case, we completely rely on radix tree to ensure we * For subpage case, we completely rely on xarray to ensure we don't try
* don't try to insert two ebs for the same bytenr. So here we always * to insert two ebs for the same bytenr. So here we always return NULL
* return NULL and just continue. * and just continue.
*/ */
if (btrfs_meta_is_subpage(fs_info)) if (btrfs_meta_is_subpage(fs_info))
return NULL; return NULL;
@ -3165,7 +3163,7 @@ finish:
/* /*
* To inform we have an extra eb under allocation, so that * To inform we have an extra eb under allocation, so that
* detach_extent_buffer_page() won't release the folio private when the * detach_extent_buffer_page() won't release the folio private when the
* eb hasn't been inserted into radix tree yet. * eb hasn't been inserted into the xarray yet.
* *
* The ref will be decreased when the eb releases the page, in * The ref will be decreased when the eb releases the page, in
* detach_extent_buffer_page(). Thus needs no special handling in the * detach_extent_buffer_page(). Thus needs no special handling in the
@ -3299,10 +3297,9 @@ reallocate:
/* /*
* We can't unlock the pages just yet since the extent buffer * We can't unlock the pages just yet since the extent buffer
* hasn't been properly inserted in the radix tree, this * hasn't been properly inserted into the xarray, this opens a
* opens a race with btree_release_folio which can free a page * race with btree_release_folio() which can free a page while we
* while we are still filling in all pages for the buffer and * are still filling in all pages for the buffer and we could crash.
* we could crash.
*/ */
} }
if (uptodate) if (uptodate)
@ -3311,23 +3308,25 @@ reallocate:
if (page_contig) if (page_contig)
eb->addr = folio_address(eb->folios[0]) + offset_in_page(eb->start); eb->addr = folio_address(eb->folios[0]) + offset_in_page(eb->start);
again: again:
ret = radix_tree_preload(GFP_NOFS); xa_lock_irq(&fs_info->buffer_tree);
if (ret) existing_eb = __xa_cmpxchg(&fs_info->buffer_tree,
start >> fs_info->sectorsize_bits, NULL, eb,
GFP_NOFS);
if (xa_is_err(existing_eb)) {
ret = xa_err(existing_eb);
xa_unlock_irq(&fs_info->buffer_tree);
goto out; goto out;
spin_lock(&fs_info->buffer_lock);
ret = radix_tree_insert(&fs_info->buffer_radix,
start >> fs_info->sectorsize_bits, eb);
spin_unlock(&fs_info->buffer_lock);
radix_tree_preload_end();
if (ret == -EEXIST) {
ret = 0;
existing_eb = find_extent_buffer(fs_info, start);
if (existing_eb)
goto out;
else
goto again;
} }
if (existing_eb) {
if (!atomic_inc_not_zero(&existing_eb->refs)) {
xa_unlock_irq(&fs_info->buffer_tree);
goto again;
}
xa_unlock_irq(&fs_info->buffer_tree);
goto out;
}
xa_unlock_irq(&fs_info->buffer_tree);
/* add one reference for the tree */ /* add one reference for the tree */
check_buffer_tree_ref(eb); check_buffer_tree_ref(eb);
@ -3397,10 +3396,23 @@ static int release_extent_buffer(struct extent_buffer *eb)
spin_unlock(&eb->refs_lock); spin_unlock(&eb->refs_lock);
spin_lock(&fs_info->buffer_lock); /*
radix_tree_delete_item(&fs_info->buffer_radix, * We're erasing, theoretically there will be no allocations, so
eb->start >> fs_info->sectorsize_bits, eb); * just use GFP_ATOMIC.
spin_unlock(&fs_info->buffer_lock); *
* We use cmpxchg instead of erase because we do not know if
* this eb is actually in the tree or not, we could be cleaning
* up an eb that we allocated but never inserted into the tree.
* Thus use cmpxchg to remove it from the tree if it is there,
* or leave the other entry if this isn't in the tree.
*
* The documentation says that putting a NULL value is the same
* as erase as long as XA_FLAGS_ALLOC is not set, which it isn't
* in this case.
*/
xa_cmpxchg_irq(&fs_info->buffer_tree,
eb->start >> fs_info->sectorsize_bits, eb, NULL,
GFP_ATOMIC);
btrfs_leak_debug_del_eb(eb); btrfs_leak_debug_del_eb(eb);
/* Should be safe to release folios at this point. */ /* Should be safe to release folios at this point. */
@ -4231,71 +4243,17 @@ void memmove_extent_buffer(const struct extent_buffer *dst,
} }
} }
#define GANG_LOOKUP_SIZE 16
static struct extent_buffer *get_next_extent_buffer(
const struct btrfs_fs_info *fs_info, struct folio *folio, u64 bytenr)
{
struct extent_buffer *gang[GANG_LOOKUP_SIZE];
struct extent_buffer *found = NULL;
u64 folio_start = folio_pos(folio);
u64 cur = folio_start;
ASSERT(in_range(bytenr, folio_start, PAGE_SIZE));
lockdep_assert_held(&fs_info->buffer_lock);
while (cur < folio_start + PAGE_SIZE) {
int ret;
int i;
ret = radix_tree_gang_lookup(&fs_info->buffer_radix,
(void **)gang, cur >> fs_info->sectorsize_bits,
min_t(unsigned int, GANG_LOOKUP_SIZE,
PAGE_SIZE / fs_info->nodesize));
if (ret == 0)
goto out;
for (i = 0; i < ret; i++) {
/* Already beyond page end */
if (gang[i]->start >= folio_start + PAGE_SIZE)
goto out;
/* Found one */
if (gang[i]->start >= bytenr) {
found = gang[i];
goto out;
}
}
cur = gang[ret - 1]->start + gang[ret - 1]->len;
}
out:
return found;
}
static int try_release_subpage_extent_buffer(struct folio *folio) static int try_release_subpage_extent_buffer(struct folio *folio)
{ {
struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); struct btrfs_fs_info *fs_info = folio_to_fs_info(folio);
u64 cur = folio_pos(folio); struct extent_buffer *eb;
const u64 end = cur + PAGE_SIZE; unsigned long start = (folio_pos(folio) >> fs_info->sectorsize_bits);
unsigned long index = start;
unsigned long end = index + (PAGE_SIZE >> fs_info->sectorsize_bits) - 1;
int ret; int ret;
while (cur < end) { xa_lock_irq(&fs_info->buffer_tree);
struct extent_buffer *eb = NULL; xa_for_each_range(&fs_info->buffer_tree, index, eb, start, end) {
/*
* Unlike try_release_extent_buffer() which uses folio private
* to grab buffer, for subpage case we rely on radix tree, thus
* we need to ensure radix tree consistency.
*
* We also want an atomic snapshot of the radix tree, thus go
* with spinlock rather than RCU.
*/
spin_lock(&fs_info->buffer_lock);
eb = get_next_extent_buffer(fs_info, folio, cur);
if (!eb) {
/* No more eb in the page range after or at cur */
spin_unlock(&fs_info->buffer_lock);
break;
}
cur = eb->start + eb->len;
/* /*
* The same as try_release_extent_buffer(), to ensure the eb * The same as try_release_extent_buffer(), to ensure the eb
* won't disappear out from under us. * won't disappear out from under us.
@ -4303,10 +4261,9 @@ static int try_release_subpage_extent_buffer(struct folio *folio)
spin_lock(&eb->refs_lock); spin_lock(&eb->refs_lock);
if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) { if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) {
spin_unlock(&eb->refs_lock); spin_unlock(&eb->refs_lock);
spin_unlock(&fs_info->buffer_lock); continue;
break;
} }
spin_unlock(&fs_info->buffer_lock); xa_unlock_irq(&fs_info->buffer_tree);
/* /*
* If tree ref isn't set then we know the ref on this eb is a * If tree ref isn't set then we know the ref on this eb is a
@ -4324,7 +4281,10 @@ static int try_release_subpage_extent_buffer(struct folio *folio)
* release_extent_buffer() will release the refs_lock. * release_extent_buffer() will release the refs_lock.
*/ */
release_extent_buffer(eb); release_extent_buffer(eb);
xa_lock_irq(&fs_info->buffer_tree);
} }
xa_unlock_irq(&fs_info->buffer_tree);
/* /*
* Finally to check if we have cleared folio private, as if we have * Finally to check if we have cleared folio private, as if we have
* released all ebs in the page, the folio private should be cleared now. * released all ebs in the page, the folio private should be cleared now.

View file

@ -777,10 +777,8 @@ struct btrfs_fs_info {
struct btrfs_delayed_root *delayed_root; struct btrfs_delayed_root *delayed_root;
/* Extent buffer radix tree */
spinlock_t buffer_lock;
/* Entries are eb->start / sectorsize */ /* Entries are eb->start / sectorsize */
struct radix_tree_root buffer_radix; struct xarray buffer_tree;
/* Next backup root to be overwritten */ /* Next backup root to be overwritten */
int backup_root_index; int backup_root_index;

View file

@ -157,9 +157,9 @@ struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize)
void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info) void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info)
{ {
struct radix_tree_iter iter;
void **slot;
struct btrfs_device *dev, *tmp; struct btrfs_device *dev, *tmp;
struct extent_buffer *eb;
unsigned long index;
if (!fs_info) if (!fs_info)
return; return;
@ -169,25 +169,13 @@ void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info)
test_mnt->mnt_sb->s_fs_info = NULL; test_mnt->mnt_sb->s_fs_info = NULL;
spin_lock(&fs_info->buffer_lock); xa_lock_irq(&fs_info->buffer_tree);
radix_tree_for_each_slot(slot, &fs_info->buffer_radix, &iter, 0) { xa_for_each(&fs_info->buffer_tree, index, eb) {
struct extent_buffer *eb; xa_unlock_irq(&fs_info->buffer_tree);
free_extent_buffer(eb);
eb = radix_tree_deref_slot_protected(slot, &fs_info->buffer_lock); xa_lock_irq(&fs_info->buffer_tree);
if (!eb)
continue;
/* Shouldn't happen but that kind of thinking creates CVE's */
if (radix_tree_exception(eb)) {
if (radix_tree_deref_retry(eb))
slot = radix_tree_iter_retry(&iter);
continue;
}
slot = radix_tree_iter_resume(slot, &iter);
spin_unlock(&fs_info->buffer_lock);
free_extent_buffer_stale(eb);
spin_lock(&fs_info->buffer_lock);
} }
spin_unlock(&fs_info->buffer_lock); xa_unlock_irq(&fs_info->buffer_tree);
btrfs_mapping_tree_free(fs_info); btrfs_mapping_tree_free(fs_info);
list_for_each_entry_safe(dev, tmp, &fs_info->fs_devices->devices, list_for_each_entry_safe(dev, tmp, &fs_info->fs_devices->devices,

View file

@ -2171,27 +2171,15 @@ static void wait_eb_writebacks(struct btrfs_block_group *block_group)
{ {
struct btrfs_fs_info *fs_info = block_group->fs_info; struct btrfs_fs_info *fs_info = block_group->fs_info;
const u64 end = block_group->start + block_group->length; const u64 end = block_group->start + block_group->length;
struct radix_tree_iter iter;
struct extent_buffer *eb; struct extent_buffer *eb;
void __rcu **slot; unsigned long index, start = (block_group->start >> fs_info->sectorsize_bits);
rcu_read_lock(); rcu_read_lock();
radix_tree_for_each_slot(slot, &fs_info->buffer_radix, &iter, xa_for_each_start(&fs_info->buffer_tree, index, eb, start) {
block_group->start >> fs_info->sectorsize_bits) {
eb = radix_tree_deref_slot(slot);
if (!eb)
continue;
if (radix_tree_deref_retry(eb)) {
slot = radix_tree_iter_retry(&iter);
continue;
}
if (eb->start < block_group->start) if (eb->start < block_group->start)
continue; continue;
if (eb->start >= end) if (eb->start >= end)
break; break;
slot = radix_tree_iter_resume(slot, &iter);
rcu_read_unlock(); rcu_read_unlock();
wait_on_extent_buffer_writeback(eb); wait_on_extent_buffer_writeback(eb);
rcu_read_lock(); rcu_read_lock();