linux/fs/btrfs/free-space-tree.c

1725 lines
45 KiB
C
Raw Permalink Normal View History

// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2015 Facebook. All rights reserved.
*/
#include <linux/kernel.h>
#include <linux/sched/mm.h>
#include "messages.h"
#include "ctree.h"
#include "disk-io.h"
#include "locking.h"
#include "free-space-tree.h"
#include "transaction.h"
#include "block-group.h"
#include "fs.h"
#include "accessors.h"
#include "extent-tree.h"
#include "root-tree.h"
static int __add_block_group_free_space(struct btrfs_trans_handle *trans,
struct btrfs_block_group *block_group,
struct btrfs_path *path);
static struct btrfs_root *btrfs_free_space_root(
struct btrfs_block_group *block_group)
{
struct btrfs_key key = {
.objectid = BTRFS_FREE_SPACE_TREE_OBJECTID,
.type = BTRFS_ROOT_ITEM_KEY,
.offset = 0,
};
if (btrfs_fs_incompat(block_group->fs_info, EXTENT_TREE_V2))
key.offset = block_group->global_root_id;
return btrfs_global_root(block_group->fs_info, &key);
}
void btrfs_set_free_space_tree_thresholds(struct btrfs_block_group *cache)
{
u32 bitmap_range;
size_t bitmap_size;
u64 num_bitmaps, total_bitmap_size;
if (WARN_ON(cache->length == 0))
btrfs_warn(cache->fs_info, "block group %llu length is zero",
cache->start);
/*
* We convert to bitmaps when the disk space required for using extents
* exceeds that required for using bitmaps.
*/
bitmap_range = cache->fs_info->sectorsize * BTRFS_FREE_SPACE_BITMAP_BITS;
num_bitmaps = div_u64(cache->length + bitmap_range - 1, bitmap_range);
bitmap_size = sizeof(struct btrfs_item) + BTRFS_FREE_SPACE_BITMAP_SIZE;
total_bitmap_size = num_bitmaps * bitmap_size;
cache->bitmap_high_thresh = div_u64(total_bitmap_size,
sizeof(struct btrfs_item));
/*
* We allow for a small buffer between the high threshold and low
* threshold to avoid thrashing back and forth between the two formats.
*/
if (cache->bitmap_high_thresh > 100)
cache->bitmap_low_thresh = cache->bitmap_high_thresh - 100;
else
cache->bitmap_low_thresh = 0;
}
static int add_new_free_space_info(struct btrfs_trans_handle *trans,
struct btrfs_block_group *block_group,
struct btrfs_path *path)
{
struct btrfs_root *root = btrfs_free_space_root(block_group);
struct btrfs_free_space_info *info;
struct btrfs_key key;
struct extent_buffer *leaf;
int ret;
key.objectid = block_group->start;
key.type = BTRFS_FREE_SPACE_INFO_KEY;
key.offset = block_group->length;
ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*info));
if (ret)
return ret;
leaf = path->nodes[0];
info = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_free_space_info);
btrfs_set_free_space_extent_count(leaf, info, 0);
btrfs_set_free_space_flags(leaf, info, 0);
btrfs_release_path(path);
return 0;
}
EXPORT_FOR_TESTS
struct btrfs_free_space_info *btrfs_search_free_space_info(
struct btrfs_trans_handle *trans,
struct btrfs_block_group *block_group,
struct btrfs_path *path, int cow)
{
struct btrfs_fs_info *fs_info = block_group->fs_info;
struct btrfs_root *root = btrfs_free_space_root(block_group);
struct btrfs_key key;
int ret;
key.objectid = block_group->start;
key.type = BTRFS_FREE_SPACE_INFO_KEY;
key.offset = block_group->length;
ret = btrfs_search_slot(trans, root, &key, path, 0, cow);
if (ret < 0)
return ERR_PTR(ret);
if (ret != 0) {
btrfs_warn(fs_info, "missing free space info for %llu",
block_group->start);
DEBUG_WARN();
return ERR_PTR(-ENOENT);
}
return btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_free_space_info);
}
/*
* btrfs_search_slot() but we're looking for the greatest key less than the
* passed key.
*/
static int btrfs_search_prev_slot(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_key *key, struct btrfs_path *p,
int ins_len, int cow)
{
int ret;
ret = btrfs_search_slot(trans, root, key, p, ins_len, cow);
if (ret < 0)
return ret;
if (ret == 0) {
DEBUG_WARN();
return -EIO;
}
if (p->slots[0] == 0) {
DEBUG_WARN("no previous slot found");
return -EIO;
}
p->slots[0]--;
return 0;
}
static inline u32 free_space_bitmap_size(const struct btrfs_fs_info *fs_info,
u64 size)
{
return DIV_ROUND_UP(size >> fs_info->sectorsize_bits, BITS_PER_BYTE);
}
static unsigned long *alloc_bitmap(u32 bitmap_size)
{
unsigned long *ret;
unsigned int nofs_flag;
u32 bitmap_rounded_size = round_up(bitmap_size, sizeof(unsigned long));
/*
* GFP_NOFS doesn't work with kvmalloc(), but we really can't recurse
* into the filesystem as the free space bitmap can be modified in the
* critical section of a transaction commit.
*
* TODO: push the memalloc_nofs_{save,restore}() to the caller where we
* know that recursion is unsafe.
*/
nofs_flag = memalloc_nofs_save();
ret = kvzalloc(bitmap_rounded_size, GFP_KERNEL);
memalloc_nofs_restore(nofs_flag);
return ret;
}
static void le_bitmap_set(unsigned long *map, unsigned int start, int len)
{
u8 *p = ((u8 *)map) + BIT_BYTE(start);
const unsigned int size = start + len;
int bits_to_set = BITS_PER_BYTE - (start % BITS_PER_BYTE);
u8 mask_to_set = BITMAP_FIRST_BYTE_MASK(start);
while (len - bits_to_set >= 0) {
*p |= mask_to_set;
len -= bits_to_set;
bits_to_set = BITS_PER_BYTE;
mask_to_set = ~0;
p++;
}
if (len) {
mask_to_set &= BITMAP_LAST_BYTE_MASK(size);
*p |= mask_to_set;
}
}
EXPORT_FOR_TESTS
int btrfs_convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
struct btrfs_block_group *block_group,
struct btrfs_path *path)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_root *root = btrfs_free_space_root(block_group);
struct btrfs_free_space_info *info;
struct btrfs_key key, found_key;
struct extent_buffer *leaf;
unsigned long *bitmap;
char *bitmap_cursor;
u64 start, end;
u64 bitmap_range, i;
u32 bitmap_size, flags, expected_extent_count;
u32 extent_count = 0;
int done = 0, nr;
int ret;
bitmap_size = free_space_bitmap_size(fs_info, block_group->length);
bitmap = alloc_bitmap(bitmap_size);
if (!bitmap) {
ret = -ENOMEM;
btrfs_abort_transaction(trans, ret);
goto out;
}
start = block_group->start;
end = block_group->start + block_group->length;
key.objectid = end - 1;
key.type = (u8)-1;
key.offset = (u64)-1;
while (!done) {
ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out;
}
leaf = path->nodes[0];
nr = 0;
path->slots[0]++;
while (path->slots[0] > 0) {
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0] - 1);
if (found_key.type == BTRFS_FREE_SPACE_INFO_KEY) {
ASSERT(found_key.objectid == block_group->start);
ASSERT(found_key.offset == block_group->length);
done = 1;
break;
} else if (found_key.type == BTRFS_FREE_SPACE_EXTENT_KEY) {
u64 first, last;
ASSERT(found_key.objectid >= start);
ASSERT(found_key.objectid < end);
ASSERT(found_key.objectid + found_key.offset <= end);
first = div_u64(found_key.objectid - start,
fs_info->sectorsize);
last = div_u64(found_key.objectid + found_key.offset - start,
fs_info->sectorsize);
le_bitmap_set(bitmap, first, last - first);
extent_count++;
nr++;
path->slots[0]--;
} else {
ASSERT(0);
}
}
ret = btrfs_del_items(trans, root, path, path->slots[0], nr);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out;
}
btrfs_release_path(path);
}
info = btrfs_search_free_space_info(trans, block_group, path, 1);
if (IS_ERR(info)) {
ret = PTR_ERR(info);
btrfs_abort_transaction(trans, ret);
goto out;
}
leaf = path->nodes[0];
flags = btrfs_free_space_flags(leaf, info);
flags |= BTRFS_FREE_SPACE_USING_BITMAPS;
btrfs: cache if we are using free space bitmaps for a block group Every time we add free space to the free space tree or we remove free space from the free space tree, we do a lookup for the block group's free space info item in the free space tree. This takes time, navigating the btree and we may block either on IO when reading extent buffers from disk or on extent buffer lock contention due to concurrency. Instead of doing this lookup every time, cache the result in the block structure and use it after the first lookup. This adds two boolean members to the block group structure but doesn't increase the structure's size. The following script that runs fs_mark was used to measure the time spent on run_delayed_tree_ref(), since down that call chain we have calls to add and remove free space to/from the free space tree (calls to btrfs_add_to_free_space_tree() and btrfs_remove_from_free_space_tree()): $ cat test.sh #!/bin/bash DEV=/dev/nullb0 MNT=/mnt FILES=100000 THREADS=$(nproc --all) echo "performance" | \ tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor umount $DEV &> /dev/null mkfs.btrfs -f $DEV mount -o ssd $DEV $MNT OPTS="-S 0 -L 5 -n $FILES -s 0 -t $THREADS -k" for ((i = 1; i <= $THREADS; i++)); do OPTS="$OPTS -d $MNT/d$i" done fs_mark $OPTS umount $MNT This is a heavy metadata test as it's exercising only file creation, so a lot of allocations of metadata extents, creating delayed refs for adding new metadata extents and dropping existing ones due to COW. The results of the times it took to execute run_delayed_tree_ref(), in nanoseconds, are the following. Before this change: Range: 1868.000 - 6482857.000; Mean: 10231.430; Median: 7005.000; Stddev: 27993.173 Percentiles: 90th: 13342.000; 95th: 23279.000; 99th: 82448.000 1868.000 - 4222.038: 270696 ############ 4222.038 - 9541.029: 1201327 ##################################################### 9541.029 - 21559.383: 385436 ################# 21559.383 - 48715.063: 64942 ### 48715.063 - 110073.800: 31454 # 110073.800 - 248714.944: 8218 | 248714.944 - 561977.042: 1030 | 561977.042 - 1269798.254: 295 | 1269798.254 - 2869132.711: 116 | 2869132.711 - 6482857.000: 28 | After this change: Range: 1554.000 - 4557014.000; Mean: 9168.164; Median: 6391.000; Stddev: 21467.060 Percentiles: 90th: 12478.000; 95th: 20964.000; 99th: 72234.000 1554.000 - 3453.820: 219004 ############ 3453.820 - 7674.743: 980645 ##################################################### 7674.743 - 17052.574: 552486 ############################## 17052.574 - 37887.762: 68558 #### 37887.762 - 84178.322: 31557 ## 84178.322 - 187024.331: 12102 # 187024.331 - 415522.355: 1364 | 415522.355 - 923187.626: 256 | 923187.626 - 2051092.468: 125 | 2051092.468 - 4557014.000: 21 | Approximate improvement in the first four buckets is about 20%. Reviewed-by: Boris Burkov <boris@bur.io> Signed-off-by: Filipe Manana <fdmanana@suse.com> Reviewed-by: David Sterba <dsterba@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2025-06-12 17:19:05 +01:00
block_group->using_free_space_bitmaps = true;
block_group->using_free_space_bitmaps_cached = true;
btrfs_set_free_space_flags(leaf, info, flags);
expected_extent_count = btrfs_free_space_extent_count(leaf, info);
btrfs_release_path(path);
if (extent_count != expected_extent_count) {
btrfs_err(fs_info,
"incorrect extent count for %llu; counted %u, expected %u",
block_group->start, extent_count,
expected_extent_count);
ret = -EIO;
btrfs_abort_transaction(trans, ret);
goto out;
}
bitmap_cursor = (char *)bitmap;
bitmap_range = fs_info->sectorsize * BTRFS_FREE_SPACE_BITMAP_BITS;
i = start;
while (i < end) {
unsigned long ptr;
u64 extent_size;
u32 data_size;
extent_size = min(end - i, bitmap_range);
data_size = free_space_bitmap_size(fs_info, extent_size);
key.objectid = i;
key.type = BTRFS_FREE_SPACE_BITMAP_KEY;
key.offset = extent_size;
ret = btrfs_insert_empty_item(trans, root, path, &key,
data_size);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out;
}
leaf = path->nodes[0];
ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
write_extent_buffer(leaf, bitmap_cursor, ptr,
data_size);
btrfs_release_path(path);
i += extent_size;
bitmap_cursor += data_size;
}
ret = 0;
out:
kvfree(bitmap);
return ret;
}
EXPORT_FOR_TESTS
int btrfs_convert_free_space_to_extents(struct btrfs_trans_handle *trans,
struct btrfs_block_group *block_group,
struct btrfs_path *path)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_root *root = btrfs_free_space_root(block_group);
struct btrfs_free_space_info *info;
struct btrfs_key key, found_key;
struct extent_buffer *leaf;
unsigned long *bitmap;
u64 start, end;
u32 bitmap_size, flags, expected_extent_count;
unsigned long nrbits, start_bit, end_bit;
u32 extent_count = 0;
int done = 0, nr;
int ret;
bitmap_size = free_space_bitmap_size(fs_info, block_group->length);
bitmap = alloc_bitmap(bitmap_size);
if (!bitmap) {
ret = -ENOMEM;
btrfs_abort_transaction(trans, ret);
goto out;
}
start = block_group->start;
end = block_group->start + block_group->length;
key.objectid = end - 1;
key.type = (u8)-1;
key.offset = (u64)-1;
while (!done) {
ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out;
}
leaf = path->nodes[0];
nr = 0;
path->slots[0]++;
while (path->slots[0] > 0) {
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0] - 1);
if (found_key.type == BTRFS_FREE_SPACE_INFO_KEY) {
ASSERT(found_key.objectid == block_group->start);
ASSERT(found_key.offset == block_group->length);
done = 1;
break;
} else if (found_key.type == BTRFS_FREE_SPACE_BITMAP_KEY) {
unsigned long ptr;
char *bitmap_cursor;
u32 bitmap_pos, data_size;
ASSERT(found_key.objectid >= start);
ASSERT(found_key.objectid < end);
ASSERT(found_key.objectid + found_key.offset <= end);
bitmap_pos = div_u64(found_key.objectid - start,
fs_info->sectorsize *
BITS_PER_BYTE);
bitmap_cursor = ((char *)bitmap) + bitmap_pos;
data_size = free_space_bitmap_size(fs_info,
found_key.offset);
path->slots[0]--;
ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
read_extent_buffer(leaf, bitmap_cursor, ptr,
data_size);
nr++;
} else {
ASSERT(0);
}
}
ret = btrfs_del_items(trans, root, path, path->slots[0], nr);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out;
}
btrfs_release_path(path);
}
info = btrfs_search_free_space_info(trans, block_group, path, 1);
if (IS_ERR(info)) {
ret = PTR_ERR(info);
btrfs_abort_transaction(trans, ret);
goto out;
}
leaf = path->nodes[0];
flags = btrfs_free_space_flags(leaf, info);
flags &= ~BTRFS_FREE_SPACE_USING_BITMAPS;
btrfs: cache if we are using free space bitmaps for a block group Every time we add free space to the free space tree or we remove free space from the free space tree, we do a lookup for the block group's free space info item in the free space tree. This takes time, navigating the btree and we may block either on IO when reading extent buffers from disk or on extent buffer lock contention due to concurrency. Instead of doing this lookup every time, cache the result in the block structure and use it after the first lookup. This adds two boolean members to the block group structure but doesn't increase the structure's size. The following script that runs fs_mark was used to measure the time spent on run_delayed_tree_ref(), since down that call chain we have calls to add and remove free space to/from the free space tree (calls to btrfs_add_to_free_space_tree() and btrfs_remove_from_free_space_tree()): $ cat test.sh #!/bin/bash DEV=/dev/nullb0 MNT=/mnt FILES=100000 THREADS=$(nproc --all) echo "performance" | \ tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor umount $DEV &> /dev/null mkfs.btrfs -f $DEV mount -o ssd $DEV $MNT OPTS="-S 0 -L 5 -n $FILES -s 0 -t $THREADS -k" for ((i = 1; i <= $THREADS; i++)); do OPTS="$OPTS -d $MNT/d$i" done fs_mark $OPTS umount $MNT This is a heavy metadata test as it's exercising only file creation, so a lot of allocations of metadata extents, creating delayed refs for adding new metadata extents and dropping existing ones due to COW. The results of the times it took to execute run_delayed_tree_ref(), in nanoseconds, are the following. Before this change: Range: 1868.000 - 6482857.000; Mean: 10231.430; Median: 7005.000; Stddev: 27993.173 Percentiles: 90th: 13342.000; 95th: 23279.000; 99th: 82448.000 1868.000 - 4222.038: 270696 ############ 4222.038 - 9541.029: 1201327 ##################################################### 9541.029 - 21559.383: 385436 ################# 21559.383 - 48715.063: 64942 ### 48715.063 - 110073.800: 31454 # 110073.800 - 248714.944: 8218 | 248714.944 - 561977.042: 1030 | 561977.042 - 1269798.254: 295 | 1269798.254 - 2869132.711: 116 | 2869132.711 - 6482857.000: 28 | After this change: Range: 1554.000 - 4557014.000; Mean: 9168.164; Median: 6391.000; Stddev: 21467.060 Percentiles: 90th: 12478.000; 95th: 20964.000; 99th: 72234.000 1554.000 - 3453.820: 219004 ############ 3453.820 - 7674.743: 980645 ##################################################### 7674.743 - 17052.574: 552486 ############################## 17052.574 - 37887.762: 68558 #### 37887.762 - 84178.322: 31557 ## 84178.322 - 187024.331: 12102 # 187024.331 - 415522.355: 1364 | 415522.355 - 923187.626: 256 | 923187.626 - 2051092.468: 125 | 2051092.468 - 4557014.000: 21 | Approximate improvement in the first four buckets is about 20%. Reviewed-by: Boris Burkov <boris@bur.io> Signed-off-by: Filipe Manana <fdmanana@suse.com> Reviewed-by: David Sterba <dsterba@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2025-06-12 17:19:05 +01:00
block_group->using_free_space_bitmaps = false;
block_group->using_free_space_bitmaps_cached = true;
btrfs_set_free_space_flags(leaf, info, flags);
expected_extent_count = btrfs_free_space_extent_count(leaf, info);
btrfs_release_path(path);
nrbits = block_group->length >> fs_info->sectorsize_bits;
start_bit = find_next_bit_le(bitmap, nrbits, 0);
while (start_bit < nrbits) {
end_bit = find_next_zero_bit_le(bitmap, nrbits, start_bit);
ASSERT(start_bit < end_bit);
key.objectid = start + start_bit * fs_info->sectorsize;
key.type = BTRFS_FREE_SPACE_EXTENT_KEY;
key.offset = (end_bit - start_bit) * fs_info->sectorsize;
ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out;
}
btrfs_release_path(path);
extent_count++;
start_bit = find_next_bit_le(bitmap, nrbits, end_bit);
}
if (extent_count != expected_extent_count) {
btrfs_err(fs_info,
"incorrect extent count for %llu; counted %u, expected %u",
block_group->start, extent_count,
expected_extent_count);
ret = -EIO;
btrfs_abort_transaction(trans, ret);
goto out;
}
ret = 0;
out:
kvfree(bitmap);
return ret;
}
static int update_free_space_extent_count(struct btrfs_trans_handle *trans,
struct btrfs_block_group *block_group,
struct btrfs_path *path,
int new_extents)
{
struct btrfs_free_space_info *info;
u32 flags;
u32 extent_count;
int ret = 0;
if (new_extents == 0)
return 0;
info = btrfs_search_free_space_info(trans, block_group, path, 1);
if (IS_ERR(info))
return PTR_ERR(info);
flags = btrfs_free_space_flags(path->nodes[0], info);
extent_count = btrfs_free_space_extent_count(path->nodes[0], info);
extent_count += new_extents;
btrfs_set_free_space_extent_count(path->nodes[0], info, extent_count);
btrfs_release_path(path);
if (!(flags & BTRFS_FREE_SPACE_USING_BITMAPS) &&
extent_count > block_group->bitmap_high_thresh) {
ret = btrfs_convert_free_space_to_bitmaps(trans, block_group, path);
} else if ((flags & BTRFS_FREE_SPACE_USING_BITMAPS) &&
extent_count < block_group->bitmap_low_thresh) {
ret = btrfs_convert_free_space_to_extents(trans, block_group, path);
}
return ret;
}
EXPORT_FOR_TESTS
bool btrfs_free_space_test_bit(struct btrfs_block_group *block_group,
struct btrfs_path *path, u64 offset)
{
struct extent_buffer *leaf;
struct btrfs_key key;
u64 found_start, found_end;
unsigned long ptr, i;
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
ASSERT(key.type == BTRFS_FREE_SPACE_BITMAP_KEY);
found_start = key.objectid;
found_end = key.objectid + key.offset;
ASSERT(offset >= found_start && offset < found_end);
ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
i = div_u64(offset - found_start,
block_group->fs_info->sectorsize);
return extent_buffer_test_bit(leaf, ptr, i);
}
static void free_space_modify_bits(struct btrfs_trans_handle *trans,
struct btrfs_block_group *block_group,
struct btrfs_path *path, u64 *start, u64 *size,
bool set_bits)
{
struct btrfs_fs_info *fs_info = block_group->fs_info;
struct extent_buffer *leaf;
struct btrfs_key key;
u64 end = *start + *size;
u64 found_start, found_end;
unsigned long ptr, first, last;
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
ASSERT(key.type == BTRFS_FREE_SPACE_BITMAP_KEY);
found_start = key.objectid;
found_end = key.objectid + key.offset;
ASSERT(*start >= found_start && *start < found_end);
ASSERT(end > found_start);
if (end > found_end)
end = found_end;
ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
first = (*start - found_start) >> fs_info->sectorsize_bits;
last = (end - found_start) >> fs_info->sectorsize_bits;
if (set_bits)
extent_buffer_bitmap_set(leaf, ptr, first, last - first);
else
extent_buffer_bitmap_clear(leaf, ptr, first, last - first);
btrfs_mark_buffer_dirty(trans, leaf);
*size -= end - *start;
*start = end;
}
/*
* We can't use btrfs_next_item() in modify_free_space_bitmap() because
* btrfs_next_leaf() doesn't get the path for writing. We can forgo the fancy
* tree walking in btrfs_next_leaf() anyways because we know exactly what we're
* looking for.
*/
static int free_space_next_bitmap(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct btrfs_path *p)
{
struct btrfs_key key;
if (p->slots[0] + 1 < btrfs_header_nritems(p->nodes[0])) {
p->slots[0]++;
return 0;
}
btrfs_item_key_to_cpu(p->nodes[0], &key, p->slots[0]);
btrfs_release_path(p);
key.objectid += key.offset;
key.type = (u8)-1;
key.offset = (u64)-1;
return btrfs_search_prev_slot(trans, root, &key, p, 0, 1);
}
/*
* If remove is 1, then we are removing free space, thus clearing bits in the
* bitmap. If remove is 0, then we are adding free space, thus setting bits in
* the bitmap.
*/
static int modify_free_space_bitmap(struct btrfs_trans_handle *trans,
struct btrfs_block_group *block_group,
struct btrfs_path *path,
u64 start, u64 size, bool remove)
{
struct btrfs_root *root = btrfs_free_space_root(block_group);
struct btrfs_key key;
u64 end = start + size;
u64 cur_start, cur_size;
bool prev_bit_set = false;
bool next_bit_set = false;
int new_extents;
int ret;
/*
* Read the bit for the block immediately before the extent of space if
* that block is within the block group.
*/
if (start > block_group->start) {
u64 prev_block = start - block_group->fs_info->sectorsize;
key.objectid = prev_block;
key.type = (u8)-1;
key.offset = (u64)-1;
ret = btrfs_search_prev_slot(trans, root, &key, path, 0, 1);
if (ret)
return ret;
prev_bit_set = btrfs_free_space_test_bit(block_group, path, prev_block);
/* The previous block may have been in the previous bitmap. */
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
if (start >= key.objectid + key.offset) {
ret = free_space_next_bitmap(trans, root, path);
if (ret)
return ret;
}
} else {
key.objectid = start;
key.type = (u8)-1;
key.offset = (u64)-1;
ret = btrfs_search_prev_slot(trans, root, &key, path, 0, 1);
if (ret)
return ret;
}
/*
* Iterate over all of the bitmaps overlapped by the extent of space,
* clearing/setting bits as required.
*/
cur_start = start;
cur_size = size;
while (1) {
free_space_modify_bits(trans, block_group, path, &cur_start,
&cur_size, !remove);
if (cur_size == 0)
break;
ret = free_space_next_bitmap(trans, root, path);
if (ret)
return ret;
}
/*
* Read the bit for the block immediately after the extent of space if
* that block is within the block group.
*/
if (end < block_group->start + block_group->length) {
/* The next block may be in the next bitmap. */
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
if (end >= key.objectid + key.offset) {
ret = free_space_next_bitmap(trans, root, path);
if (ret)
return ret;
}
next_bit_set = btrfs_free_space_test_bit(block_group, path, end);
}
if (remove) {
new_extents = -1;
if (prev_bit_set) {
/* Leftover on the left. */
new_extents++;
}
if (next_bit_set) {
/* Leftover on the right. */
new_extents++;
}
} else {
new_extents = 1;
if (prev_bit_set) {
/* Merging with neighbor on the left. */
new_extents--;
}
if (next_bit_set) {
/* Merging with neighbor on the right. */
new_extents--;
}
}
btrfs_release_path(path);
return update_free_space_extent_count(trans, block_group, path, new_extents);
}
static int remove_free_space_extent(struct btrfs_trans_handle *trans,
struct btrfs_block_group *block_group,
struct btrfs_path *path,
u64 start, u64 size)
{
struct btrfs_root *root = btrfs_free_space_root(block_group);
struct btrfs_key key;
u64 found_start, found_end;
u64 end = start + size;
int new_extents = -1;
int ret;
key.objectid = start;
key.type = (u8)-1;
key.offset = (u64)-1;
ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1);
if (ret)
return ret;
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
ASSERT(key.type == BTRFS_FREE_SPACE_EXTENT_KEY);
found_start = key.objectid;
found_end = key.objectid + key.offset;
ASSERT(start >= found_start && end <= found_end);
/*
* Okay, now that we've found the free space extent which contains the
* free space that we are removing, there are four cases:
*
* 1. We're using the whole extent: delete the key we found and
* decrement the free space extent count.
* 2. We are using part of the extent starting at the beginning: delete
* the key we found and insert a new key representing the leftover at
* the end. There is no net change in the number of extents.
* 3. We are using part of the extent ending at the end: delete the key
* we found and insert a new key representing the leftover at the
* beginning. There is no net change in the number of extents.
* 4. We are using part of the extent in the middle: delete the key we
* found and insert two new keys representing the leftovers on each
* side. Where we used to have one extent, we now have two, so increment
* the extent count. We may need to convert the block group to bitmaps
* as a result.
*/
/* Delete the existing key (cases 1-4). */
ret = btrfs_del_item(trans, root, path);
if (ret)
return ret;
/* Add a key for leftovers at the beginning (cases 3 and 4). */
if (start > found_start) {
key.objectid = found_start;
key.type = BTRFS_FREE_SPACE_EXTENT_KEY;
key.offset = start - found_start;
btrfs_release_path(path);
ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
if (ret)
return ret;
new_extents++;
}
/* Add a key for leftovers at the end (cases 2 and 4). */
if (end < found_end) {
key.objectid = end;
key.type = BTRFS_FREE_SPACE_EXTENT_KEY;
key.offset = found_end - end;
btrfs_release_path(path);
ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
if (ret)
return ret;
new_extents++;
}
btrfs_release_path(path);
return update_free_space_extent_count(trans, block_group, path, new_extents);
}
static int using_bitmaps(struct btrfs_block_group *bg, struct btrfs_path *path)
{
struct btrfs_free_space_info *info;
u32 flags;
btrfs: cache if we are using free space bitmaps for a block group Every time we add free space to the free space tree or we remove free space from the free space tree, we do a lookup for the block group's free space info item in the free space tree. This takes time, navigating the btree and we may block either on IO when reading extent buffers from disk or on extent buffer lock contention due to concurrency. Instead of doing this lookup every time, cache the result in the block structure and use it after the first lookup. This adds two boolean members to the block group structure but doesn't increase the structure's size. The following script that runs fs_mark was used to measure the time spent on run_delayed_tree_ref(), since down that call chain we have calls to add and remove free space to/from the free space tree (calls to btrfs_add_to_free_space_tree() and btrfs_remove_from_free_space_tree()): $ cat test.sh #!/bin/bash DEV=/dev/nullb0 MNT=/mnt FILES=100000 THREADS=$(nproc --all) echo "performance" | \ tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor umount $DEV &> /dev/null mkfs.btrfs -f $DEV mount -o ssd $DEV $MNT OPTS="-S 0 -L 5 -n $FILES -s 0 -t $THREADS -k" for ((i = 1; i <= $THREADS; i++)); do OPTS="$OPTS -d $MNT/d$i" done fs_mark $OPTS umount $MNT This is a heavy metadata test as it's exercising only file creation, so a lot of allocations of metadata extents, creating delayed refs for adding new metadata extents and dropping existing ones due to COW. The results of the times it took to execute run_delayed_tree_ref(), in nanoseconds, are the following. Before this change: Range: 1868.000 - 6482857.000; Mean: 10231.430; Median: 7005.000; Stddev: 27993.173 Percentiles: 90th: 13342.000; 95th: 23279.000; 99th: 82448.000 1868.000 - 4222.038: 270696 ############ 4222.038 - 9541.029: 1201327 ##################################################### 9541.029 - 21559.383: 385436 ################# 21559.383 - 48715.063: 64942 ### 48715.063 - 110073.800: 31454 # 110073.800 - 248714.944: 8218 | 248714.944 - 561977.042: 1030 | 561977.042 - 1269798.254: 295 | 1269798.254 - 2869132.711: 116 | 2869132.711 - 6482857.000: 28 | After this change: Range: 1554.000 - 4557014.000; Mean: 9168.164; Median: 6391.000; Stddev: 21467.060 Percentiles: 90th: 12478.000; 95th: 20964.000; 99th: 72234.000 1554.000 - 3453.820: 219004 ############ 3453.820 - 7674.743: 980645 ##################################################### 7674.743 - 17052.574: 552486 ############################## 17052.574 - 37887.762: 68558 #### 37887.762 - 84178.322: 31557 ## 84178.322 - 187024.331: 12102 # 187024.331 - 415522.355: 1364 | 415522.355 - 923187.626: 256 | 923187.626 - 2051092.468: 125 | 2051092.468 - 4557014.000: 21 | Approximate improvement in the first four buckets is about 20%. Reviewed-by: Boris Burkov <boris@bur.io> Signed-off-by: Filipe Manana <fdmanana@suse.com> Reviewed-by: David Sterba <dsterba@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2025-06-12 17:19:05 +01:00
if (bg->using_free_space_bitmaps_cached)
return bg->using_free_space_bitmaps;
info = btrfs_search_free_space_info(NULL, bg, path, 0);
if (IS_ERR(info))
return PTR_ERR(info);
flags = btrfs_free_space_flags(path->nodes[0], info);
btrfs_release_path(path);
btrfs: cache if we are using free space bitmaps for a block group Every time we add free space to the free space tree or we remove free space from the free space tree, we do a lookup for the block group's free space info item in the free space tree. This takes time, navigating the btree and we may block either on IO when reading extent buffers from disk or on extent buffer lock contention due to concurrency. Instead of doing this lookup every time, cache the result in the block structure and use it after the first lookup. This adds two boolean members to the block group structure but doesn't increase the structure's size. The following script that runs fs_mark was used to measure the time spent on run_delayed_tree_ref(), since down that call chain we have calls to add and remove free space to/from the free space tree (calls to btrfs_add_to_free_space_tree() and btrfs_remove_from_free_space_tree()): $ cat test.sh #!/bin/bash DEV=/dev/nullb0 MNT=/mnt FILES=100000 THREADS=$(nproc --all) echo "performance" | \ tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor umount $DEV &> /dev/null mkfs.btrfs -f $DEV mount -o ssd $DEV $MNT OPTS="-S 0 -L 5 -n $FILES -s 0 -t $THREADS -k" for ((i = 1; i <= $THREADS; i++)); do OPTS="$OPTS -d $MNT/d$i" done fs_mark $OPTS umount $MNT This is a heavy metadata test as it's exercising only file creation, so a lot of allocations of metadata extents, creating delayed refs for adding new metadata extents and dropping existing ones due to COW. The results of the times it took to execute run_delayed_tree_ref(), in nanoseconds, are the following. Before this change: Range: 1868.000 - 6482857.000; Mean: 10231.430; Median: 7005.000; Stddev: 27993.173 Percentiles: 90th: 13342.000; 95th: 23279.000; 99th: 82448.000 1868.000 - 4222.038: 270696 ############ 4222.038 - 9541.029: 1201327 ##################################################### 9541.029 - 21559.383: 385436 ################# 21559.383 - 48715.063: 64942 ### 48715.063 - 110073.800: 31454 # 110073.800 - 248714.944: 8218 | 248714.944 - 561977.042: 1030 | 561977.042 - 1269798.254: 295 | 1269798.254 - 2869132.711: 116 | 2869132.711 - 6482857.000: 28 | After this change: Range: 1554.000 - 4557014.000; Mean: 9168.164; Median: 6391.000; Stddev: 21467.060 Percentiles: 90th: 12478.000; 95th: 20964.000; 99th: 72234.000 1554.000 - 3453.820: 219004 ############ 3453.820 - 7674.743: 980645 ##################################################### 7674.743 - 17052.574: 552486 ############################## 17052.574 - 37887.762: 68558 #### 37887.762 - 84178.322: 31557 ## 84178.322 - 187024.331: 12102 # 187024.331 - 415522.355: 1364 | 415522.355 - 923187.626: 256 | 923187.626 - 2051092.468: 125 | 2051092.468 - 4557014.000: 21 | Approximate improvement in the first four buckets is about 20%. Reviewed-by: Boris Burkov <boris@bur.io> Signed-off-by: Filipe Manana <fdmanana@suse.com> Reviewed-by: David Sterba <dsterba@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2025-06-12 17:19:05 +01:00
bg->using_free_space_bitmaps = (flags & BTRFS_FREE_SPACE_USING_BITMAPS);
bg->using_free_space_bitmaps_cached = true;
return bg->using_free_space_bitmaps;
}
EXPORT_FOR_TESTS
int __btrfs_remove_from_free_space_tree(struct btrfs_trans_handle *trans,
struct btrfs_block_group *block_group,
struct btrfs_path *path, u64 start, u64 size)
{
int ret;
ret = __add_block_group_free_space(trans, block_group, path);
if (ret)
return ret;
ret = using_bitmaps(block_group, path);
if (ret < 0)
return ret;
if (ret)
return modify_free_space_bitmap(trans, block_group, path,
start, size, true);
return remove_free_space_extent(trans, block_group, path, start, size);
}
int btrfs_remove_from_free_space_tree(struct btrfs_trans_handle *trans,
u64 start, u64 size)
{
struct btrfs_block_group *block_group;
struct btrfs_path *path;
int ret;
if (!btrfs_fs_compat_ro(trans->fs_info, FREE_SPACE_TREE))
return 0;
path = btrfs_alloc_path();
if (!path) {
ret = -ENOMEM;
btrfs_abort_transaction(trans, ret);
goto out;
}
block_group = btrfs_lookup_block_group(trans->fs_info, start);
if (!block_group) {
DEBUG_WARN("no block group found for start=%llu", start);
ret = -ENOENT;
btrfs_abort_transaction(trans, ret);
goto out;
}
mutex_lock(&block_group->free_space_lock);
ret = __btrfs_remove_from_free_space_tree(trans, block_group, path, start, size);
mutex_unlock(&block_group->free_space_lock);
if (ret)
btrfs_abort_transaction(trans, ret);
btrfs_put_block_group(block_group);
out:
btrfs_free_path(path);
return ret;
}
static int add_free_space_extent(struct btrfs_trans_handle *trans,
struct btrfs_block_group *block_group,
struct btrfs_path *path,
u64 start, u64 size)
{
struct btrfs_root *root = btrfs_free_space_root(block_group);
struct btrfs_key key, new_key;
u64 found_start, found_end;
u64 end = start + size;
int new_extents = 1;
int ret;
/*
* We are adding a new extent of free space, but we need to merge
* extents. There are four cases here:
*
* 1. The new extent does not have any immediate neighbors to merge
* with: add the new key and increment the free space extent count. We
* may need to convert the block group to bitmaps as a result.
* 2. The new extent has an immediate neighbor before it: remove the
* previous key and insert a new key combining both of them. There is no
* net change in the number of extents.
* 3. The new extent has an immediate neighbor after it: remove the next
* key and insert a new key combining both of them. There is no net
* change in the number of extents.
* 4. The new extent has immediate neighbors on both sides: remove both
* of the keys and insert a new key combining all of them. Where we used
* to have two extents, we now have one, so decrement the extent count.
*/
new_key.objectid = start;
new_key.type = BTRFS_FREE_SPACE_EXTENT_KEY;
new_key.offset = size;
/* Search for a neighbor on the left. */
if (start == block_group->start)
goto right;
key.objectid = start - 1;
key.type = (u8)-1;
key.offset = (u64)-1;
ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1);
if (ret)
return ret;
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
if (key.type != BTRFS_FREE_SPACE_EXTENT_KEY) {
ASSERT(key.type == BTRFS_FREE_SPACE_INFO_KEY);
btrfs_release_path(path);
goto right;
}
found_start = key.objectid;
found_end = key.objectid + key.offset;
ASSERT(found_start >= block_group->start &&
found_end > block_group->start);
ASSERT(found_start < start && found_end <= start);
/*
* Delete the neighbor on the left and absorb it into the new key (cases
* 2 and 4).
*/
if (found_end == start) {
ret = btrfs_del_item(trans, root, path);
if (ret)
return ret;
new_key.objectid = found_start;
new_key.offset += key.offset;
new_extents--;
}
btrfs_release_path(path);
right:
/* Search for a neighbor on the right. */
if (end == block_group->start + block_group->length)
goto insert;
key.objectid = end;
key.type = (u8)-1;
key.offset = (u64)-1;
ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1);
if (ret)
return ret;
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
if (key.type != BTRFS_FREE_SPACE_EXTENT_KEY) {
ASSERT(key.type == BTRFS_FREE_SPACE_INFO_KEY);
btrfs_release_path(path);
goto insert;
}
found_start = key.objectid;
found_end = key.objectid + key.offset;
ASSERT(found_start >= block_group->start &&
found_end > block_group->start);
ASSERT((found_start < start && found_end <= start) ||
(found_start >= end && found_end > end));
/*
* Delete the neighbor on the right and absorb it into the new key
* (cases 3 and 4).
*/
if (found_start == end) {
ret = btrfs_del_item(trans, root, path);
if (ret)
return ret;
new_key.offset += key.offset;
new_extents--;
}
btrfs_release_path(path);
insert:
/* Insert the new key (cases 1-4). */
ret = btrfs_insert_empty_item(trans, root, path, &new_key, 0);
if (ret)
return ret;
btrfs_release_path(path);
return update_free_space_extent_count(trans, block_group, path, new_extents);
}
EXPORT_FOR_TESTS
int __btrfs_add_to_free_space_tree(struct btrfs_trans_handle *trans,
struct btrfs_block_group *block_group,
struct btrfs_path *path, u64 start, u64 size)
{
int ret;
ret = __add_block_group_free_space(trans, block_group, path);
if (ret)
return ret;
ret = using_bitmaps(block_group, path);
if (ret < 0)
return ret;
if (ret)
return modify_free_space_bitmap(trans, block_group, path,
start, size, false);
return add_free_space_extent(trans, block_group, path, start, size);
}
int btrfs_add_to_free_space_tree(struct btrfs_trans_handle *trans,
u64 start, u64 size)
{
struct btrfs_block_group *block_group;
struct btrfs_path *path;
int ret;
if (!btrfs_fs_compat_ro(trans->fs_info, FREE_SPACE_TREE))
return 0;
path = btrfs_alloc_path();
if (!path) {
ret = -ENOMEM;
btrfs_abort_transaction(trans, ret);
goto out;
}
block_group = btrfs_lookup_block_group(trans->fs_info, start);
if (!block_group) {
DEBUG_WARN("no block group found for start=%llu", start);
ret = -ENOENT;
btrfs_abort_transaction(trans, ret);
goto out;
}
mutex_lock(&block_group->free_space_lock);
ret = __btrfs_add_to_free_space_tree(trans, block_group, path, start, size);
mutex_unlock(&block_group->free_space_lock);
if (ret)
btrfs_abort_transaction(trans, ret);
btrfs_put_block_group(block_group);
out:
btrfs_free_path(path);
return ret;
}
/*
* Populate the free space tree by walking the extent tree. Operations on the
* extent tree that happen as a result of writes to the free space tree will go
* through the normal add/remove hooks.
*/
static int populate_free_space_tree(struct btrfs_trans_handle *trans,
struct btrfs_block_group *block_group)
{
struct btrfs_root *extent_root;
BTRFS_PATH_AUTO_FREE(path);
BTRFS_PATH_AUTO_FREE(path2);
struct btrfs_key key;
u64 start, end;
int ret;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
path2 = btrfs_alloc_path();
if (!path2)
return -ENOMEM;
path->reada = READA_FORWARD;
ret = add_new_free_space_info(trans, block_group, path2);
if (ret)
return ret;
mutex_lock(&block_group->free_space_lock);
/*
* Iterate through all of the extent and metadata items in this block
* group, adding the free space between them and the free space at the
* end. Note that EXTENT_ITEM and METADATA_ITEM are less than
* BLOCK_GROUP_ITEM, so an extent may precede the block group that it's
* contained in.
*/
key.objectid = block_group->start;
key.type = BTRFS_EXTENT_ITEM_KEY;
key.offset = 0;
extent_root = btrfs_extent_root(trans->fs_info, key.objectid);
ret = btrfs_search_slot_for_read(extent_root, &key, path, 1, 0);
if (ret < 0)
goto out_locked;
btrfs: fix assertion when building free space tree When building the free space tree with the block group tree feature enabled, we can hit an assertion failure like this: BTRFS info (device loop0 state M): rebuilding free space tree assertion failed: ret == 0, in fs/btrfs/free-space-tree.c:1102 ------------[ cut here ]------------ kernel BUG at fs/btrfs/free-space-tree.c:1102! Internal error: Oops - BUG: 00000000f2000800 [#1] SMP Modules linked in: CPU: 1 UID: 0 PID: 6592 Comm: syz-executor322 Not tainted 6.15.0-rc7-syzkaller-gd7fa1af5b33e #0 PREEMPT Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 05/07/2025 pstate: 60400005 (nZCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--) pc : populate_free_space_tree+0x514/0x518 fs/btrfs/free-space-tree.c:1102 lr : populate_free_space_tree+0x514/0x518 fs/btrfs/free-space-tree.c:1102 sp : ffff8000a4ce7600 x29: ffff8000a4ce76e0 x28: ffff0000c9bc6000 x27: ffff0000ddfff3d8 x26: ffff0000ddfff378 x25: dfff800000000000 x24: 0000000000000001 x23: ffff8000a4ce7660 x22: ffff70001499cecc x21: ffff0000e1d8c160 x20: ffff0000e1cb7800 x19: ffff0000e1d8c0b0 x18: 00000000ffffffff x17: ffff800092f39000 x16: ffff80008ad27e48 x15: ffff700011e740c0 x14: 1ffff00011e740c0 x13: 0000000000000004 x12: ffffffffffffffff x11: ffff700011e740c0 x10: 0000000000ff0100 x9 : 94ef24f55d2dbc00 x8 : 94ef24f55d2dbc00 x7 : 0000000000000001 x6 : 0000000000000001 x5 : ffff8000a4ce6f98 x4 : ffff80008f415ba0 x3 : ffff800080548ef0 x2 : 0000000000000000 x1 : 0000000100000000 x0 : 000000000000003e Call trace: populate_free_space_tree+0x514/0x518 fs/btrfs/free-space-tree.c:1102 (P) btrfs_rebuild_free_space_tree+0x14c/0x54c fs/btrfs/free-space-tree.c:1337 btrfs_start_pre_rw_mount+0xa78/0xe10 fs/btrfs/disk-io.c:3074 btrfs_remount_rw fs/btrfs/super.c:1319 [inline] btrfs_reconfigure+0x828/0x2418 fs/btrfs/super.c:1543 reconfigure_super+0x1d4/0x6f0 fs/super.c:1083 do_remount fs/namespace.c:3365 [inline] path_mount+0xb34/0xde0 fs/namespace.c:4200 do_mount fs/namespace.c:4221 [inline] __do_sys_mount fs/namespace.c:4432 [inline] __se_sys_mount fs/namespace.c:4409 [inline] __arm64_sys_mount+0x3e8/0x468 fs/namespace.c:4409 __invoke_syscall arch/arm64/kernel/syscall.c:35 [inline] invoke_syscall+0x98/0x2b8 arch/arm64/kernel/syscall.c:49 el0_svc_common+0x130/0x23c arch/arm64/kernel/syscall.c:132 do_el0_svc+0x48/0x58 arch/arm64/kernel/syscall.c:151 el0_svc+0x58/0x17c arch/arm64/kernel/entry-common.c:767 el0t_64_sync_handler+0x78/0x108 arch/arm64/kernel/entry-common.c:786 el0t_64_sync+0x198/0x19c arch/arm64/kernel/entry.S:600 Code: f0047182 91178042 528089c3 9771d47b (d4210000) ---[ end trace 0000000000000000 ]--- This happens because we are processing an empty block group, which has no extents allocated from it, there are no items for this block group, including the block group item since block group items are stored in a dedicated tree when using the block group tree feature. It also means this is the block group with the highest start offset, so there are no higher keys in the extent root, hence btrfs_search_slot_for_read() returns 1 (no higher key found). Fix this by asserting 'ret' is 0 only if the block group tree feature is not enabled, in which case we should find a block group item for the block group since it's stored in the extent root and block group item keys are greater than extent item keys (the value for BTRFS_BLOCK_GROUP_ITEM_KEY is 192 and for BTRFS_EXTENT_ITEM_KEY and BTRFS_METADATA_ITEM_KEY the values are 168 and 169 respectively). In case 'ret' is 1, we just need to add a record to the free space tree which spans the whole block group, and we can achieve this by making 'ret == 0' as the while loop's condition. Reported-by: syzbot+36fae25c35159a763a2a@syzkaller.appspotmail.com Link: https://lore.kernel.org/linux-btrfs/6841dca8.a00a0220.d4325.0020.GAE@google.com/ Reviewed-by: Qu Wenruo <wqu@suse.com> Signed-off-by: Filipe Manana <fdmanana@suse.com> Reviewed-by: David Sterba <dsterba@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2025-06-05 20:51:03 +01:00
/*
* If ret is 1 (no key found), it means this is an empty block group,
* without any extents allocated from it and there's no block group
* item (key BTRFS_BLOCK_GROUP_ITEM_KEY) located in the extent tree
* because we are using the block group tree feature, so block group
* items are stored in the block group tree. It also means there are no
* extents allocated for block groups with a start offset beyond this
* block group's end offset (this is the last, highest, block group).
*/
if (!btrfs_fs_compat_ro(trans->fs_info, BLOCK_GROUP_TREE))
ASSERT(ret == 0);
start = block_group->start;
end = block_group->start + block_group->length;
btrfs: fix assertion when building free space tree When building the free space tree with the block group tree feature enabled, we can hit an assertion failure like this: BTRFS info (device loop0 state M): rebuilding free space tree assertion failed: ret == 0, in fs/btrfs/free-space-tree.c:1102 ------------[ cut here ]------------ kernel BUG at fs/btrfs/free-space-tree.c:1102! Internal error: Oops - BUG: 00000000f2000800 [#1] SMP Modules linked in: CPU: 1 UID: 0 PID: 6592 Comm: syz-executor322 Not tainted 6.15.0-rc7-syzkaller-gd7fa1af5b33e #0 PREEMPT Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 05/07/2025 pstate: 60400005 (nZCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--) pc : populate_free_space_tree+0x514/0x518 fs/btrfs/free-space-tree.c:1102 lr : populate_free_space_tree+0x514/0x518 fs/btrfs/free-space-tree.c:1102 sp : ffff8000a4ce7600 x29: ffff8000a4ce76e0 x28: ffff0000c9bc6000 x27: ffff0000ddfff3d8 x26: ffff0000ddfff378 x25: dfff800000000000 x24: 0000000000000001 x23: ffff8000a4ce7660 x22: ffff70001499cecc x21: ffff0000e1d8c160 x20: ffff0000e1cb7800 x19: ffff0000e1d8c0b0 x18: 00000000ffffffff x17: ffff800092f39000 x16: ffff80008ad27e48 x15: ffff700011e740c0 x14: 1ffff00011e740c0 x13: 0000000000000004 x12: ffffffffffffffff x11: ffff700011e740c0 x10: 0000000000ff0100 x9 : 94ef24f55d2dbc00 x8 : 94ef24f55d2dbc00 x7 : 0000000000000001 x6 : 0000000000000001 x5 : ffff8000a4ce6f98 x4 : ffff80008f415ba0 x3 : ffff800080548ef0 x2 : 0000000000000000 x1 : 0000000100000000 x0 : 000000000000003e Call trace: populate_free_space_tree+0x514/0x518 fs/btrfs/free-space-tree.c:1102 (P) btrfs_rebuild_free_space_tree+0x14c/0x54c fs/btrfs/free-space-tree.c:1337 btrfs_start_pre_rw_mount+0xa78/0xe10 fs/btrfs/disk-io.c:3074 btrfs_remount_rw fs/btrfs/super.c:1319 [inline] btrfs_reconfigure+0x828/0x2418 fs/btrfs/super.c:1543 reconfigure_super+0x1d4/0x6f0 fs/super.c:1083 do_remount fs/namespace.c:3365 [inline] path_mount+0xb34/0xde0 fs/namespace.c:4200 do_mount fs/namespace.c:4221 [inline] __do_sys_mount fs/namespace.c:4432 [inline] __se_sys_mount fs/namespace.c:4409 [inline] __arm64_sys_mount+0x3e8/0x468 fs/namespace.c:4409 __invoke_syscall arch/arm64/kernel/syscall.c:35 [inline] invoke_syscall+0x98/0x2b8 arch/arm64/kernel/syscall.c:49 el0_svc_common+0x130/0x23c arch/arm64/kernel/syscall.c:132 do_el0_svc+0x48/0x58 arch/arm64/kernel/syscall.c:151 el0_svc+0x58/0x17c arch/arm64/kernel/entry-common.c:767 el0t_64_sync_handler+0x78/0x108 arch/arm64/kernel/entry-common.c:786 el0t_64_sync+0x198/0x19c arch/arm64/kernel/entry.S:600 Code: f0047182 91178042 528089c3 9771d47b (d4210000) ---[ end trace 0000000000000000 ]--- This happens because we are processing an empty block group, which has no extents allocated from it, there are no items for this block group, including the block group item since block group items are stored in a dedicated tree when using the block group tree feature. It also means this is the block group with the highest start offset, so there are no higher keys in the extent root, hence btrfs_search_slot_for_read() returns 1 (no higher key found). Fix this by asserting 'ret' is 0 only if the block group tree feature is not enabled, in which case we should find a block group item for the block group since it's stored in the extent root and block group item keys are greater than extent item keys (the value for BTRFS_BLOCK_GROUP_ITEM_KEY is 192 and for BTRFS_EXTENT_ITEM_KEY and BTRFS_METADATA_ITEM_KEY the values are 168 and 169 respectively). In case 'ret' is 1, we just need to add a record to the free space tree which spans the whole block group, and we can achieve this by making 'ret == 0' as the while loop's condition. Reported-by: syzbot+36fae25c35159a763a2a@syzkaller.appspotmail.com Link: https://lore.kernel.org/linux-btrfs/6841dca8.a00a0220.d4325.0020.GAE@google.com/ Reviewed-by: Qu Wenruo <wqu@suse.com> Signed-off-by: Filipe Manana <fdmanana@suse.com> Reviewed-by: David Sterba <dsterba@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2025-06-05 20:51:03 +01:00
while (ret == 0) {
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
if (key.type == BTRFS_EXTENT_ITEM_KEY ||
key.type == BTRFS_METADATA_ITEM_KEY) {
if (key.objectid >= end)
break;
if (start < key.objectid) {
ret = __btrfs_add_to_free_space_tree(trans,
block_group,
path2, start,
key.objectid -
start);
if (ret)
goto out_locked;
}
start = key.objectid;
if (key.type == BTRFS_METADATA_ITEM_KEY)
start += trans->fs_info->nodesize;
else
start += key.offset;
} else if (key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
if (key.objectid != block_group->start)
break;
}
ret = btrfs_next_item(extent_root, path);
if (ret < 0)
goto out_locked;
}
if (start < end) {
ret = __btrfs_add_to_free_space_tree(trans, block_group, path2,
start, end - start);
if (ret)
goto out_locked;
}
ret = 0;
out_locked:
mutex_unlock(&block_group->free_space_lock);
return ret;
}
int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info)
{
struct btrfs_trans_handle *trans;
struct btrfs_root *tree_root = fs_info->tree_root;
struct btrfs_root *free_space_root;
struct btrfs_block_group *block_group;
struct rb_node *node;
int ret;
trans = btrfs_start_transaction(tree_root, 0);
if (IS_ERR(trans))
return PTR_ERR(trans);
set_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags);
btrfs: fix possible free space tree corruption with online conversion While running btrfs/011 in a loop I would often ASSERT() while trying to add a new free space entry that already existed, or get an EEXIST while adding a new block to the extent tree, which is another indication of double allocation. This occurs because when we do the free space tree population, we create the new root and then populate the tree and commit the transaction. The problem is when you create a new root, the root node and commit root node are the same. During this initial transaction commit we will run all of the delayed refs that were paused during the free space tree generation, and thus begin to cache block groups. While caching block groups the caching thread will be reading from the main root for the free space tree, so as we make allocations we'll be changing the free space tree, which can cause us to add the same range twice which results in either the ASSERT(ret != -EEXIST); in __btrfs_add_free_space, or in a variety of different errors when running delayed refs because of a double allocation. Fix this by marking the fs_info as unsafe to load the free space tree, and fall back on the old slow method. We could be smarter than this, for example caching the block group while we're populating the free space tree, but since this is a serious problem I've opted for the simplest solution. CC: stable@vger.kernel.org # 4.9+ Fixes: a5ed91828518 ("Btrfs: implement the free space B-tree") Reviewed-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: Josef Bacik <josef@toxicpanda.com> Signed-off-by: David Sterba <dsterba@suse.com>
2021-01-15 16:26:17 -05:00
set_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags);
free_space_root = btrfs_create_tree(trans,
BTRFS_FREE_SPACE_TREE_OBJECTID);
if (IS_ERR(free_space_root)) {
ret = PTR_ERR(free_space_root);
btrfs_abort_transaction(trans, ret);
btrfs_end_transaction(trans);
goto out_clear;
}
ret = btrfs_global_root_insert(free_space_root);
if (ret) {
btrfs_put_root(free_space_root);
btrfs_abort_transaction(trans, ret);
btrfs_end_transaction(trans);
goto out_clear;
}
node = rb_first_cached(&fs_info->block_group_cache_tree);
while (node) {
block_group = rb_entry(node, struct btrfs_block_group,
cache_node);
ret = populate_free_space_tree(trans, block_group);
if (ret) {
btrfs_abort_transaction(trans, ret);
btrfs_end_transaction(trans);
goto out_clear;
}
node = rb_next(node);
}
btrfs_set_fs_compat_ro(fs_info, FREE_SPACE_TREE);
btrfs_set_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID);
clear_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags);
btrfs: fix possible free space tree corruption with online conversion While running btrfs/011 in a loop I would often ASSERT() while trying to add a new free space entry that already existed, or get an EEXIST while adding a new block to the extent tree, which is another indication of double allocation. This occurs because when we do the free space tree population, we create the new root and then populate the tree and commit the transaction. The problem is when you create a new root, the root node and commit root node are the same. During this initial transaction commit we will run all of the delayed refs that were paused during the free space tree generation, and thus begin to cache block groups. While caching block groups the caching thread will be reading from the main root for the free space tree, so as we make allocations we'll be changing the free space tree, which can cause us to add the same range twice which results in either the ASSERT(ret != -EEXIST); in __btrfs_add_free_space, or in a variety of different errors when running delayed refs because of a double allocation. Fix this by marking the fs_info as unsafe to load the free space tree, and fall back on the old slow method. We could be smarter than this, for example caching the block group while we're populating the free space tree, but since this is a serious problem I've opted for the simplest solution. CC: stable@vger.kernel.org # 4.9+ Fixes: a5ed91828518 ("Btrfs: implement the free space B-tree") Reviewed-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: Josef Bacik <josef@toxicpanda.com> Signed-off-by: David Sterba <dsterba@suse.com>
2021-01-15 16:26:17 -05:00
ret = btrfs_commit_transaction(trans);
btrfs: fix possible free space tree corruption with online conversion While running btrfs/011 in a loop I would often ASSERT() while trying to add a new free space entry that already existed, or get an EEXIST while adding a new block to the extent tree, which is another indication of double allocation. This occurs because when we do the free space tree population, we create the new root and then populate the tree and commit the transaction. The problem is when you create a new root, the root node and commit root node are the same. During this initial transaction commit we will run all of the delayed refs that were paused during the free space tree generation, and thus begin to cache block groups. While caching block groups the caching thread will be reading from the main root for the free space tree, so as we make allocations we'll be changing the free space tree, which can cause us to add the same range twice which results in either the ASSERT(ret != -EEXIST); in __btrfs_add_free_space, or in a variety of different errors when running delayed refs because of a double allocation. Fix this by marking the fs_info as unsafe to load the free space tree, and fall back on the old slow method. We could be smarter than this, for example caching the block group while we're populating the free space tree, but since this is a serious problem I've opted for the simplest solution. CC: stable@vger.kernel.org # 4.9+ Fixes: a5ed91828518 ("Btrfs: implement the free space B-tree") Reviewed-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: Josef Bacik <josef@toxicpanda.com> Signed-off-by: David Sterba <dsterba@suse.com>
2021-01-15 16:26:17 -05:00
/*
* Now that we've committed the transaction any reading of our commit
* root will be safe, so we can cache from the free space tree now.
*/
clear_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags);
return ret;
out_clear:
clear_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags);
btrfs: fix possible free space tree corruption with online conversion While running btrfs/011 in a loop I would often ASSERT() while trying to add a new free space entry that already existed, or get an EEXIST while adding a new block to the extent tree, which is another indication of double allocation. This occurs because when we do the free space tree population, we create the new root and then populate the tree and commit the transaction. The problem is when you create a new root, the root node and commit root node are the same. During this initial transaction commit we will run all of the delayed refs that were paused during the free space tree generation, and thus begin to cache block groups. While caching block groups the caching thread will be reading from the main root for the free space tree, so as we make allocations we'll be changing the free space tree, which can cause us to add the same range twice which results in either the ASSERT(ret != -EEXIST); in __btrfs_add_free_space, or in a variety of different errors when running delayed refs because of a double allocation. Fix this by marking the fs_info as unsafe to load the free space tree, and fall back on the old slow method. We could be smarter than this, for example caching the block group while we're populating the free space tree, but since this is a serious problem I've opted for the simplest solution. CC: stable@vger.kernel.org # 4.9+ Fixes: a5ed91828518 ("Btrfs: implement the free space B-tree") Reviewed-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: Josef Bacik <josef@toxicpanda.com> Signed-off-by: David Sterba <dsterba@suse.com>
2021-01-15 16:26:17 -05:00
clear_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags);
return ret;
}
static int clear_free_space_tree(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
btrfs: fix failure to rebuild free space tree using multiple transactions If we are rebuilding a free space tree, while modifying the free space tree we may need to allocate a new metadata block group. If we end up using multiple transactions for the rebuild, when we call btrfs_end_transaction() we enter btrfs_create_pending_block_groups() which calls add_block_group_free_space() to add items to the free space tree for the block group. Then later during the free space tree rebuild, at btrfs_rebuild_free_space_tree(), we may find such new block groups and call populate_free_space_tree() for them, which fails with -EEXIST because there are already items in the free space tree. Then we abort the transaction with -EEXIST at btrfs_rebuild_free_space_tree(). Notice that we say "may find" the new block groups because a new block group may be inserted in the block groups rbtree, which is being iterated by the rebuild process, before or after the current node where the rebuild process is currently at. Syzbot recently reported such case which produces a trace like the following: ------------[ cut here ]------------ BTRFS: Transaction aborted (error -17) WARNING: CPU: 1 PID: 7626 at fs/btrfs/free-space-tree.c:1341 btrfs_rebuild_free_space_tree+0x470/0x54c fs/btrfs/free-space-tree.c:1341 Modules linked in: CPU: 1 UID: 0 PID: 7626 Comm: syz.2.25 Not tainted 6.15.0-rc7-syzkaller-00085-gd7fa1af5b33e-dirty #0 PREEMPT Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 05/07/2025 pstate: 60400005 (nZCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--) pc : btrfs_rebuild_free_space_tree+0x470/0x54c fs/btrfs/free-space-tree.c:1341 lr : btrfs_rebuild_free_space_tree+0x470/0x54c fs/btrfs/free-space-tree.c:1341 sp : ffff80009c4f7740 x29: ffff80009c4f77b0 x28: ffff0000d4c3f400 x27: 0000000000000000 x26: dfff800000000000 x25: ffff70001389eee8 x24: 0000000000000003 x23: 1fffe000182b6e7b x22: 0000000000000000 x21: ffff0000c15b73d8 x20: 00000000ffffffef x19: ffff0000c15b7378 x18: 1fffe0003386f276 x17: ffff80008f31e000 x16: ffff80008adbe98c x15: 0000000000000001 x14: 1fffe0001b281550 x13: 0000000000000000 x12: 0000000000000000 x11: ffff60001b281551 x10: 0000000000000003 x9 : 1c8922000a902c00 x8 : 1c8922000a902c00 x7 : ffff800080485878 x6 : 0000000000000000 x5 : 0000000000000001 x4 : 0000000000000001 x3 : ffff80008047843c x2 : 0000000000000001 x1 : ffff80008b3ebc40 x0 : 0000000000000001 Call trace: btrfs_rebuild_free_space_tree+0x470/0x54c fs/btrfs/free-space-tree.c:1341 (P) btrfs_start_pre_rw_mount+0xa78/0xe10 fs/btrfs/disk-io.c:3074 btrfs_remount_rw fs/btrfs/super.c:1319 [inline] btrfs_reconfigure+0x828/0x2418 fs/btrfs/super.c:1543 reconfigure_super+0x1d4/0x6f0 fs/super.c:1083 do_remount fs/namespace.c:3365 [inline] path_mount+0xb34/0xde0 fs/namespace.c:4200 do_mount fs/namespace.c:4221 [inline] __do_sys_mount fs/namespace.c:4432 [inline] __se_sys_mount fs/namespace.c:4409 [inline] __arm64_sys_mount+0x3e8/0x468 fs/namespace.c:4409 __invoke_syscall arch/arm64/kernel/syscall.c:35 [inline] invoke_syscall+0x98/0x2b8 arch/arm64/kernel/syscall.c:49 el0_svc_common+0x130/0x23c arch/arm64/kernel/syscall.c:132 do_el0_svc+0x48/0x58 arch/arm64/kernel/syscall.c:151 el0_svc+0x58/0x17c arch/arm64/kernel/entry-common.c:767 el0t_64_sync_handler+0x78/0x108 arch/arm64/kernel/entry-common.c:786 el0t_64_sync+0x198/0x19c arch/arm64/kernel/entry.S:600 irq event stamp: 330 hardirqs last enabled at (329): [<ffff80008048590c>] raw_spin_rq_unlock_irq kernel/sched/sched.h:1525 [inline] hardirqs last enabled at (329): [<ffff80008048590c>] finish_lock_switch+0xb0/0x1c0 kernel/sched/core.c:5130 hardirqs last disabled at (330): [<ffff80008adb9e60>] el1_dbg+0x24/0x80 arch/arm64/kernel/entry-common.c:511 softirqs last enabled at (10): [<ffff8000801fbf10>] local_bh_enable+0x10/0x34 include/linux/bottom_half.h:32 softirqs last disabled at (8): [<ffff8000801fbedc>] local_bh_disable+0x10/0x34 include/linux/bottom_half.h:19 ---[ end trace 0000000000000000 ]--- Fix this by flagging new block groups which had their free space tree entries already added and then skip them in the rebuild process. Also, since the rebuild may be triggered when doing a remount, make sure that when we clear an existing free space tree that we clear such flag from every existing block group, otherwise we would skip those block groups during the rebuild. Reported-by: syzbot+d0014fb0fc39c5487ae5@syzkaller.appspotmail.com Link: https://lore.kernel.org/linux-btrfs/68460a54.050a0220.daf97.0af5.GAE@google.com/ Fixes: 882af9f13e83 ("btrfs: handle free space tree rebuild in multiple transactions") Reviewed-by: Boris Burkov <boris@bur.io> Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2025-06-07 13:43:32 +01:00
struct rb_node *node;
int nr;
int ret;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
key.objectid = 0;
key.type = 0;
key.offset = 0;
while (1) {
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret < 0)
return ret;
nr = btrfs_header_nritems(path->nodes[0]);
if (!nr)
break;
path->slots[0] = 0;
ret = btrfs_del_items(trans, root, path, 0, nr);
if (ret)
return ret;
btrfs_release_path(path);
}
btrfs: fix failure to rebuild free space tree using multiple transactions If we are rebuilding a free space tree, while modifying the free space tree we may need to allocate a new metadata block group. If we end up using multiple transactions for the rebuild, when we call btrfs_end_transaction() we enter btrfs_create_pending_block_groups() which calls add_block_group_free_space() to add items to the free space tree for the block group. Then later during the free space tree rebuild, at btrfs_rebuild_free_space_tree(), we may find such new block groups and call populate_free_space_tree() for them, which fails with -EEXIST because there are already items in the free space tree. Then we abort the transaction with -EEXIST at btrfs_rebuild_free_space_tree(). Notice that we say "may find" the new block groups because a new block group may be inserted in the block groups rbtree, which is being iterated by the rebuild process, before or after the current node where the rebuild process is currently at. Syzbot recently reported such case which produces a trace like the following: ------------[ cut here ]------------ BTRFS: Transaction aborted (error -17) WARNING: CPU: 1 PID: 7626 at fs/btrfs/free-space-tree.c:1341 btrfs_rebuild_free_space_tree+0x470/0x54c fs/btrfs/free-space-tree.c:1341 Modules linked in: CPU: 1 UID: 0 PID: 7626 Comm: syz.2.25 Not tainted 6.15.0-rc7-syzkaller-00085-gd7fa1af5b33e-dirty #0 PREEMPT Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 05/07/2025 pstate: 60400005 (nZCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--) pc : btrfs_rebuild_free_space_tree+0x470/0x54c fs/btrfs/free-space-tree.c:1341 lr : btrfs_rebuild_free_space_tree+0x470/0x54c fs/btrfs/free-space-tree.c:1341 sp : ffff80009c4f7740 x29: ffff80009c4f77b0 x28: ffff0000d4c3f400 x27: 0000000000000000 x26: dfff800000000000 x25: ffff70001389eee8 x24: 0000000000000003 x23: 1fffe000182b6e7b x22: 0000000000000000 x21: ffff0000c15b73d8 x20: 00000000ffffffef x19: ffff0000c15b7378 x18: 1fffe0003386f276 x17: ffff80008f31e000 x16: ffff80008adbe98c x15: 0000000000000001 x14: 1fffe0001b281550 x13: 0000000000000000 x12: 0000000000000000 x11: ffff60001b281551 x10: 0000000000000003 x9 : 1c8922000a902c00 x8 : 1c8922000a902c00 x7 : ffff800080485878 x6 : 0000000000000000 x5 : 0000000000000001 x4 : 0000000000000001 x3 : ffff80008047843c x2 : 0000000000000001 x1 : ffff80008b3ebc40 x0 : 0000000000000001 Call trace: btrfs_rebuild_free_space_tree+0x470/0x54c fs/btrfs/free-space-tree.c:1341 (P) btrfs_start_pre_rw_mount+0xa78/0xe10 fs/btrfs/disk-io.c:3074 btrfs_remount_rw fs/btrfs/super.c:1319 [inline] btrfs_reconfigure+0x828/0x2418 fs/btrfs/super.c:1543 reconfigure_super+0x1d4/0x6f0 fs/super.c:1083 do_remount fs/namespace.c:3365 [inline] path_mount+0xb34/0xde0 fs/namespace.c:4200 do_mount fs/namespace.c:4221 [inline] __do_sys_mount fs/namespace.c:4432 [inline] __se_sys_mount fs/namespace.c:4409 [inline] __arm64_sys_mount+0x3e8/0x468 fs/namespace.c:4409 __invoke_syscall arch/arm64/kernel/syscall.c:35 [inline] invoke_syscall+0x98/0x2b8 arch/arm64/kernel/syscall.c:49 el0_svc_common+0x130/0x23c arch/arm64/kernel/syscall.c:132 do_el0_svc+0x48/0x58 arch/arm64/kernel/syscall.c:151 el0_svc+0x58/0x17c arch/arm64/kernel/entry-common.c:767 el0t_64_sync_handler+0x78/0x108 arch/arm64/kernel/entry-common.c:786 el0t_64_sync+0x198/0x19c arch/arm64/kernel/entry.S:600 irq event stamp: 330 hardirqs last enabled at (329): [<ffff80008048590c>] raw_spin_rq_unlock_irq kernel/sched/sched.h:1525 [inline] hardirqs last enabled at (329): [<ffff80008048590c>] finish_lock_switch+0xb0/0x1c0 kernel/sched/core.c:5130 hardirqs last disabled at (330): [<ffff80008adb9e60>] el1_dbg+0x24/0x80 arch/arm64/kernel/entry-common.c:511 softirqs last enabled at (10): [<ffff8000801fbf10>] local_bh_enable+0x10/0x34 include/linux/bottom_half.h:32 softirqs last disabled at (8): [<ffff8000801fbedc>] local_bh_disable+0x10/0x34 include/linux/bottom_half.h:19 ---[ end trace 0000000000000000 ]--- Fix this by flagging new block groups which had their free space tree entries already added and then skip them in the rebuild process. Also, since the rebuild may be triggered when doing a remount, make sure that when we clear an existing free space tree that we clear such flag from every existing block group, otherwise we would skip those block groups during the rebuild. Reported-by: syzbot+d0014fb0fc39c5487ae5@syzkaller.appspotmail.com Link: https://lore.kernel.org/linux-btrfs/68460a54.050a0220.daf97.0af5.GAE@google.com/ Fixes: 882af9f13e83 ("btrfs: handle free space tree rebuild in multiple transactions") Reviewed-by: Boris Burkov <boris@bur.io> Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2025-06-07 13:43:32 +01:00
node = rb_first_cached(&trans->fs_info->block_group_cache_tree);
while (node) {
struct btrfs_block_group *bg;
bg = rb_entry(node, struct btrfs_block_group, cache_node);
clear_bit(BLOCK_GROUP_FLAG_FREE_SPACE_ADDED, &bg->runtime_flags);
node = rb_next(node);
cond_resched();
}
return 0;
}
int btrfs_delete_free_space_tree(struct btrfs_fs_info *fs_info)
{
struct btrfs_trans_handle *trans;
struct btrfs_root *tree_root = fs_info->tree_root;
struct btrfs_key key = {
.objectid = BTRFS_FREE_SPACE_TREE_OBJECTID,
.type = BTRFS_ROOT_ITEM_KEY,
.offset = 0,
};
struct btrfs_root *free_space_root = btrfs_global_root(fs_info, &key);
int ret;
trans = btrfs_start_transaction(tree_root, 0);
if (IS_ERR(trans))
return PTR_ERR(trans);
btrfs_clear_fs_compat_ro(fs_info, FREE_SPACE_TREE);
btrfs_clear_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID);
ret = clear_free_space_tree(trans, free_space_root);
if (ret) {
btrfs_abort_transaction(trans, ret);
btrfs_end_transaction(trans);
return ret;
}
ret = btrfs_del_root(trans, &free_space_root->root_key);
if (ret) {
btrfs_abort_transaction(trans, ret);
btrfs_end_transaction(trans);
return ret;
}
btrfs_global_root_delete(free_space_root);
btrfs: fix race when deleting free space root from the dirty cow roots list When deleting the free space tree we are deleting the free space root from the list fs_info->dirty_cowonly_roots without taking the lock that protects it, which is struct btrfs_fs_info::trans_lock. This unsynchronized list manipulation may cause chaos if there's another concurrent manipulation of this list, such as when adding a root to it with ctree.c:add_root_to_dirty_list(). This can result in all sorts of weird failures caused by a race, such as the following crash: [337571.278245] general protection fault, probably for non-canonical address 0xdead000000000108: 0000 [#1] PREEMPT SMP PTI [337571.278933] CPU: 1 PID: 115447 Comm: btrfs Tainted: G W 6.4.0-rc6-btrfs-next-134+ #1 [337571.279153] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014 [337571.279572] RIP: 0010:commit_cowonly_roots+0x11f/0x250 [btrfs] [337571.279928] Code: 85 38 06 00 (...) [337571.280363] RSP: 0018:ffff9f63446efba0 EFLAGS: 00010206 [337571.280582] RAX: ffff942d98ec2638 RBX: ffff9430b82b4c30 RCX: 0000000449e1c000 [337571.280798] RDX: dead000000000100 RSI: ffff9430021e4900 RDI: 0000000000036070 [337571.281015] RBP: ffff942d98ec2000 R08: ffff942d98ec2000 R09: 000000000000015b [337571.281254] R10: 0000000000000009 R11: 0000000000000001 R12: ffff942fe8fbf600 [337571.281476] R13: ffff942dabe23040 R14: ffff942dabe20800 R15: ffff942d92cf3b48 [337571.281723] FS: 00007f478adb7340(0000) GS:ffff94349fa40000(0000) knlGS:0000000000000000 [337571.281950] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [337571.282184] CR2: 00007f478ab9a3d5 CR3: 000000001e02c001 CR4: 0000000000370ee0 [337571.282416] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [337571.282647] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [337571.282874] Call Trace: [337571.283101] <TASK> [337571.283327] ? __die_body+0x1b/0x60 [337571.283570] ? die_addr+0x39/0x60 [337571.283796] ? exc_general_protection+0x22e/0x430 [337571.284022] ? asm_exc_general_protection+0x22/0x30 [337571.284251] ? commit_cowonly_roots+0x11f/0x250 [btrfs] [337571.284531] btrfs_commit_transaction+0x42e/0xf90 [btrfs] [337571.284803] ? _raw_spin_unlock+0x15/0x30 [337571.285031] ? release_extent_buffer+0x103/0x130 [btrfs] [337571.285305] reset_balance_state+0x152/0x1b0 [btrfs] [337571.285578] btrfs_balance+0xa50/0x11e0 [btrfs] [337571.285864] ? __kmem_cache_alloc_node+0x14a/0x410 [337571.286086] btrfs_ioctl+0x249a/0x3320 [btrfs] [337571.286358] ? mod_objcg_state+0xd2/0x360 [337571.286577] ? refill_obj_stock+0xb0/0x160 [337571.286798] ? seq_release+0x25/0x30 [337571.287016] ? __rseq_handle_notify_resume+0x3ba/0x4b0 [337571.287235] ? percpu_counter_add_batch+0x2e/0xa0 [337571.287455] ? __x64_sys_ioctl+0x88/0xc0 [337571.287675] __x64_sys_ioctl+0x88/0xc0 [337571.287901] do_syscall_64+0x38/0x90 [337571.288126] entry_SYSCALL_64_after_hwframe+0x72/0xdc [337571.288352] RIP: 0033:0x7f478aaffe9b So fix this by locking struct btrfs_fs_info::trans_lock before deleting the free space root from that list. Fixes: a5ed91828518 ("Btrfs: implement the free space B-tree") CC: stable@vger.kernel.org # 4.14+ Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2023-06-19 17:21:48 +01:00
spin_lock(&fs_info->trans_lock);
list_del(&free_space_root->dirty_list);
btrfs: fix race when deleting free space root from the dirty cow roots list When deleting the free space tree we are deleting the free space root from the list fs_info->dirty_cowonly_roots without taking the lock that protects it, which is struct btrfs_fs_info::trans_lock. This unsynchronized list manipulation may cause chaos if there's another concurrent manipulation of this list, such as when adding a root to it with ctree.c:add_root_to_dirty_list(). This can result in all sorts of weird failures caused by a race, such as the following crash: [337571.278245] general protection fault, probably for non-canonical address 0xdead000000000108: 0000 [#1] PREEMPT SMP PTI [337571.278933] CPU: 1 PID: 115447 Comm: btrfs Tainted: G W 6.4.0-rc6-btrfs-next-134+ #1 [337571.279153] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014 [337571.279572] RIP: 0010:commit_cowonly_roots+0x11f/0x250 [btrfs] [337571.279928] Code: 85 38 06 00 (...) [337571.280363] RSP: 0018:ffff9f63446efba0 EFLAGS: 00010206 [337571.280582] RAX: ffff942d98ec2638 RBX: ffff9430b82b4c30 RCX: 0000000449e1c000 [337571.280798] RDX: dead000000000100 RSI: ffff9430021e4900 RDI: 0000000000036070 [337571.281015] RBP: ffff942d98ec2000 R08: ffff942d98ec2000 R09: 000000000000015b [337571.281254] R10: 0000000000000009 R11: 0000000000000001 R12: ffff942fe8fbf600 [337571.281476] R13: ffff942dabe23040 R14: ffff942dabe20800 R15: ffff942d92cf3b48 [337571.281723] FS: 00007f478adb7340(0000) GS:ffff94349fa40000(0000) knlGS:0000000000000000 [337571.281950] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [337571.282184] CR2: 00007f478ab9a3d5 CR3: 000000001e02c001 CR4: 0000000000370ee0 [337571.282416] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [337571.282647] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [337571.282874] Call Trace: [337571.283101] <TASK> [337571.283327] ? __die_body+0x1b/0x60 [337571.283570] ? die_addr+0x39/0x60 [337571.283796] ? exc_general_protection+0x22e/0x430 [337571.284022] ? asm_exc_general_protection+0x22/0x30 [337571.284251] ? commit_cowonly_roots+0x11f/0x250 [btrfs] [337571.284531] btrfs_commit_transaction+0x42e/0xf90 [btrfs] [337571.284803] ? _raw_spin_unlock+0x15/0x30 [337571.285031] ? release_extent_buffer+0x103/0x130 [btrfs] [337571.285305] reset_balance_state+0x152/0x1b0 [btrfs] [337571.285578] btrfs_balance+0xa50/0x11e0 [btrfs] [337571.285864] ? __kmem_cache_alloc_node+0x14a/0x410 [337571.286086] btrfs_ioctl+0x249a/0x3320 [btrfs] [337571.286358] ? mod_objcg_state+0xd2/0x360 [337571.286577] ? refill_obj_stock+0xb0/0x160 [337571.286798] ? seq_release+0x25/0x30 [337571.287016] ? __rseq_handle_notify_resume+0x3ba/0x4b0 [337571.287235] ? percpu_counter_add_batch+0x2e/0xa0 [337571.287455] ? __x64_sys_ioctl+0x88/0xc0 [337571.287675] __x64_sys_ioctl+0x88/0xc0 [337571.287901] do_syscall_64+0x38/0x90 [337571.288126] entry_SYSCALL_64_after_hwframe+0x72/0xdc [337571.288352] RIP: 0033:0x7f478aaffe9b So fix this by locking struct btrfs_fs_info::trans_lock before deleting the free space root from that list. Fixes: a5ed91828518 ("Btrfs: implement the free space B-tree") CC: stable@vger.kernel.org # 4.14+ Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2023-06-19 17:21:48 +01:00
spin_unlock(&fs_info->trans_lock);
btrfs_tree_lock(free_space_root->node);
btrfs_clear_buffer_dirty(trans, free_space_root->node);
btrfs_tree_unlock(free_space_root->node);
ret = btrfs_free_tree_block(trans, btrfs_root_id(free_space_root),
free_space_root->node, 0, 1);
btrfs_put_root(free_space_root);
if (ret < 0) {
btrfs_abort_transaction(trans, ret);
btrfs_end_transaction(trans);
return ret;
}
return btrfs_commit_transaction(trans);
}
int btrfs_rebuild_free_space_tree(struct btrfs_fs_info *fs_info)
{
struct btrfs_trans_handle *trans;
struct btrfs_key key = {
.objectid = BTRFS_FREE_SPACE_TREE_OBJECTID,
.type = BTRFS_ROOT_ITEM_KEY,
.offset = 0,
};
struct btrfs_root *free_space_root = btrfs_global_root(fs_info, &key);
struct rb_node *node;
int ret;
trans = btrfs_start_transaction(free_space_root, 1);
if (IS_ERR(trans))
return PTR_ERR(trans);
set_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags);
set_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags);
ret = clear_free_space_tree(trans, free_space_root);
if (ret) {
btrfs_abort_transaction(trans, ret);
btrfs_end_transaction(trans);
return ret;
}
node = rb_first_cached(&fs_info->block_group_cache_tree);
while (node) {
struct btrfs_block_group *block_group;
block_group = rb_entry(node, struct btrfs_block_group,
cache_node);
btrfs: fix failure to rebuild free space tree using multiple transactions If we are rebuilding a free space tree, while modifying the free space tree we may need to allocate a new metadata block group. If we end up using multiple transactions for the rebuild, when we call btrfs_end_transaction() we enter btrfs_create_pending_block_groups() which calls add_block_group_free_space() to add items to the free space tree for the block group. Then later during the free space tree rebuild, at btrfs_rebuild_free_space_tree(), we may find such new block groups and call populate_free_space_tree() for them, which fails with -EEXIST because there are already items in the free space tree. Then we abort the transaction with -EEXIST at btrfs_rebuild_free_space_tree(). Notice that we say "may find" the new block groups because a new block group may be inserted in the block groups rbtree, which is being iterated by the rebuild process, before or after the current node where the rebuild process is currently at. Syzbot recently reported such case which produces a trace like the following: ------------[ cut here ]------------ BTRFS: Transaction aborted (error -17) WARNING: CPU: 1 PID: 7626 at fs/btrfs/free-space-tree.c:1341 btrfs_rebuild_free_space_tree+0x470/0x54c fs/btrfs/free-space-tree.c:1341 Modules linked in: CPU: 1 UID: 0 PID: 7626 Comm: syz.2.25 Not tainted 6.15.0-rc7-syzkaller-00085-gd7fa1af5b33e-dirty #0 PREEMPT Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 05/07/2025 pstate: 60400005 (nZCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--) pc : btrfs_rebuild_free_space_tree+0x470/0x54c fs/btrfs/free-space-tree.c:1341 lr : btrfs_rebuild_free_space_tree+0x470/0x54c fs/btrfs/free-space-tree.c:1341 sp : ffff80009c4f7740 x29: ffff80009c4f77b0 x28: ffff0000d4c3f400 x27: 0000000000000000 x26: dfff800000000000 x25: ffff70001389eee8 x24: 0000000000000003 x23: 1fffe000182b6e7b x22: 0000000000000000 x21: ffff0000c15b73d8 x20: 00000000ffffffef x19: ffff0000c15b7378 x18: 1fffe0003386f276 x17: ffff80008f31e000 x16: ffff80008adbe98c x15: 0000000000000001 x14: 1fffe0001b281550 x13: 0000000000000000 x12: 0000000000000000 x11: ffff60001b281551 x10: 0000000000000003 x9 : 1c8922000a902c00 x8 : 1c8922000a902c00 x7 : ffff800080485878 x6 : 0000000000000000 x5 : 0000000000000001 x4 : 0000000000000001 x3 : ffff80008047843c x2 : 0000000000000001 x1 : ffff80008b3ebc40 x0 : 0000000000000001 Call trace: btrfs_rebuild_free_space_tree+0x470/0x54c fs/btrfs/free-space-tree.c:1341 (P) btrfs_start_pre_rw_mount+0xa78/0xe10 fs/btrfs/disk-io.c:3074 btrfs_remount_rw fs/btrfs/super.c:1319 [inline] btrfs_reconfigure+0x828/0x2418 fs/btrfs/super.c:1543 reconfigure_super+0x1d4/0x6f0 fs/super.c:1083 do_remount fs/namespace.c:3365 [inline] path_mount+0xb34/0xde0 fs/namespace.c:4200 do_mount fs/namespace.c:4221 [inline] __do_sys_mount fs/namespace.c:4432 [inline] __se_sys_mount fs/namespace.c:4409 [inline] __arm64_sys_mount+0x3e8/0x468 fs/namespace.c:4409 __invoke_syscall arch/arm64/kernel/syscall.c:35 [inline] invoke_syscall+0x98/0x2b8 arch/arm64/kernel/syscall.c:49 el0_svc_common+0x130/0x23c arch/arm64/kernel/syscall.c:132 do_el0_svc+0x48/0x58 arch/arm64/kernel/syscall.c:151 el0_svc+0x58/0x17c arch/arm64/kernel/entry-common.c:767 el0t_64_sync_handler+0x78/0x108 arch/arm64/kernel/entry-common.c:786 el0t_64_sync+0x198/0x19c arch/arm64/kernel/entry.S:600 irq event stamp: 330 hardirqs last enabled at (329): [<ffff80008048590c>] raw_spin_rq_unlock_irq kernel/sched/sched.h:1525 [inline] hardirqs last enabled at (329): [<ffff80008048590c>] finish_lock_switch+0xb0/0x1c0 kernel/sched/core.c:5130 hardirqs last disabled at (330): [<ffff80008adb9e60>] el1_dbg+0x24/0x80 arch/arm64/kernel/entry-common.c:511 softirqs last enabled at (10): [<ffff8000801fbf10>] local_bh_enable+0x10/0x34 include/linux/bottom_half.h:32 softirqs last disabled at (8): [<ffff8000801fbedc>] local_bh_disable+0x10/0x34 include/linux/bottom_half.h:19 ---[ end trace 0000000000000000 ]--- Fix this by flagging new block groups which had their free space tree entries already added and then skip them in the rebuild process. Also, since the rebuild may be triggered when doing a remount, make sure that when we clear an existing free space tree that we clear such flag from every existing block group, otherwise we would skip those block groups during the rebuild. Reported-by: syzbot+d0014fb0fc39c5487ae5@syzkaller.appspotmail.com Link: https://lore.kernel.org/linux-btrfs/68460a54.050a0220.daf97.0af5.GAE@google.com/ Fixes: 882af9f13e83 ("btrfs: handle free space tree rebuild in multiple transactions") Reviewed-by: Boris Burkov <boris@bur.io> Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2025-06-07 13:43:32 +01:00
if (test_bit(BLOCK_GROUP_FLAG_FREE_SPACE_ADDED,
&block_group->runtime_flags))
goto next;
ret = populate_free_space_tree(trans, block_group);
if (ret) {
btrfs_abort_transaction(trans, ret);
btrfs_end_transaction(trans);
return ret;
}
btrfs: fix failure to rebuild free space tree using multiple transactions If we are rebuilding a free space tree, while modifying the free space tree we may need to allocate a new metadata block group. If we end up using multiple transactions for the rebuild, when we call btrfs_end_transaction() we enter btrfs_create_pending_block_groups() which calls add_block_group_free_space() to add items to the free space tree for the block group. Then later during the free space tree rebuild, at btrfs_rebuild_free_space_tree(), we may find such new block groups and call populate_free_space_tree() for them, which fails with -EEXIST because there are already items in the free space tree. Then we abort the transaction with -EEXIST at btrfs_rebuild_free_space_tree(). Notice that we say "may find" the new block groups because a new block group may be inserted in the block groups rbtree, which is being iterated by the rebuild process, before or after the current node where the rebuild process is currently at. Syzbot recently reported such case which produces a trace like the following: ------------[ cut here ]------------ BTRFS: Transaction aborted (error -17) WARNING: CPU: 1 PID: 7626 at fs/btrfs/free-space-tree.c:1341 btrfs_rebuild_free_space_tree+0x470/0x54c fs/btrfs/free-space-tree.c:1341 Modules linked in: CPU: 1 UID: 0 PID: 7626 Comm: syz.2.25 Not tainted 6.15.0-rc7-syzkaller-00085-gd7fa1af5b33e-dirty #0 PREEMPT Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 05/07/2025 pstate: 60400005 (nZCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--) pc : btrfs_rebuild_free_space_tree+0x470/0x54c fs/btrfs/free-space-tree.c:1341 lr : btrfs_rebuild_free_space_tree+0x470/0x54c fs/btrfs/free-space-tree.c:1341 sp : ffff80009c4f7740 x29: ffff80009c4f77b0 x28: ffff0000d4c3f400 x27: 0000000000000000 x26: dfff800000000000 x25: ffff70001389eee8 x24: 0000000000000003 x23: 1fffe000182b6e7b x22: 0000000000000000 x21: ffff0000c15b73d8 x20: 00000000ffffffef x19: ffff0000c15b7378 x18: 1fffe0003386f276 x17: ffff80008f31e000 x16: ffff80008adbe98c x15: 0000000000000001 x14: 1fffe0001b281550 x13: 0000000000000000 x12: 0000000000000000 x11: ffff60001b281551 x10: 0000000000000003 x9 : 1c8922000a902c00 x8 : 1c8922000a902c00 x7 : ffff800080485878 x6 : 0000000000000000 x5 : 0000000000000001 x4 : 0000000000000001 x3 : ffff80008047843c x2 : 0000000000000001 x1 : ffff80008b3ebc40 x0 : 0000000000000001 Call trace: btrfs_rebuild_free_space_tree+0x470/0x54c fs/btrfs/free-space-tree.c:1341 (P) btrfs_start_pre_rw_mount+0xa78/0xe10 fs/btrfs/disk-io.c:3074 btrfs_remount_rw fs/btrfs/super.c:1319 [inline] btrfs_reconfigure+0x828/0x2418 fs/btrfs/super.c:1543 reconfigure_super+0x1d4/0x6f0 fs/super.c:1083 do_remount fs/namespace.c:3365 [inline] path_mount+0xb34/0xde0 fs/namespace.c:4200 do_mount fs/namespace.c:4221 [inline] __do_sys_mount fs/namespace.c:4432 [inline] __se_sys_mount fs/namespace.c:4409 [inline] __arm64_sys_mount+0x3e8/0x468 fs/namespace.c:4409 __invoke_syscall arch/arm64/kernel/syscall.c:35 [inline] invoke_syscall+0x98/0x2b8 arch/arm64/kernel/syscall.c:49 el0_svc_common+0x130/0x23c arch/arm64/kernel/syscall.c:132 do_el0_svc+0x48/0x58 arch/arm64/kernel/syscall.c:151 el0_svc+0x58/0x17c arch/arm64/kernel/entry-common.c:767 el0t_64_sync_handler+0x78/0x108 arch/arm64/kernel/entry-common.c:786 el0t_64_sync+0x198/0x19c arch/arm64/kernel/entry.S:600 irq event stamp: 330 hardirqs last enabled at (329): [<ffff80008048590c>] raw_spin_rq_unlock_irq kernel/sched/sched.h:1525 [inline] hardirqs last enabled at (329): [<ffff80008048590c>] finish_lock_switch+0xb0/0x1c0 kernel/sched/core.c:5130 hardirqs last disabled at (330): [<ffff80008adb9e60>] el1_dbg+0x24/0x80 arch/arm64/kernel/entry-common.c:511 softirqs last enabled at (10): [<ffff8000801fbf10>] local_bh_enable+0x10/0x34 include/linux/bottom_half.h:32 softirqs last disabled at (8): [<ffff8000801fbedc>] local_bh_disable+0x10/0x34 include/linux/bottom_half.h:19 ---[ end trace 0000000000000000 ]--- Fix this by flagging new block groups which had their free space tree entries already added and then skip them in the rebuild process. Also, since the rebuild may be triggered when doing a remount, make sure that when we clear an existing free space tree that we clear such flag from every existing block group, otherwise we would skip those block groups during the rebuild. Reported-by: syzbot+d0014fb0fc39c5487ae5@syzkaller.appspotmail.com Link: https://lore.kernel.org/linux-btrfs/68460a54.050a0220.daf97.0af5.GAE@google.com/ Fixes: 882af9f13e83 ("btrfs: handle free space tree rebuild in multiple transactions") Reviewed-by: Boris Burkov <boris@bur.io> Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2025-06-07 13:43:32 +01:00
next:
if (btrfs_should_end_transaction(trans)) {
btrfs_end_transaction(trans);
trans = btrfs_start_transaction(free_space_root, 1);
if (IS_ERR(trans))
return PTR_ERR(trans);
}
node = rb_next(node);
}
btrfs_set_fs_compat_ro(fs_info, FREE_SPACE_TREE);
btrfs_set_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID);
clear_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags);
ret = btrfs_commit_transaction(trans);
clear_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags);
return ret;
}
static int __add_block_group_free_space(struct btrfs_trans_handle *trans,
struct btrfs_block_group *block_group,
struct btrfs_path *path)
{
bool own_path = false;
int ret;
if (!test_and_clear_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE,
&block_group->runtime_flags))
return 0;
btrfs: fix failure to rebuild free space tree using multiple transactions If we are rebuilding a free space tree, while modifying the free space tree we may need to allocate a new metadata block group. If we end up using multiple transactions for the rebuild, when we call btrfs_end_transaction() we enter btrfs_create_pending_block_groups() which calls add_block_group_free_space() to add items to the free space tree for the block group. Then later during the free space tree rebuild, at btrfs_rebuild_free_space_tree(), we may find such new block groups and call populate_free_space_tree() for them, which fails with -EEXIST because there are already items in the free space tree. Then we abort the transaction with -EEXIST at btrfs_rebuild_free_space_tree(). Notice that we say "may find" the new block groups because a new block group may be inserted in the block groups rbtree, which is being iterated by the rebuild process, before or after the current node where the rebuild process is currently at. Syzbot recently reported such case which produces a trace like the following: ------------[ cut here ]------------ BTRFS: Transaction aborted (error -17) WARNING: CPU: 1 PID: 7626 at fs/btrfs/free-space-tree.c:1341 btrfs_rebuild_free_space_tree+0x470/0x54c fs/btrfs/free-space-tree.c:1341 Modules linked in: CPU: 1 UID: 0 PID: 7626 Comm: syz.2.25 Not tainted 6.15.0-rc7-syzkaller-00085-gd7fa1af5b33e-dirty #0 PREEMPT Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 05/07/2025 pstate: 60400005 (nZCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--) pc : btrfs_rebuild_free_space_tree+0x470/0x54c fs/btrfs/free-space-tree.c:1341 lr : btrfs_rebuild_free_space_tree+0x470/0x54c fs/btrfs/free-space-tree.c:1341 sp : ffff80009c4f7740 x29: ffff80009c4f77b0 x28: ffff0000d4c3f400 x27: 0000000000000000 x26: dfff800000000000 x25: ffff70001389eee8 x24: 0000000000000003 x23: 1fffe000182b6e7b x22: 0000000000000000 x21: ffff0000c15b73d8 x20: 00000000ffffffef x19: ffff0000c15b7378 x18: 1fffe0003386f276 x17: ffff80008f31e000 x16: ffff80008adbe98c x15: 0000000000000001 x14: 1fffe0001b281550 x13: 0000000000000000 x12: 0000000000000000 x11: ffff60001b281551 x10: 0000000000000003 x9 : 1c8922000a902c00 x8 : 1c8922000a902c00 x7 : ffff800080485878 x6 : 0000000000000000 x5 : 0000000000000001 x4 : 0000000000000001 x3 : ffff80008047843c x2 : 0000000000000001 x1 : ffff80008b3ebc40 x0 : 0000000000000001 Call trace: btrfs_rebuild_free_space_tree+0x470/0x54c fs/btrfs/free-space-tree.c:1341 (P) btrfs_start_pre_rw_mount+0xa78/0xe10 fs/btrfs/disk-io.c:3074 btrfs_remount_rw fs/btrfs/super.c:1319 [inline] btrfs_reconfigure+0x828/0x2418 fs/btrfs/super.c:1543 reconfigure_super+0x1d4/0x6f0 fs/super.c:1083 do_remount fs/namespace.c:3365 [inline] path_mount+0xb34/0xde0 fs/namespace.c:4200 do_mount fs/namespace.c:4221 [inline] __do_sys_mount fs/namespace.c:4432 [inline] __se_sys_mount fs/namespace.c:4409 [inline] __arm64_sys_mount+0x3e8/0x468 fs/namespace.c:4409 __invoke_syscall arch/arm64/kernel/syscall.c:35 [inline] invoke_syscall+0x98/0x2b8 arch/arm64/kernel/syscall.c:49 el0_svc_common+0x130/0x23c arch/arm64/kernel/syscall.c:132 do_el0_svc+0x48/0x58 arch/arm64/kernel/syscall.c:151 el0_svc+0x58/0x17c arch/arm64/kernel/entry-common.c:767 el0t_64_sync_handler+0x78/0x108 arch/arm64/kernel/entry-common.c:786 el0t_64_sync+0x198/0x19c arch/arm64/kernel/entry.S:600 irq event stamp: 330 hardirqs last enabled at (329): [<ffff80008048590c>] raw_spin_rq_unlock_irq kernel/sched/sched.h:1525 [inline] hardirqs last enabled at (329): [<ffff80008048590c>] finish_lock_switch+0xb0/0x1c0 kernel/sched/core.c:5130 hardirqs last disabled at (330): [<ffff80008adb9e60>] el1_dbg+0x24/0x80 arch/arm64/kernel/entry-common.c:511 softirqs last enabled at (10): [<ffff8000801fbf10>] local_bh_enable+0x10/0x34 include/linux/bottom_half.h:32 softirqs last disabled at (8): [<ffff8000801fbedc>] local_bh_disable+0x10/0x34 include/linux/bottom_half.h:19 ---[ end trace 0000000000000000 ]--- Fix this by flagging new block groups which had their free space tree entries already added and then skip them in the rebuild process. Also, since the rebuild may be triggered when doing a remount, make sure that when we clear an existing free space tree that we clear such flag from every existing block group, otherwise we would skip those block groups during the rebuild. Reported-by: syzbot+d0014fb0fc39c5487ae5@syzkaller.appspotmail.com Link: https://lore.kernel.org/linux-btrfs/68460a54.050a0220.daf97.0af5.GAE@google.com/ Fixes: 882af9f13e83 ("btrfs: handle free space tree rebuild in multiple transactions") Reviewed-by: Boris Burkov <boris@bur.io> Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2025-06-07 13:43:32 +01:00
/*
* While rebuilding the free space tree we may allocate new metadata
* block groups while modifying the free space tree.
*
* Because during the rebuild (at btrfs_rebuild_free_space_tree()) we
* can use multiple transactions, every time btrfs_end_transaction() is
* called at btrfs_rebuild_free_space_tree() we finish the creation of
* new block groups by calling btrfs_create_pending_block_groups(), and
* that in turn calls us, through add_block_group_free_space(), to add
* a free space info item and a free space extent item for the block
* group.
*
* Then later btrfs_rebuild_free_space_tree() may find such new block
* groups and processes them with populate_free_space_tree(), which can
* fail with EEXIST since there are already items for the block group in
* the free space tree. Notice that we say "may find" because a new
* block group may be added to the block groups rbtree in a node before
* or after the block group currently being processed by the rebuild
* process. So signal the rebuild process to skip such new block groups
* if it finds them.
*/
set_bit(BLOCK_GROUP_FLAG_FREE_SPACE_ADDED, &block_group->runtime_flags);
if (!path) {
path = btrfs_alloc_path();
if (!path) {
btrfs_abort_transaction(trans, -ENOMEM);
return -ENOMEM;
}
own_path = true;
}
ret = add_new_free_space_info(trans, block_group, path);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out;
}
ret = __btrfs_add_to_free_space_tree(trans, block_group, path,
block_group->start, block_group->length);
if (ret)
btrfs_abort_transaction(trans, ret);
out:
if (own_path)
btrfs_free_path(path);
return ret;
}
int btrfs_add_block_group_free_space(struct btrfs_trans_handle *trans,
struct btrfs_block_group *block_group)
{
int ret;
if (!btrfs_fs_compat_ro(trans->fs_info, FREE_SPACE_TREE))
return 0;
mutex_lock(&block_group->free_space_lock);
ret = __add_block_group_free_space(trans, block_group, NULL);
mutex_unlock(&block_group->free_space_lock);
return ret;
}
int btrfs_remove_block_group_free_space(struct btrfs_trans_handle *trans,
struct btrfs_block_group *block_group)
{
struct btrfs_root *root = btrfs_free_space_root(block_group);
struct btrfs_path *path;
struct btrfs_key key, found_key;
struct extent_buffer *leaf;
u64 start, end;
int done = 0, nr;
int ret;
if (!btrfs_fs_compat_ro(trans->fs_info, FREE_SPACE_TREE))
return 0;
if (test_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &block_group->runtime_flags)) {
/* We never added this block group to the free space tree. */
return 0;
}
path = btrfs_alloc_path();
if (!path) {
ret = -ENOMEM;
btrfs_abort_transaction(trans, ret);
goto out;
}
start = block_group->start;
end = block_group->start + block_group->length;
key.objectid = end - 1;
key.type = (u8)-1;
key.offset = (u64)-1;
while (!done) {
ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out;
}
leaf = path->nodes[0];
nr = 0;
path->slots[0]++;
while (path->slots[0] > 0) {
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0] - 1);
if (found_key.type == BTRFS_FREE_SPACE_INFO_KEY) {
ASSERT(found_key.objectid == block_group->start);
ASSERT(found_key.offset == block_group->length);
done = 1;
nr++;
path->slots[0]--;
break;
} else if (found_key.type == BTRFS_FREE_SPACE_EXTENT_KEY ||
found_key.type == BTRFS_FREE_SPACE_BITMAP_KEY) {
ASSERT(found_key.objectid >= start);
ASSERT(found_key.objectid < end);
ASSERT(found_key.objectid + found_key.offset <= end);
nr++;
path->slots[0]--;
} else {
ASSERT(0);
}
}
ret = btrfs_del_items(trans, root, path, path->slots[0], nr);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out;
}
btrfs_release_path(path);
}
ret = 0;
out:
btrfs_free_path(path);
return ret;
}
static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl,
struct btrfs_path *path,
u32 expected_extent_count)
{
struct btrfs_block_group *block_group;
struct btrfs_fs_info *fs_info;
struct btrfs_root *root;
struct btrfs_key key;
bool prev_bit_set = false;
/* Initialize to silence GCC. */
u64 extent_start = 0;
u64 end, offset;
u64 total_found = 0;
u32 extent_count = 0;
int ret;
block_group = caching_ctl->block_group;
fs_info = block_group->fs_info;
root = btrfs_free_space_root(block_group);
end = block_group->start + block_group->length;
while (1) {
ret = btrfs_next_item(root, path);
if (ret < 0)
return ret;
if (ret)
break;
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
if (key.type == BTRFS_FREE_SPACE_INFO_KEY)
break;
ASSERT(key.type == BTRFS_FREE_SPACE_BITMAP_KEY);
ASSERT(key.objectid < end && key.objectid + key.offset <= end);
offset = key.objectid;
while (offset < key.objectid + key.offset) {
bool bit_set;
bit_set = btrfs_free_space_test_bit(block_group, path, offset);
if (!prev_bit_set && bit_set) {
extent_start = offset;
} else if (prev_bit_set && !bit_set) {
u64 space_added;
ret = btrfs_add_new_free_space(block_group,
extent_start,
offset,
&space_added);
if (ret)
return ret;
total_found += space_added;
if (total_found > CACHING_CTL_WAKE_UP) {
total_found = 0;
wake_up(&caching_ctl->wait);
}
extent_count++;
}
prev_bit_set = bit_set;
offset += fs_info->sectorsize;
}
}
if (prev_bit_set) {
ret = btrfs_add_new_free_space(block_group, extent_start, end, NULL);
if (ret)
return ret;
extent_count++;
}
if (extent_count != expected_extent_count) {
btrfs_err(fs_info,
"incorrect extent count for %llu; counted %u, expected %u",
block_group->start, extent_count,
expected_extent_count);
DEBUG_WARN();
return -EIO;
}
return 0;
}
static int load_free_space_extents(struct btrfs_caching_control *caching_ctl,
struct btrfs_path *path,
u32 expected_extent_count)
{
struct btrfs_block_group *block_group;
struct btrfs_fs_info *fs_info;
struct btrfs_root *root;
struct btrfs_key key;
u64 end;
u64 total_found = 0;
u32 extent_count = 0;
int ret;
block_group = caching_ctl->block_group;
fs_info = block_group->fs_info;
root = btrfs_free_space_root(block_group);
end = block_group->start + block_group->length;
while (1) {
u64 space_added;
ret = btrfs_next_item(root, path);
if (ret < 0)
return ret;
if (ret)
break;
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
if (key.type == BTRFS_FREE_SPACE_INFO_KEY)
break;
ASSERT(key.type == BTRFS_FREE_SPACE_EXTENT_KEY);
ASSERT(key.objectid < end && key.objectid + key.offset <= end);
ret = btrfs_add_new_free_space(block_group, key.objectid,
key.objectid + key.offset,
&space_added);
if (ret)
return ret;
total_found += space_added;
if (total_found > CACHING_CTL_WAKE_UP) {
total_found = 0;
wake_up(&caching_ctl->wait);
}
extent_count++;
}
if (extent_count != expected_extent_count) {
btrfs_err(fs_info,
"incorrect extent count for %llu; counted %u, expected %u",
block_group->start, extent_count,
expected_extent_count);
DEBUG_WARN();
return -EIO;
}
return 0;
}
int btrfs_load_free_space_tree(struct btrfs_caching_control *caching_ctl)
{
struct btrfs_block_group *block_group;
struct btrfs_free_space_info *info;
BTRFS_PATH_AUTO_FREE(path);
u32 extent_count, flags;
block_group = caching_ctl->block_group;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
/*
* Just like caching_thread() doesn't want to deadlock on the extent
* tree, we don't want to deadlock on the free space tree.
*/
path->skip_locking = 1;
path->search_commit_root = 1;
path->reada = READA_FORWARD;
info = btrfs_search_free_space_info(NULL, block_group, path, 0);
if (IS_ERR(info))
return PTR_ERR(info);
extent_count = btrfs_free_space_extent_count(path->nodes[0], info);
flags = btrfs_free_space_flags(path->nodes[0], info);
/*
* We left path pointing to the free space info item, so now
* load_free_space_foo can just iterate through the free space tree from
* there.
*/
if (flags & BTRFS_FREE_SPACE_USING_BITMAPS)
return load_free_space_bitmaps(caching_ctl, path, extent_count);
else
return load_free_space_extents(caching_ctl, path, extent_count);
}