mirror of
git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2025-05-24 10:39:52 +00:00

Currently we're using btrfs_lock_and_flush_ordered_range() for both btrfs_read_folio() and btrfs_readahead(), but it has one critical problem for future subpage optimizations: - It will call btrfs_start_ordered_extent() to writeback the involved folios But remember we're calling btrfs_lock_and_flush_ordered_range() at read paths, meaning the folio is already locked by read path. If we really trigger writeback for those already locked folios, this will lead to a deadlock and writeback cannot get the folio lock. Such dead lock is prevented by the fact that btrfs always keeps a dirty folio also uptodate, by either dirtying all blocks of the folio, or by reading the whole folio before dirtying. To prepare for the incoming patch which allows btrfs to skip full folio read if the buffered write is block aligned, we have to start by solving the possible deadlock first. Instead of blindly calling btrfs_start_ordered_extent(), introduce a new helper, which is smarter in the following ways: - Only wait and flush the ordered extent if * The folio doesn't even have private bit set * Part of the blocks of the ordered extent are not uptodate This can happen by: * The folio writeback finished, then got invalidated. There are a lot of reasons that a folio can get invalidated, from memory pressure to direct IO (which invalidates all folios of the range). But OE not yet finished. We have to wait for the ordered extent, as the OE may contain to-be-inserted data checksum. Without waiting, our read can fail due to the missing checksum. But either way, the OE should not need any extra flush inside the locked folio range. - Skip the ordered extent completely if * All the blocks are dirty This happens when OE creation is caused by a folio writeback whose file offset is before our folio. E.g. 16K page size and 4K block size 0 8K 16K 24K 32K |//////////////||///////| | The writeback of folio 0 created an OE for range [0, 24K), but since folio 16K is not fully uptodate, a read is triggered for folio 16K. The writeback will never happen (we're holding the folio lock for read), nor will the OE finish. Thus we must skip the range. * All the blocks are uptodate This happens when the writeback finished, but OE not yet finished. Since the blocks are already uptodate, we can skip the OE range. The new helper lock_extents_for_read() will do a loop for the target range by: 1) Lock the full range 2) If there is no ordered extent in the remaining range, exit 3) If there is an ordered extent that we can skip Skip to the end of the OE, and continue checking We do not trigger writeback nor wait for the OE. 4) If there is an ordered extent that we cannot skip Unlock the whole extent range and start the ordered extent. And also update btrfs_start_ordered_extent() to add two more parameters: @nowriteback_start and @nowriteback_len, to prevent triggering flush for a certain range. This will allow us to handle the following case properly in the future: 16K page size, 4K btrfs block size: 0 4K 8K 12K 16K 20K 24K 28K 32K |/////////////////////////////||////////////////| | | |<-------------------- OE 2 ------------------->| |< OE 1 >| The folio has been written back before, thus we have an OE at [28K, 32K). Although the OE 1 finished its IO, the OE is not yet removed from IO tree. The folio got invalidated after writeback completed and before the ordered extent finished. And [16K, 24K) range is dirty and uptodate, caused by a block aligned buffered write (and future enhancements allowing btrfs to skip full folio read for such case). But writeback for folio 0 has began, thus it generated OE 2, covering range [0, 24K). Since the full folio 16K is not uptodate, if we want to read the folio, the existing btrfs_lock_and_flush_ordered_range() will dead lock, by: btrfs_read_folio() | Folio 16K is already locked |- btrfs_lock_and_flush_ordered_range() |- btrfs_start_ordered_extent() for range [16K, 24K) |- filemap_fdatawrite_range() for range [16K, 24K) |- extent_write_cache_pages() folio_lock() on folio 16K, deadlock. But now we will have the following sequence: btrfs_read_folio() | Folio 16K is already locked |- lock_extents_for_read() |- can_skip_ordered_extent() for range [16K, 24K) | Returned true, the range [16K, 24K) will be skipped. |- can_skip_ordered_extent() for range [28K, 32K) | Returned false. |- btrfs_start_ordered_extent() for range [28K, 32K) with [16K, 32K) as no writeback range No writeback for folio 16K will be triggered. And there will be no more possible deadlock on the same folio. Reviewed-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: Qu Wenruo <wqu@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
228 lines
6.7 KiB
C
228 lines
6.7 KiB
C
/* SPDX-License-Identifier: GPL-2.0 */
|
|
/*
|
|
* Copyright (C) 2007 Oracle. All rights reserved.
|
|
*/
|
|
|
|
#ifndef BTRFS_ORDERED_DATA_H
|
|
#define BTRFS_ORDERED_DATA_H
|
|
|
|
#include <linux/types.h>
|
|
#include <linux/list.h>
|
|
#include <linux/refcount.h>
|
|
#include <linux/completion.h>
|
|
#include <linux/rbtree.h>
|
|
#include <linux/wait.h>
|
|
#include "async-thread.h"
|
|
|
|
struct inode;
|
|
struct page;
|
|
struct extent_state;
|
|
struct btrfs_block_group;
|
|
struct btrfs_inode;
|
|
struct btrfs_root;
|
|
struct btrfs_fs_info;
|
|
|
|
struct btrfs_ordered_sum {
|
|
/*
|
|
* Logical start address and length for of the blocks covered by
|
|
* the sums array.
|
|
*/
|
|
u64 logical;
|
|
u32 len;
|
|
|
|
struct list_head list;
|
|
/* last field is a variable length array of csums */
|
|
u8 sums[];
|
|
};
|
|
|
|
/*
|
|
* Bits for btrfs_ordered_extent::flags.
|
|
*
|
|
* BTRFS_ORDERED_IO_DONE is set when all of the blocks are written.
|
|
* It is used to make sure metadata is inserted into the tree only once
|
|
* per extent.
|
|
*
|
|
* BTRFS_ORDERED_COMPLETE is set when the extent is removed from the
|
|
* rbtree, just before waking any waiters. It is used to indicate the
|
|
* IO is done and any metadata is inserted into the tree.
|
|
*/
|
|
enum {
|
|
/*
|
|
* Different types for ordered extents, one and only one of the 4 types
|
|
* need to be set when creating ordered extent.
|
|
*
|
|
* REGULAR: For regular non-compressed COW write
|
|
* NOCOW: For NOCOW write into existing non-hole extent
|
|
* PREALLOC: For NOCOW write into preallocated extent
|
|
* COMPRESSED: For compressed COW write
|
|
*/
|
|
BTRFS_ORDERED_REGULAR,
|
|
BTRFS_ORDERED_NOCOW,
|
|
BTRFS_ORDERED_PREALLOC,
|
|
BTRFS_ORDERED_COMPRESSED,
|
|
|
|
/*
|
|
* Extra bit for direct io, can only be set for
|
|
* REGULAR/NOCOW/PREALLOC. No direct io for compressed extent.
|
|
*/
|
|
BTRFS_ORDERED_DIRECT,
|
|
|
|
/* Extra status bits for ordered extents */
|
|
|
|
/* set when all the pages are written */
|
|
BTRFS_ORDERED_IO_DONE,
|
|
/* set when removed from the tree */
|
|
BTRFS_ORDERED_COMPLETE,
|
|
/* We had an io error when writing this out */
|
|
BTRFS_ORDERED_IOERR,
|
|
/* Set when we have to truncate an extent */
|
|
BTRFS_ORDERED_TRUNCATED,
|
|
/* Used during fsync to track already logged extents */
|
|
BTRFS_ORDERED_LOGGED,
|
|
/* We have already logged all the csums of the ordered extent */
|
|
BTRFS_ORDERED_LOGGED_CSUM,
|
|
/* We wait for this extent to complete in the current transaction */
|
|
BTRFS_ORDERED_PENDING,
|
|
/* BTRFS_IOC_ENCODED_WRITE */
|
|
BTRFS_ORDERED_ENCODED,
|
|
};
|
|
|
|
/* BTRFS_ORDERED_* flags that specify the type of the extent. */
|
|
#define BTRFS_ORDERED_TYPE_FLAGS ((1UL << BTRFS_ORDERED_REGULAR) | \
|
|
(1UL << BTRFS_ORDERED_NOCOW) | \
|
|
(1UL << BTRFS_ORDERED_PREALLOC) | \
|
|
(1UL << BTRFS_ORDERED_COMPRESSED) | \
|
|
(1UL << BTRFS_ORDERED_DIRECT) | \
|
|
(1UL << BTRFS_ORDERED_ENCODED))
|
|
|
|
struct btrfs_ordered_extent {
|
|
/* logical offset in the file */
|
|
u64 file_offset;
|
|
|
|
/*
|
|
* These fields directly correspond to the same fields in
|
|
* btrfs_file_extent_item.
|
|
*/
|
|
u64 num_bytes;
|
|
u64 ram_bytes;
|
|
u64 disk_bytenr;
|
|
u64 disk_num_bytes;
|
|
u64 offset;
|
|
|
|
/* number of bytes that still need writing */
|
|
u64 bytes_left;
|
|
|
|
/*
|
|
* If we get truncated we need to adjust the file extent we enter for
|
|
* this ordered extent so that we do not expose stale data.
|
|
*/
|
|
u64 truncated_len;
|
|
|
|
/* flags (described above) */
|
|
unsigned long flags;
|
|
|
|
/* compression algorithm */
|
|
int compress_type;
|
|
|
|
/* Qgroup reserved space */
|
|
int qgroup_rsv;
|
|
|
|
/* reference count */
|
|
refcount_t refs;
|
|
|
|
/* the inode we belong to */
|
|
struct btrfs_inode *inode;
|
|
|
|
/* list of checksums for insertion when the extent io is done */
|
|
struct list_head list;
|
|
|
|
/* used for fast fsyncs */
|
|
struct list_head log_list;
|
|
|
|
/* used to wait for the BTRFS_ORDERED_COMPLETE bit */
|
|
wait_queue_head_t wait;
|
|
|
|
/* our friendly rbtree entry */
|
|
struct rb_node rb_node;
|
|
|
|
/* a per root list of all the pending ordered extents */
|
|
struct list_head root_extent_list;
|
|
|
|
struct btrfs_work work;
|
|
|
|
struct completion completion;
|
|
struct btrfs_work flush_work;
|
|
struct list_head work_list;
|
|
|
|
struct list_head bioc_list;
|
|
};
|
|
|
|
int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent);
|
|
int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent);
|
|
|
|
void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry);
|
|
void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode,
|
|
struct btrfs_ordered_extent *entry);
|
|
void btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered,
|
|
struct folio *folio, u64 file_offset, u64 len,
|
|
bool uptodate);
|
|
void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode,
|
|
struct folio *folio, u64 file_offset,
|
|
u64 num_bytes, bool uptodate);
|
|
bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode,
|
|
struct btrfs_ordered_extent **cached,
|
|
u64 file_offset, u64 io_size);
|
|
|
|
/*
|
|
* This represents details about the target file extent item of a write operation.
|
|
*/
|
|
struct btrfs_file_extent {
|
|
u64 disk_bytenr;
|
|
u64 disk_num_bytes;
|
|
u64 num_bytes;
|
|
u64 ram_bytes;
|
|
u64 offset;
|
|
u8 compression;
|
|
};
|
|
|
|
struct btrfs_ordered_extent *btrfs_alloc_ordered_extent(
|
|
struct btrfs_inode *inode, u64 file_offset,
|
|
const struct btrfs_file_extent *file_extent, unsigned long flags);
|
|
void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry,
|
|
struct btrfs_ordered_sum *sum);
|
|
struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *inode,
|
|
u64 file_offset);
|
|
void btrfs_start_ordered_extent_nowriteback(struct btrfs_ordered_extent *entry,
|
|
u64 nowriteback_start, u32 nowriteback_len);
|
|
static inline void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry)
|
|
{
|
|
return btrfs_start_ordered_extent_nowriteback(entry, 0, 0);
|
|
}
|
|
|
|
int btrfs_wait_ordered_range(struct btrfs_inode *inode, u64 start, u64 len);
|
|
struct btrfs_ordered_extent *
|
|
btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset);
|
|
struct btrfs_ordered_extent *btrfs_lookup_first_ordered_range(
|
|
struct btrfs_inode *inode, u64 file_offset, u64 len);
|
|
struct btrfs_ordered_extent *btrfs_lookup_ordered_range(
|
|
struct btrfs_inode *inode,
|
|
u64 file_offset,
|
|
u64 len);
|
|
void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode,
|
|
struct list_head *list);
|
|
u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr,
|
|
const struct btrfs_block_group *bg);
|
|
void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr,
|
|
const struct btrfs_block_group *bg);
|
|
void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start,
|
|
u64 end,
|
|
struct extent_state **cached_state);
|
|
bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end,
|
|
struct extent_state **cached_state);
|
|
struct btrfs_ordered_extent *btrfs_split_ordered_extent(
|
|
struct btrfs_ordered_extent *ordered, u64 len);
|
|
void btrfs_mark_ordered_extent_error(struct btrfs_ordered_extent *ordered);
|
|
int __init ordered_data_init(void);
|
|
void __cold ordered_data_exit(void);
|
|
|
|
#endif
|