mirror of
				git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
				synced 2025-09-18 22:14:16 +00:00 
			
		
		
		
	for-6.13-tag
-----BEGIN PGP SIGNATURE-----
 
 iQIzBAABCgAdFiEE8rQSAMVO+zA4DBdWxWXV+ddtWDsFAmc0zT4ACgkQxWXV+ddt
 WDtThRAAhzSSiHcJqTfCL5nHh7w85MNEVw28o1ETgXSYJmx0JOWLE7Znlp2FV7jj
 IbYkFfF2gXJzYvRZkcXB/TAHV9KJG5yZIBZfccbM+9db9f8xkImVKMuqQRXPU41R
 ppSCmqZTeujtt8ucsaJkMpm6pzECKJCJaGOsMJ8fiqKpo89dKO3eGAVboSbpPF4C
 r0YmppiBwSP/cCXQCqWxZRbqPGN+lUgZpIGNRi157kehfmRHlVVJTO1pgqK8PCXb
 uIT09Kulppfez8+1A10CPcniDTyinLik/qLTNlzdWoDBL4iNJMg0A0wsA04AJVf0
 PdOS0REusiv3QcEIO6PefuRFRRfXcSLPpPDUceltJT5O0uM2gUqf2C7dEHXUGU3o
 TdgYlbQpsJWpZ7VGWQDZeGGV04lOPQvu0LGLPgEerUQd5H9ABa0dX8Fn0sPhKsa8
 whpAcdfE4rdNxB2OJFnqQeFq0z3cSjP/rvKlluCmAj97QYI+kiu3QyhemcT1YSC9
 U7n5Ya9IzIYCN3ml54q3hEgyD0IVGGG20GuUmqC9XSP9mrQRC8I1g7v26AiOTrrk
 VhgSdtMmphDxXudifsnYMaQ0Z1QqiUrW1SM/prAEOnBYCo75+HDsTgrq9ithgHoI
 4xz4YXJyMRs18qfTJctXC1wmGuz5plTdQrwarHdNsELN5HEyqX4=
 =aAcf
 -----END PGP SIGNATURE-----
Merge tag 'for-6.13-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux
Pull btrfs updates from David Sterba:
 "Changes outside of btrfs: add io_uring command flag to track a dying
  task (the rest will go via the block git tree).
  User visible changes:
   - wire encoded read (ioctl) to io_uring commands, this can be used on
     itself, in the future this will allow 'send' to be asynchronous. As
     a consequence, the encoded read ioctl can also work in non-blocking
     mode
   - new ioctl to wait for cleaned subvolumes, no need to use the
     generic and root-only SEARCH_TREE ioctl, will be used by "btrfs
     subvol sync"
   - recognize different paths/symlinks for the same devices and don't
     report them during rescanning, this can be observed with LVM or DM
   - seeding device use case change, the sprout device (the one
     capturing new writes) will not clear the read-only status of the
     super block; this prevents accumulating space from deleted
     snapshots
  Performance improvements:
   - reduce lock contention when traversing extent buffers
   - reduce extent tree lock contention when searching for inline
     backref
   - switch from rb-trees to xarray for delayed ref tracking,
     improvements due to better cache locality, branching factors and
     more compact data structures
   - enable extent map shrinker again (prevent memory exhaustion under
     some types of IO load), reworked to run in a single worker thread
     (there used to be problems causing long stalls under memory
     pressure)
  Core changes:
   - raid-stripe-tree feature updates:
       - make device replace and scrub work
       - implement partial deletion of stripe extents
       - new selftests
   - split the config option BTRFS_DEBUG and add EXPERIMENTAL for
     features that are experimental or with known problems so we don't
     misuse debugging config for that
   - subpage mode updates (sector < page):
       - update compression implementations
       - update writepage, writeback
   - continued folio API conversions:
       - buffered writes
   - make buffered write copy one page at a time, preparatory work for
     future integration with large folios, may cause performance drop
   - proper locking of root item regarding starting send
   - error handling improvements
   - code cleanups and refactoring:
       - dead code removal
       - unused parameter reduction
       - lockdep assertions"
* tag 'for-6.13-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux: (119 commits)
  btrfs: send: check for read-only send root under critical section
  btrfs: send: check for dead send root under critical section
  btrfs: remove check for NULL fs_info at btrfs_folio_end_lock_bitmap()
  btrfs: fix warning on PTR_ERR() against NULL device at btrfs_control_ioctl()
  btrfs: fix a typo in btrfs_use_zone_append
  btrfs: avoid superfluous calls to free_extent_map() in btrfs_encoded_read()
  btrfs: simplify logic to decrement snapshot counter at btrfs_mksnapshot()
  btrfs: remove hole from struct btrfs_delayed_node
  btrfs: update stale comment for struct btrfs_delayed_ref_node::add_list
  btrfs: add new ioctl to wait for cleaned subvolumes
  btrfs: simplify range tracking in cow_file_range()
  btrfs: remove conditional path allocation in btrfs_read_locked_inode()
  btrfs: push cleanup into btrfs_read_locked_inode()
  io_uring/cmd: let cmds to know about dying task
  btrfs: add struct io_btrfs_cmd as type for io_uring_cmd_to_pdu()
  btrfs: add io_uring command for encoded reads (ENCODED_READ ioctl)
  btrfs: move priv off stack in btrfs_encoded_read_regular_fill_pages()
  btrfs: don't sleep in btrfs_encoded_read() if IOCB_NOWAIT is set
  btrfs: change btrfs_encoded_read() so that reading of extent is done by caller
  btrfs: remove pointless iocb::ki_pos addition in btrfs_encoded_read()
  ...
			
			
This commit is contained in:
		
						commit
						c14a8a4c04
					
				
					 67 changed files with 2470 additions and 1432 deletions
				
			
		|  | @ -78,6 +78,32 @@ config BTRFS_ASSERT | |||
| 
 | ||||
| 	  If unsure, say N. | ||||
| 
 | ||||
| config BTRFS_EXPERIMENTAL | ||||
| 	bool "Btrfs experimental features" | ||||
| 	depends on BTRFS_FS | ||||
| 	default n | ||||
| 	help | ||||
| 	  Enable experimental features.  These features may not be stable enough | ||||
| 	  for end users.  This is meant for btrfs developers or users who wish | ||||
| 	  to test the functionality and report problems. | ||||
| 
 | ||||
| 	  Current list: | ||||
| 
 | ||||
| 	  - extent map shrinker - performance problems with too frequent shrinks | ||||
| 
 | ||||
| 	  - send stream protocol v3 - fs-verity support | ||||
| 
 | ||||
| 	  - checksum offload mode - sysfs knob to affect when checksums are | ||||
| 	                            calculated (at IO time, or in a thread) | ||||
| 
 | ||||
| 	  - raid-stripe-tree - additional mapping of extents to devices to | ||||
| 			       support RAID1* profiles on zoned devices, | ||||
| 			       RAID56 not yet supported | ||||
| 
 | ||||
| 	  - extent tree v2 - complex rework of extent tracking | ||||
| 
 | ||||
| 	  If unsure, say N. | ||||
| 
 | ||||
| config BTRFS_FS_REF_VERIFY | ||||
| 	bool "Btrfs with the ref verify tool compiled in" | ||||
| 	depends on BTRFS_FS | ||||
|  |  | |||
|  | @ -43,4 +43,5 @@ btrfs-$(CONFIG_FS_VERITY) += verity.o | |||
| btrfs-$(CONFIG_BTRFS_FS_RUN_SANITY_TESTS) += tests/free-space-tests.o \
 | ||||
| 	tests/extent-buffer-tests.o tests/btrfs-tests.o \
 | ||||
| 	tests/extent-io-tests.o tests/inode-tests.o tests/qgroup-tests.o \
 | ||||
| 	tests/free-space-tree-tests.o tests/extent-map-tests.o | ||||
| 	tests/free-space-tree-tests.o tests/extent-map-tests.o \
 | ||||
| 	tests/raid-stripe-tree-tests.o | ||||
|  |  | |||
|  | @ -1442,7 +1442,8 @@ again: | |||
| 		 */ | ||||
| 		delayed_refs = &ctx->trans->transaction->delayed_refs; | ||||
| 		spin_lock(&delayed_refs->lock); | ||||
| 		head = btrfs_find_delayed_ref_head(delayed_refs, ctx->bytenr); | ||||
| 		head = btrfs_find_delayed_ref_head(ctx->fs_info, delayed_refs, | ||||
| 						   ctx->bytenr); | ||||
| 		if (head) { | ||||
| 			if (!mutex_trylock(&head->mutex)) { | ||||
| 				refcount_inc(&head->refs); | ||||
|  |  | |||
|  | @ -587,7 +587,7 @@ static bool should_async_write(struct btrfs_bio *bbio) | |||
| { | ||||
| 	bool auto_csum_mode = true; | ||||
| 
 | ||||
| #ifdef CONFIG_BTRFS_DEBUG | ||||
| #ifdef CONFIG_BTRFS_EXPERIMENTAL | ||||
| 	struct btrfs_fs_devices *fs_devices = bbio->fs_info->fs_devices; | ||||
| 	enum btrfs_offload_csum_mode csum_mode = READ_ONCE(fs_devices->offload_csum_mode); | ||||
| 
 | ||||
|  |  | |||
|  | @ -2797,7 +2797,7 @@ next: | |||
| 		 * uncompressed data size, because the compression is only done | ||||
| 		 * when writeback triggered and we don't know how much space we | ||||
| 		 * are actually going to need, so we reserve the uncompressed | ||||
| 		 * size because the data may be uncompressible in the worst case. | ||||
| 		 * size because the data may be incompressible in the worst case. | ||||
| 		 */ | ||||
| 		if (ret == 0) { | ||||
| 			bool used; | ||||
|  |  | |||
|  | @ -577,7 +577,6 @@ void btrfs_merge_delalloc_extent(struct btrfs_inode *inode, struct extent_state | |||
| 				 struct extent_state *other); | ||||
| void btrfs_split_delalloc_extent(struct btrfs_inode *inode, | ||||
| 				 struct extent_state *orig, u64 split); | ||||
| void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end); | ||||
| void btrfs_evict_inode(struct inode *inode); | ||||
| struct inode *btrfs_alloc_inode(struct super_block *sb); | ||||
| void btrfs_destroy_inode(struct inode *inode); | ||||
|  | @ -613,11 +612,17 @@ int btrfs_writepage_cow_fixup(struct folio *folio); | |||
| int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info, | ||||
| 					     int compress_type); | ||||
| int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, | ||||
| 					  u64 file_offset, u64 disk_bytenr, | ||||
| 					  u64 disk_io_size, | ||||
| 					  struct page **pages); | ||||
| 					  u64 disk_bytenr, u64 disk_io_size, | ||||
| 					  struct page **pages, void *uring_ctx); | ||||
| ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, | ||||
| 			   struct btrfs_ioctl_encoded_io_args *encoded); | ||||
| 			   struct btrfs_ioctl_encoded_io_args *encoded, | ||||
| 			   struct extent_state **cached_state, | ||||
| 			   u64 *disk_bytenr, u64 *disk_io_size); | ||||
| ssize_t btrfs_encoded_read_regular(struct kiocb *iocb, struct iov_iter *iter, | ||||
| 				   u64 start, u64 lockend, | ||||
| 				   struct extent_state **cached_state, | ||||
| 				   u64 disk_bytenr, u64 disk_io_size, | ||||
| 				   size_t count, bool compressed, bool *unlocked); | ||||
| ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, | ||||
| 			       const struct btrfs_ioctl_encoded_io_args *encoded); | ||||
| 
 | ||||
|  |  | |||
|  | @ -453,7 +453,7 @@ static noinline int add_ra_bio_pages(struct inode *inode, | |||
| 		if (pg_index > end_index) | ||||
| 			break; | ||||
| 
 | ||||
| 		folio = __filemap_get_folio(mapping, pg_index, 0, 0); | ||||
| 		folio = filemap_get_folio(mapping, pg_index); | ||||
| 		if (!IS_ERR(folio)) { | ||||
| 			u64 folio_sz = folio_size(folio); | ||||
| 			u64 offset = offset_in_folio(folio, cur); | ||||
|  | @ -545,8 +545,7 @@ static noinline int add_ra_bio_pages(struct inode *inode, | |||
| 		 * subpage::readers and to unlock the page. | ||||
| 		 */ | ||||
| 		if (fs_info->sectorsize < PAGE_SIZE) | ||||
| 			btrfs_subpage_start_reader(fs_info, folio, cur, | ||||
| 						   add_size); | ||||
| 			btrfs_folio_set_lock(fs_info, folio, cur, add_size); | ||||
| 		folio_put(folio); | ||||
| 		cur += add_size; | ||||
| 	} | ||||
|  | @ -702,7 +701,7 @@ static void free_heuristic_ws(struct list_head *ws) | |||
| 	kfree(workspace); | ||||
| } | ||||
| 
 | ||||
| static struct list_head *alloc_heuristic_ws(unsigned int level) | ||||
| static struct list_head *alloc_heuristic_ws(void) | ||||
| { | ||||
| 	struct heuristic_ws *ws; | ||||
| 
 | ||||
|  | @ -744,9 +743,9 @@ static const struct btrfs_compress_op * const btrfs_compress_op[] = { | |||
| static struct list_head *alloc_workspace(int type, unsigned int level) | ||||
| { | ||||
| 	switch (type) { | ||||
| 	case BTRFS_COMPRESS_NONE: return alloc_heuristic_ws(level); | ||||
| 	case BTRFS_COMPRESS_NONE: return alloc_heuristic_ws(); | ||||
| 	case BTRFS_COMPRESS_ZLIB: return zlib_alloc_workspace(level); | ||||
| 	case BTRFS_COMPRESS_LZO:  return lzo_alloc_workspace(level); | ||||
| 	case BTRFS_COMPRESS_LZO:  return lzo_alloc_workspace(); | ||||
| 	case BTRFS_COMPRESS_ZSTD: return zstd_alloc_workspace(level); | ||||
| 	default: | ||||
| 		/*
 | ||||
|  | @ -1030,6 +1029,7 @@ int btrfs_compress_folios(unsigned int type_level, struct address_space *mapping | |||
| { | ||||
| 	int type = btrfs_compress_type(type_level); | ||||
| 	int level = btrfs_compress_level(type_level); | ||||
| 	const unsigned long orig_len = *total_out; | ||||
| 	struct list_head *workspace; | ||||
| 	int ret; | ||||
| 
 | ||||
|  | @ -1037,6 +1037,8 @@ int btrfs_compress_folios(unsigned int type_level, struct address_space *mapping | |||
| 	workspace = get_workspace(type, level); | ||||
| 	ret = compression_compress_pages(type, workspace, mapping, start, folios, | ||||
| 					 out_folios, total_in, total_out); | ||||
| 	/* The total read-in bytes should be no larger than the input. */ | ||||
| 	ASSERT(*total_in <= orig_len); | ||||
| 	put_workspace(type, workspace); | ||||
| 	return ret; | ||||
| } | ||||
|  |  | |||
|  | @ -175,7 +175,7 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb); | |||
| int lzo_decompress(struct list_head *ws, const u8 *data_in, | ||||
| 		struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen, | ||||
| 		size_t destlen); | ||||
| struct list_head *lzo_alloc_workspace(unsigned int level); | ||||
| struct list_head *lzo_alloc_workspace(void); | ||||
| void lzo_free_workspace(struct list_head *ws); | ||||
| 
 | ||||
| int zstd_compress_folios(struct list_head *ws, struct address_space *mapping, | ||||
|  |  | |||
							
								
								
									
										148
									
								
								fs/btrfs/ctree.c
									
										
									
									
									
								
							
							
						
						
									
										148
									
								
								fs/btrfs/ctree.c
									
										
									
									
									
								
							|  | @ -1508,26 +1508,26 @@ static noinline void unlock_up(struct btrfs_path *path, int level, | |||
|  */ | ||||
| static int | ||||
| read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, | ||||
| 		      struct extent_buffer **eb_ret, int level, int slot, | ||||
| 		      struct extent_buffer **eb_ret, int slot, | ||||
| 		      const struct btrfs_key *key) | ||||
| { | ||||
| 	struct btrfs_fs_info *fs_info = root->fs_info; | ||||
| 	struct btrfs_tree_parent_check check = { 0 }; | ||||
| 	u64 blocknr; | ||||
| 	u64 gen; | ||||
| 	struct extent_buffer *tmp; | ||||
| 	int ret; | ||||
| 	struct extent_buffer *tmp = NULL; | ||||
| 	int ret = 0; | ||||
| 	int parent_level; | ||||
| 	bool unlock_up; | ||||
| 	int err; | ||||
| 	bool read_tmp = false; | ||||
| 	bool tmp_locked = false; | ||||
| 	bool path_released = false; | ||||
| 
 | ||||
| 	unlock_up = ((level + 1 < BTRFS_MAX_LEVEL) && p->locks[level + 1]); | ||||
| 	blocknr = btrfs_node_blockptr(*eb_ret, slot); | ||||
| 	gen = btrfs_node_ptr_generation(*eb_ret, slot); | ||||
| 	parent_level = btrfs_header_level(*eb_ret); | ||||
| 	btrfs_node_key_to_cpu(*eb_ret, &check.first_key, slot); | ||||
| 	check.has_first_key = true; | ||||
| 	check.level = parent_level - 1; | ||||
| 	check.transid = gen; | ||||
| 	check.transid = btrfs_node_ptr_generation(*eb_ret, slot); | ||||
| 	check.owner_root = btrfs_root_id(root); | ||||
| 
 | ||||
| 	/*
 | ||||
|  | @ -1540,80 +1540,116 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, | |||
| 	tmp = find_extent_buffer(fs_info, blocknr); | ||||
| 	if (tmp) { | ||||
| 		if (p->reada == READA_FORWARD_ALWAYS) | ||||
| 			reada_for_search(fs_info, p, level, slot, key->objectid); | ||||
| 			reada_for_search(fs_info, p, parent_level, slot, key->objectid); | ||||
| 
 | ||||
| 		/* first we do an atomic uptodate check */ | ||||
| 		if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) { | ||||
| 		if (btrfs_buffer_uptodate(tmp, check.transid, 1) > 0) { | ||||
| 			/*
 | ||||
| 			 * Do extra check for first_key, eb can be stale due to | ||||
| 			 * being cached, read from scrub, or have multiple | ||||
| 			 * parents (shared tree blocks). | ||||
| 			 */ | ||||
| 			if (btrfs_verify_level_key(tmp, | ||||
| 					parent_level - 1, &check.first_key, gen)) { | ||||
| 				free_extent_buffer(tmp); | ||||
| 				return -EUCLEAN; | ||||
| 			if (btrfs_verify_level_key(tmp, &check)) { | ||||
| 				ret = -EUCLEAN; | ||||
| 				goto out; | ||||
| 			} | ||||
| 			*eb_ret = tmp; | ||||
| 			return 0; | ||||
| 			tmp = NULL; | ||||
| 			ret = 0; | ||||
| 			goto out; | ||||
| 		} | ||||
| 
 | ||||
| 		if (p->nowait) { | ||||
| 			free_extent_buffer(tmp); | ||||
| 			return -EAGAIN; | ||||
| 		} | ||||
| 
 | ||||
| 		if (unlock_up) | ||||
| 			btrfs_unlock_up_safe(p, level + 1); | ||||
| 
 | ||||
| 		/* now we're allowed to do a blocking uptodate check */ | ||||
| 		ret = btrfs_read_extent_buffer(tmp, &check); | ||||
| 		if (ret) { | ||||
| 			free_extent_buffer(tmp); | ||||
| 			btrfs_release_path(p); | ||||
| 			return ret; | ||||
| 		} | ||||
| 
 | ||||
| 		if (unlock_up) | ||||
| 			ret = -EAGAIN; | ||||
| 			goto out; | ||||
| 		} | ||||
| 
 | ||||
| 		if (!p->skip_locking) { | ||||
| 			btrfs_unlock_up_safe(p, parent_level + 1); | ||||
| 			tmp_locked = true; | ||||
| 			btrfs_tree_read_lock(tmp); | ||||
| 			btrfs_release_path(p); | ||||
| 			ret = -EAGAIN; | ||||
| 			path_released = true; | ||||
| 		} | ||||
| 
 | ||||
| 		/* Now we're allowed to do a blocking uptodate check. */ | ||||
| 		err = btrfs_read_extent_buffer(tmp, &check); | ||||
| 		if (err) { | ||||
| 			ret = err; | ||||
| 			goto out; | ||||
| 		} | ||||
| 
 | ||||
| 		if (ret == 0) { | ||||
| 			ASSERT(!tmp_locked); | ||||
| 			*eb_ret = tmp; | ||||
| 			tmp = NULL; | ||||
| 		} | ||||
| 		goto out; | ||||
| 	} else if (p->nowait) { | ||||
| 		return -EAGAIN; | ||||
| 		ret = -EAGAIN; | ||||
| 		goto out; | ||||
| 	} | ||||
| 
 | ||||
| 	if (unlock_up) { | ||||
| 		btrfs_unlock_up_safe(p, level + 1); | ||||
| 	if (!p->skip_locking) { | ||||
| 		btrfs_unlock_up_safe(p, parent_level + 1); | ||||
| 		ret = -EAGAIN; | ||||
| 	} else { | ||||
| 		ret = 0; | ||||
| 	} | ||||
| 
 | ||||
| 	if (p->reada != READA_NONE) | ||||
| 		reada_for_search(fs_info, p, level, slot, key->objectid); | ||||
| 		reada_for_search(fs_info, p, parent_level, slot, key->objectid); | ||||
| 
 | ||||
| 	tmp = read_tree_block(fs_info, blocknr, &check); | ||||
| 	tmp = btrfs_find_create_tree_block(fs_info, blocknr, check.owner_root, check.level); | ||||
| 	if (IS_ERR(tmp)) { | ||||
| 		btrfs_release_path(p); | ||||
| 		return PTR_ERR(tmp); | ||||
| 		ret = PTR_ERR(tmp); | ||||
| 		tmp = NULL; | ||||
| 		goto out; | ||||
| 	} | ||||
| 	read_tmp = true; | ||||
| 
 | ||||
| 	if (!p->skip_locking) { | ||||
| 		ASSERT(ret == -EAGAIN); | ||||
| 		tmp_locked = true; | ||||
| 		btrfs_tree_read_lock(tmp); | ||||
| 		btrfs_release_path(p); | ||||
| 		path_released = true; | ||||
| 	} | ||||
| 
 | ||||
| 	/* Now we're allowed to do a blocking uptodate check. */ | ||||
| 	err = btrfs_read_extent_buffer(tmp, &check); | ||||
| 	if (err) { | ||||
| 		ret = err; | ||||
| 		goto out; | ||||
| 	} | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * If the read above didn't mark this buffer up to date, | ||||
| 	 * it will never end up being up to date.  Set ret to EIO now | ||||
| 	 * and give up so that our caller doesn't loop forever | ||||
| 	 * on our EAGAINs. | ||||
| 	 */ | ||||
| 	if (!extent_buffer_uptodate(tmp)) | ||||
| 	if (!extent_buffer_uptodate(tmp)) { | ||||
| 		ret = -EIO; | ||||
| 
 | ||||
| out: | ||||
| 	if (ret == 0) { | ||||
| 		*eb_ret = tmp; | ||||
| 	} else { | ||||
| 		free_extent_buffer(tmp); | ||||
| 		btrfs_release_path(p); | ||||
| 		goto out; | ||||
| 	} | ||||
| 
 | ||||
| 	if (ret == 0) { | ||||
| 		ASSERT(!tmp_locked); | ||||
| 		*eb_ret = tmp; | ||||
| 		tmp = NULL; | ||||
| 	} | ||||
| out: | ||||
| 	if (tmp) { | ||||
| 		if (tmp_locked) | ||||
| 			btrfs_tree_read_unlock(tmp); | ||||
| 		if (read_tmp && ret && ret != -EAGAIN) | ||||
| 			free_extent_buffer_stale(tmp); | ||||
| 		else | ||||
| 			free_extent_buffer(tmp); | ||||
| 	} | ||||
| 	if (ret && !path_released) | ||||
| 		btrfs_release_path(p); | ||||
| 
 | ||||
| 	return ret; | ||||
| } | ||||
| 
 | ||||
|  | @ -2197,8 +2233,8 @@ cow_done: | |||
| 			goto done; | ||||
| 		} | ||||
| 
 | ||||
| 		err = read_block_for_search(root, p, &b, level, slot, key); | ||||
| 		if (err == -EAGAIN) | ||||
| 		err = read_block_for_search(root, p, &b, slot, key); | ||||
| 		if (err == -EAGAIN && !p->nowait) | ||||
| 			goto again; | ||||
| 		if (err) { | ||||
| 			ret = err; | ||||
|  | @ -2324,8 +2360,8 @@ again: | |||
| 			goto done; | ||||
| 		} | ||||
| 
 | ||||
| 		err = read_block_for_search(root, p, &b, level, slot, key); | ||||
| 		if (err == -EAGAIN) | ||||
| 		err = read_block_for_search(root, p, &b, slot, key); | ||||
| 		if (err == -EAGAIN && !p->nowait) | ||||
| 			goto again; | ||||
| 		if (err) { | ||||
| 			ret = err; | ||||
|  | @ -2334,7 +2370,7 @@ again: | |||
| 
 | ||||
| 		level = btrfs_header_level(b); | ||||
| 		btrfs_tree_read_lock(b); | ||||
| 		b = btrfs_tree_mod_log_rewind(fs_info, p, b, time_seq); | ||||
| 		b = btrfs_tree_mod_log_rewind(fs_info, b, time_seq); | ||||
| 		if (!b) { | ||||
| 			ret = -ENOMEM; | ||||
| 			goto done; | ||||
|  | @ -4930,8 +4966,7 @@ again: | |||
| 		} | ||||
| 
 | ||||
| 		next = c; | ||||
| 		ret = read_block_for_search(root, path, &next, level, | ||||
| 					    slot, &key); | ||||
| 		ret = read_block_for_search(root, path, &next, slot, &key); | ||||
| 		if (ret == -EAGAIN && !path->nowait) | ||||
| 			goto again; | ||||
| 
 | ||||
|  | @ -4974,8 +5009,7 @@ again: | |||
| 		if (!level) | ||||
| 			break; | ||||
| 
 | ||||
| 		ret = read_block_for_search(root, path, &next, level, | ||||
| 					    0, &key); | ||||
| 		ret = read_block_for_search(root, path, &next, 0, &key); | ||||
| 		if (ret == -EAGAIN && !path->nowait) | ||||
| 			goto again; | ||||
| 
 | ||||
|  |  | |||
|  | @ -64,9 +64,9 @@ struct btrfs_delayed_node { | |||
| 	struct mutex mutex; | ||||
| 	struct btrfs_inode_item inode_item; | ||||
| 	refcount_t refs; | ||||
| 	int count; | ||||
| 	u64 index_cnt; | ||||
| 	unsigned long flags; | ||||
| 	int count; | ||||
| 	/*
 | ||||
| 	 * The size of the next batch of dir index items to insert (if this | ||||
| 	 * node is from a directory inode). Protected by @mutex. | ||||
|  |  | |||
|  | @ -9,6 +9,7 @@ | |||
| #include "messages.h" | ||||
| #include "ctree.h" | ||||
| #include "delayed-ref.h" | ||||
| #include "extent-tree.h" | ||||
| #include "transaction.h" | ||||
| #include "qgroup.h" | ||||
| #include "space-info.h" | ||||
|  | @ -313,39 +314,6 @@ static int comp_refs(struct btrfs_delayed_ref_node *ref1, | |||
| 	return 0; | ||||
| } | ||||
| 
 | ||||
| /* insert a new ref to head ref rbtree */ | ||||
| static struct btrfs_delayed_ref_head *htree_insert(struct rb_root_cached *root, | ||||
| 						   struct rb_node *node) | ||||
| { | ||||
| 	struct rb_node **p = &root->rb_root.rb_node; | ||||
| 	struct rb_node *parent_node = NULL; | ||||
| 	struct btrfs_delayed_ref_head *entry; | ||||
| 	struct btrfs_delayed_ref_head *ins; | ||||
| 	u64 bytenr; | ||||
| 	bool leftmost = true; | ||||
| 
 | ||||
| 	ins = rb_entry(node, struct btrfs_delayed_ref_head, href_node); | ||||
| 	bytenr = ins->bytenr; | ||||
| 	while (*p) { | ||||
| 		parent_node = *p; | ||||
| 		entry = rb_entry(parent_node, struct btrfs_delayed_ref_head, | ||||
| 				 href_node); | ||||
| 
 | ||||
| 		if (bytenr < entry->bytenr) { | ||||
| 			p = &(*p)->rb_left; | ||||
| 		} else if (bytenr > entry->bytenr) { | ||||
| 			p = &(*p)->rb_right; | ||||
| 			leftmost = false; | ||||
| 		} else { | ||||
| 			return entry; | ||||
| 		} | ||||
| 	} | ||||
| 
 | ||||
| 	rb_link_node(node, parent_node, p); | ||||
| 	rb_insert_color_cached(node, root, leftmost); | ||||
| 	return NULL; | ||||
| } | ||||
| 
 | ||||
| static struct btrfs_delayed_ref_node* tree_insert(struct rb_root_cached *root, | ||||
| 		struct btrfs_delayed_ref_node *ins) | ||||
| { | ||||
|  | @ -380,75 +348,32 @@ static struct btrfs_delayed_ref_node* tree_insert(struct rb_root_cached *root, | |||
| static struct btrfs_delayed_ref_head *find_first_ref_head( | ||||
| 		struct btrfs_delayed_ref_root *dr) | ||||
| { | ||||
| 	struct rb_node *n; | ||||
| 	struct btrfs_delayed_ref_head *entry; | ||||
| 	unsigned long from = 0; | ||||
| 
 | ||||
| 	n = rb_first_cached(&dr->href_root); | ||||
| 	if (!n) | ||||
| 		return NULL; | ||||
| 	lockdep_assert_held(&dr->lock); | ||||
| 
 | ||||
| 	entry = rb_entry(n, struct btrfs_delayed_ref_head, href_node); | ||||
| 
 | ||||
| 	return entry; | ||||
| 	return xa_find(&dr->head_refs, &from, ULONG_MAX, XA_PRESENT); | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Find a head entry based on bytenr. This returns the delayed ref head if it | ||||
|  * was able to find one, or NULL if nothing was in that spot.  If return_bigger | ||||
|  * is given, the next bigger entry is returned if no exact match is found. | ||||
|  */ | ||||
| static struct btrfs_delayed_ref_head *find_ref_head( | ||||
| 		struct btrfs_delayed_ref_root *dr, u64 bytenr, | ||||
| 		bool return_bigger) | ||||
| { | ||||
| 	struct rb_root *root = &dr->href_root.rb_root; | ||||
| 	struct rb_node *n; | ||||
| 	struct btrfs_delayed_ref_head *entry; | ||||
| 
 | ||||
| 	n = root->rb_node; | ||||
| 	entry = NULL; | ||||
| 	while (n) { | ||||
| 		entry = rb_entry(n, struct btrfs_delayed_ref_head, href_node); | ||||
| 
 | ||||
| 		if (bytenr < entry->bytenr) | ||||
| 			n = n->rb_left; | ||||
| 		else if (bytenr > entry->bytenr) | ||||
| 			n = n->rb_right; | ||||
| 		else | ||||
| 			return entry; | ||||
| 	} | ||||
| 	if (entry && return_bigger) { | ||||
| 		if (bytenr > entry->bytenr) { | ||||
| 			n = rb_next(&entry->href_node); | ||||
| 			if (!n) | ||||
| 				return NULL; | ||||
| 			entry = rb_entry(n, struct btrfs_delayed_ref_head, | ||||
| 					 href_node); | ||||
| 		} | ||||
| 		return entry; | ||||
| 	} | ||||
| 	return NULL; | ||||
| } | ||||
| 
 | ||||
| int btrfs_delayed_ref_lock(struct btrfs_delayed_ref_root *delayed_refs, | ||||
| 			   struct btrfs_delayed_ref_head *head) | ||||
| static bool btrfs_delayed_ref_lock(struct btrfs_delayed_ref_root *delayed_refs, | ||||
| 				   struct btrfs_delayed_ref_head *head) | ||||
| { | ||||
| 	lockdep_assert_held(&delayed_refs->lock); | ||||
| 	if (mutex_trylock(&head->mutex)) | ||||
| 		return 0; | ||||
| 		return true; | ||||
| 
 | ||||
| 	refcount_inc(&head->refs); | ||||
| 	spin_unlock(&delayed_refs->lock); | ||||
| 
 | ||||
| 	mutex_lock(&head->mutex); | ||||
| 	spin_lock(&delayed_refs->lock); | ||||
| 	if (RB_EMPTY_NODE(&head->href_node)) { | ||||
| 	if (!head->tracked) { | ||||
| 		mutex_unlock(&head->mutex); | ||||
| 		btrfs_put_delayed_ref_head(head); | ||||
| 		return -EAGAIN; | ||||
| 		return false; | ||||
| 	} | ||||
| 	btrfs_put_delayed_ref_head(head); | ||||
| 	return 0; | ||||
| 	return true; | ||||
| } | ||||
| 
 | ||||
| static inline void drop_delayed_ref(struct btrfs_fs_info *fs_info, | ||||
|  | @ -462,7 +387,6 @@ static inline void drop_delayed_ref(struct btrfs_fs_info *fs_info, | |||
| 	if (!list_empty(&ref->add_list)) | ||||
| 		list_del(&ref->add_list); | ||||
| 	btrfs_put_delayed_ref(ref); | ||||
| 	atomic_dec(&delayed_refs->num_entries); | ||||
| 	btrfs_delayed_refs_rsv_release(fs_info, 1, 0); | ||||
| } | ||||
| 
 | ||||
|  | @ -558,33 +482,31 @@ int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq) | |||
| } | ||||
| 
 | ||||
| struct btrfs_delayed_ref_head *btrfs_select_ref_head( | ||||
| 		const struct btrfs_fs_info *fs_info, | ||||
| 		struct btrfs_delayed_ref_root *delayed_refs) | ||||
| { | ||||
| 	struct btrfs_delayed_ref_head *head; | ||||
| 	unsigned long start_index; | ||||
| 	unsigned long found_index; | ||||
| 	bool found_head = false; | ||||
| 	bool locked; | ||||
| 
 | ||||
| 	lockdep_assert_held(&delayed_refs->lock); | ||||
| 	spin_lock(&delayed_refs->lock); | ||||
| again: | ||||
| 	head = find_ref_head(delayed_refs, delayed_refs->run_delayed_start, | ||||
| 			     true); | ||||
| 	if (!head && delayed_refs->run_delayed_start != 0) { | ||||
| 		delayed_refs->run_delayed_start = 0; | ||||
| 		head = find_first_ref_head(delayed_refs); | ||||
| 	} | ||||
| 	if (!head) | ||||
| 		return NULL; | ||||
| 
 | ||||
| 	while (head->processing) { | ||||
| 		struct rb_node *node; | ||||
| 
 | ||||
| 		node = rb_next(&head->href_node); | ||||
| 		if (!node) { | ||||
| 			if (delayed_refs->run_delayed_start == 0) | ||||
| 				return NULL; | ||||
| 			delayed_refs->run_delayed_start = 0; | ||||
| 			goto again; | ||||
| 	start_index = (delayed_refs->run_delayed_start >> fs_info->sectorsize_bits); | ||||
| 	xa_for_each_start(&delayed_refs->head_refs, found_index, head, start_index) { | ||||
| 		if (!head->processing) { | ||||
| 			found_head = true; | ||||
| 			break; | ||||
| 		} | ||||
| 		head = rb_entry(node, struct btrfs_delayed_ref_head, | ||||
| 				href_node); | ||||
| 	} | ||||
| 	if (!found_head) { | ||||
| 		if (delayed_refs->run_delayed_start == 0) { | ||||
| 			spin_unlock(&delayed_refs->lock); | ||||
| 			return NULL; | ||||
| 		} | ||||
| 		delayed_refs->run_delayed_start = 0; | ||||
| 		goto again; | ||||
| 	} | ||||
| 
 | ||||
| 	head->processing = true; | ||||
|  | @ -592,18 +514,42 @@ again: | |||
| 	delayed_refs->num_heads_ready--; | ||||
| 	delayed_refs->run_delayed_start = head->bytenr + | ||||
| 		head->num_bytes; | ||||
| 
 | ||||
| 	locked = btrfs_delayed_ref_lock(delayed_refs, head); | ||||
| 	spin_unlock(&delayed_refs->lock); | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * We may have dropped the spin lock to get the head mutex lock, and | ||||
| 	 * that might have given someone else time to free the head.  If that's | ||||
| 	 * true, it has been removed from our list and we can move on. | ||||
| 	 */ | ||||
| 	if (!locked) | ||||
| 		return ERR_PTR(-EAGAIN); | ||||
| 
 | ||||
| 	return head; | ||||
| } | ||||
| 
 | ||||
| void btrfs_delete_ref_head(struct btrfs_delayed_ref_root *delayed_refs, | ||||
| void btrfs_unselect_ref_head(struct btrfs_delayed_ref_root *delayed_refs, | ||||
| 			     struct btrfs_delayed_ref_head *head) | ||||
| { | ||||
| 	spin_lock(&delayed_refs->lock); | ||||
| 	head->processing = false; | ||||
| 	delayed_refs->num_heads_ready++; | ||||
| 	spin_unlock(&delayed_refs->lock); | ||||
| 	btrfs_delayed_ref_unlock(head); | ||||
| } | ||||
| 
 | ||||
| void btrfs_delete_ref_head(const struct btrfs_fs_info *fs_info, | ||||
| 			   struct btrfs_delayed_ref_root *delayed_refs, | ||||
| 			   struct btrfs_delayed_ref_head *head) | ||||
| { | ||||
| 	const unsigned long index = (head->bytenr >> fs_info->sectorsize_bits); | ||||
| 
 | ||||
| 	lockdep_assert_held(&delayed_refs->lock); | ||||
| 	lockdep_assert_held(&head->lock); | ||||
| 
 | ||||
| 	rb_erase_cached(&head->href_node, &delayed_refs->href_root); | ||||
| 	RB_CLEAR_NODE(&head->href_node); | ||||
| 	atomic_dec(&delayed_refs->num_entries); | ||||
| 	xa_erase(&delayed_refs->head_refs, index); | ||||
| 	head->tracked = false; | ||||
| 	delayed_refs->num_heads--; | ||||
| 	if (!head->processing) | ||||
| 		delayed_refs->num_heads_ready--; | ||||
|  | @ -629,7 +575,6 @@ static bool insert_delayed_ref(struct btrfs_trans_handle *trans, | |||
| 	if (!exist) { | ||||
| 		if (ref->action == BTRFS_ADD_DELAYED_REF) | ||||
| 			list_add_tail(&ref->add_list, &href->ref_add_list); | ||||
| 		atomic_inc(&root->num_entries); | ||||
| 		spin_unlock(&href->lock); | ||||
| 		trans->delayed_ref_updates++; | ||||
| 		return false; | ||||
|  | @ -813,7 +758,7 @@ static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref, | |||
| 	head_ref->is_system = (generic_ref->ref_root == BTRFS_CHUNK_TREE_OBJECTID); | ||||
| 	head_ref->ref_tree = RB_ROOT_CACHED; | ||||
| 	INIT_LIST_HEAD(&head_ref->ref_add_list); | ||||
| 	RB_CLEAR_NODE(&head_ref->href_node); | ||||
| 	head_ref->tracked = false; | ||||
| 	head_ref->processing = false; | ||||
| 	head_ref->total_ref_mod = count_mod; | ||||
| 	spin_lock_init(&head_ref->lock); | ||||
|  | @ -830,7 +775,6 @@ static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref, | |||
| 			qrecord->data_rsv = reserved; | ||||
| 			qrecord->data_rsv_refroot = generic_ref->ref_root; | ||||
| 		} | ||||
| 		qrecord->bytenr = generic_ref->bytenr; | ||||
| 		qrecord->num_bytes = generic_ref->num_bytes; | ||||
| 		qrecord->old_roots = NULL; | ||||
| 	} | ||||
|  | @ -852,19 +796,33 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans, | |||
| 	struct btrfs_fs_info *fs_info = trans->fs_info; | ||||
| 	struct btrfs_delayed_ref_head *existing; | ||||
| 	struct btrfs_delayed_ref_root *delayed_refs; | ||||
| 	const unsigned long index = (head_ref->bytenr >> fs_info->sectorsize_bits); | ||||
| 	bool qrecord_inserted = false; | ||||
| 
 | ||||
| 	delayed_refs = &trans->transaction->delayed_refs; | ||||
| 	lockdep_assert_held(&delayed_refs->lock); | ||||
| 
 | ||||
| #if BITS_PER_LONG == 32 | ||||
| 	if (head_ref->bytenr >= MAX_LFS_FILESIZE) { | ||||
| 		if (qrecord) | ||||
| 			xa_release(&delayed_refs->dirty_extents, index); | ||||
| 		btrfs_err_rl(fs_info, | ||||
| "delayed ref head %llu is beyond 32bit page cache and xarray index limit", | ||||
| 			     head_ref->bytenr); | ||||
| 		btrfs_err_32bit_limit(fs_info); | ||||
| 		return ERR_PTR(-EOVERFLOW); | ||||
| 	} | ||||
| #endif | ||||
| 
 | ||||
| 	/* Record qgroup extent info if provided */ | ||||
| 	if (qrecord) { | ||||
| 		int ret; | ||||
| 
 | ||||
| 		ret = btrfs_qgroup_trace_extent_nolock(fs_info, delayed_refs, qrecord); | ||||
| 		ret = btrfs_qgroup_trace_extent_nolock(fs_info, delayed_refs, qrecord, | ||||
| 						       head_ref->bytenr); | ||||
| 		if (ret) { | ||||
| 			/* Clean up if insertion fails or item exists. */ | ||||
| 			xa_release(&delayed_refs->dirty_extents, | ||||
| 				   qrecord->bytenr >> fs_info->sectorsize_bits); | ||||
| 			xa_release(&delayed_refs->dirty_extents, index); | ||||
| 			/* Caller responsible for freeing qrecord on error. */ | ||||
| 			if (ret < 0) | ||||
| 				return ERR_PTR(ret); | ||||
|  | @ -876,8 +834,7 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans, | |||
| 
 | ||||
| 	trace_add_delayed_ref_head(fs_info, head_ref, action); | ||||
| 
 | ||||
| 	existing = htree_insert(&delayed_refs->href_root, | ||||
| 				&head_ref->href_node); | ||||
| 	existing = xa_load(&delayed_refs->head_refs, index); | ||||
| 	if (existing) { | ||||
| 		update_existing_head_ref(trans, existing, head_ref); | ||||
| 		/*
 | ||||
|  | @ -887,6 +844,19 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans, | |||
| 		kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref); | ||||
| 		head_ref = existing; | ||||
| 	} else { | ||||
| 		existing = xa_store(&delayed_refs->head_refs, index, head_ref, GFP_ATOMIC); | ||||
| 		if (xa_is_err(existing)) { | ||||
| 			/* Memory was preallocated by the caller. */ | ||||
| 			ASSERT(xa_err(existing) != -ENOMEM); | ||||
| 			return ERR_PTR(xa_err(existing)); | ||||
| 		} else if (WARN_ON(existing)) { | ||||
| 			/*
 | ||||
| 			 * Shouldn't happen we just did a lookup before under | ||||
| 			 * delayed_refs->lock. | ||||
| 			 */ | ||||
| 			return ERR_PTR(-EEXIST); | ||||
| 		} | ||||
| 		head_ref->tracked = true; | ||||
| 		/*
 | ||||
| 		 * We reserve the amount of bytes needed to delete csums when | ||||
| 		 * adding the ref head and not when adding individual drop refs | ||||
|  | @ -900,7 +870,6 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans, | |||
| 		} | ||||
| 		delayed_refs->num_heads++; | ||||
| 		delayed_refs->num_heads_ready++; | ||||
| 		atomic_inc(&delayed_refs->num_entries); | ||||
| 	} | ||||
| 	if (qrecord_inserted_ret) | ||||
| 		*qrecord_inserted_ret = qrecord_inserted; | ||||
|  | @ -1008,6 +977,8 @@ static int add_delayed_ref(struct btrfs_trans_handle *trans, | |||
| 	struct btrfs_delayed_ref_head *new_head_ref; | ||||
| 	struct btrfs_delayed_ref_root *delayed_refs; | ||||
| 	struct btrfs_qgroup_extent_record *record = NULL; | ||||
| 	const unsigned long index = (generic_ref->bytenr >> fs_info->sectorsize_bits); | ||||
| 	bool qrecord_reserved = false; | ||||
| 	bool qrecord_inserted; | ||||
| 	int action = generic_ref->action; | ||||
| 	bool merged; | ||||
|  | @ -1023,25 +994,32 @@ static int add_delayed_ref(struct btrfs_trans_handle *trans, | |||
| 		goto free_node; | ||||
| 	} | ||||
| 
 | ||||
| 	delayed_refs = &trans->transaction->delayed_refs; | ||||
| 
 | ||||
| 	if (btrfs_qgroup_full_accounting(fs_info) && !generic_ref->skip_qgroup) { | ||||
| 		record = kzalloc(sizeof(*record), GFP_NOFS); | ||||
| 		if (!record) { | ||||
| 			ret = -ENOMEM; | ||||
| 			goto free_head_ref; | ||||
| 		} | ||||
| 		if (xa_reserve(&trans->transaction->delayed_refs.dirty_extents, | ||||
| 			       generic_ref->bytenr >> fs_info->sectorsize_bits, | ||||
| 			       GFP_NOFS)) { | ||||
| 		if (xa_reserve(&delayed_refs->dirty_extents, index, GFP_NOFS)) { | ||||
| 			ret = -ENOMEM; | ||||
| 			goto free_record; | ||||
| 		} | ||||
| 		qrecord_reserved = true; | ||||
| 	} | ||||
| 
 | ||||
| 	ret = xa_reserve(&delayed_refs->head_refs, index, GFP_NOFS); | ||||
| 	if (ret) { | ||||
| 		if (qrecord_reserved) | ||||
| 			xa_release(&delayed_refs->dirty_extents, index); | ||||
| 		goto free_record; | ||||
| 	} | ||||
| 
 | ||||
| 	init_delayed_ref_common(fs_info, node, generic_ref); | ||||
| 	init_delayed_ref_head(head_ref, generic_ref, record, reserved); | ||||
| 	head_ref->extent_op = extent_op; | ||||
| 
 | ||||
| 	delayed_refs = &trans->transaction->delayed_refs; | ||||
| 	spin_lock(&delayed_refs->lock); | ||||
| 
 | ||||
| 	/*
 | ||||
|  | @ -1051,6 +1029,7 @@ static int add_delayed_ref(struct btrfs_trans_handle *trans, | |||
| 	new_head_ref = add_delayed_ref_head(trans, head_ref, record, | ||||
| 					    action, &qrecord_inserted); | ||||
| 	if (IS_ERR(new_head_ref)) { | ||||
| 		xa_release(&delayed_refs->head_refs, index); | ||||
| 		spin_unlock(&delayed_refs->lock); | ||||
| 		ret = PTR_ERR(new_head_ref); | ||||
| 		goto free_record; | ||||
|  | @ -1074,7 +1053,7 @@ static int add_delayed_ref(struct btrfs_trans_handle *trans, | |||
| 		kmem_cache_free(btrfs_delayed_ref_node_cachep, node); | ||||
| 
 | ||||
| 	if (qrecord_inserted) | ||||
| 		return btrfs_qgroup_trace_extent_post(trans, record); | ||||
| 		return btrfs_qgroup_trace_extent_post(trans, record, generic_ref->bytenr); | ||||
| 	return 0; | ||||
| 
 | ||||
| free_record: | ||||
|  | @ -1113,6 +1092,7 @@ int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans, | |||
| 				u64 bytenr, u64 num_bytes, u8 level, | ||||
| 				struct btrfs_delayed_extent_op *extent_op) | ||||
| { | ||||
| 	const unsigned long index = (bytenr >> trans->fs_info->sectorsize_bits); | ||||
| 	struct btrfs_delayed_ref_head *head_ref; | ||||
| 	struct btrfs_delayed_ref_head *head_ref_ret; | ||||
| 	struct btrfs_delayed_ref_root *delayed_refs; | ||||
|  | @ -1123,6 +1103,7 @@ int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans, | |||
| 		.num_bytes = num_bytes, | ||||
| 		.tree_ref.level = level, | ||||
| 	}; | ||||
| 	int ret; | ||||
| 
 | ||||
| 	head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS); | ||||
| 	if (!head_ref) | ||||
|  | @ -1132,16 +1113,23 @@ int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans, | |||
| 	head_ref->extent_op = extent_op; | ||||
| 
 | ||||
| 	delayed_refs = &trans->transaction->delayed_refs; | ||||
| 	spin_lock(&delayed_refs->lock); | ||||
| 
 | ||||
| 	ret = xa_reserve(&delayed_refs->head_refs, index, GFP_NOFS); | ||||
| 	if (ret) { | ||||
| 		kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref); | ||||
| 		return ret; | ||||
| 	} | ||||
| 
 | ||||
| 	spin_lock(&delayed_refs->lock); | ||||
| 	head_ref_ret = add_delayed_ref_head(trans, head_ref, NULL, | ||||
| 					    BTRFS_UPDATE_DELAYED_HEAD, NULL); | ||||
| 	spin_unlock(&delayed_refs->lock); | ||||
| 
 | ||||
| 	if (IS_ERR(head_ref_ret)) { | ||||
| 		xa_release(&delayed_refs->head_refs, index); | ||||
| 		spin_unlock(&delayed_refs->lock); | ||||
| 		kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref); | ||||
| 		return PTR_ERR(head_ref_ret); | ||||
| 	} | ||||
| 	spin_unlock(&delayed_refs->lock); | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Need to update the delayed_refs_rsv with any changes we may have | ||||
|  | @ -1164,11 +1152,15 @@ void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref) | |||
|  * head node if found, or NULL if not. | ||||
|  */ | ||||
| struct btrfs_delayed_ref_head * | ||||
| btrfs_find_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs, u64 bytenr) | ||||
| btrfs_find_delayed_ref_head(const struct btrfs_fs_info *fs_info, | ||||
| 			    struct btrfs_delayed_ref_root *delayed_refs, | ||||
| 			    u64 bytenr) | ||||
| { | ||||
| 	const unsigned long index = (bytenr >> fs_info->sectorsize_bits); | ||||
| 
 | ||||
| 	lockdep_assert_held(&delayed_refs->lock); | ||||
| 
 | ||||
| 	return find_ref_head(delayed_refs, bytenr, false); | ||||
| 	return xa_load(&delayed_refs->head_refs, index); | ||||
| } | ||||
| 
 | ||||
| static int find_comp(struct btrfs_delayed_ref_node *entry, u64 root, u64 parent) | ||||
|  | @ -1238,6 +1230,81 @@ bool btrfs_find_delayed_tree_ref(struct btrfs_delayed_ref_head *head, | |||
| 	return found; | ||||
| } | ||||
| 
 | ||||
| void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans) | ||||
| { | ||||
| 	struct btrfs_delayed_ref_root *delayed_refs = &trans->delayed_refs; | ||||
| 	struct btrfs_fs_info *fs_info = trans->fs_info; | ||||
| 
 | ||||
| 	spin_lock(&delayed_refs->lock); | ||||
| 	while (true) { | ||||
| 		struct btrfs_delayed_ref_head *head; | ||||
| 		struct rb_node *n; | ||||
| 		bool pin_bytes = false; | ||||
| 
 | ||||
| 		head = find_first_ref_head(delayed_refs); | ||||
| 		if (!head) | ||||
| 			break; | ||||
| 
 | ||||
| 		if (!btrfs_delayed_ref_lock(delayed_refs, head)) | ||||
| 			continue; | ||||
| 
 | ||||
| 		spin_lock(&head->lock); | ||||
| 		while ((n = rb_first_cached(&head->ref_tree)) != NULL) { | ||||
| 			struct btrfs_delayed_ref_node *ref; | ||||
| 
 | ||||
| 			ref = rb_entry(n, struct btrfs_delayed_ref_node, ref_node); | ||||
| 			drop_delayed_ref(fs_info, delayed_refs, head, ref); | ||||
| 		} | ||||
| 		if (head->must_insert_reserved) | ||||
| 			pin_bytes = true; | ||||
| 		btrfs_free_delayed_extent_op(head->extent_op); | ||||
| 		btrfs_delete_ref_head(fs_info, delayed_refs, head); | ||||
| 		spin_unlock(&head->lock); | ||||
| 		spin_unlock(&delayed_refs->lock); | ||||
| 		mutex_unlock(&head->mutex); | ||||
| 
 | ||||
| 		if (pin_bytes) { | ||||
| 			struct btrfs_block_group *bg; | ||||
| 
 | ||||
| 			bg = btrfs_lookup_block_group(fs_info, head->bytenr); | ||||
| 			if (WARN_ON_ONCE(bg == NULL)) { | ||||
| 				/*
 | ||||
| 				 * Unexpected and there's nothing we can do here | ||||
| 				 * because we are in a transaction abort path, | ||||
| 				 * so any errors can only be ignored or reported | ||||
| 				 * while attempting to cleanup all resources. | ||||
| 				 */ | ||||
| 				btrfs_err(fs_info, | ||||
| "block group for delayed ref at %llu was not found while destroying ref head", | ||||
| 					  head->bytenr); | ||||
| 			} else { | ||||
| 				spin_lock(&bg->space_info->lock); | ||||
| 				spin_lock(&bg->lock); | ||||
| 				bg->pinned += head->num_bytes; | ||||
| 				btrfs_space_info_update_bytes_pinned(fs_info, | ||||
| 								     bg->space_info, | ||||
| 								     head->num_bytes); | ||||
| 				bg->reserved -= head->num_bytes; | ||||
| 				bg->space_info->bytes_reserved -= head->num_bytes; | ||||
| 				spin_unlock(&bg->lock); | ||||
| 				spin_unlock(&bg->space_info->lock); | ||||
| 
 | ||||
| 				btrfs_put_block_group(bg); | ||||
| 			} | ||||
| 
 | ||||
| 			btrfs_error_unpin_extent_range(fs_info, head->bytenr, | ||||
| 				head->bytenr + head->num_bytes - 1); | ||||
| 		} | ||||
| 		btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head); | ||||
| 		btrfs_put_delayed_ref_head(head); | ||||
| 		cond_resched(); | ||||
| 		spin_lock(&delayed_refs->lock); | ||||
| 	} | ||||
| 	btrfs_qgroup_destroy_extent_records(trans); | ||||
| 
 | ||||
| 	spin_unlock(&delayed_refs->lock); | ||||
| } | ||||
| 
 | ||||
| void __cold btrfs_delayed_ref_exit(void) | ||||
| { | ||||
| 	kmem_cache_destroy(btrfs_delayed_ref_head_cachep); | ||||
|  |  | |||
|  | @ -61,7 +61,8 @@ struct btrfs_delayed_ref_node { | |||
| 	/*
 | ||||
| 	 * If action is BTRFS_ADD_DELAYED_REF, also link this node to | ||||
| 	 * ref_head->ref_add_list, then we do not need to iterate the | ||||
| 	 * whole ref_head->ref_list to find BTRFS_ADD_DELAYED_REF nodes. | ||||
| 	 * refs rbtree in the corresponding delayed ref head | ||||
| 	 * (struct btrfs_delayed_ref_head::ref_tree). | ||||
| 	 */ | ||||
| 	struct list_head add_list; | ||||
| 
 | ||||
|  | @ -122,12 +123,6 @@ struct btrfs_delayed_extent_op { | |||
| struct btrfs_delayed_ref_head { | ||||
| 	u64 bytenr; | ||||
| 	u64 num_bytes; | ||||
| 	/*
 | ||||
| 	 * For insertion into struct btrfs_delayed_ref_root::href_root. | ||||
| 	 * Keep it in the same cache line as 'bytenr' for more efficient | ||||
| 	 * searches in the rbtree. | ||||
| 	 */ | ||||
| 	struct rb_node href_node; | ||||
| 	/*
 | ||||
| 	 * the mutex is held while running the refs, and it is also | ||||
| 	 * held when checking the sum of reference modifications. | ||||
|  | @ -191,6 +186,11 @@ struct btrfs_delayed_ref_head { | |||
| 	bool is_data; | ||||
| 	bool is_system; | ||||
| 	bool processing; | ||||
| 	/*
 | ||||
| 	 * Indicate if it's currently in the data structure that tracks head | ||||
| 	 * refs (struct btrfs_delayed_ref_root::head_refs). | ||||
| 	 */ | ||||
| 	bool tracked; | ||||
| }; | ||||
| 
 | ||||
| enum btrfs_delayed_ref_flags { | ||||
|  | @ -199,38 +199,52 @@ enum btrfs_delayed_ref_flags { | |||
| }; | ||||
| 
 | ||||
| struct btrfs_delayed_ref_root { | ||||
| 	/* head ref rbtree */ | ||||
| 	struct rb_root_cached href_root; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Track dirty extent records. | ||||
| 	 * Track head references. | ||||
| 	 * The keys correspond to the logical address of the extent ("bytenr") | ||||
| 	 * right shifted by fs_info->sectorsize_bits. This is both to get a more | ||||
| 	 * dense index space (optimizes xarray structure) and because indexes in | ||||
| 	 * xarrays are of "unsigned long" type, meaning they are 32 bits wide on | ||||
| 	 * 32 bits platforms, limiting the extent range to 4G which is too low | ||||
| 	 * and makes it unusable (truncated index values) on 32 bits platforms. | ||||
| 	 * Protected by the spinlock 'lock' defined below. | ||||
| 	 */ | ||||
| 	struct xarray head_refs; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Track dirty extent records. | ||||
| 	 * The keys correspond to the logical address of the extent ("bytenr") | ||||
| 	 * right shifted by fs_info->sectorsize_bits, for same reasons as above. | ||||
| 	 */ | ||||
| 	struct xarray dirty_extents; | ||||
| 
 | ||||
| 	/* this spin lock protects the rbtree and the entries inside */ | ||||
| 	/*
 | ||||
| 	 * Protects the xarray head_refs, its entries and the following fields: | ||||
| 	 * num_heads, num_heads_ready, pending_csums and run_delayed_start. | ||||
| 	 */ | ||||
| 	spinlock_t lock; | ||||
| 
 | ||||
| 	/* how many delayed ref updates we've queued, used by the
 | ||||
| 	 * throttling code | ||||
| 	 */ | ||||
| 	atomic_t num_entries; | ||||
| 
 | ||||
| 	/* total number of head nodes in tree */ | ||||
| 	/* Total number of head refs, protected by the spinlock 'lock'. */ | ||||
| 	unsigned long num_heads; | ||||
| 
 | ||||
| 	/* total number of head nodes ready for processing */ | ||||
| 	/*
 | ||||
| 	 * Total number of head refs ready for processing, protected by the | ||||
| 	 * spinlock 'lock'. | ||||
| 	 */ | ||||
| 	unsigned long num_heads_ready; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Track space reserved for deleting csums of data extents. | ||||
| 	 * Protected by the spinlock 'lock'. | ||||
| 	 */ | ||||
| 	u64 pending_csums; | ||||
| 
 | ||||
| 	unsigned long flags; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Track from which bytenr to start searching ref heads. | ||||
| 	 * Protected by the spinlock 'lock'. | ||||
| 	 */ | ||||
| 	u64 run_delayed_start; | ||||
| 
 | ||||
| 	/*
 | ||||
|  | @ -372,19 +386,22 @@ void btrfs_merge_delayed_refs(struct btrfs_fs_info *fs_info, | |||
| 			      struct btrfs_delayed_ref_head *head); | ||||
| 
 | ||||
| struct btrfs_delayed_ref_head * | ||||
| btrfs_find_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs, | ||||
| btrfs_find_delayed_ref_head(const struct btrfs_fs_info *fs_info, | ||||
| 			    struct btrfs_delayed_ref_root *delayed_refs, | ||||
| 			    u64 bytenr); | ||||
| int btrfs_delayed_ref_lock(struct btrfs_delayed_ref_root *delayed_refs, | ||||
| 			   struct btrfs_delayed_ref_head *head); | ||||
| static inline void btrfs_delayed_ref_unlock(struct btrfs_delayed_ref_head *head) | ||||
| { | ||||
| 	mutex_unlock(&head->mutex); | ||||
| } | ||||
| void btrfs_delete_ref_head(struct btrfs_delayed_ref_root *delayed_refs, | ||||
| void btrfs_delete_ref_head(const struct btrfs_fs_info *fs_info, | ||||
| 			   struct btrfs_delayed_ref_root *delayed_refs, | ||||
| 			   struct btrfs_delayed_ref_head *head); | ||||
| 
 | ||||
| struct btrfs_delayed_ref_head *btrfs_select_ref_head( | ||||
| 		const struct btrfs_fs_info *fs_info, | ||||
| 		struct btrfs_delayed_ref_root *delayed_refs); | ||||
| void btrfs_unselect_ref_head(struct btrfs_delayed_ref_root *delayed_refs, | ||||
| 			     struct btrfs_delayed_ref_head *head); | ||||
| 
 | ||||
| int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq); | ||||
| 
 | ||||
|  | @ -399,6 +416,7 @@ int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info, | |||
| bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info); | ||||
| bool btrfs_find_delayed_tree_ref(struct btrfs_delayed_ref_head *head, | ||||
| 				 u64 root, u64 parent); | ||||
| void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans); | ||||
| 
 | ||||
| static inline u64 btrfs_delayed_ref_owner(struct btrfs_delayed_ref_node *node) | ||||
| { | ||||
|  |  | |||
|  | @ -45,7 +45,7 @@ | |||
|  * | ||||
|  * - Copy existing extents | ||||
|  * | ||||
|  *   This happens by re-using scrub facility, as scrub also iterates through | ||||
|  *   This happens by reusing scrub facility, as scrub also iterates through | ||||
|  *   existing extents from commit root. | ||||
|  * | ||||
|  *   Location:		scrub_write_block_to_dev_replace() from | ||||
|  | @ -641,6 +641,7 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, | |||
| 		return ret; | ||||
| 
 | ||||
| 	down_write(&dev_replace->rwsem); | ||||
| 	dev_replace->replace_task = current; | ||||
| 	switch (dev_replace->replace_state) { | ||||
| 	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: | ||||
| 	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: | ||||
|  | @ -994,6 +995,7 @@ error: | |||
| 	list_add(&tgt_device->dev_alloc_list, &fs_devices->alloc_list); | ||||
| 	fs_devices->rw_devices++; | ||||
| 
 | ||||
| 	dev_replace->replace_task = NULL; | ||||
| 	up_write(&dev_replace->rwsem); | ||||
| 	btrfs_rm_dev_replace_blocked(fs_info); | ||||
| 
 | ||||
|  |  | |||
|  | @ -27,7 +27,6 @@ static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle | |||
| 						   const char *name, | ||||
| 						   int name_len) | ||||
| { | ||||
| 	struct btrfs_fs_info *fs_info = root->fs_info; | ||||
| 	int ret; | ||||
| 	char *ptr; | ||||
| 	struct extent_buffer *leaf; | ||||
|  | @ -35,7 +34,7 @@ static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle | |||
| 	ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size); | ||||
| 	if (ret == -EEXIST) { | ||||
| 		struct btrfs_dir_item *di; | ||||
| 		di = btrfs_match_dir_item_name(fs_info, path, name, name_len); | ||||
| 		di = btrfs_match_dir_item_name(path, name, name_len); | ||||
| 		if (di) | ||||
| 			return ERR_PTR(-EEXIST); | ||||
| 		btrfs_extend_item(trans, path, data_size); | ||||
|  | @ -190,7 +189,7 @@ static struct btrfs_dir_item *btrfs_lookup_match_dir( | |||
| 	if (ret > 0) | ||||
| 		return ERR_PTR(-ENOENT); | ||||
| 
 | ||||
| 	return btrfs_match_dir_item_name(root->fs_info, path, name, name_len); | ||||
| 	return btrfs_match_dir_item_name(path, name, name_len); | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  | @ -341,8 +340,7 @@ btrfs_search_dir_index_item(struct btrfs_root *root, struct btrfs_path *path, | |||
| 		if (key.objectid != dirid || key.type != BTRFS_DIR_INDEX_KEY) | ||||
| 			break; | ||||
| 
 | ||||
| 		di = btrfs_match_dir_item_name(root->fs_info, path, | ||||
| 					       name->name, name->len); | ||||
| 		di = btrfs_match_dir_item_name(path, name->name, name->len); | ||||
| 		if (di) | ||||
| 			return di; | ||||
| 	} | ||||
|  | @ -378,8 +376,7 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans, | |||
|  * this walks through all the entries in a dir item and finds one | ||||
|  * for a specific name. | ||||
|  */ | ||||
| struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info, | ||||
| 						 const struct btrfs_path *path, | ||||
| struct btrfs_dir_item *btrfs_match_dir_item_name(const struct btrfs_path *path, | ||||
| 						 const char *name, int name_len) | ||||
| { | ||||
| 	struct btrfs_dir_item *dir_item; | ||||
|  |  | |||
|  | @ -44,8 +44,7 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans, | |||
| 					  struct btrfs_path *path, u64 dir, | ||||
| 					  const char *name, u16 name_len, | ||||
| 					  int mod); | ||||
| struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info, | ||||
| 						 const struct btrfs_path *path, | ||||
| struct btrfs_dir_item *btrfs_match_dir_item_name(const struct btrfs_path *path, | ||||
| 						 const char *name, | ||||
| 						 int name_len); | ||||
| 
 | ||||
|  |  | |||
|  | @ -834,7 +834,7 @@ relock: | |||
| 		return ret; | ||||
| 	} | ||||
| 
 | ||||
| 	ret = btrfs_write_check(iocb, from, ret); | ||||
| 	ret = btrfs_write_check(iocb, ret); | ||||
| 	if (ret < 0) { | ||||
| 		btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); | ||||
| 		goto out; | ||||
|  |  | |||
|  | @ -917,8 +917,7 @@ fail: | |||
| 	return ERR_PTR(ret); | ||||
| } | ||||
| 
 | ||||
| static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, | ||||
| 					 struct btrfs_fs_info *fs_info) | ||||
| static struct btrfs_root *alloc_log_tree(struct btrfs_fs_info *fs_info) | ||||
| { | ||||
| 	struct btrfs_root *root; | ||||
| 
 | ||||
|  | @ -966,7 +965,7 @@ int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, | |||
| { | ||||
| 	struct btrfs_root *log_root; | ||||
| 
 | ||||
| 	log_root = alloc_log_tree(trans, fs_info); | ||||
| 	log_root = alloc_log_tree(fs_info); | ||||
| 	if (IS_ERR(log_root)) | ||||
| 		return PTR_ERR(log_root); | ||||
| 
 | ||||
|  | @ -992,7 +991,7 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans, | |||
| 	struct btrfs_inode_item *inode_item; | ||||
| 	int ret; | ||||
| 
 | ||||
| 	log_root = alloc_log_tree(trans, fs_info); | ||||
| 	log_root = alloc_log_tree(fs_info); | ||||
| 	if (IS_ERR(log_root)) | ||||
| 		return PTR_ERR(log_root); | ||||
| 
 | ||||
|  | @ -2786,6 +2785,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) | |||
| 	btrfs_init_scrub(fs_info); | ||||
| 	btrfs_init_balance(fs_info); | ||||
| 	btrfs_init_async_reclaim_work(fs_info); | ||||
| 	btrfs_init_extent_map_shrinker_work(fs_info); | ||||
| 
 | ||||
| 	rwlock_init(&fs_info->block_group_cache_lock); | ||||
| 	fs_info->block_group_cache_tree = RB_ROOT_CACHED; | ||||
|  | @ -2852,8 +2852,6 @@ static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block | |||
| 	if (ret) | ||||
| 		return ret; | ||||
| 
 | ||||
| 	spin_lock_init(&fs_info->extent_map_shrinker_lock); | ||||
| 
 | ||||
| 	ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL); | ||||
| 	if (ret) | ||||
| 		return ret; | ||||
|  | @ -3202,8 +3200,7 @@ int btrfs_check_features(struct btrfs_fs_info *fs_info, bool is_rw_mount) | |||
| 	return 0; | ||||
| } | ||||
| 
 | ||||
| int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices, | ||||
| 		      const char *options) | ||||
| int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices) | ||||
| { | ||||
| 	u32 sectorsize; | ||||
| 	u32 nodesize; | ||||
|  | @ -4186,7 +4183,7 @@ static void warn_about_uncommitted_trans(struct btrfs_fs_info *fs_info) | |||
| 		btrfs_warn(fs_info, | ||||
| 	"transaction %llu (with %llu dirty metadata bytes) is not committed", | ||||
| 			   trans->transid, dirty_bytes); | ||||
| 		btrfs_cleanup_one_transaction(trans, fs_info); | ||||
| 		btrfs_cleanup_one_transaction(trans); | ||||
| 
 | ||||
| 		if (trans == fs_info->running_transaction) | ||||
| 			fs_info->running_transaction = NULL; | ||||
|  | @ -4294,6 +4291,7 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) | |||
| 	cancel_work_sync(&fs_info->async_reclaim_work); | ||||
| 	cancel_work_sync(&fs_info->async_data_reclaim_work); | ||||
| 	cancel_work_sync(&fs_info->preempt_reclaim_work); | ||||
| 	cancel_work_sync(&fs_info->em_shrinker_work); | ||||
| 
 | ||||
| 	/* Cancel or finish ongoing discard work */ | ||||
| 	btrfs_discard_cleanup(fs_info); | ||||
|  | @ -4531,75 +4529,6 @@ static void btrfs_destroy_all_ordered_extents(struct btrfs_fs_info *fs_info) | |||
| 	btrfs_wait_ordered_roots(fs_info, U64_MAX, NULL); | ||||
| } | ||||
| 
 | ||||
| static void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, | ||||
| 				       struct btrfs_fs_info *fs_info) | ||||
| { | ||||
| 	struct rb_node *node; | ||||
| 	struct btrfs_delayed_ref_root *delayed_refs = &trans->delayed_refs; | ||||
| 	struct btrfs_delayed_ref_node *ref; | ||||
| 
 | ||||
| 	spin_lock(&delayed_refs->lock); | ||||
| 	while ((node = rb_first_cached(&delayed_refs->href_root)) != NULL) { | ||||
| 		struct btrfs_delayed_ref_head *head; | ||||
| 		struct rb_node *n; | ||||
| 		bool pin_bytes = false; | ||||
| 
 | ||||
| 		head = rb_entry(node, struct btrfs_delayed_ref_head, | ||||
| 				href_node); | ||||
| 		if (btrfs_delayed_ref_lock(delayed_refs, head)) | ||||
| 			continue; | ||||
| 
 | ||||
| 		spin_lock(&head->lock); | ||||
| 		while ((n = rb_first_cached(&head->ref_tree)) != NULL) { | ||||
| 			ref = rb_entry(n, struct btrfs_delayed_ref_node, | ||||
| 				       ref_node); | ||||
| 			rb_erase_cached(&ref->ref_node, &head->ref_tree); | ||||
| 			RB_CLEAR_NODE(&ref->ref_node); | ||||
| 			if (!list_empty(&ref->add_list)) | ||||
| 				list_del(&ref->add_list); | ||||
| 			atomic_dec(&delayed_refs->num_entries); | ||||
| 			btrfs_put_delayed_ref(ref); | ||||
| 			btrfs_delayed_refs_rsv_release(fs_info, 1, 0); | ||||
| 		} | ||||
| 		if (head->must_insert_reserved) | ||||
| 			pin_bytes = true; | ||||
| 		btrfs_free_delayed_extent_op(head->extent_op); | ||||
| 		btrfs_delete_ref_head(delayed_refs, head); | ||||
| 		spin_unlock(&head->lock); | ||||
| 		spin_unlock(&delayed_refs->lock); | ||||
| 		mutex_unlock(&head->mutex); | ||||
| 
 | ||||
| 		if (pin_bytes) { | ||||
| 			struct btrfs_block_group *cache; | ||||
| 
 | ||||
| 			cache = btrfs_lookup_block_group(fs_info, head->bytenr); | ||||
| 			BUG_ON(!cache); | ||||
| 
 | ||||
| 			spin_lock(&cache->space_info->lock); | ||||
| 			spin_lock(&cache->lock); | ||||
| 			cache->pinned += head->num_bytes; | ||||
| 			btrfs_space_info_update_bytes_pinned(fs_info, | ||||
| 				cache->space_info, head->num_bytes); | ||||
| 			cache->reserved -= head->num_bytes; | ||||
| 			cache->space_info->bytes_reserved -= head->num_bytes; | ||||
| 			spin_unlock(&cache->lock); | ||||
| 			spin_unlock(&cache->space_info->lock); | ||||
| 
 | ||||
| 			btrfs_put_block_group(cache); | ||||
| 
 | ||||
| 			btrfs_error_unpin_extent_range(fs_info, head->bytenr, | ||||
| 				head->bytenr + head->num_bytes - 1); | ||||
| 		} | ||||
| 		btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head); | ||||
| 		btrfs_put_delayed_ref_head(head); | ||||
| 		cond_resched(); | ||||
| 		spin_lock(&delayed_refs->lock); | ||||
| 	} | ||||
| 	btrfs_qgroup_destroy_extent_records(trans); | ||||
| 
 | ||||
| 	spin_unlock(&delayed_refs->lock); | ||||
| } | ||||
| 
 | ||||
| static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root) | ||||
| { | ||||
| 	struct btrfs_inode *btrfs_inode; | ||||
|  | @ -4805,9 +4734,9 @@ static void btrfs_free_all_qgroup_pertrans(struct btrfs_fs_info *fs_info) | |||
| 	spin_unlock(&fs_info->fs_roots_radix_lock); | ||||
| } | ||||
| 
 | ||||
| void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, | ||||
| 				   struct btrfs_fs_info *fs_info) | ||||
| void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans) | ||||
| { | ||||
| 	struct btrfs_fs_info *fs_info = cur_trans->fs_info; | ||||
| 	struct btrfs_device *dev, *tmp; | ||||
| 
 | ||||
| 	btrfs_cleanup_dirty_bgs(cur_trans, fs_info); | ||||
|  | @ -4819,7 +4748,7 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, | |||
| 		list_del_init(&dev->post_commit_list); | ||||
| 	} | ||||
| 
 | ||||
| 	btrfs_destroy_delayed_refs(cur_trans, fs_info); | ||||
| 	btrfs_destroy_delayed_refs(cur_trans); | ||||
| 
 | ||||
| 	cur_trans->state = TRANS_STATE_COMMIT_START; | ||||
| 	wake_up(&fs_info->transaction_blocked_wait); | ||||
|  | @ -4865,7 +4794,7 @@ static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info) | |||
| 		} else { | ||||
| 			spin_unlock(&fs_info->trans_lock); | ||||
| 		} | ||||
| 		btrfs_cleanup_one_transaction(t, fs_info); | ||||
| 		btrfs_cleanup_one_transaction(t); | ||||
| 
 | ||||
| 		spin_lock(&fs_info->trans_lock); | ||||
| 		if (t == fs_info->running_transaction) | ||||
|  |  | |||
|  | @ -52,8 +52,7 @@ struct extent_buffer *btrfs_find_create_tree_block( | |||
| int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info); | ||||
| int btrfs_check_super_csum(struct btrfs_fs_info *fs_info, | ||||
| 			   const struct btrfs_super_block *disk_sb); | ||||
| int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices, | ||||
| 		      const char *options); | ||||
| int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices); | ||||
| void __cold close_ctree(struct btrfs_fs_info *fs_info); | ||||
| int btrfs_validate_super(const struct btrfs_fs_info *fs_info, | ||||
| 			 const struct btrfs_super_block *sb, int mirror_num); | ||||
|  | @ -127,8 +126,7 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans, | |||
| 		       struct btrfs_root *root); | ||||
| void btrfs_cleanup_dirty_bgs(struct btrfs_transaction *trans, | ||||
| 			     struct btrfs_fs_info *fs_info); | ||||
| void btrfs_cleanup_one_transaction(struct btrfs_transaction *trans, | ||||
| 				  struct btrfs_fs_info *fs_info); | ||||
| void btrfs_cleanup_one_transaction(struct btrfs_transaction *trans); | ||||
| struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, | ||||
| 				     u64 objectid); | ||||
| int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags); | ||||
|  |  | |||
|  | @ -182,7 +182,7 @@ search_again: | |||
| 
 | ||||
| 	delayed_refs = &trans->transaction->delayed_refs; | ||||
| 	spin_lock(&delayed_refs->lock); | ||||
| 	head = btrfs_find_delayed_ref_head(delayed_refs, bytenr); | ||||
| 	head = btrfs_find_delayed_ref_head(fs_info, delayed_refs, bytenr); | ||||
| 	if (head) { | ||||
| 		if (!mutex_trylock(&head->mutex)) { | ||||
| 			refcount_inc(&head->refs); | ||||
|  | @ -795,7 +795,6 @@ int lookup_inline_extent_backref(struct btrfs_trans_handle *trans, | |||
| 	if (insert) { | ||||
| 		extra_size = btrfs_extent_inline_ref_size(want); | ||||
| 		path->search_for_extension = 1; | ||||
| 		path->keep_locks = 1; | ||||
| 	} else | ||||
| 		extra_size = -1; | ||||
| 
 | ||||
|  | @ -946,6 +945,25 @@ again: | |||
| 			ret = -EAGAIN; | ||||
| 			goto out; | ||||
| 		} | ||||
| 
 | ||||
| 		if (path->slots[0] + 1 < btrfs_header_nritems(path->nodes[0])) { | ||||
| 			struct btrfs_key tmp_key; | ||||
| 
 | ||||
| 			btrfs_item_key_to_cpu(path->nodes[0], &tmp_key, path->slots[0] + 1); | ||||
| 			if (tmp_key.objectid == bytenr && | ||||
| 			    tmp_key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) { | ||||
| 				ret = -EAGAIN; | ||||
| 				goto out; | ||||
| 			} | ||||
| 			goto out_no_entry; | ||||
| 		} | ||||
| 
 | ||||
| 		if (!path->keep_locks) { | ||||
| 			btrfs_release_path(path); | ||||
| 			path->keep_locks = 1; | ||||
| 			goto again; | ||||
| 		} | ||||
| 
 | ||||
| 		/*
 | ||||
| 		 * To add new inline back ref, we have to make sure | ||||
| 		 * there is no corresponding back ref item. | ||||
|  | @ -959,13 +977,15 @@ again: | |||
| 			goto out; | ||||
| 		} | ||||
| 	} | ||||
| out_no_entry: | ||||
| 	*ref_ret = (struct btrfs_extent_inline_ref *)ptr; | ||||
| out: | ||||
| 	if (insert) { | ||||
| 	if (path->keep_locks) { | ||||
| 		path->keep_locks = 0; | ||||
| 		path->search_for_extension = 0; | ||||
| 		btrfs_unlock_up_safe(path, 1); | ||||
| 	} | ||||
| 	if (insert) | ||||
| 		path->search_for_extension = 0; | ||||
| 	return ret; | ||||
| } | ||||
| 
 | ||||
|  | @ -1807,16 +1827,6 @@ select_delayed_ref(struct btrfs_delayed_ref_head *head) | |||
| 	return ref; | ||||
| } | ||||
| 
 | ||||
| static void unselect_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs, | ||||
| 				      struct btrfs_delayed_ref_head *head) | ||||
| { | ||||
| 	spin_lock(&delayed_refs->lock); | ||||
| 	head->processing = false; | ||||
| 	delayed_refs->num_heads_ready++; | ||||
| 	spin_unlock(&delayed_refs->lock); | ||||
| 	btrfs_delayed_ref_unlock(head); | ||||
| } | ||||
| 
 | ||||
| static struct btrfs_delayed_extent_op *cleanup_extent_op( | ||||
| 				struct btrfs_delayed_ref_head *head) | ||||
| { | ||||
|  | @ -1891,7 +1901,7 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans, | |||
| 
 | ||||
| 	ret = run_and_cleanup_extent_op(trans, head); | ||||
| 	if (ret < 0) { | ||||
| 		unselect_delayed_ref_head(delayed_refs, head); | ||||
| 		btrfs_unselect_ref_head(delayed_refs, head); | ||||
| 		btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret); | ||||
| 		return ret; | ||||
| 	} else if (ret) { | ||||
|  | @ -1910,7 +1920,7 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans, | |||
| 		spin_unlock(&delayed_refs->lock); | ||||
| 		return 1; | ||||
| 	} | ||||
| 	btrfs_delete_ref_head(delayed_refs, head); | ||||
| 	btrfs_delete_ref_head(fs_info, delayed_refs, head); | ||||
| 	spin_unlock(&head->lock); | ||||
| 	spin_unlock(&delayed_refs->lock); | ||||
| 
 | ||||
|  | @ -1933,39 +1943,6 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans, | |||
| 	return ret; | ||||
| } | ||||
| 
 | ||||
| static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head( | ||||
| 					struct btrfs_trans_handle *trans) | ||||
| { | ||||
| 	struct btrfs_delayed_ref_root *delayed_refs = | ||||
| 		&trans->transaction->delayed_refs; | ||||
| 	struct btrfs_delayed_ref_head *head = NULL; | ||||
| 	int ret; | ||||
| 
 | ||||
| 	spin_lock(&delayed_refs->lock); | ||||
| 	head = btrfs_select_ref_head(delayed_refs); | ||||
| 	if (!head) { | ||||
| 		spin_unlock(&delayed_refs->lock); | ||||
| 		return head; | ||||
| 	} | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Grab the lock that says we are going to process all the refs for | ||||
| 	 * this head | ||||
| 	 */ | ||||
| 	ret = btrfs_delayed_ref_lock(delayed_refs, head); | ||||
| 	spin_unlock(&delayed_refs->lock); | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * We may have dropped the spin lock to get the head mutex lock, and | ||||
| 	 * that might have given someone else time to free the head.  If that's | ||||
| 	 * true, it has been removed from our list and we can move on. | ||||
| 	 */ | ||||
| 	if (ret == -EAGAIN) | ||||
| 		head = ERR_PTR(-EAGAIN); | ||||
| 
 | ||||
| 	return head; | ||||
| } | ||||
| 
 | ||||
| static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans, | ||||
| 					   struct btrfs_delayed_ref_head *locked_ref, | ||||
| 					   u64 *bytes_released) | ||||
|  | @ -1986,7 +1963,7 @@ static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans, | |||
| 		if (ref->seq && | ||||
| 		    btrfs_check_delayed_seq(fs_info, ref->seq)) { | ||||
| 			spin_unlock(&locked_ref->lock); | ||||
| 			unselect_delayed_ref_head(delayed_refs, locked_ref); | ||||
| 			btrfs_unselect_ref_head(delayed_refs, locked_ref); | ||||
| 			return -EAGAIN; | ||||
| 		} | ||||
| 
 | ||||
|  | @ -2009,7 +1986,6 @@ static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans, | |||
| 		default: | ||||
| 			WARN_ON(1); | ||||
| 		} | ||||
| 		atomic_dec(&delayed_refs->num_entries); | ||||
| 
 | ||||
| 		/*
 | ||||
| 		 * Record the must_insert_reserved flag before we drop the | ||||
|  | @ -2035,7 +2011,7 @@ static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans, | |||
| 
 | ||||
| 		btrfs_free_delayed_extent_op(extent_op); | ||||
| 		if (ret) { | ||||
| 			unselect_delayed_ref_head(delayed_refs, locked_ref); | ||||
| 			btrfs_unselect_ref_head(delayed_refs, locked_ref); | ||||
| 			btrfs_put_delayed_ref(ref); | ||||
| 			return ret; | ||||
| 		} | ||||
|  | @ -2073,7 +2049,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, | |||
| 
 | ||||
| 	do { | ||||
| 		if (!locked_ref) { | ||||
| 			locked_ref = btrfs_obtain_ref_head(trans); | ||||
| 			locked_ref = btrfs_select_ref_head(fs_info, delayed_refs); | ||||
| 			if (IS_ERR_OR_NULL(locked_ref)) { | ||||
| 				if (PTR_ERR(locked_ref) == -EAGAIN) { | ||||
| 					continue; | ||||
|  | @ -2220,7 +2196,7 @@ again: | |||
| 		btrfs_create_pending_block_groups(trans); | ||||
| 
 | ||||
| 		spin_lock(&delayed_refs->lock); | ||||
| 		if (RB_EMPTY_ROOT(&delayed_refs->href_root.rb_root)) { | ||||
| 		if (xa_empty(&delayed_refs->head_refs)) { | ||||
| 			spin_unlock(&delayed_refs->lock); | ||||
| 			return 0; | ||||
| 		} | ||||
|  | @ -2275,7 +2251,7 @@ static noinline int check_delayed_ref(struct btrfs_root *root, | |||
| 
 | ||||
| 	delayed_refs = &cur_trans->delayed_refs; | ||||
| 	spin_lock(&delayed_refs->lock); | ||||
| 	head = btrfs_find_delayed_ref_head(delayed_refs, bytenr); | ||||
| 	head = btrfs_find_delayed_ref_head(root->fs_info, delayed_refs, bytenr); | ||||
| 	if (!head) { | ||||
| 		spin_unlock(&delayed_refs->lock); | ||||
| 		btrfs_put_transaction(cur_trans); | ||||
|  | @ -3144,7 +3120,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, | |||
| 				break; | ||||
| 			} | ||||
| 
 | ||||
| 			/* Quick path didn't find the EXTEMT/METADATA_ITEM */ | ||||
| 			/* Quick path didn't find the EXTENT/METADATA_ITEM */ | ||||
| 			if (path->slots[0] - extent_slot > 5) | ||||
| 				break; | ||||
| 			extent_slot--; | ||||
|  | @ -3377,13 +3353,14 @@ out: | |||
| static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, | ||||
| 				      u64 bytenr) | ||||
| { | ||||
| 	struct btrfs_fs_info *fs_info = trans->fs_info; | ||||
| 	struct btrfs_delayed_ref_head *head; | ||||
| 	struct btrfs_delayed_ref_root *delayed_refs; | ||||
| 	int ret = 0; | ||||
| 
 | ||||
| 	delayed_refs = &trans->transaction->delayed_refs; | ||||
| 	spin_lock(&delayed_refs->lock); | ||||
| 	head = btrfs_find_delayed_ref_head(delayed_refs, bytenr); | ||||
| 	head = btrfs_find_delayed_ref_head(fs_info, delayed_refs, bytenr); | ||||
| 	if (!head) | ||||
| 		goto out_delayed_unlock; | ||||
| 
 | ||||
|  | @ -3401,7 +3378,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, | |||
| 	if (!mutex_trylock(&head->mutex)) | ||||
| 		goto out; | ||||
| 
 | ||||
| 	btrfs_delete_ref_head(delayed_refs, head); | ||||
| 	btrfs_delete_ref_head(fs_info, delayed_refs, head); | ||||
| 	head->processing = false; | ||||
| 
 | ||||
| 	spin_unlock(&head->lock); | ||||
|  | @ -3411,7 +3388,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, | |||
| 	if (head->must_insert_reserved) | ||||
| 		ret = 1; | ||||
| 
 | ||||
| 	btrfs_cleanup_ref_head_accounting(trans->fs_info, delayed_refs, head); | ||||
| 	btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head); | ||||
| 	mutex_unlock(&head->mutex); | ||||
| 	btrfs_put_delayed_ref_head(head); | ||||
| 	return ret; | ||||
|  | @ -5270,7 +5247,7 @@ struct walk_control { | |||
|  * corrupted file systems must have been caught before calling this function. | ||||
|  */ | ||||
| static bool visit_node_for_delete(struct btrfs_root *root, struct walk_control *wc, | ||||
| 				  struct extent_buffer *eb, u64 refs, u64 flags, int slot) | ||||
| 				  struct extent_buffer *eb, u64 flags, int slot) | ||||
| { | ||||
| 	struct btrfs_key key; | ||||
| 	u64 generation; | ||||
|  | @ -5384,7 +5361,7 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans, | |||
| 			continue; | ||||
| 
 | ||||
| 		/* If we don't need to visit this node don't reada. */ | ||||
| 		if (!visit_node_for_delete(root, wc, eb, refs, flags, slot)) | ||||
| 		if (!visit_node_for_delete(root, wc, eb, flags, slot)) | ||||
| 			continue; | ||||
| reada: | ||||
| 		btrfs_readahead_node_child(eb, slot); | ||||
|  | @ -5518,7 +5495,7 @@ again: | |||
| 	 */ | ||||
| 	delayed_refs = &trans->transaction->delayed_refs; | ||||
| 	spin_lock(&delayed_refs->lock); | ||||
| 	head = btrfs_find_delayed_ref_head(delayed_refs, bytenr); | ||||
| 	head = btrfs_find_delayed_ref_head(root->fs_info, delayed_refs, bytenr); | ||||
| 	if (!head) | ||||
| 		goto out; | ||||
| 	if (!mutex_trylock(&head->mutex)) { | ||||
|  | @ -5737,8 +5714,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, | |||
| 
 | ||||
| 	/* If we don't have to walk into this node skip it. */ | ||||
| 	if (!visit_node_for_delete(root, wc, path->nodes[level], | ||||
| 				   wc->refs[level - 1], wc->flags[level - 1], | ||||
| 				   path->slots[level])) | ||||
| 				   wc->flags[level - 1], path->slots[level])) | ||||
| 		goto skip; | ||||
| 
 | ||||
| 	/*
 | ||||
|  |  | |||
|  | @ -190,7 +190,7 @@ static void process_one_folio(struct btrfs_fs_info *fs_info, | |||
| 		btrfs_folio_clamp_clear_writeback(fs_info, folio, start, len); | ||||
| 
 | ||||
| 	if (folio != locked_folio && (page_ops & PAGE_UNLOCK)) | ||||
| 		btrfs_folio_end_writer_lock(fs_info, folio, start, len); | ||||
| 		btrfs_folio_end_lock(fs_info, folio, start, len); | ||||
| } | ||||
| 
 | ||||
| static void __process_folios_contig(struct address_space *mapping, | ||||
|  | @ -276,7 +276,7 @@ static noinline int lock_delalloc_folios(struct inode *inode, | |||
| 			range_start = max_t(u64, folio_pos(folio), start); | ||||
| 			range_len = min_t(u64, folio_pos(folio) + folio_size(folio), | ||||
| 					  end + 1) - range_start; | ||||
| 			btrfs_folio_set_writer_lock(fs_info, folio, range_start, range_len); | ||||
| 			btrfs_folio_set_lock(fs_info, folio, range_start, range_len); | ||||
| 
 | ||||
| 			processed_end = range_start + range_len - 1; | ||||
| 		} | ||||
|  | @ -438,7 +438,7 @@ static void end_folio_read(struct folio *folio, bool uptodate, u64 start, u32 le | |||
| 	if (!btrfs_is_subpage(fs_info, folio->mapping)) | ||||
| 		folio_unlock(folio); | ||||
| 	else | ||||
| 		btrfs_subpage_end_reader(fs_info, folio, start, len); | ||||
| 		btrfs_folio_end_lock(fs_info, folio, start, len); | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  | @ -495,7 +495,7 @@ static void begin_folio_read(struct btrfs_fs_info *fs_info, struct folio *folio) | |||
| 		return; | ||||
| 
 | ||||
| 	ASSERT(folio_test_private(folio)); | ||||
| 	btrfs_subpage_start_reader(fs_info, folio, folio_pos(folio), PAGE_SIZE); | ||||
| 	btrfs_folio_set_lock(fs_info, folio, folio_pos(folio), PAGE_SIZE); | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  | @ -1102,6 +1102,45 @@ int btrfs_read_folio(struct file *file, struct folio *folio) | |||
| 	return ret; | ||||
| } | ||||
| 
 | ||||
| static void set_delalloc_bitmap(struct folio *folio, unsigned long *delalloc_bitmap, | ||||
| 				u64 start, u32 len) | ||||
| { | ||||
| 	struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); | ||||
| 	const u64 folio_start = folio_pos(folio); | ||||
| 	unsigned int start_bit; | ||||
| 	unsigned int nbits; | ||||
| 
 | ||||
| 	ASSERT(start >= folio_start && start + len <= folio_start + PAGE_SIZE); | ||||
| 	start_bit = (start - folio_start) >> fs_info->sectorsize_bits; | ||||
| 	nbits = len >> fs_info->sectorsize_bits; | ||||
| 	ASSERT(bitmap_test_range_all_zero(delalloc_bitmap, start_bit, nbits)); | ||||
| 	bitmap_set(delalloc_bitmap, start_bit, nbits); | ||||
| } | ||||
| 
 | ||||
| static bool find_next_delalloc_bitmap(struct folio *folio, | ||||
| 				      unsigned long *delalloc_bitmap, u64 start, | ||||
| 				      u64 *found_start, u32 *found_len) | ||||
| { | ||||
| 	struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); | ||||
| 	const u64 folio_start = folio_pos(folio); | ||||
| 	const unsigned int bitmap_size = fs_info->sectors_per_page; | ||||
| 	unsigned int start_bit; | ||||
| 	unsigned int first_zero; | ||||
| 	unsigned int first_set; | ||||
| 
 | ||||
| 	ASSERT(start >= folio_start && start < folio_start + PAGE_SIZE); | ||||
| 
 | ||||
| 	start_bit = (start - folio_start) >> fs_info->sectorsize_bits; | ||||
| 	first_set = find_next_bit(delalloc_bitmap, bitmap_size, start_bit); | ||||
| 	if (first_set >= bitmap_size) | ||||
| 		return false; | ||||
| 
 | ||||
| 	*found_start = folio_start + (first_set << fs_info->sectorsize_bits); | ||||
| 	first_zero = find_next_zero_bit(delalloc_bitmap, bitmap_size, first_set); | ||||
| 	*found_len = (first_zero - first_set) << fs_info->sectorsize_bits; | ||||
| 	return true; | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * helper for extent_writepage(), doing all of the delayed allocation setup. | ||||
|  * | ||||
|  | @ -1121,6 +1160,7 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, | |||
| 	const bool is_subpage = btrfs_is_subpage(fs_info, folio->mapping); | ||||
| 	const u64 page_start = folio_pos(folio); | ||||
| 	const u64 page_end = page_start + folio_size(folio) - 1; | ||||
| 	unsigned long delalloc_bitmap = 0; | ||||
| 	/*
 | ||||
| 	 * Save the last found delalloc end. As the delalloc end can go beyond | ||||
| 	 * page boundary, thus we cannot rely on subpage bitmap to locate the | ||||
|  | @ -1131,6 +1171,7 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, | |||
| 	u64 delalloc_end = page_end; | ||||
| 	u64 delalloc_to_write = 0; | ||||
| 	int ret = 0; | ||||
| 	int bit; | ||||
| 
 | ||||
| 	/* Save the dirty bitmap as our submission bitmap will be a subset of it. */ | ||||
| 	if (btrfs_is_subpage(fs_info, inode->vfs_inode.i_mapping)) { | ||||
|  | @ -1140,6 +1181,12 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, | |||
| 		bio_ctrl->submit_bitmap = 1; | ||||
| 	} | ||||
| 
 | ||||
| 	for_each_set_bit(bit, &bio_ctrl->submit_bitmap, fs_info->sectors_per_page) { | ||||
| 		u64 start = page_start + (bit << fs_info->sectorsize_bits); | ||||
| 
 | ||||
| 		btrfs_folio_set_lock(fs_info, folio, start, fs_info->sectorsize); | ||||
| 	} | ||||
| 
 | ||||
| 	/* Lock all (subpage) delalloc ranges inside the folio first. */ | ||||
| 	while (delalloc_start < page_end) { | ||||
| 		delalloc_end = page_end; | ||||
|  | @ -1148,9 +1195,8 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, | |||
| 			delalloc_start = delalloc_end + 1; | ||||
| 			continue; | ||||
| 		} | ||||
| 		btrfs_folio_set_writer_lock(fs_info, folio, delalloc_start, | ||||
| 					    min(delalloc_end, page_end) + 1 - | ||||
| 					    delalloc_start); | ||||
| 		set_delalloc_bitmap(folio, &delalloc_bitmap, delalloc_start, | ||||
| 				    min(delalloc_end, page_end) + 1 - delalloc_start); | ||||
| 		last_delalloc_end = delalloc_end; | ||||
| 		delalloc_start = delalloc_end + 1; | ||||
| 	} | ||||
|  | @ -1175,7 +1221,7 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, | |||
| 			found_len = last_delalloc_end + 1 - found_start; | ||||
| 			found = true; | ||||
| 		} else { | ||||
| 			found = btrfs_subpage_find_writer_locked(fs_info, folio, | ||||
| 			found = find_next_delalloc_bitmap(folio, &delalloc_bitmap, | ||||
| 					delalloc_start, &found_start, &found_len); | ||||
| 		} | ||||
| 		if (!found) | ||||
|  | @ -1314,7 +1360,7 @@ static int submit_one_sector(struct btrfs_inode *inode, | |||
| 	 * a folio for a range already written to disk. | ||||
| 	 */ | ||||
| 	btrfs_folio_clear_dirty(fs_info, folio, filepos, sectorsize); | ||||
| 	btrfs_set_range_writeback(inode, filepos, filepos + sectorsize - 1); | ||||
| 	btrfs_folio_set_writeback(fs_info, folio, filepos, sectorsize); | ||||
| 	/*
 | ||||
| 	 * Above call should set the whole folio with writeback flag, even | ||||
| 	 * just for a single subpage sector. | ||||
|  | @ -1391,8 +1437,6 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode, | |||
| 			goto out; | ||||
| 		submitted_io = true; | ||||
| 	} | ||||
| 
 | ||||
| 	btrfs_folio_assert_not_dirty(fs_info, folio, start, len); | ||||
| out: | ||||
| 	/*
 | ||||
| 	 * If we didn't submitted any sector (>= i_size), folio dirty get | ||||
|  | @ -1476,7 +1520,7 @@ done: | |||
| 	 * Only unlock ranges that are submitted. As there can be some async | ||||
| 	 * submitted ranges inside the folio. | ||||
| 	 */ | ||||
| 	btrfs_folio_end_writer_lock_bitmap(fs_info, folio, bio_ctrl->submit_bitmap); | ||||
| 	btrfs_folio_end_lock_bitmap(fs_info, folio, bio_ctrl->submit_bitmap); | ||||
| 	ASSERT(ret <= 0); | ||||
| 	return ret; | ||||
| } | ||||
|  | @ -2115,7 +2159,27 @@ retry: | |||
| 				continue; | ||||
| 			} | ||||
| 
 | ||||
| 			if (wbc->sync_mode != WB_SYNC_NONE) { | ||||
| 			/*
 | ||||
| 			 * For subpage case, compression can lead to mixed | ||||
| 			 * writeback and dirty flags, e.g: | ||||
| 			 * 0     32K    64K    96K    128K | ||||
| 			 * |     |//////||/////|   |//|
 | ||||
| 			 * | ||||
| 			 * In above case, [32K, 96K) is asynchronously submitted | ||||
| 			 * for compression, and [124K, 128K) needs to be written back. | ||||
| 			 * | ||||
| 			 * If we didn't wait wrtiteback for page 64K, [128K, 128K) | ||||
| 			 * won't be submitted as the page still has writeback flag | ||||
| 			 * and will be skipped in the next check. | ||||
| 			 * | ||||
| 			 * This mixed writeback and dirty case is only possible for | ||||
| 			 * subpage case. | ||||
| 			 * | ||||
| 			 * TODO: Remove this check after migrating compression to | ||||
| 			 * regular submission. | ||||
| 			 */ | ||||
| 			if (wbc->sync_mode != WB_SYNC_NONE || | ||||
| 			    btrfs_is_subpage(inode_to_fs_info(inode), mapping)) { | ||||
| 				if (folio_test_writeback(folio)) | ||||
| 					submit_write_bio(bio_ctrl, 0); | ||||
| 				folio_wait_writeback(folio); | ||||
|  | @ -2200,7 +2264,7 @@ void extent_write_locked_range(struct inode *inode, const struct folio *locked_f | |||
| 		u32 cur_len = cur_end + 1 - cur; | ||||
| 		struct folio *folio; | ||||
| 
 | ||||
| 		folio = __filemap_get_folio(mapping, cur >> PAGE_SHIFT, 0, 0); | ||||
| 		folio = filemap_get_folio(mapping, cur >> PAGE_SHIFT); | ||||
| 
 | ||||
| 		/*
 | ||||
| 		 * This shouldn't happen, the pages are pinned and locked, this | ||||
|  | @ -2233,7 +2297,7 @@ void extent_write_locked_range(struct inode *inode, const struct folio *locked_f | |||
| 						       cur, cur_len, !ret); | ||||
| 			mapping_set_error(mapping, ret); | ||||
| 		} | ||||
| 		btrfs_folio_end_writer_lock(fs_info, folio, cur, cur_len); | ||||
| 		btrfs_folio_end_lock(fs_info, folio, cur, cur_len); | ||||
| 		if (ret < 0) | ||||
| 			found_error = true; | ||||
| next_page: | ||||
|  | @ -2317,7 +2381,7 @@ int extent_invalidate_folio(struct extent_io_tree *tree, | |||
|  * to drop the page. | ||||
|  */ | ||||
| static bool try_release_extent_state(struct extent_io_tree *tree, | ||||
| 				    struct folio *folio, gfp_t mask) | ||||
| 				     struct folio *folio) | ||||
| { | ||||
| 	u64 start = folio_pos(folio); | ||||
| 	u64 end = start + PAGE_SIZE - 1; | ||||
|  | @ -2428,7 +2492,7 @@ next: | |||
| 			cond_resched(); | ||||
| 		} | ||||
| 	} | ||||
| 	return try_release_extent_state(io_tree, folio, mask); | ||||
| 	return try_release_extent_state(io_tree, folio); | ||||
| } | ||||
| 
 | ||||
| static void __free_extent_buffer(struct extent_buffer *eb) | ||||
|  | @ -2442,7 +2506,7 @@ static int extent_buffer_under_io(const struct extent_buffer *eb) | |||
| 		test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); | ||||
| } | ||||
| 
 | ||||
| static bool folio_range_has_eb(struct btrfs_fs_info *fs_info, struct folio *folio) | ||||
| static bool folio_range_has_eb(struct folio *folio) | ||||
| { | ||||
| 	struct btrfs_subpage *subpage; | ||||
| 
 | ||||
|  | @ -2452,12 +2516,6 @@ static bool folio_range_has_eb(struct btrfs_fs_info *fs_info, struct folio *foli | |||
| 		subpage = folio_get_private(folio); | ||||
| 		if (atomic_read(&subpage->eb_refs)) | ||||
| 			return true; | ||||
| 		/*
 | ||||
| 		 * Even there is no eb refs here, we may still have | ||||
| 		 * end_folio_read() call relying on page::private. | ||||
| 		 */ | ||||
| 		if (atomic_read(&subpage->readers)) | ||||
| 			return true; | ||||
| 	} | ||||
| 	return false; | ||||
| } | ||||
|  | @ -2516,7 +2574,7 @@ static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct fo | |||
| 	 * We can only detach the folio private if there are no other ebs in the | ||||
| 	 * page range and no unfinished IO. | ||||
| 	 */ | ||||
| 	if (!folio_range_has_eb(fs_info, folio)) | ||||
| 	if (!folio_range_has_eb(folio)) | ||||
| 		btrfs_detach_subpage(fs_info, folio); | ||||
| 
 | ||||
| 	spin_unlock(&folio->mapping->i_private_lock); | ||||
|  | @ -3121,7 +3179,7 @@ out: | |||
| 	} | ||||
| 	/*
 | ||||
| 	 * Now all pages of that extent buffer is unmapped, set UNMAPPED flag, | ||||
| 	 * so it can be cleaned up without utlizing page->mapping. | ||||
| 	 * so it can be cleaned up without utilizing page->mapping. | ||||
| 	 */ | ||||
| 	set_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags); | ||||
| 
 | ||||
|  | @ -4221,7 +4279,6 @@ void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info, | |||
| 				u64 bytenr, u64 owner_root, u64 gen, int level) | ||||
| { | ||||
| 	struct btrfs_tree_parent_check check = { | ||||
| 		.has_first_key = 0, | ||||
| 		.level = level, | ||||
| 		.transid = gen | ||||
| 	}; | ||||
|  |  | |||
|  | @ -77,10 +77,13 @@ static u64 range_end(u64 start, u64 len) | |||
| 	return start + len; | ||||
| } | ||||
| 
 | ||||
| static void dec_evictable_extent_maps(struct btrfs_inode *inode) | ||||
| static void remove_em(struct btrfs_inode *inode, struct extent_map *em) | ||||
| { | ||||
| 	struct btrfs_fs_info *fs_info = inode->root->fs_info; | ||||
| 
 | ||||
| 	rb_erase(&em->rb_node, &inode->extent_tree.root); | ||||
| 	RB_CLEAR_NODE(&em->rb_node); | ||||
| 
 | ||||
| 	if (!btrfs_is_testing(fs_info) && is_fstree(btrfs_root_id(inode->root))) | ||||
| 		percpu_counter_dec(&fs_info->evictable_extent_maps); | ||||
| } | ||||
|  | @ -339,7 +342,6 @@ static void validate_extent_map(struct btrfs_fs_info *fs_info, struct extent_map | |||
| static void try_merge_map(struct btrfs_inode *inode, struct extent_map *em) | ||||
| { | ||||
| 	struct btrfs_fs_info *fs_info = inode->root->fs_info; | ||||
| 	struct extent_map_tree *tree = &inode->extent_tree; | ||||
| 	struct extent_map *merge = NULL; | ||||
| 	struct rb_node *rb; | ||||
| 
 | ||||
|  | @ -371,10 +373,8 @@ static void try_merge_map(struct btrfs_inode *inode, struct extent_map *em) | |||
| 			em->flags |= EXTENT_FLAG_MERGED; | ||||
| 
 | ||||
| 			validate_extent_map(fs_info, em); | ||||
| 			rb_erase(&merge->rb_node, &tree->root); | ||||
| 			RB_CLEAR_NODE(&merge->rb_node); | ||||
| 			remove_em(inode, merge); | ||||
| 			free_extent_map(merge); | ||||
| 			dec_evictable_extent_maps(inode); | ||||
| 		} | ||||
| 	} | ||||
| 
 | ||||
|  | @ -386,12 +386,10 @@ static void try_merge_map(struct btrfs_inode *inode, struct extent_map *em) | |||
| 		if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE) | ||||
| 			merge_ondisk_extents(em, merge, em); | ||||
| 		validate_extent_map(fs_info, em); | ||||
| 		rb_erase(&merge->rb_node, &tree->root); | ||||
| 		RB_CLEAR_NODE(&merge->rb_node); | ||||
| 		em->generation = max(em->generation, merge->generation); | ||||
| 		em->flags |= EXTENT_FLAG_MERGED; | ||||
| 		remove_em(inode, merge); | ||||
| 		free_extent_map(merge); | ||||
| 		dec_evictable_extent_maps(inode); | ||||
| 	} | ||||
| } | ||||
| 
 | ||||
|  | @ -588,12 +586,10 @@ void remove_extent_mapping(struct btrfs_inode *inode, struct extent_map *em) | |||
| 	lockdep_assert_held_write(&tree->lock); | ||||
| 
 | ||||
| 	WARN_ON(em->flags & EXTENT_FLAG_PINNED); | ||||
| 	rb_erase(&em->rb_node, &tree->root); | ||||
| 	if (!(em->flags & EXTENT_FLAG_LOGGING)) | ||||
| 		list_del_init(&em->list); | ||||
| 	RB_CLEAR_NODE(&em->rb_node); | ||||
| 
 | ||||
| 	dec_evictable_extent_maps(inode); | ||||
| 	remove_em(inode, em); | ||||
| } | ||||
| 
 | ||||
| static void replace_extent_mapping(struct btrfs_inode *inode, | ||||
|  | @ -1122,13 +1118,12 @@ out_free_pre: | |||
| struct btrfs_em_shrink_ctx { | ||||
| 	long nr_to_scan; | ||||
| 	long scanned; | ||||
| 	u64 last_ino; | ||||
| 	u64 last_root; | ||||
| }; | ||||
| 
 | ||||
| static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_ctx *ctx) | ||||
| { | ||||
| 	const u64 cur_fs_gen = btrfs_get_fs_generation(inode->root->fs_info); | ||||
| 	struct btrfs_fs_info *fs_info = inode->root->fs_info; | ||||
| 	const u64 cur_fs_gen = btrfs_get_fs_generation(fs_info); | ||||
| 	struct extent_map_tree *tree = &inode->extent_tree; | ||||
| 	long nr_dropped = 0; | ||||
| 	struct rb_node *node; | ||||
|  | @ -1201,7 +1196,8 @@ next: | |||
| 		 * lock. This is to avoid slowing other tasks trying to take the | ||||
| 		 * lock. | ||||
| 		 */ | ||||
| 		if (need_resched() || rwlock_needbreak(&tree->lock)) | ||||
| 		if (need_resched() || rwlock_needbreak(&tree->lock) || | ||||
| 		    btrfs_fs_closing(fs_info)) | ||||
| 			break; | ||||
| 		node = next; | ||||
| 	} | ||||
|  | @ -1213,19 +1209,21 @@ next: | |||
| 
 | ||||
| static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx *ctx) | ||||
| { | ||||
| 	struct btrfs_fs_info *fs_info = root->fs_info; | ||||
| 	struct btrfs_inode *inode; | ||||
| 	long nr_dropped = 0; | ||||
| 	u64 min_ino = ctx->last_ino + 1; | ||||
| 	u64 min_ino = fs_info->em_shrinker_last_ino + 1; | ||||
| 
 | ||||
| 	inode = btrfs_find_first_inode(root, min_ino); | ||||
| 	while (inode) { | ||||
| 		nr_dropped += btrfs_scan_inode(inode, ctx); | ||||
| 
 | ||||
| 		min_ino = btrfs_ino(inode) + 1; | ||||
| 		ctx->last_ino = btrfs_ino(inode); | ||||
| 		fs_info->em_shrinker_last_ino = btrfs_ino(inode); | ||||
| 		btrfs_add_delayed_iput(inode); | ||||
| 
 | ||||
| 		if (ctx->scanned >= ctx->nr_to_scan) | ||||
| 		if (ctx->scanned >= ctx->nr_to_scan || | ||||
| 		    btrfs_fs_closing(inode->root->fs_info)) | ||||
| 			break; | ||||
| 
 | ||||
| 		cond_resched(); | ||||
|  | @ -1241,52 +1239,43 @@ static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx | |||
| 		 * inode if there is one or we will find out this was the last | ||||
| 		 * one and move to the next root. | ||||
| 		 */ | ||||
| 		ctx->last_root = btrfs_root_id(root); | ||||
| 		fs_info->em_shrinker_last_root = btrfs_root_id(root); | ||||
| 	} else { | ||||
| 		/*
 | ||||
| 		 * No more inodes in this root, set extent_map_shrinker_last_ino to 0 so | ||||
| 		 * that when processing the next root we start from its first inode. | ||||
| 		 */ | ||||
| 		ctx->last_ino = 0; | ||||
| 		ctx->last_root = btrfs_root_id(root) + 1; | ||||
| 		fs_info->em_shrinker_last_ino = 0; | ||||
| 		fs_info->em_shrinker_last_root = btrfs_root_id(root) + 1; | ||||
| 	} | ||||
| 
 | ||||
| 	return nr_dropped; | ||||
| } | ||||
| 
 | ||||
| long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan) | ||||
| static void btrfs_extent_map_shrinker_worker(struct work_struct *work) | ||||
| { | ||||
| 	struct btrfs_fs_info *fs_info; | ||||
| 	struct btrfs_em_shrink_ctx ctx; | ||||
| 	u64 start_root_id; | ||||
| 	u64 next_root_id; | ||||
| 	bool cycled = false; | ||||
| 	long nr_dropped = 0; | ||||
| 
 | ||||
| 	fs_info = container_of(work, struct btrfs_fs_info, em_shrinker_work); | ||||
| 
 | ||||
| 	ctx.scanned = 0; | ||||
| 	ctx.nr_to_scan = nr_to_scan; | ||||
| 	ctx.nr_to_scan = atomic64_read(&fs_info->em_shrinker_nr_to_scan); | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * In case we have multiple tasks running this shrinker, make the next | ||||
| 	 * one start from the next inode in case it starts before we finish. | ||||
| 	 */ | ||||
| 	spin_lock(&fs_info->extent_map_shrinker_lock); | ||||
| 	ctx.last_ino = fs_info->extent_map_shrinker_last_ino; | ||||
| 	fs_info->extent_map_shrinker_last_ino++; | ||||
| 	ctx.last_root = fs_info->extent_map_shrinker_last_root; | ||||
| 	spin_unlock(&fs_info->extent_map_shrinker_lock); | ||||
| 
 | ||||
| 	start_root_id = ctx.last_root; | ||||
| 	next_root_id = ctx.last_root; | ||||
| 	start_root_id = fs_info->em_shrinker_last_root; | ||||
| 	next_root_id = fs_info->em_shrinker_last_root; | ||||
| 
 | ||||
| 	if (trace_btrfs_extent_map_shrinker_scan_enter_enabled()) { | ||||
| 		s64 nr = percpu_counter_sum_positive(&fs_info->evictable_extent_maps); | ||||
| 
 | ||||
| 		trace_btrfs_extent_map_shrinker_scan_enter(fs_info, nr_to_scan, | ||||
| 							   nr, ctx.last_root, | ||||
| 							   ctx.last_ino); | ||||
| 		trace_btrfs_extent_map_shrinker_scan_enter(fs_info, nr); | ||||
| 	} | ||||
| 
 | ||||
| 	while (ctx.scanned < ctx.nr_to_scan) { | ||||
| 	while (ctx.scanned < ctx.nr_to_scan && !btrfs_fs_closing(fs_info)) { | ||||
| 		struct btrfs_root *root; | ||||
| 		unsigned long count; | ||||
| 
 | ||||
|  | @ -1300,8 +1289,8 @@ long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan) | |||
| 			spin_unlock(&fs_info->fs_roots_radix_lock); | ||||
| 			if (start_root_id > 0 && !cycled) { | ||||
| 				next_root_id = 0; | ||||
| 				ctx.last_root = 0; | ||||
| 				ctx.last_ino = 0; | ||||
| 				fs_info->em_shrinker_last_root = 0; | ||||
| 				fs_info->em_shrinker_last_ino = 0; | ||||
| 				cycled = true; | ||||
| 				continue; | ||||
| 			} | ||||
|  | @ -1320,29 +1309,40 @@ long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan) | |||
| 		btrfs_put_root(root); | ||||
| 	} | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * In case of multiple tasks running this extent map shrinking code this | ||||
| 	 * isn't perfect but it's simple and silences things like KCSAN. It's | ||||
| 	 * not possible to know which task made more progress because we can | ||||
| 	 * cycle back to the first root and first inode if it's not the first | ||||
| 	 * time the shrinker ran, see the above logic. Also a task that started | ||||
| 	 * later may finish ealier than another task and made less progress. So | ||||
| 	 * make this simple and update to the progress of the last task that | ||||
| 	 * finished, with the occasional possiblity of having two consecutive | ||||
| 	 * runs of the shrinker process the same inodes. | ||||
| 	 */ | ||||
| 	spin_lock(&fs_info->extent_map_shrinker_lock); | ||||
| 	fs_info->extent_map_shrinker_last_ino = ctx.last_ino; | ||||
| 	fs_info->extent_map_shrinker_last_root = ctx.last_root; | ||||
| 	spin_unlock(&fs_info->extent_map_shrinker_lock); | ||||
| 
 | ||||
| 	if (trace_btrfs_extent_map_shrinker_scan_exit_enabled()) { | ||||
| 		s64 nr = percpu_counter_sum_positive(&fs_info->evictable_extent_maps); | ||||
| 
 | ||||
| 		trace_btrfs_extent_map_shrinker_scan_exit(fs_info, nr_dropped, | ||||
| 							  nr, ctx.last_root, | ||||
| 							  ctx.last_ino); | ||||
| 		trace_btrfs_extent_map_shrinker_scan_exit(fs_info, nr_dropped, nr); | ||||
| 	} | ||||
| 
 | ||||
| 	return nr_dropped; | ||||
| 	atomic64_set(&fs_info->em_shrinker_nr_to_scan, 0); | ||||
| } | ||||
| 
 | ||||
| void btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan) | ||||
| { | ||||
| 	/*
 | ||||
| 	 * Do nothing if the shrinker is already running. In case of high memory | ||||
| 	 * pressure we can have a lot of tasks calling us and all passing the | ||||
| 	 * same nr_to_scan value, but in reality we may need only to free | ||||
| 	 * nr_to_scan extent maps (or less). In case we need to free more than | ||||
| 	 * that, we will be called again by the fs shrinker, so no worries about | ||||
| 	 * not doing enough work to reclaim memory from extent maps. | ||||
| 	 * We can also be repeatedly called with the same nr_to_scan value | ||||
| 	 * simply because the shrinker runs asynchronously and multiple calls | ||||
| 	 * to this function are made before the shrinker does enough progress. | ||||
| 	 * | ||||
| 	 * That's why we set the atomic counter to nr_to_scan only if its | ||||
| 	 * current value is zero, instead of incrementing the counter by | ||||
| 	 * nr_to_scan. | ||||
| 	 */ | ||||
| 	if (atomic64_cmpxchg(&fs_info->em_shrinker_nr_to_scan, 0, nr_to_scan) != 0) | ||||
| 		return; | ||||
| 
 | ||||
| 	queue_work(system_unbound_wq, &fs_info->em_shrinker_work); | ||||
| } | ||||
| 
 | ||||
| void btrfs_init_extent_map_shrinker_work(struct btrfs_fs_info *fs_info) | ||||
| { | ||||
| 	atomic64_set(&fs_info->em_shrinker_nr_to_scan, 0); | ||||
| 	INIT_WORK(&fs_info->em_shrinker_work, btrfs_extent_map_shrinker_worker); | ||||
| } | ||||
|  |  | |||
|  | @ -189,6 +189,7 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, | |||
| int btrfs_replace_extent_map_range(struct btrfs_inode *inode, | ||||
| 				   struct extent_map *new_em, | ||||
| 				   bool modified); | ||||
| long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan); | ||||
| void btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan); | ||||
| void btrfs_init_extent_map_shrinker_work(struct btrfs_fs_info *fs_info); | ||||
| 
 | ||||
| #endif | ||||
|  |  | |||
|  | @ -186,7 +186,7 @@ static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo, | |||
| 			 * we have in the cache is the last delalloc range we | ||||
| 			 * found while the file extent item we found can be | ||||
| 			 * either for a whole delalloc range we previously | ||||
| 			 * emmitted or only a part of that range. | ||||
| 			 * emitted or only a part of that range. | ||||
| 			 * | ||||
| 			 * We have two cases here: | ||||
| 			 * | ||||
|  | @ -194,13 +194,13 @@ static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo, | |||
| 			 *    cached extent's end. In this case just ignore the | ||||
| 			 *    current file extent item because we don't want to | ||||
| 			 *    overlap with previous ranges that may have been | ||||
| 			 *    emmitted already; | ||||
| 			 *    emitted already; | ||||
| 			 * | ||||
| 			 * 2) The file extent item starts behind the currently | ||||
| 			 *    cached extent but its end offset goes beyond the | ||||
| 			 *    end offset of the cached extent. We don't want to | ||||
| 			 *    overlap with a previous range that may have been | ||||
| 			 *    emmitted already, so we emit the currently cached | ||||
| 			 *    emitted already, so we emit the currently cached | ||||
| 			 *    extent and then partially store the current file | ||||
| 			 *    extent item's range in the cache, for the subrange | ||||
| 			 *    going the cached extent's end to the end of the | ||||
|  |  | |||
							
								
								
									
										351
									
								
								fs/btrfs/file.c
									
										
									
									
									
								
							
							
						
						
									
										351
									
								
								fs/btrfs/file.c
									
										
									
									
									
								
							|  | @ -37,33 +37,30 @@ | |||
| #include "file.h" | ||||
| #include "super.h" | ||||
| 
 | ||||
| /* simple helper to fault in pages and copy.  This should go away
 | ||||
|  * and be replaced with calls into generic code. | ||||
| /*
 | ||||
|  * Helper to fault in page and copy.  This should go away and be replaced with | ||||
|  * calls into generic code. | ||||
|  */ | ||||
| static noinline int btrfs_copy_from_user(loff_t pos, size_t write_bytes, | ||||
| 					 struct page **prepared_pages, | ||||
| 					 struct iov_iter *i) | ||||
| 					 struct folio *folio, struct iov_iter *i) | ||||
| { | ||||
| 	size_t copied = 0; | ||||
| 	size_t total_copied = 0; | ||||
| 	int pg = 0; | ||||
| 	int offset = offset_in_page(pos); | ||||
| 
 | ||||
| 	while (write_bytes > 0) { | ||||
| 		size_t count = min_t(size_t, | ||||
| 				     PAGE_SIZE - offset, write_bytes); | ||||
| 		struct page *page = prepared_pages[pg]; | ||||
| 		size_t count = min_t(size_t, PAGE_SIZE - offset, write_bytes); | ||||
| 		/*
 | ||||
| 		 * Copy data from userspace to the current page | ||||
| 		 */ | ||||
| 		copied = copy_page_from_iter_atomic(page, offset, count, i); | ||||
| 		copied = copy_folio_from_iter_atomic(folio, offset, count, i); | ||||
| 
 | ||||
| 		/* Flush processor's dcache for this page */ | ||||
| 		flush_dcache_page(page); | ||||
| 		flush_dcache_folio(folio); | ||||
| 
 | ||||
| 		/*
 | ||||
| 		 * if we get a partial write, we can end up with | ||||
| 		 * partially up to date pages.  These add | ||||
| 		 * partially up to date page.  These add | ||||
| 		 * a lot of complexity, so make sure they don't | ||||
| 		 * happen by forcing this copy to be retried. | ||||
| 		 * | ||||
|  | @ -71,7 +68,7 @@ static noinline int btrfs_copy_from_user(loff_t pos, size_t write_bytes, | |||
| 		 * back to page at a time copies after we return 0. | ||||
| 		 */ | ||||
| 		if (unlikely(copied < count)) { | ||||
| 			if (!PageUptodate(page)) { | ||||
| 			if (!folio_test_uptodate(folio)) { | ||||
| 				iov_iter_revert(i, copied); | ||||
| 				copied = 0; | ||||
| 			} | ||||
|  | @ -82,54 +79,44 @@ static noinline int btrfs_copy_from_user(loff_t pos, size_t write_bytes, | |||
| 		write_bytes -= copied; | ||||
| 		total_copied += copied; | ||||
| 		offset += copied; | ||||
| 		if (offset == PAGE_SIZE) { | ||||
| 			pg++; | ||||
| 			offset = 0; | ||||
| 		} | ||||
| 	} | ||||
| 	return total_copied; | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * unlocks pages after btrfs_file_write is done with them | ||||
|  * Unlock folio after btrfs_file_write() is done with it. | ||||
|  */ | ||||
| static void btrfs_drop_pages(struct btrfs_fs_info *fs_info, | ||||
| 			     struct page **pages, size_t num_pages, | ||||
| static void btrfs_drop_folio(struct btrfs_fs_info *fs_info, struct folio *folio, | ||||
| 			     u64 pos, u64 copied) | ||||
| { | ||||
| 	size_t i; | ||||
| 	u64 block_start = round_down(pos, fs_info->sectorsize); | ||||
| 	u64 block_len = round_up(pos + copied, fs_info->sectorsize) - block_start; | ||||
| 
 | ||||
| 	ASSERT(block_len <= U32_MAX); | ||||
| 	for (i = 0; i < num_pages; i++) { | ||||
| 		/* page checked is some magic around finding pages that
 | ||||
| 		 * have been modified without going through btrfs_set_page_dirty | ||||
| 		 * clear it here. There should be no need to mark the pages | ||||
| 		 * accessed as prepare_pages should have marked them accessed | ||||
| 		 * in prepare_pages via find_or_create_page() | ||||
| 		 */ | ||||
| 		btrfs_folio_clamp_clear_checked(fs_info, page_folio(pages[i]), | ||||
| 						block_start, block_len); | ||||
| 		unlock_page(pages[i]); | ||||
| 		put_page(pages[i]); | ||||
| 	} | ||||
| 	/*
 | ||||
| 	 * Folio checked is some magic around finding folios that have been | ||||
| 	 * modified without going through btrfs_dirty_folio().  Clear it here. | ||||
| 	 * There should be no need to mark the pages accessed as | ||||
| 	 * prepare_one_folio() should have marked them accessed in | ||||
| 	 * prepare_one_folio() via find_or_create_page() | ||||
| 	 */ | ||||
| 	btrfs_folio_clamp_clear_checked(fs_info, folio, block_start, block_len); | ||||
| 	folio_unlock(folio); | ||||
| 	folio_put(folio); | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * After btrfs_copy_from_user(), update the following things for delalloc: | ||||
|  * - Mark newly dirtied pages as DELALLOC in the io tree. | ||||
|  * - Mark newly dirtied folio as DELALLOC in the io tree. | ||||
|  *   Used to advise which range is to be written back. | ||||
|  * - Mark modified pages as Uptodate/Dirty and not needing COW fixup | ||||
|  * - Mark modified folio as Uptodate/Dirty and not needing COW fixup | ||||
|  * - Update inode size for past EOF write | ||||
|  */ | ||||
| int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages, | ||||
| 		      size_t num_pages, loff_t pos, size_t write_bytes, | ||||
| 		      struct extent_state **cached, bool noreserve) | ||||
| int btrfs_dirty_folio(struct btrfs_inode *inode, struct folio *folio, loff_t pos, | ||||
| 		      size_t write_bytes, struct extent_state **cached, bool noreserve) | ||||
| { | ||||
| 	struct btrfs_fs_info *fs_info = inode->root->fs_info; | ||||
| 	int ret = 0; | ||||
| 	int i; | ||||
| 	u64 num_bytes; | ||||
| 	u64 start_pos; | ||||
| 	u64 end_of_last_block; | ||||
|  | @ -147,6 +134,8 @@ int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages, | |||
| 	num_bytes = round_up(write_bytes + pos - start_pos, | ||||
| 			     fs_info->sectorsize); | ||||
| 	ASSERT(num_bytes <= U32_MAX); | ||||
| 	ASSERT(folio_pos(folio) <= pos && | ||||
| 	       folio_pos(folio) + folio_size(folio) >= pos + write_bytes); | ||||
| 
 | ||||
| 	end_of_last_block = start_pos + num_bytes - 1; | ||||
| 
 | ||||
|  | @ -163,16 +152,9 @@ int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages, | |||
| 	if (ret) | ||||
| 		return ret; | ||||
| 
 | ||||
| 	for (i = 0; i < num_pages; i++) { | ||||
| 		struct page *p = pages[i]; | ||||
| 
 | ||||
| 		btrfs_folio_clamp_set_uptodate(fs_info, page_folio(p), | ||||
| 					       start_pos, num_bytes); | ||||
| 		btrfs_folio_clamp_clear_checked(fs_info, page_folio(p), | ||||
| 						start_pos, num_bytes); | ||||
| 		btrfs_folio_clamp_set_dirty(fs_info, page_folio(p), | ||||
| 					    start_pos, num_bytes); | ||||
| 	} | ||||
| 	btrfs_folio_clamp_set_uptodate(fs_info, folio, start_pos, num_bytes); | ||||
| 	btrfs_folio_clamp_clear_checked(fs_info, folio, start_pos, num_bytes); | ||||
| 	btrfs_folio_clamp_set_dirty(fs_info, folio, start_pos, num_bytes); | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * we've only changed i_size in ram, and we haven't updated | ||||
|  | @ -851,55 +833,49 @@ out: | |||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * on error we return an unlocked page and the error value | ||||
|  * on success we return a locked page and 0 | ||||
|  * On error return an unlocked folio and the error value | ||||
|  * On success return a locked folio and 0 | ||||
|  */ | ||||
| static int prepare_uptodate_page(struct inode *inode, | ||||
| 				 struct page *page, u64 pos, | ||||
| 				 bool force_uptodate) | ||||
| static int prepare_uptodate_folio(struct inode *inode, struct folio *folio, u64 pos, | ||||
| 				  u64 len, bool force_uptodate) | ||||
| { | ||||
| 	struct folio *folio = page_folio(page); | ||||
| 	u64 clamp_start = max_t(u64, pos, folio_pos(folio)); | ||||
| 	u64 clamp_end = min_t(u64, pos + len, folio_pos(folio) + folio_size(folio)); | ||||
| 	int ret = 0; | ||||
| 
 | ||||
| 	if (((pos & (PAGE_SIZE - 1)) || force_uptodate) && | ||||
| 	    !PageUptodate(page)) { | ||||
| 		ret = btrfs_read_folio(NULL, folio); | ||||
| 		if (ret) | ||||
| 			return ret; | ||||
| 		lock_page(page); | ||||
| 		if (!PageUptodate(page)) { | ||||
| 			unlock_page(page); | ||||
| 			return -EIO; | ||||
| 		} | ||||
| 	if (folio_test_uptodate(folio)) | ||||
| 		return 0; | ||||
| 
 | ||||
| 		/*
 | ||||
| 		 * Since btrfs_read_folio() will unlock the folio before it | ||||
| 		 * returns, there is a window where btrfs_release_folio() can be | ||||
| 		 * called to release the page.  Here we check both inode | ||||
| 		 * mapping and PagePrivate() to make sure the page was not | ||||
| 		 * released. | ||||
| 		 * | ||||
| 		 * The private flag check is essential for subpage as we need | ||||
| 		 * to store extra bitmap using folio private. | ||||
| 		 */ | ||||
| 		if (page->mapping != inode->i_mapping || !folio_test_private(folio)) { | ||||
| 			unlock_page(page); | ||||
| 			return -EAGAIN; | ||||
| 		} | ||||
| 	if (!force_uptodate && | ||||
| 	    IS_ALIGNED(clamp_start, PAGE_SIZE) && | ||||
| 	    IS_ALIGNED(clamp_end, PAGE_SIZE)) | ||||
| 		return 0; | ||||
| 
 | ||||
| 	ret = btrfs_read_folio(NULL, folio); | ||||
| 	if (ret) | ||||
| 		return ret; | ||||
| 	folio_lock(folio); | ||||
| 	if (!folio_test_uptodate(folio)) { | ||||
| 		folio_unlock(folio); | ||||
| 		return -EIO; | ||||
| 	} | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Since btrfs_read_folio() will unlock the folio before it returns, | ||||
| 	 * there is a window where btrfs_release_folio() can be called to | ||||
| 	 * release the page.  Here we check both inode mapping and page | ||||
| 	 * private to make sure the page was not released. | ||||
| 	 * | ||||
| 	 * The private flag check is essential for subpage as we need to store | ||||
| 	 * extra bitmap using folio private. | ||||
| 	 */ | ||||
| 	if (folio->mapping != inode->i_mapping || !folio_test_private(folio)) { | ||||
| 		folio_unlock(folio); | ||||
| 		return -EAGAIN; | ||||
| 	} | ||||
| 	return 0; | ||||
| } | ||||
| 
 | ||||
| static fgf_t get_prepare_fgp_flags(bool nowait) | ||||
| { | ||||
| 	fgf_t fgp_flags = FGP_LOCK | FGP_ACCESSED | FGP_CREAT; | ||||
| 
 | ||||
| 	if (nowait) | ||||
| 		fgp_flags |= FGP_NOWAIT; | ||||
| 
 | ||||
| 	return fgp_flags; | ||||
| } | ||||
| 
 | ||||
| static gfp_t get_prepare_gfp_flags(struct inode *inode, bool nowait) | ||||
| { | ||||
| 	gfp_t gfp; | ||||
|  | @ -914,89 +890,67 @@ static gfp_t get_prepare_gfp_flags(struct inode *inode, bool nowait) | |||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * this just gets pages into the page cache and locks them down. | ||||
|  * Get folio into the page cache and lock it. | ||||
|  */ | ||||
| static noinline int prepare_pages(struct inode *inode, struct page **pages, | ||||
| 				  size_t num_pages, loff_t pos, | ||||
| 				  size_t write_bytes, bool force_uptodate, | ||||
| 				  bool nowait) | ||||
| static noinline int prepare_one_folio(struct inode *inode, struct folio **folio_ret, | ||||
| 				      loff_t pos, size_t write_bytes, | ||||
| 				      bool force_uptodate, bool nowait) | ||||
| { | ||||
| 	int i; | ||||
| 	unsigned long index = pos >> PAGE_SHIFT; | ||||
| 	gfp_t mask = get_prepare_gfp_flags(inode, nowait); | ||||
| 	fgf_t fgp_flags = get_prepare_fgp_flags(nowait); | ||||
| 	fgf_t fgp_flags = (nowait ? FGP_WRITEBEGIN | FGP_NOWAIT : FGP_WRITEBEGIN); | ||||
| 	struct folio *folio; | ||||
| 	int ret = 0; | ||||
| 	int faili; | ||||
| 
 | ||||
| 	for (i = 0; i < num_pages; i++) { | ||||
| again: | ||||
| 		pages[i] = pagecache_get_page(inode->i_mapping, index + i, | ||||
| 					      fgp_flags, mask | __GFP_WRITE); | ||||
| 		if (!pages[i]) { | ||||
| 			faili = i - 1; | ||||
| 			if (nowait) | ||||
| 				ret = -EAGAIN; | ||||
| 			else | ||||
| 				ret = -ENOMEM; | ||||
| 			goto fail; | ||||
| 		} | ||||
| 
 | ||||
| 		ret = set_page_extent_mapped(pages[i]); | ||||
| 		if (ret < 0) { | ||||
| 			faili = i; | ||||
| 			goto fail; | ||||
| 		} | ||||
| 
 | ||||
| 		if (i == 0) | ||||
| 			ret = prepare_uptodate_page(inode, pages[i], pos, | ||||
| 						    force_uptodate); | ||||
| 		if (!ret && i == num_pages - 1) | ||||
| 			ret = prepare_uptodate_page(inode, pages[i], | ||||
| 						    pos + write_bytes, false); | ||||
| 		if (ret) { | ||||
| 			put_page(pages[i]); | ||||
| 			if (!nowait && ret == -EAGAIN) { | ||||
| 				ret = 0; | ||||
| 				goto again; | ||||
| 			} | ||||
| 			faili = i - 1; | ||||
| 			goto fail; | ||||
| 		} | ||||
| 		wait_on_page_writeback(pages[i]); | ||||
| 	folio = __filemap_get_folio(inode->i_mapping, index, fgp_flags, mask); | ||||
| 	if (IS_ERR(folio)) { | ||||
| 		if (nowait) | ||||
| 			ret = -EAGAIN; | ||||
| 		else | ||||
| 			ret = PTR_ERR(folio); | ||||
| 		return ret; | ||||
| 	} | ||||
| 
 | ||||
| 	/* Only support page sized folio yet. */ | ||||
| 	ASSERT(folio_order(folio) == 0); | ||||
| 	ret = set_folio_extent_mapped(folio); | ||||
| 	if (ret < 0) { | ||||
| 		folio_unlock(folio); | ||||
| 		folio_put(folio); | ||||
| 		return ret; | ||||
| 	} | ||||
| 	ret = prepare_uptodate_folio(inode, folio, pos, write_bytes, force_uptodate); | ||||
| 	if (ret) { | ||||
| 		/* The folio is already unlocked. */ | ||||
| 		folio_put(folio); | ||||
| 		if (!nowait && ret == -EAGAIN) { | ||||
| 			ret = 0; | ||||
| 			goto again; | ||||
| 		} | ||||
| 		return ret; | ||||
| 	} | ||||
| 	*folio_ret = folio; | ||||
| 	return 0; | ||||
| fail: | ||||
| 	while (faili >= 0) { | ||||
| 		unlock_page(pages[faili]); | ||||
| 		put_page(pages[faili]); | ||||
| 		faili--; | ||||
| 	} | ||||
| 	return ret; | ||||
| 
 | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * This function locks the extent and properly waits for data=ordered extents | ||||
|  * to finish before allowing the pages to be modified if need. | ||||
|  * Locks the extent and properly waits for data=ordered extents to finish | ||||
|  * before allowing the folios to be modified if need. | ||||
|  * | ||||
|  * The return value: | ||||
|  * Return: | ||||
|  * 1 - the extent is locked | ||||
|  * 0 - the extent is not locked, and everything is OK | ||||
|  * -EAGAIN - need re-prepare the pages | ||||
|  * the other < 0 number - Something wrong happens | ||||
|  * -EAGAIN - need to prepare the folios again | ||||
|  */ | ||||
| static noinline int | ||||
| lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages, | ||||
| 				size_t num_pages, loff_t pos, | ||||
| 				size_t write_bytes, | ||||
| lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct folio *folio, | ||||
| 				loff_t pos, size_t write_bytes, | ||||
| 				u64 *lockstart, u64 *lockend, bool nowait, | ||||
| 				struct extent_state **cached_state) | ||||
| { | ||||
| 	struct btrfs_fs_info *fs_info = inode->root->fs_info; | ||||
| 	u64 start_pos; | ||||
| 	u64 last_pos; | ||||
| 	int i; | ||||
| 	int ret = 0; | ||||
| 
 | ||||
| 	start_pos = round_down(pos, fs_info->sectorsize); | ||||
|  | @ -1008,12 +962,8 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages, | |||
| 		if (nowait) { | ||||
| 			if (!try_lock_extent(&inode->io_tree, start_pos, last_pos, | ||||
| 					     cached_state)) { | ||||
| 				for (i = 0; i < num_pages; i++) { | ||||
| 					unlock_page(pages[i]); | ||||
| 					put_page(pages[i]); | ||||
| 					pages[i] = NULL; | ||||
| 				} | ||||
| 
 | ||||
| 				folio_unlock(folio); | ||||
| 				folio_put(folio); | ||||
| 				return -EAGAIN; | ||||
| 			} | ||||
| 		} else { | ||||
|  | @ -1027,10 +977,8 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages, | |||
| 		    ordered->file_offset <= last_pos) { | ||||
| 			unlock_extent(&inode->io_tree, start_pos, last_pos, | ||||
| 				      cached_state); | ||||
| 			for (i = 0; i < num_pages; i++) { | ||||
| 				unlock_page(pages[i]); | ||||
| 				put_page(pages[i]); | ||||
| 			} | ||||
| 			folio_unlock(folio); | ||||
| 			folio_put(folio); | ||||
| 			btrfs_start_ordered_extent(ordered); | ||||
| 			btrfs_put_ordered_extent(ordered); | ||||
| 			return -EAGAIN; | ||||
|  | @ -1044,11 +992,10 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages, | |||
| 	} | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * We should be called after prepare_pages() which should have locked | ||||
| 	 * We should be called after prepare_one_folio() which should have locked | ||||
| 	 * all pages in the range. | ||||
| 	 */ | ||||
| 	for (i = 0; i < num_pages; i++) | ||||
| 		WARN_ON(!PageLocked(pages[i])); | ||||
| 	WARN_ON(!folio_test_locked(folio)); | ||||
| 
 | ||||
| 	return ret; | ||||
| } | ||||
|  | @ -1120,7 +1067,7 @@ void btrfs_check_nocow_unlock(struct btrfs_inode *inode) | |||
| 	btrfs_drew_write_unlock(&inode->root->snapshot_lock); | ||||
| } | ||||
| 
 | ||||
| int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from, size_t count) | ||||
| int btrfs_write_check(struct kiocb *iocb, size_t count) | ||||
| { | ||||
| 	struct file *file = iocb->ki_filp; | ||||
| 	struct inode *inode = file_inode(file); | ||||
|  | @ -1175,20 +1122,17 @@ ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i) | |||
| 	loff_t pos; | ||||
| 	struct inode *inode = file_inode(file); | ||||
| 	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); | ||||
| 	struct page **pages = NULL; | ||||
| 	struct extent_changeset *data_reserved = NULL; | ||||
| 	u64 release_bytes = 0; | ||||
| 	u64 lockstart; | ||||
| 	u64 lockend; | ||||
| 	size_t num_written = 0; | ||||
| 	int nrptrs; | ||||
| 	ssize_t ret; | ||||
| 	bool only_release_metadata = false; | ||||
| 	bool force_page_uptodate = false; | ||||
| 	loff_t old_isize = i_size_read(inode); | ||||
| 	unsigned int ilock_flags = 0; | ||||
| 	const bool nowait = (iocb->ki_flags & IOCB_NOWAIT); | ||||
| 	unsigned int bdp_flags = (nowait ? BDP_ASYNC : 0); | ||||
| 	bool only_release_metadata = false; | ||||
| 
 | ||||
| 	if (nowait) | ||||
| 		ilock_flags |= BTRFS_ILOCK_TRY; | ||||
|  | @ -1201,38 +1145,26 @@ ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i) | |||
| 	if (ret <= 0) | ||||
| 		goto out; | ||||
| 
 | ||||
| 	ret = btrfs_write_check(iocb, i, ret); | ||||
| 	ret = btrfs_write_check(iocb, ret); | ||||
| 	if (ret < 0) | ||||
| 		goto out; | ||||
| 
 | ||||
| 	pos = iocb->ki_pos; | ||||
| 	nrptrs = min(DIV_ROUND_UP(iov_iter_count(i), PAGE_SIZE), | ||||
| 			PAGE_SIZE / (sizeof(struct page *))); | ||||
| 	nrptrs = min(nrptrs, current->nr_dirtied_pause - current->nr_dirtied); | ||||
| 	nrptrs = max(nrptrs, 8); | ||||
| 	pages = kmalloc_array(nrptrs, sizeof(struct page *), GFP_KERNEL); | ||||
| 	if (!pages) { | ||||
| 		ret = -ENOMEM; | ||||
| 		goto out; | ||||
| 	} | ||||
| 
 | ||||
| 	while (iov_iter_count(i) > 0) { | ||||
| 		struct extent_state *cached_state = NULL; | ||||
| 		size_t offset = offset_in_page(pos); | ||||
| 		size_t sector_offset; | ||||
| 		size_t write_bytes = min(iov_iter_count(i), | ||||
| 					 nrptrs * (size_t)PAGE_SIZE - | ||||
| 					 offset); | ||||
| 		size_t num_pages; | ||||
| 		size_t write_bytes = min(iov_iter_count(i), PAGE_SIZE - offset); | ||||
| 		size_t reserve_bytes; | ||||
| 		size_t dirty_pages; | ||||
| 		size_t copied; | ||||
| 		size_t dirty_sectors; | ||||
| 		size_t num_sectors; | ||||
| 		struct folio *folio = NULL; | ||||
| 		int extents_locked; | ||||
| 		bool force_page_uptodate = false; | ||||
| 
 | ||||
| 		/*
 | ||||
| 		 * Fault pages before locking them in prepare_pages | ||||
| 		 * Fault pages before locking them in prepare_one_folio() | ||||
| 		 * to avoid recursive lock | ||||
| 		 */ | ||||
| 		if (unlikely(fault_in_iov_iter_readable(i, write_bytes))) { | ||||
|  | @ -1271,8 +1203,6 @@ ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i) | |||
| 			only_release_metadata = true; | ||||
| 		} | ||||
| 
 | ||||
| 		num_pages = DIV_ROUND_UP(write_bytes + offset, PAGE_SIZE); | ||||
| 		WARN_ON(num_pages > nrptrs); | ||||
| 		reserve_bytes = round_up(write_bytes + sector_offset, | ||||
| 					 fs_info->sectorsize); | ||||
| 		WARN_ON(reserve_bytes == 0); | ||||
|  | @ -1300,23 +1230,17 @@ again: | |||
| 			break; | ||||
| 		} | ||||
| 
 | ||||
| 		/*
 | ||||
| 		 * This is going to setup the pages array with the number of | ||||
| 		 * pages we want, so we don't really need to worry about the | ||||
| 		 * contents of pages from loop to loop | ||||
| 		 */ | ||||
| 		ret = prepare_pages(inode, pages, num_pages, | ||||
| 				    pos, write_bytes, force_page_uptodate, false); | ||||
| 		ret = prepare_one_folio(inode, &folio, pos, write_bytes, | ||||
| 					force_page_uptodate, false); | ||||
| 		if (ret) { | ||||
| 			btrfs_delalloc_release_extents(BTRFS_I(inode), | ||||
| 						       reserve_bytes); | ||||
| 			break; | ||||
| 		} | ||||
| 
 | ||||
| 		extents_locked = lock_and_cleanup_extent_if_need( | ||||
| 				BTRFS_I(inode), pages, | ||||
| 				num_pages, pos, write_bytes, &lockstart, | ||||
| 				&lockend, nowait, &cached_state); | ||||
| 		extents_locked = lock_and_cleanup_extent_if_need(BTRFS_I(inode), | ||||
| 						folio, pos, write_bytes, &lockstart, | ||||
| 						&lockend, nowait, &cached_state); | ||||
| 		if (extents_locked < 0) { | ||||
| 			if (!nowait && extents_locked == -EAGAIN) | ||||
| 				goto again; | ||||
|  | @ -1327,28 +1251,18 @@ again: | |||
| 			break; | ||||
| 		} | ||||
| 
 | ||||
| 		copied = btrfs_copy_from_user(pos, write_bytes, pages, i); | ||||
| 		copied = btrfs_copy_from_user(pos, write_bytes, folio, i); | ||||
| 
 | ||||
| 		num_sectors = BTRFS_BYTES_TO_BLKS(fs_info, reserve_bytes); | ||||
| 		dirty_sectors = round_up(copied + sector_offset, | ||||
| 					fs_info->sectorsize); | ||||
| 		dirty_sectors = BTRFS_BYTES_TO_BLKS(fs_info, dirty_sectors); | ||||
| 
 | ||||
| 		/*
 | ||||
| 		 * if we have trouble faulting in the pages, fall | ||||
| 		 * back to one page at a time | ||||
| 		 */ | ||||
| 		if (copied < write_bytes) | ||||
| 			nrptrs = 1; | ||||
| 
 | ||||
| 		if (copied == 0) { | ||||
| 			force_page_uptodate = true; | ||||
| 			dirty_sectors = 0; | ||||
| 			dirty_pages = 0; | ||||
| 		} else { | ||||
| 			force_page_uptodate = false; | ||||
| 			dirty_pages = DIV_ROUND_UP(copied + offset, | ||||
| 						   PAGE_SIZE); | ||||
| 		} | ||||
| 
 | ||||
| 		if (num_sectors > dirty_sectors) { | ||||
|  | @ -1358,13 +1272,10 @@ again: | |||
| 				btrfs_delalloc_release_metadata(BTRFS_I(inode), | ||||
| 							release_bytes, true); | ||||
| 			} else { | ||||
| 				u64 __pos; | ||||
| 
 | ||||
| 				__pos = round_down(pos, | ||||
| 						   fs_info->sectorsize) + | ||||
| 					(dirty_pages << PAGE_SHIFT); | ||||
| 				u64 release_start = round_up(pos + copied, | ||||
| 							     fs_info->sectorsize); | ||||
| 				btrfs_delalloc_release_space(BTRFS_I(inode), | ||||
| 						data_reserved, __pos, | ||||
| 						data_reserved, release_start, | ||||
| 						release_bytes, true); | ||||
| 			} | ||||
| 		} | ||||
|  | @ -1372,15 +1283,14 @@ again: | |||
| 		release_bytes = round_up(copied + sector_offset, | ||||
| 					fs_info->sectorsize); | ||||
| 
 | ||||
| 		ret = btrfs_dirty_pages(BTRFS_I(inode), pages, | ||||
| 					dirty_pages, pos, copied, | ||||
| 		ret = btrfs_dirty_folio(BTRFS_I(inode), folio, pos, copied, | ||||
| 					&cached_state, only_release_metadata); | ||||
| 
 | ||||
| 		/*
 | ||||
| 		 * If we have not locked the extent range, because the range's | ||||
| 		 * start offset is >= i_size, we might still have a non-NULL | ||||
| 		 * cached extent state, acquired while marking the extent range | ||||
| 		 * as delalloc through btrfs_dirty_pages(). Therefore free any | ||||
| 		 * as delalloc through btrfs_dirty_page(). Therefore free any | ||||
| 		 * possible cached extent state to avoid a memory leak. | ||||
| 		 */ | ||||
| 		if (extents_locked) | ||||
|  | @ -1391,7 +1301,7 @@ again: | |||
| 
 | ||||
| 		btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes); | ||||
| 		if (ret) { | ||||
| 			btrfs_drop_pages(fs_info, pages, num_pages, pos, copied); | ||||
| 			btrfs_drop_folio(fs_info, folio, pos, copied); | ||||
| 			break; | ||||
| 		} | ||||
| 
 | ||||
|  | @ -1399,7 +1309,7 @@ again: | |||
| 		if (only_release_metadata) | ||||
| 			btrfs_check_nocow_unlock(BTRFS_I(inode)); | ||||
| 
 | ||||
| 		btrfs_drop_pages(fs_info, pages, num_pages, pos, copied); | ||||
| 		btrfs_drop_folio(fs_info, folio, pos, copied); | ||||
| 
 | ||||
| 		cond_resched(); | ||||
| 
 | ||||
|  | @ -1407,8 +1317,6 @@ again: | |||
| 		num_written += copied; | ||||
| 	} | ||||
| 
 | ||||
| 	kfree(pages); | ||||
| 
 | ||||
| 	if (release_bytes) { | ||||
| 		if (only_release_metadata) { | ||||
| 			btrfs_check_nocow_unlock(BTRFS_I(inode)); | ||||
|  | @ -1453,7 +1361,7 @@ static ssize_t btrfs_encoded_write(struct kiocb *iocb, struct iov_iter *from, | |||
| 	if (ret || encoded->len == 0) | ||||
| 		goto out; | ||||
| 
 | ||||
| 	ret = btrfs_write_check(iocb, from, encoded->len); | ||||
| 	ret = btrfs_write_check(iocb, encoded->len); | ||||
| 	if (ret < 0) | ||||
| 		goto out; | ||||
| 
 | ||||
|  | @ -3785,6 +3693,7 @@ const struct file_operations btrfs_file_operations = { | |||
| 	.compat_ioctl	= btrfs_compat_ioctl, | ||||
| #endif | ||||
| 	.remap_file_range = btrfs_remap_file_range, | ||||
| 	.uring_cmd	= btrfs_uring_cmd, | ||||
| 	.fop_flags	= FOP_BUFFER_RASYNC | FOP_BUFFER_WASYNC, | ||||
| }; | ||||
| 
 | ||||
|  |  | |||
|  | @ -34,9 +34,8 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, | |||
| ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from, | ||||
| 			    const struct btrfs_ioctl_encoded_io_args *encoded); | ||||
| int btrfs_release_file(struct inode *inode, struct file *file); | ||||
| int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages, | ||||
| 		      size_t num_pages, loff_t pos, size_t write_bytes, | ||||
| 		      struct extent_state **cached, bool noreserve); | ||||
| int btrfs_dirty_folio(struct btrfs_inode *inode, struct folio *folio, loff_t pos, | ||||
| 		      size_t write_bytes, struct extent_state **cached, bool noreserve); | ||||
| int btrfs_fdatawrite_range(struct btrfs_inode *inode, loff_t start, loff_t end); | ||||
| int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos, | ||||
| 			   size_t *write_bytes, bool nowait); | ||||
|  | @ -44,7 +43,7 @@ void btrfs_check_nocow_unlock(struct btrfs_inode *inode); | |||
| bool btrfs_find_delalloc_in_range(struct btrfs_inode *inode, u64 start, u64 end, | ||||
| 				  struct extent_state **cached_state, | ||||
| 				  u64 *delalloc_start_ret, u64 *delalloc_end_ret); | ||||
| int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from, size_t count); | ||||
| int btrfs_write_check(struct kiocb *iocb, size_t count); | ||||
| ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i); | ||||
| 
 | ||||
| #endif | ||||
|  |  | |||
|  | @ -11,6 +11,7 @@ | |||
| #include <linux/ratelimit.h> | ||||
| #include <linux/error-injection.h> | ||||
| #include <linux/sched/mm.h> | ||||
| #include <linux/string_choices.h> | ||||
| #include "ctree.h" | ||||
| #include "fs.h" | ||||
| #include "messages.h" | ||||
|  | @ -1387,6 +1388,7 @@ static int __btrfs_write_out_cache(struct inode *inode, | |||
| 	int bitmaps = 0; | ||||
| 	int ret; | ||||
| 	int must_iput = 0; | ||||
| 	int i_size; | ||||
| 
 | ||||
| 	if (!i_size_read(inode)) | ||||
| 		return -EIO; | ||||
|  | @ -1457,11 +1459,16 @@ static int __btrfs_write_out_cache(struct inode *inode, | |||
| 	io_ctl_zero_remaining_pages(io_ctl); | ||||
| 
 | ||||
| 	/* Everything is written out, now we dirty the pages in the file. */ | ||||
| 	ret = btrfs_dirty_pages(BTRFS_I(inode), io_ctl->pages, | ||||
| 				io_ctl->num_pages, 0, i_size_read(inode), | ||||
| 				&cached_state, false); | ||||
| 	if (ret) | ||||
| 		goto out_nospc; | ||||
| 	i_size = i_size_read(inode); | ||||
| 	for (int i = 0; i < round_up(i_size, PAGE_SIZE) / PAGE_SIZE; i++) { | ||||
| 		u64 dirty_start = i * PAGE_SIZE; | ||||
| 		u64 dirty_len = min_t(u64, dirty_start + PAGE_SIZE, i_size) - dirty_start; | ||||
| 
 | ||||
| 		ret = btrfs_dirty_folio(BTRFS_I(inode), page_folio(io_ctl->pages[i]), | ||||
| 					dirty_start, dirty_len, &cached_state, false); | ||||
| 		if (ret < 0) | ||||
| 			goto out_nospc; | ||||
| 	} | ||||
| 
 | ||||
| 	if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) | ||||
| 		up_write(&block_group->data_rwsem); | ||||
|  | @ -2936,12 +2943,11 @@ void btrfs_dump_free_space(struct btrfs_block_group *block_group, | |||
| 		if (info->bytes >= bytes && !block_group->ro) | ||||
| 			count++; | ||||
| 		btrfs_crit(fs_info, "entry offset %llu, bytes %llu, bitmap %s", | ||||
| 			   info->offset, info->bytes, | ||||
| 		       (info->bitmap) ? "yes" : "no"); | ||||
| 			   info->offset, info->bytes, str_yes_no(info->bitmap)); | ||||
| 	} | ||||
| 	spin_unlock(&ctl->tree_lock); | ||||
| 	btrfs_info(fs_info, "block group has cluster?: %s", | ||||
| 	       list_empty(&block_group->cluster_list) ? "no" : "yes"); | ||||
| 	       str_no_yes(list_empty(&block_group->cluster_list))); | ||||
| 	btrfs_info(fs_info, | ||||
| 		   "%d free space entries at or bigger than %llu bytes", | ||||
| 		   count, bytes); | ||||
|  |  | |||
|  | @ -263,10 +263,10 @@ enum { | |||
| 	 BTRFS_FEATURE_INCOMPAT_ZONED		|	\ | ||||
| 	 BTRFS_FEATURE_INCOMPAT_SIMPLE_QUOTA) | ||||
| 
 | ||||
| #ifdef CONFIG_BTRFS_DEBUG | ||||
| #ifdef CONFIG_BTRFS_EXPERIMENTAL | ||||
| 	/*
 | ||||
| 	 * Features under developmen like Extent tree v2 support is enabled | ||||
| 	 * only under CONFIG_BTRFS_DEBUG. | ||||
| 	 * only under CONFIG_BTRFS_EXPERIMENTAL | ||||
| 	 */ | ||||
| #define BTRFS_FEATURE_INCOMPAT_SUPP		\ | ||||
| 	(BTRFS_FEATURE_INCOMPAT_SUPP_STABLE |	\ | ||||
|  | @ -317,6 +317,8 @@ struct btrfs_dev_replace { | |||
| 
 | ||||
| 	struct percpu_counter bio_counter; | ||||
| 	wait_queue_head_t replace_wait; | ||||
| 
 | ||||
| 	struct task_struct *replace_task; | ||||
| }; | ||||
| 
 | ||||
| /*
 | ||||
|  | @ -633,9 +635,10 @@ struct btrfs_fs_info { | |||
| 	s32 delalloc_batch; | ||||
| 
 | ||||
| 	struct percpu_counter evictable_extent_maps; | ||||
| 	spinlock_t extent_map_shrinker_lock; | ||||
| 	u64 extent_map_shrinker_last_root; | ||||
| 	u64 extent_map_shrinker_last_ino; | ||||
| 	u64 em_shrinker_last_root; | ||||
| 	u64 em_shrinker_last_ino; | ||||
| 	atomic64_t em_shrinker_nr_to_scan; | ||||
| 	struct work_struct em_shrinker_work; | ||||
| 
 | ||||
| 	/* Protected by 'trans_lock'. */ | ||||
| 	struct list_head dirty_cowonly_roots; | ||||
|  | @ -876,12 +879,9 @@ struct btrfs_fs_info { | |||
| #endif | ||||
| }; | ||||
| 
 | ||||
| #define page_to_inode(_page)	(BTRFS_I(_Generic((_page),			\ | ||||
| 					  struct page *: (_page))->mapping->host)) | ||||
| #define folio_to_inode(_folio)	(BTRFS_I(_Generic((_folio),			\ | ||||
| 					  struct folio *: (_folio))->mapping->host)) | ||||
| 
 | ||||
| #define page_to_fs_info(_page)	 (page_to_inode(_page)->root->fs_info) | ||||
| #define folio_to_fs_info(_folio) (folio_to_inode(_folio)->root->fs_info) | ||||
| 
 | ||||
| #define inode_to_fs_info(_inode) (BTRFS_I(_Generic((_inode),			\ | ||||
|  |  | |||
							
								
								
									
										495
									
								
								fs/btrfs/inode.c
									
										
									
									
									
								
							
							
						
						
									
										495
									
								
								fs/btrfs/inode.c
									
										
									
									
									
								
							|  | @ -421,7 +421,7 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, | |||
| 			index++; | ||||
| 			continue; | ||||
| 		} | ||||
| 		folio = __filemap_get_folio(inode->vfs_inode.i_mapping, index, 0, 0); | ||||
| 		folio = filemap_get_folio(inode->vfs_inode.i_mapping, index); | ||||
| 		index++; | ||||
| 		if (IS_ERR(folio)) | ||||
| 			continue; | ||||
|  | @ -556,8 +556,7 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, | |||
| 	} else { | ||||
| 		struct folio *folio; | ||||
| 
 | ||||
| 		folio = __filemap_get_folio(inode->vfs_inode.i_mapping, | ||||
| 					    0, 0, 0); | ||||
| 		folio = filemap_get_folio(inode->vfs_inode.i_mapping, 0); | ||||
| 		ASSERT(!IS_ERR(folio)); | ||||
| 		btrfs_set_file_extent_compression(leaf, ei, 0); | ||||
| 		kaddr = kmap_local_folio(folio, 0); | ||||
|  | @ -646,7 +645,7 @@ static bool can_cow_file_range_inline(struct btrfs_inode *inode, | |||
|  * If being used directly, you must have already checked we're allowed to cow | ||||
|  * the range by getting true from can_cow_file_range_inline(). | ||||
|  */ | ||||
| static noinline int __cow_file_range_inline(struct btrfs_inode *inode, u64 offset, | ||||
| static noinline int __cow_file_range_inline(struct btrfs_inode *inode, | ||||
| 					    u64 size, size_t compressed_size, | ||||
| 					    int compress_type, | ||||
| 					    struct folio *compressed_folio, | ||||
|  | @ -736,7 +735,7 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, | |||
| 		return 1; | ||||
| 
 | ||||
| 	lock_extent(&inode->io_tree, offset, end, &cached); | ||||
| 	ret = __cow_file_range_inline(inode, offset, size, compressed_size, | ||||
| 	ret = __cow_file_range_inline(inode, size, compressed_size, | ||||
| 				      compress_type, compressed_folio, | ||||
| 				      update_i_size); | ||||
| 	if (ret > 0) { | ||||
|  | @ -832,32 +831,16 @@ static inline int inode_need_compress(struct btrfs_inode *inode, u64 start, | |||
| 		return 0; | ||||
| 	} | ||||
| 	/*
 | ||||
| 	 * Special check for subpage. | ||||
| 	 * Only enable sector perfect compression for experimental builds. | ||||
| 	 * | ||||
| 	 * We lock the full page then run each delalloc range in the page, thus | ||||
| 	 * for the following case, we will hit some subpage specific corner case: | ||||
| 	 * This is a big feature change for subpage cases, and can hit | ||||
| 	 * different corner cases, so only limit this feature for | ||||
| 	 * experimental build for now. | ||||
| 	 * | ||||
| 	 * 0		32K		64K | ||||
| 	 * |	|///////|	|///////|
 | ||||
| 	 *		\- A		\- B | ||||
| 	 * | ||||
| 	 * In above case, both range A and range B will try to unlock the full | ||||
| 	 * page [0, 64K), causing the one finished later will have page | ||||
| 	 * unlocked already, triggering various page lock requirement BUG_ON()s. | ||||
| 	 * | ||||
| 	 * So here we add an artificial limit that subpage compression can only | ||||
| 	 * if the range is fully page aligned. | ||||
| 	 * | ||||
| 	 * In theory we only need to ensure the first page is fully covered, but | ||||
| 	 * the tailing partial page will be locked until the full compression | ||||
| 	 * finishes, delaying the write of other range. | ||||
| 	 * | ||||
| 	 * TODO: Make btrfs_run_delalloc_range() to lock all delalloc range | ||||
| 	 * first to prevent any submitted async extent to unlock the full page. | ||||
| 	 * By this, we can ensure for subpage case that only the last async_cow | ||||
| 	 * will unlock the full page. | ||||
| 	 * ETA for moving this out of experimental builds is 6.15. | ||||
| 	 */ | ||||
| 	if (fs_info->sectorsize < PAGE_SIZE) { | ||||
| 	if (fs_info->sectorsize < PAGE_SIZE && | ||||
| 	    !IS_ENABLED(CONFIG_BTRFS_EXPERIMENTAL)) { | ||||
| 		if (!PAGE_ALIGNED(start) || | ||||
| 		    !PAGE_ALIGNED(end + 1)) | ||||
| 			return 0; | ||||
|  | @ -896,13 +879,14 @@ static int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 e | |||
| 
 | ||||
| 	for (unsigned long index = start >> PAGE_SHIFT; | ||||
| 	     index <= end_index; index++) { | ||||
| 		folio = __filemap_get_folio(inode->i_mapping, index, 0, 0); | ||||
| 		folio = filemap_get_folio(inode->i_mapping, index); | ||||
| 		if (IS_ERR(folio)) { | ||||
| 			if (!ret) | ||||
| 				ret = PTR_ERR(folio); | ||||
| 			continue; | ||||
| 		} | ||||
| 		folio_clear_dirty_for_io(folio); | ||||
| 		btrfs_folio_clamp_clear_dirty(inode_to_fs_info(inode), folio, start, | ||||
| 					      end + 1 - start); | ||||
| 		folio_put(folio); | ||||
| 	} | ||||
| 	return ret; | ||||
|  | @ -1001,17 +985,6 @@ again: | |||
| 	   (start > 0 || end + 1 < inode->disk_i_size)) | ||||
| 		goto cleanup_and_bail_uncompressed; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * For subpage case, we require full page alignment for the sector | ||||
| 	 * aligned range. | ||||
| 	 * Thus we must also check against @actual_end, not just @end. | ||||
| 	 */ | ||||
| 	if (blocksize < PAGE_SIZE) { | ||||
| 		if (!PAGE_ALIGNED(start) || | ||||
| 		    !PAGE_ALIGNED(round_up(actual_end, blocksize))) | ||||
| 			goto cleanup_and_bail_uncompressed; | ||||
| 	} | ||||
| 
 | ||||
| 	total_compressed = min_t(unsigned long, total_compressed, | ||||
| 			BTRFS_MAX_UNCOMPRESSED); | ||||
| 	total_in = 0; | ||||
|  | @ -1359,7 +1332,6 @@ static noinline int cow_file_range(struct btrfs_inode *inode, | |||
| 	u64 alloc_hint = 0; | ||||
| 	u64 orig_start = start; | ||||
| 	u64 num_bytes; | ||||
| 	unsigned long ram_size; | ||||
| 	u64 cur_alloc_size = 0; | ||||
| 	u64 min_alloc_size; | ||||
| 	u64 blocksize = fs_info->sectorsize; | ||||
|  | @ -1367,7 +1339,6 @@ static noinline int cow_file_range(struct btrfs_inode *inode, | |||
| 	struct extent_map *em; | ||||
| 	unsigned clear_bits; | ||||
| 	unsigned long page_ops; | ||||
| 	bool extent_reserved = false; | ||||
| 	int ret = 0; | ||||
| 
 | ||||
| 	if (btrfs_is_free_space_inode(inode)) { | ||||
|  | @ -1421,8 +1392,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode, | |||
| 		struct btrfs_ordered_extent *ordered; | ||||
| 		struct btrfs_file_extent file_extent; | ||||
| 
 | ||||
| 		cur_alloc_size = num_bytes; | ||||
| 		ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size, | ||||
| 		ret = btrfs_reserve_extent(root, num_bytes, num_bytes, | ||||
| 					   min_alloc_size, 0, alloc_hint, | ||||
| 					   &ins, 1, 1); | ||||
| 		if (ret == -EAGAIN) { | ||||
|  | @ -1453,9 +1423,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode, | |||
| 		if (ret < 0) | ||||
| 			goto out_unlock; | ||||
| 		cur_alloc_size = ins.offset; | ||||
| 		extent_reserved = true; | ||||
| 
 | ||||
| 		ram_size = ins.offset; | ||||
| 		file_extent.disk_bytenr = ins.objectid; | ||||
| 		file_extent.disk_num_bytes = ins.offset; | ||||
| 		file_extent.num_bytes = ins.offset; | ||||
|  | @ -1463,14 +1431,14 @@ static noinline int cow_file_range(struct btrfs_inode *inode, | |||
| 		file_extent.offset = 0; | ||||
| 		file_extent.compression = BTRFS_COMPRESS_NONE; | ||||
| 
 | ||||
| 		lock_extent(&inode->io_tree, start, start + ram_size - 1, | ||||
| 		lock_extent(&inode->io_tree, start, start + cur_alloc_size - 1, | ||||
| 			    &cached); | ||||
| 
 | ||||
| 		em = btrfs_create_io_em(inode, start, &file_extent, | ||||
| 					BTRFS_ORDERED_REGULAR); | ||||
| 		if (IS_ERR(em)) { | ||||
| 			unlock_extent(&inode->io_tree, start, | ||||
| 				      start + ram_size - 1, &cached); | ||||
| 				      start + cur_alloc_size - 1, &cached); | ||||
| 			ret = PTR_ERR(em); | ||||
| 			goto out_reserve; | ||||
| 		} | ||||
|  | @ -1480,7 +1448,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode, | |||
| 						     1 << BTRFS_ORDERED_REGULAR); | ||||
| 		if (IS_ERR(ordered)) { | ||||
| 			unlock_extent(&inode->io_tree, start, | ||||
| 				      start + ram_size - 1, &cached); | ||||
| 				      start + cur_alloc_size - 1, &cached); | ||||
| 			ret = PTR_ERR(ordered); | ||||
| 			goto out_drop_extent_cache; | ||||
| 		} | ||||
|  | @ -1501,7 +1469,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode, | |||
| 			 */ | ||||
| 			if (ret) | ||||
| 				btrfs_drop_extent_map_range(inode, start, | ||||
| 							    start + ram_size - 1, | ||||
| 							    start + cur_alloc_size - 1, | ||||
| 							    false); | ||||
| 		} | ||||
| 		btrfs_put_ordered_extent(ordered); | ||||
|  | @ -1519,7 +1487,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode, | |||
| 		page_ops = (keep_locked ? 0 : PAGE_UNLOCK); | ||||
| 		page_ops |= PAGE_SET_ORDERED; | ||||
| 
 | ||||
| 		extent_clear_unlock_delalloc(inode, start, start + ram_size - 1, | ||||
| 		extent_clear_unlock_delalloc(inode, start, start + cur_alloc_size - 1, | ||||
| 					     locked_folio, &cached, | ||||
| 					     EXTENT_LOCKED | EXTENT_DELALLOC, | ||||
| 					     page_ops); | ||||
|  | @ -1529,7 +1497,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode, | |||
| 			num_bytes -= cur_alloc_size; | ||||
| 		alloc_hint = ins.objectid + ins.offset; | ||||
| 		start += cur_alloc_size; | ||||
| 		extent_reserved = false; | ||||
| 		cur_alloc_size = 0; | ||||
| 
 | ||||
| 		/*
 | ||||
| 		 * btrfs_reloc_clone_csums() error, since start is increased | ||||
|  | @ -1545,7 +1513,7 @@ done: | |||
| 	return ret; | ||||
| 
 | ||||
| out_drop_extent_cache: | ||||
| 	btrfs_drop_extent_map_range(inode, start, start + ram_size - 1, false); | ||||
| 	btrfs_drop_extent_map_range(inode, start, start + cur_alloc_size - 1, false); | ||||
| out_reserve: | ||||
| 	btrfs_dec_block_group_reservations(fs_info, ins.objectid); | ||||
| 	btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1); | ||||
|  | @ -1599,13 +1567,12 @@ out_unlock: | |||
| 	 * to decrement again the data space_info's bytes_may_use counter, | ||||
| 	 * therefore we do not pass it the flag EXTENT_CLEAR_DATA_RESV. | ||||
| 	 */ | ||||
| 	if (extent_reserved) { | ||||
| 	if (cur_alloc_size) { | ||||
| 		extent_clear_unlock_delalloc(inode, start, | ||||
| 					     start + cur_alloc_size - 1, | ||||
| 					     locked_folio, &cached, clear_bits, | ||||
| 					     page_ops); | ||||
| 		btrfs_qgroup_free_data(inode, NULL, start, cur_alloc_size, NULL); | ||||
| 		start += cur_alloc_size; | ||||
| 	} | ||||
| 
 | ||||
| 	/*
 | ||||
|  | @ -1614,11 +1581,13 @@ out_unlock: | |||
| 	 * space_info's bytes_may_use counter, reserved in | ||||
| 	 * btrfs_check_data_free_space(). | ||||
| 	 */ | ||||
| 	if (start < end) { | ||||
| 	if (start + cur_alloc_size < end) { | ||||
| 		clear_bits |= EXTENT_CLEAR_DATA_RESV; | ||||
| 		extent_clear_unlock_delalloc(inode, start, end, locked_folio, | ||||
| 		extent_clear_unlock_delalloc(inode, start + cur_alloc_size, | ||||
| 					     end, locked_folio, | ||||
| 					     &cached, clear_bits, page_ops); | ||||
| 		btrfs_qgroup_free_data(inode, NULL, start, end - start + 1, NULL); | ||||
| 		btrfs_qgroup_free_data(inode, NULL, start + cur_alloc_size, | ||||
| 				       end - start - cur_alloc_size + 1, NULL); | ||||
| 	} | ||||
| 	return ret; | ||||
| } | ||||
|  | @ -3094,34 +3063,6 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) | |||
| 			goto out; | ||||
| 	} | ||||
| 
 | ||||
| 	if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { | ||||
| 		BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */ | ||||
| 
 | ||||
| 		btrfs_inode_safe_disk_i_size_write(inode, 0); | ||||
| 		if (freespace_inode) | ||||
| 			trans = btrfs_join_transaction_spacecache(root); | ||||
| 		else | ||||
| 			trans = btrfs_join_transaction(root); | ||||
| 		if (IS_ERR(trans)) { | ||||
| 			ret = PTR_ERR(trans); | ||||
| 			trans = NULL; | ||||
| 			goto out; | ||||
| 		} | ||||
| 		trans->block_rsv = &inode->block_rsv; | ||||
| 		ret = btrfs_update_inode_fallback(trans, inode); | ||||
| 		if (ret) /* -ENOMEM or corruption */ | ||||
| 			btrfs_abort_transaction(trans, ret); | ||||
| 
 | ||||
| 		ret = btrfs_insert_raid_extent(trans, ordered_extent); | ||||
| 		if (ret) | ||||
| 			btrfs_abort_transaction(trans, ret); | ||||
| 
 | ||||
| 		goto out; | ||||
| 	} | ||||
| 
 | ||||
| 	clear_bits |= EXTENT_LOCKED; | ||||
| 	lock_extent(io_tree, start, end, &cached_state); | ||||
| 
 | ||||
| 	if (freespace_inode) | ||||
| 		trans = btrfs_join_transaction_spacecache(root); | ||||
| 	else | ||||
|  | @ -3135,8 +3076,31 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) | |||
| 	trans->block_rsv = &inode->block_rsv; | ||||
| 
 | ||||
| 	ret = btrfs_insert_raid_extent(trans, ordered_extent); | ||||
| 	if (ret) | ||||
| 	if (ret) { | ||||
| 		btrfs_abort_transaction(trans, ret); | ||||
| 		goto out; | ||||
| 	} | ||||
| 
 | ||||
| 	if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { | ||||
| 		/* Logic error */ | ||||
| 		ASSERT(list_empty(&ordered_extent->list)); | ||||
| 		if (!list_empty(&ordered_extent->list)) { | ||||
| 			ret = -EINVAL; | ||||
| 			btrfs_abort_transaction(trans, ret); | ||||
| 			goto out; | ||||
| 		} | ||||
| 
 | ||||
| 		btrfs_inode_safe_disk_i_size_write(inode, 0); | ||||
| 		ret = btrfs_update_inode_fallback(trans, inode); | ||||
| 		if (ret) { | ||||
| 			/* -ENOMEM or corruption */ | ||||
| 			btrfs_abort_transaction(trans, ret); | ||||
| 		} | ||||
| 		goto out; | ||||
| 	} | ||||
| 
 | ||||
| 	clear_bits |= EXTENT_LOCKED; | ||||
| 	lock_extent(io_tree, start, end, &cached_state); | ||||
| 
 | ||||
| 	if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags)) | ||||
| 		compress_type = ordered_extent->compress_type; | ||||
|  | @ -3791,14 +3755,45 @@ static int btrfs_init_file_extent_tree(struct btrfs_inode *inode) | |||
| 	return 0; | ||||
| } | ||||
| 
 | ||||
| static int btrfs_add_inode_to_root(struct btrfs_inode *inode, bool prealloc) | ||||
| { | ||||
| 	struct btrfs_root *root = inode->root; | ||||
| 	struct btrfs_inode *existing; | ||||
| 	const u64 ino = btrfs_ino(inode); | ||||
| 	int ret; | ||||
| 
 | ||||
| 	if (inode_unhashed(&inode->vfs_inode)) | ||||
| 		return 0; | ||||
| 
 | ||||
| 	if (prealloc) { | ||||
| 		ret = xa_reserve(&root->inodes, ino, GFP_NOFS); | ||||
| 		if (ret) | ||||
| 			return ret; | ||||
| 	} | ||||
| 
 | ||||
| 	existing = xa_store(&root->inodes, ino, inode, GFP_ATOMIC); | ||||
| 
 | ||||
| 	if (xa_is_err(existing)) { | ||||
| 		ret = xa_err(existing); | ||||
| 		ASSERT(ret != -EINVAL); | ||||
| 		ASSERT(ret != -ENOMEM); | ||||
| 		return ret; | ||||
| 	} else if (existing) { | ||||
| 		WARN_ON(!(existing->vfs_inode.i_state & (I_WILL_FREE | I_FREEING))); | ||||
| 	} | ||||
| 
 | ||||
| 	return 0; | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * read an inode from the btree into the in-memory inode | ||||
|  * Read a locked inode from the btree into the in-memory inode and add it to | ||||
|  * its root list/tree. | ||||
|  * | ||||
|  * On failure clean up the inode. | ||||
|  */ | ||||
| static int btrfs_read_locked_inode(struct inode *inode, | ||||
| 				   struct btrfs_path *in_path) | ||||
| static int btrfs_read_locked_inode(struct inode *inode, struct btrfs_path *path) | ||||
| { | ||||
| 	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); | ||||
| 	struct btrfs_path *path = in_path; | ||||
| 	struct extent_buffer *leaf; | ||||
| 	struct btrfs_inode_item *inode_item; | ||||
| 	struct btrfs_root *root = BTRFS_I(inode)->root; | ||||
|  | @ -3812,25 +3807,25 @@ static int btrfs_read_locked_inode(struct inode *inode, | |||
| 
 | ||||
| 	ret = btrfs_init_file_extent_tree(BTRFS_I(inode)); | ||||
| 	if (ret) | ||||
| 		return ret; | ||||
| 		goto out; | ||||
| 
 | ||||
| 	ret = btrfs_fill_inode(inode, &rdev); | ||||
| 	if (!ret) | ||||
| 		filled = true; | ||||
| 
 | ||||
| 	if (!path) { | ||||
| 		path = btrfs_alloc_path(); | ||||
| 		if (!path) | ||||
| 			return -ENOMEM; | ||||
| 	} | ||||
| 	ASSERT(path); | ||||
| 
 | ||||
| 	btrfs_get_inode_key(BTRFS_I(inode), &location); | ||||
| 
 | ||||
| 	ret = btrfs_lookup_inode(NULL, root, path, &location, 0); | ||||
| 	if (ret) { | ||||
| 		if (path != in_path) | ||||
| 			btrfs_free_path(path); | ||||
| 		return ret; | ||||
| 		/*
 | ||||
| 		 * ret > 0 can come from btrfs_search_slot called by | ||||
| 		 * btrfs_lookup_inode(), this means the inode was not found. | ||||
| 		 */ | ||||
| 		if (ret > 0) | ||||
| 			ret = -ENOENT; | ||||
| 		goto out; | ||||
| 	} | ||||
| 
 | ||||
| 	leaf = path->nodes[0]; | ||||
|  | @ -3965,8 +3960,6 @@ cache_acl: | |||
| 				  btrfs_ino(BTRFS_I(inode)), | ||||
| 				  btrfs_root_id(root), ret); | ||||
| 	} | ||||
| 	if (path != in_path) | ||||
| 		btrfs_free_path(path); | ||||
| 
 | ||||
| 	if (!maybe_acls) | ||||
| 		cache_no_acl(inode); | ||||
|  | @ -3993,7 +3986,15 @@ cache_acl: | |||
| 	} | ||||
| 
 | ||||
| 	btrfs_sync_inode_flags_to_i_flags(inode); | ||||
| 
 | ||||
| 	ret = btrfs_add_inode_to_root(BTRFS_I(inode), true); | ||||
| 	if (ret) | ||||
| 		goto out; | ||||
| 
 | ||||
| 	return 0; | ||||
| out: | ||||
| 	iget_failed(inode); | ||||
| 	return ret; | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  | @ -5502,35 +5503,7 @@ out: | |||
| 	return err; | ||||
| } | ||||
| 
 | ||||
| static int btrfs_add_inode_to_root(struct btrfs_inode *inode, bool prealloc) | ||||
| { | ||||
| 	struct btrfs_root *root = inode->root; | ||||
| 	struct btrfs_inode *existing; | ||||
| 	const u64 ino = btrfs_ino(inode); | ||||
| 	int ret; | ||||
| 
 | ||||
| 	if (inode_unhashed(&inode->vfs_inode)) | ||||
| 		return 0; | ||||
| 
 | ||||
| 	if (prealloc) { | ||||
| 		ret = xa_reserve(&root->inodes, ino, GFP_NOFS); | ||||
| 		if (ret) | ||||
| 			return ret; | ||||
| 	} | ||||
| 
 | ||||
| 	existing = xa_store(&root->inodes, ino, inode, GFP_ATOMIC); | ||||
| 
 | ||||
| 	if (xa_is_err(existing)) { | ||||
| 		ret = xa_err(existing); | ||||
| 		ASSERT(ret != -EINVAL); | ||||
| 		ASSERT(ret != -ENOMEM); | ||||
| 		return ret; | ||||
| 	} else if (existing) { | ||||
| 		WARN_ON(!(existing->vfs_inode.i_state & (I_WILL_FREE | I_FREEING))); | ||||
| 	} | ||||
| 
 | ||||
| 	return 0; | ||||
| } | ||||
| 
 | ||||
| static void btrfs_del_inode_from_root(struct btrfs_inode *inode) | ||||
| { | ||||
|  | @ -5592,10 +5565,8 @@ static struct inode *btrfs_iget_locked(u64 ino, struct btrfs_root *root) | |||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Get an inode object given its inode number and corresponding root. | ||||
|  * Path can be preallocated to prevent recursing back to iget through | ||||
|  * allocator. NULL is also valid but may require an additional allocation | ||||
|  * later. | ||||
|  * Get an inode object given its inode number and corresponding root.  Path is | ||||
|  * preallocated to prevent recursing back to iget through allocator. | ||||
|  */ | ||||
| struct inode *btrfs_iget_path(u64 ino, struct btrfs_root *root, | ||||
| 			      struct btrfs_path *path) | ||||
|  | @ -5611,30 +5582,40 @@ struct inode *btrfs_iget_path(u64 ino, struct btrfs_root *root, | |||
| 		return inode; | ||||
| 
 | ||||
| 	ret = btrfs_read_locked_inode(inode, path); | ||||
| 	/*
 | ||||
| 	 * ret > 0 can come from btrfs_search_slot called by | ||||
| 	 * btrfs_read_locked_inode(), this means the inode item was not found. | ||||
| 	 */ | ||||
| 	if (ret > 0) | ||||
| 		ret = -ENOENT; | ||||
| 	if (ret < 0) | ||||
| 		goto error; | ||||
| 
 | ||||
| 	ret = btrfs_add_inode_to_root(BTRFS_I(inode), true); | ||||
| 	if (ret < 0) | ||||
| 		goto error; | ||||
| 	if (ret) | ||||
| 		return ERR_PTR(ret); | ||||
| 
 | ||||
| 	unlock_new_inode(inode); | ||||
| 
 | ||||
| 	return inode; | ||||
| error: | ||||
| 	iget_failed(inode); | ||||
| 	return ERR_PTR(ret); | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Get an inode object given its inode number and corresponding root. | ||||
|  */ | ||||
| struct inode *btrfs_iget(u64 ino, struct btrfs_root *root) | ||||
| { | ||||
| 	return btrfs_iget_path(ino, root, NULL); | ||||
| 	struct inode *inode; | ||||
| 	struct btrfs_path *path; | ||||
| 	int ret; | ||||
| 
 | ||||
| 	inode = btrfs_iget_locked(ino, root); | ||||
| 	if (!inode) | ||||
| 		return ERR_PTR(-ENOMEM); | ||||
| 
 | ||||
| 	if (!(inode->i_state & I_NEW)) | ||||
| 		return inode; | ||||
| 
 | ||||
| 	path = btrfs_alloc_path(); | ||||
| 	if (!path) | ||||
| 		return ERR_PTR(-ENOMEM); | ||||
| 
 | ||||
| 	ret = btrfs_read_locked_inode(inode, path); | ||||
| 	btrfs_free_path(path); | ||||
| 	if (ret) | ||||
| 		return ERR_PTR(ret); | ||||
| 
 | ||||
| 	unlock_new_inode(inode); | ||||
| 	return inode; | ||||
| } | ||||
| 
 | ||||
| static struct inode *new_simple_dir(struct inode *dir, | ||||
|  | @ -6023,7 +6004,7 @@ again: | |||
| 	 * offset.  This means that new entries created during readdir | ||||
| 	 * are *guaranteed* to be seen in the future by that readdir. | ||||
| 	 * This has broken buggy programs which operate on names as | ||||
| 	 * they're returned by readdir.  Until we re-use freed offsets | ||||
| 	 * they're returned by readdir.  Until we reuse freed offsets | ||||
| 	 * we have this hack to stop new entries from being returned | ||||
| 	 * under the assumption that they'll never reach this huge | ||||
| 	 * offset. | ||||
|  | @ -6765,8 +6746,7 @@ static noinline int uncompress_inline(struct btrfs_path *path, | |||
| 	return ret; | ||||
| } | ||||
| 
 | ||||
| static int read_inline_extent(struct btrfs_inode *inode, struct btrfs_path *path, | ||||
| 			      struct folio *folio) | ||||
| static int read_inline_extent(struct btrfs_path *path, struct folio *folio) | ||||
| { | ||||
| 	struct btrfs_file_extent_item *fi; | ||||
| 	void *kaddr; | ||||
|  | @ -6964,7 +6944,7 @@ next: | |||
| 		ASSERT(em->disk_bytenr == EXTENT_MAP_INLINE); | ||||
| 		ASSERT(em->len == fs_info->sectorsize); | ||||
| 
 | ||||
| 		ret = read_inline_extent(inode, path, folio); | ||||
| 		ret = read_inline_extent(path, folio); | ||||
| 		if (ret < 0) | ||||
| 			goto out; | ||||
| 		goto insert; | ||||
|  | @ -8972,28 +8952,6 @@ out_inode: | |||
| 	return finish_open_simple(file, ret); | ||||
| } | ||||
| 
 | ||||
| void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end) | ||||
| { | ||||
| 	struct btrfs_fs_info *fs_info = inode->root->fs_info; | ||||
| 	unsigned long index = start >> PAGE_SHIFT; | ||||
| 	unsigned long end_index = end >> PAGE_SHIFT; | ||||
| 	struct folio *folio; | ||||
| 	u32 len; | ||||
| 
 | ||||
| 	ASSERT(end + 1 - start <= U32_MAX); | ||||
| 	len = end + 1 - start; | ||||
| 	while (index <= end_index) { | ||||
| 		folio = __filemap_get_folio(inode->vfs_inode.i_mapping, index, 0, 0); | ||||
| 		ASSERT(!IS_ERR(folio)); /* folios should be in the extent_io_tree */ | ||||
| 
 | ||||
| 		/* This is for data, which doesn't yet support larger folio. */ | ||||
| 		ASSERT(folio_order(folio) == 0); | ||||
| 		btrfs_folio_set_writeback(fs_info, folio, start, len); | ||||
| 		folio_put(folio); | ||||
| 		index++; | ||||
| 	} | ||||
| } | ||||
| 
 | ||||
| int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info, | ||||
| 					     int compress_type) | ||||
| { | ||||
|  | @ -9038,12 +8996,16 @@ static ssize_t btrfs_encoded_read_inline( | |||
| 	unsigned long ptr; | ||||
| 	void *tmp; | ||||
| 	ssize_t ret; | ||||
| 	const bool nowait = (iocb->ki_flags & IOCB_NOWAIT); | ||||
| 
 | ||||
| 	path = btrfs_alloc_path(); | ||||
| 	if (!path) { | ||||
| 		ret = -ENOMEM; | ||||
| 		goto out; | ||||
| 	} | ||||
| 
 | ||||
| 	path->nowait = nowait; | ||||
| 
 | ||||
| 	ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode), | ||||
| 				       extent_start, 0); | ||||
| 	if (ret) { | ||||
|  | @ -9107,6 +9069,7 @@ out: | |||
| 
 | ||||
| struct btrfs_encoded_read_private { | ||||
| 	wait_queue_head_t wait; | ||||
| 	void *uring_ctx; | ||||
| 	atomic_t pending; | ||||
| 	blk_status_t status; | ||||
| }; | ||||
|  | @ -9126,26 +9089,40 @@ static void btrfs_encoded_read_endio(struct btrfs_bio *bbio) | |||
| 		 */ | ||||
| 		WRITE_ONCE(priv->status, bbio->bio.bi_status); | ||||
| 	} | ||||
| 	if (!atomic_dec_return(&priv->pending)) | ||||
| 		wake_up(&priv->wait); | ||||
| 	if (atomic_dec_return(&priv->pending) == 0) { | ||||
| 		int err = blk_status_to_errno(READ_ONCE(priv->status)); | ||||
| 
 | ||||
| 		if (priv->uring_ctx) { | ||||
| 			btrfs_uring_read_extent_endio(priv->uring_ctx, err); | ||||
| 			kfree(priv); | ||||
| 		} else { | ||||
| 			wake_up(&priv->wait); | ||||
| 		} | ||||
| 	} | ||||
| 	bio_put(&bbio->bio); | ||||
| } | ||||
| 
 | ||||
| int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, | ||||
| 					  u64 file_offset, u64 disk_bytenr, | ||||
| 					  u64 disk_io_size, struct page **pages) | ||||
| 					  u64 disk_bytenr, u64 disk_io_size, | ||||
| 					  struct page **pages, void *uring_ctx) | ||||
| { | ||||
| 	struct btrfs_fs_info *fs_info = inode->root->fs_info; | ||||
| 	struct btrfs_encoded_read_private priv = { | ||||
| 		.pending = ATOMIC_INIT(1), | ||||
| 	}; | ||||
| 	struct btrfs_encoded_read_private *priv; | ||||
| 	unsigned long i = 0; | ||||
| 	struct btrfs_bio *bbio; | ||||
| 	int ret; | ||||
| 
 | ||||
| 	init_waitqueue_head(&priv.wait); | ||||
| 	priv = kmalloc(sizeof(struct btrfs_encoded_read_private), GFP_NOFS); | ||||
| 	if (!priv) | ||||
| 		return -ENOMEM; | ||||
| 
 | ||||
| 	init_waitqueue_head(&priv->wait); | ||||
| 	atomic_set(&priv->pending, 1); | ||||
| 	priv->status = 0; | ||||
| 	priv->uring_ctx = uring_ctx; | ||||
| 
 | ||||
| 	bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info, | ||||
| 			       btrfs_encoded_read_endio, &priv); | ||||
| 			       btrfs_encoded_read_endio, priv); | ||||
| 	bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT; | ||||
| 	bbio->inode = inode; | ||||
| 
 | ||||
|  | @ -9153,11 +9130,11 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, | |||
| 		size_t bytes = min_t(u64, disk_io_size, PAGE_SIZE); | ||||
| 
 | ||||
| 		if (bio_add_page(&bbio->bio, pages[i], bytes, 0) < bytes) { | ||||
| 			atomic_inc(&priv.pending); | ||||
| 			atomic_inc(&priv->pending); | ||||
| 			btrfs_submit_bbio(bbio, 0); | ||||
| 
 | ||||
| 			bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info, | ||||
| 					       btrfs_encoded_read_endio, &priv); | ||||
| 					       btrfs_encoded_read_endio, priv); | ||||
| 			bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT; | ||||
| 			bbio->inode = inode; | ||||
| 			continue; | ||||
|  | @ -9168,22 +9145,33 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, | |||
| 		disk_io_size -= bytes; | ||||
| 	} while (disk_io_size); | ||||
| 
 | ||||
| 	atomic_inc(&priv.pending); | ||||
| 	atomic_inc(&priv->pending); | ||||
| 	btrfs_submit_bbio(bbio, 0); | ||||
| 
 | ||||
| 	if (atomic_dec_return(&priv.pending)) | ||||
| 		io_wait_event(priv.wait, !atomic_read(&priv.pending)); | ||||
| 	/* See btrfs_encoded_read_endio() for ordering. */ | ||||
| 	return blk_status_to_errno(READ_ONCE(priv.status)); | ||||
| 	if (uring_ctx) { | ||||
| 		if (atomic_dec_return(&priv->pending) == 0) { | ||||
| 			ret = blk_status_to_errno(READ_ONCE(priv->status)); | ||||
| 			btrfs_uring_read_extent_endio(uring_ctx, ret); | ||||
| 			kfree(priv); | ||||
| 			return ret; | ||||
| 		} | ||||
| 
 | ||||
| 		return -EIOCBQUEUED; | ||||
| 	} else { | ||||
| 		if (atomic_dec_return(&priv->pending) != 0) | ||||
| 			io_wait_event(priv->wait, !atomic_read(&priv->pending)); | ||||
| 		/* See btrfs_encoded_read_endio() for ordering. */ | ||||
| 		ret = blk_status_to_errno(READ_ONCE(priv->status)); | ||||
| 		kfree(priv); | ||||
| 		return ret; | ||||
| 	} | ||||
| } | ||||
| 
 | ||||
| static ssize_t btrfs_encoded_read_regular(struct kiocb *iocb, | ||||
| 					  struct iov_iter *iter, | ||||
| 					  u64 start, u64 lockend, | ||||
| 					  struct extent_state **cached_state, | ||||
| 					  u64 disk_bytenr, u64 disk_io_size, | ||||
| 					  size_t count, bool compressed, | ||||
| 					  bool *unlocked) | ||||
| ssize_t btrfs_encoded_read_regular(struct kiocb *iocb, struct iov_iter *iter, | ||||
| 				   u64 start, u64 lockend, | ||||
| 				   struct extent_state **cached_state, | ||||
| 				   u64 disk_bytenr, u64 disk_io_size, | ||||
| 				   size_t count, bool compressed, bool *unlocked) | ||||
| { | ||||
| 	struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp)); | ||||
| 	struct extent_io_tree *io_tree = &inode->io_tree; | ||||
|  | @ -9203,8 +9191,8 @@ static ssize_t btrfs_encoded_read_regular(struct kiocb *iocb, | |||
| 		goto out; | ||||
| 		} | ||||
| 
 | ||||
| 	ret = btrfs_encoded_read_regular_fill_pages(inode, start, disk_bytenr, | ||||
| 						    disk_io_size, pages); | ||||
| 	ret = btrfs_encoded_read_regular_fill_pages(inode, disk_bytenr, | ||||
| 						    disk_io_size, pages, NULL); | ||||
| 	if (ret) | ||||
| 		goto out; | ||||
| 
 | ||||
|  | @ -9244,21 +9232,26 @@ out: | |||
| } | ||||
| 
 | ||||
| ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, | ||||
| 			   struct btrfs_ioctl_encoded_io_args *encoded) | ||||
| 			   struct btrfs_ioctl_encoded_io_args *encoded, | ||||
| 			   struct extent_state **cached_state, | ||||
| 			   u64 *disk_bytenr, u64 *disk_io_size) | ||||
| { | ||||
| 	struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp)); | ||||
| 	struct btrfs_fs_info *fs_info = inode->root->fs_info; | ||||
| 	struct extent_io_tree *io_tree = &inode->io_tree; | ||||
| 	ssize_t ret; | ||||
| 	size_t count = iov_iter_count(iter); | ||||
| 	u64 start, lockend, disk_bytenr, disk_io_size; | ||||
| 	struct extent_state *cached_state = NULL; | ||||
| 	u64 start, lockend; | ||||
| 	struct extent_map *em; | ||||
| 	const bool nowait = (iocb->ki_flags & IOCB_NOWAIT); | ||||
| 	bool unlocked = false; | ||||
| 
 | ||||
| 	file_accessed(iocb->ki_filp); | ||||
| 
 | ||||
| 	btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED); | ||||
| 	ret = btrfs_inode_lock(inode, | ||||
| 			       BTRFS_ILOCK_SHARED | (nowait ? BTRFS_ILOCK_TRY : 0)); | ||||
| 	if (ret) | ||||
| 		return ret; | ||||
| 
 | ||||
| 	if (iocb->ki_pos >= inode->vfs_inode.i_size) { | ||||
| 		btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); | ||||
|  | @ -9271,21 +9264,46 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, | |||
| 	 */ | ||||
| 	lockend = start + BTRFS_MAX_UNCOMPRESSED - 1; | ||||
| 
 | ||||
| 	for (;;) { | ||||
| 	if (nowait) { | ||||
| 		struct btrfs_ordered_extent *ordered; | ||||
| 
 | ||||
| 		ret = btrfs_wait_ordered_range(inode, start, | ||||
| 					       lockend - start + 1); | ||||
| 		if (ret) | ||||
| 		if (filemap_range_needs_writeback(inode->vfs_inode.i_mapping, | ||||
| 						  start, lockend)) { | ||||
| 			ret = -EAGAIN; | ||||
| 			goto out_unlock_inode; | ||||
| 		lock_extent(io_tree, start, lockend, &cached_state); | ||||
| 		} | ||||
| 
 | ||||
| 		if (!try_lock_extent(io_tree, start, lockend, cached_state)) { | ||||
| 			ret = -EAGAIN; | ||||
| 			goto out_unlock_inode; | ||||
| 		} | ||||
| 
 | ||||
| 		ordered = btrfs_lookup_ordered_range(inode, start, | ||||
| 						     lockend - start + 1); | ||||
| 		if (!ordered) | ||||
| 			break; | ||||
| 		btrfs_put_ordered_extent(ordered); | ||||
| 		unlock_extent(io_tree, start, lockend, &cached_state); | ||||
| 		cond_resched(); | ||||
| 		if (ordered) { | ||||
| 			btrfs_put_ordered_extent(ordered); | ||||
| 			unlock_extent(io_tree, start, lockend, cached_state); | ||||
| 			ret = -EAGAIN; | ||||
| 			goto out_unlock_inode; | ||||
| 		} | ||||
| 	} else { | ||||
| 		for (;;) { | ||||
| 			struct btrfs_ordered_extent *ordered; | ||||
| 
 | ||||
| 			ret = btrfs_wait_ordered_range(inode, start, | ||||
| 						       lockend - start + 1); | ||||
| 			if (ret) | ||||
| 				goto out_unlock_inode; | ||||
| 
 | ||||
| 			lock_extent(io_tree, start, lockend, cached_state); | ||||
| 			ordered = btrfs_lookup_ordered_range(inode, start, | ||||
| 							     lockend - start + 1); | ||||
| 			if (!ordered) | ||||
| 				break; | ||||
| 			btrfs_put_ordered_extent(ordered); | ||||
| 			unlock_extent(io_tree, start, lockend, cached_state); | ||||
| 			cond_resched(); | ||||
| 		} | ||||
| 	} | ||||
| 
 | ||||
| 	em = btrfs_get_extent(inode, NULL, start, lockend - start + 1); | ||||
|  | @ -9304,9 +9322,9 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, | |||
| 		free_extent_map(em); | ||||
| 		em = NULL; | ||||
| 		ret = btrfs_encoded_read_inline(iocb, iter, start, lockend, | ||||
| 						&cached_state, extent_start, | ||||
| 						cached_state, extent_start, | ||||
| 						count, encoded, &unlocked); | ||||
| 		goto out; | ||||
| 		goto out_unlock_extent; | ||||
| 	} | ||||
| 
 | ||||
| 	/*
 | ||||
|  | @ -9317,12 +9335,12 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, | |||
| 			     inode->vfs_inode.i_size) - iocb->ki_pos; | ||||
| 	if (em->disk_bytenr == EXTENT_MAP_HOLE || | ||||
| 	    (em->flags & EXTENT_FLAG_PREALLOC)) { | ||||
| 		disk_bytenr = EXTENT_MAP_HOLE; | ||||
| 		*disk_bytenr = EXTENT_MAP_HOLE; | ||||
| 		count = min_t(u64, count, encoded->len); | ||||
| 		encoded->len = count; | ||||
| 		encoded->unencoded_len = count; | ||||
| 	} else if (extent_map_is_compressed(em)) { | ||||
| 		disk_bytenr = em->disk_bytenr; | ||||
| 		*disk_bytenr = em->disk_bytenr; | ||||
| 		/*
 | ||||
| 		 * Bail if the buffer isn't large enough to return the whole | ||||
| 		 * compressed extent. | ||||
|  | @ -9331,7 +9349,7 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, | |||
| 			ret = -ENOBUFS; | ||||
| 			goto out_em; | ||||
| 		} | ||||
| 		disk_io_size = em->disk_num_bytes; | ||||
| 		*disk_io_size = em->disk_num_bytes; | ||||
| 		count = em->disk_num_bytes; | ||||
| 		encoded->unencoded_len = em->ram_bytes; | ||||
| 		encoded->unencoded_offset = iocb->ki_pos - (em->start - em->offset); | ||||
|  | @ -9341,47 +9359,42 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, | |||
| 			goto out_em; | ||||
| 		encoded->compression = ret; | ||||
| 	} else { | ||||
| 		disk_bytenr = extent_map_block_start(em) + (start - em->start); | ||||
| 		*disk_bytenr = extent_map_block_start(em) + (start - em->start); | ||||
| 		if (encoded->len > count) | ||||
| 			encoded->len = count; | ||||
| 		/*
 | ||||
| 		 * Don't read beyond what we locked. This also limits the page | ||||
| 		 * allocations that we'll do. | ||||
| 		 */ | ||||
| 		disk_io_size = min(lockend + 1, iocb->ki_pos + encoded->len) - start; | ||||
| 		count = start + disk_io_size - iocb->ki_pos; | ||||
| 		*disk_io_size = min(lockend + 1, iocb->ki_pos + encoded->len) - start; | ||||
| 		count = start + *disk_io_size - iocb->ki_pos; | ||||
| 		encoded->len = count; | ||||
| 		encoded->unencoded_len = count; | ||||
| 		disk_io_size = ALIGN(disk_io_size, fs_info->sectorsize); | ||||
| 		*disk_io_size = ALIGN(*disk_io_size, fs_info->sectorsize); | ||||
| 	} | ||||
| 	free_extent_map(em); | ||||
| 	em = NULL; | ||||
| 
 | ||||
| 	if (disk_bytenr == EXTENT_MAP_HOLE) { | ||||
| 		unlock_extent(io_tree, start, lockend, &cached_state); | ||||
| 	if (*disk_bytenr == EXTENT_MAP_HOLE) { | ||||
| 		unlock_extent(io_tree, start, lockend, cached_state); | ||||
| 		btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); | ||||
| 		unlocked = true; | ||||
| 		ret = iov_iter_zero(count, iter); | ||||
| 		if (ret != count) | ||||
| 			ret = -EFAULT; | ||||
| 	} else { | ||||
| 		ret = btrfs_encoded_read_regular(iocb, iter, start, lockend, | ||||
| 						 &cached_state, disk_bytenr, | ||||
| 						 disk_io_size, count, | ||||
| 						 encoded->compression, | ||||
| 						 &unlocked); | ||||
| 		ret = -EIOCBQUEUED; | ||||
| 		goto out_unlock_extent; | ||||
| 	} | ||||
| 
 | ||||
| out: | ||||
| 	if (ret >= 0) | ||||
| 		iocb->ki_pos += encoded->len; | ||||
| out_em: | ||||
| 	free_extent_map(em); | ||||
| out_unlock_extent: | ||||
| 	if (!unlocked) | ||||
| 		unlock_extent(io_tree, start, lockend, &cached_state); | ||||
| 	/* Leave inode and extent locked if we need to do a read. */ | ||||
| 	if (!unlocked && ret != -EIOCBQUEUED) | ||||
| 		unlock_extent(io_tree, start, lockend, cached_state); | ||||
| out_unlock_inode: | ||||
| 	if (!unlocked) | ||||
| 	if (!unlocked && ret != -EIOCBQUEUED) | ||||
| 		btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); | ||||
| 	return ret; | ||||
| } | ||||
|  | @ -9492,7 +9505,7 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, | |||
| 	 */ | ||||
| 	disk_num_bytes = ALIGN(orig_count, fs_info->sectorsize); | ||||
| 	nr_folios = DIV_ROUND_UP(disk_num_bytes, PAGE_SIZE); | ||||
| 	folios = kvcalloc(nr_folios, sizeof(struct page *), GFP_KERNEL_ACCOUNT); | ||||
| 	folios = kvcalloc(nr_folios, sizeof(struct folio *), GFP_KERNEL_ACCOUNT); | ||||
| 	if (!folios) | ||||
| 		return -ENOMEM; | ||||
| 	for (i = 0; i < nr_folios; i++) { | ||||
|  | @ -9556,7 +9569,7 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, | |||
| 	if (encoded->unencoded_len == encoded->len && | ||||
| 	    encoded->unencoded_offset == 0 && | ||||
| 	    can_cow_file_range_inline(inode, start, encoded->len, orig_count)) { | ||||
| 		ret = __cow_file_range_inline(inode, start, encoded->len, | ||||
| 		ret = __cow_file_range_inline(inode, encoded->len, | ||||
| 					      orig_count, compression, folios[0], | ||||
| 					      true); | ||||
| 		if (ret <= 0) { | ||||
|  |  | |||
							
								
								
									
										478
									
								
								fs/btrfs/ioctl.c
									
										
									
									
									
								
							
							
						
						
									
										478
									
								
								fs/btrfs/ioctl.c
									
										
									
									
									
								
							|  | @ -29,6 +29,7 @@ | |||
| #include <linux/fileattr.h> | ||||
| #include <linux/fsverity.h> | ||||
| #include <linux/sched/xacct.h> | ||||
| #include <linux/io_uring/cmd.h> | ||||
| #include "ctree.h" | ||||
| #include "disk-io.h" | ||||
| #include "export.h" | ||||
|  | @ -1048,7 +1049,6 @@ static noinline int btrfs_mksnapshot(const struct path *parent, | |||
| 				   struct btrfs_qgroup_inherit *inherit) | ||||
| { | ||||
| 	int ret; | ||||
| 	bool snapshot_force_cow = false; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Force new buffered writes to reserve space even when NOCOW is | ||||
|  | @ -1067,15 +1067,13 @@ static noinline int btrfs_mksnapshot(const struct path *parent, | |||
| 	 * creation. | ||||
| 	 */ | ||||
| 	atomic_inc(&root->snapshot_force_cow); | ||||
| 	snapshot_force_cow = true; | ||||
| 
 | ||||
| 	btrfs_wait_ordered_extents(root, U64_MAX, NULL); | ||||
| 
 | ||||
| 	ret = btrfs_mksubvol(parent, idmap, name, namelen, | ||||
| 			     root, readonly, inherit); | ||||
| 	atomic_dec(&root->snapshot_force_cow); | ||||
| out: | ||||
| 	if (snapshot_force_cow) | ||||
| 		atomic_dec(&root->snapshot_force_cow); | ||||
| 	btrfs_drew_read_unlock(&root->snapshot_lock); | ||||
| 	return ret; | ||||
| } | ||||
|  | @ -4057,8 +4055,7 @@ static long btrfs_ioctl_quota_rescan_status(struct btrfs_fs_info *fs_info, | |||
| 	return 0; | ||||
| } | ||||
| 
 | ||||
| static long btrfs_ioctl_quota_rescan_wait(struct btrfs_fs_info *fs_info, | ||||
| 						void __user *arg) | ||||
| static long btrfs_ioctl_quota_rescan_wait(struct btrfs_fs_info *fs_info) | ||||
| { | ||||
| 	if (!capable(CAP_SYS_ADMIN)) | ||||
| 		return -EPERM; | ||||
|  | @ -4513,12 +4510,17 @@ static int btrfs_ioctl_encoded_read(struct file *file, void __user *argp, | |||
| 	size_t copy_end_kernel = offsetofend(struct btrfs_ioctl_encoded_io_args, | ||||
| 					     flags); | ||||
| 	size_t copy_end; | ||||
| 	struct btrfs_inode *inode = BTRFS_I(file_inode(file)); | ||||
| 	struct btrfs_fs_info *fs_info = inode->root->fs_info; | ||||
| 	struct extent_io_tree *io_tree = &inode->io_tree; | ||||
| 	struct iovec iovstack[UIO_FASTIOV]; | ||||
| 	struct iovec *iov = iovstack; | ||||
| 	struct iov_iter iter; | ||||
| 	loff_t pos; | ||||
| 	struct kiocb kiocb; | ||||
| 	ssize_t ret; | ||||
| 	u64 disk_bytenr, disk_io_size; | ||||
| 	struct extent_state *cached_state = NULL; | ||||
| 
 | ||||
| 	if (!capable(CAP_SYS_ADMIN)) { | ||||
| 		ret = -EPERM; | ||||
|  | @ -4571,7 +4573,32 @@ static int btrfs_ioctl_encoded_read(struct file *file, void __user *argp, | |||
| 	init_sync_kiocb(&kiocb, file); | ||||
| 	kiocb.ki_pos = pos; | ||||
| 
 | ||||
| 	ret = btrfs_encoded_read(&kiocb, &iter, &args); | ||||
| 	ret = btrfs_encoded_read(&kiocb, &iter, &args, &cached_state, | ||||
| 				 &disk_bytenr, &disk_io_size); | ||||
| 
 | ||||
| 	if (ret == -EIOCBQUEUED) { | ||||
| 		bool unlocked = false; | ||||
| 		u64 start, lockend, count; | ||||
| 
 | ||||
| 		start = ALIGN_DOWN(kiocb.ki_pos, fs_info->sectorsize); | ||||
| 		lockend = start + BTRFS_MAX_UNCOMPRESSED - 1; | ||||
| 
 | ||||
| 		if (args.compression) | ||||
| 			count = disk_io_size; | ||||
| 		else | ||||
| 			count = args.len; | ||||
| 
 | ||||
| 		ret = btrfs_encoded_read_regular(&kiocb, &iter, start, lockend, | ||||
| 						 &cached_state, disk_bytenr, | ||||
| 						 disk_io_size, count, | ||||
| 						 args.compression, &unlocked); | ||||
| 
 | ||||
| 		if (!unlocked) { | ||||
| 			unlock_extent(io_tree, start, lockend, &cached_state); | ||||
| 			btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); | ||||
| 		} | ||||
| 	} | ||||
| 
 | ||||
| 	if (ret >= 0) { | ||||
| 		fsnotify_access(file); | ||||
| 		if (copy_to_user(argp + copy_end, | ||||
|  | @ -4689,6 +4716,439 @@ out_acct: | |||
| 	return ret; | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Context that's attached to an encoded read io_uring command, in cmd->pdu. It | ||||
|  * contains the fields in btrfs_uring_read_extent that are necessary to finish | ||||
|  * off and cleanup the I/O in btrfs_uring_read_finished. | ||||
|  */ | ||||
| struct btrfs_uring_priv { | ||||
| 	struct io_uring_cmd *cmd; | ||||
| 	struct page **pages; | ||||
| 	unsigned long nr_pages; | ||||
| 	struct kiocb iocb; | ||||
| 	struct iovec *iov; | ||||
| 	struct iov_iter iter; | ||||
| 	struct extent_state *cached_state; | ||||
| 	u64 count; | ||||
| 	u64 start; | ||||
| 	u64 lockend; | ||||
| 	int err; | ||||
| 	bool compressed; | ||||
| }; | ||||
| 
 | ||||
| struct io_btrfs_cmd { | ||||
| 	struct btrfs_uring_priv *priv; | ||||
| }; | ||||
| 
 | ||||
| static void btrfs_uring_read_finished(struct io_uring_cmd *cmd, unsigned int issue_flags) | ||||
| { | ||||
| 	struct io_btrfs_cmd *bc = io_uring_cmd_to_pdu(cmd, struct io_btrfs_cmd); | ||||
| 	struct btrfs_uring_priv *priv = bc->priv; | ||||
| 	struct btrfs_inode *inode = BTRFS_I(file_inode(priv->iocb.ki_filp)); | ||||
| 	struct extent_io_tree *io_tree = &inode->io_tree; | ||||
| 	unsigned long index; | ||||
| 	u64 cur; | ||||
| 	size_t page_offset; | ||||
| 	ssize_t ret; | ||||
| 
 | ||||
| 	if (priv->err) { | ||||
| 		ret = priv->err; | ||||
| 		goto out; | ||||
| 	} | ||||
| 
 | ||||
| 	if (priv->compressed) { | ||||
| 		index = 0; | ||||
| 		page_offset = 0; | ||||
| 	} else { | ||||
| 		index = (priv->iocb.ki_pos - priv->start) >> PAGE_SHIFT; | ||||
| 		page_offset = offset_in_page(priv->iocb.ki_pos - priv->start); | ||||
| 	} | ||||
| 	cur = 0; | ||||
| 	while (cur < priv->count) { | ||||
| 		size_t bytes = min_t(size_t, priv->count - cur, PAGE_SIZE - page_offset); | ||||
| 
 | ||||
| 		if (copy_page_to_iter(priv->pages[index], page_offset, bytes, | ||||
| 				      &priv->iter) != bytes) { | ||||
| 			ret = -EFAULT; | ||||
| 			goto out; | ||||
| 		} | ||||
| 
 | ||||
| 		index++; | ||||
| 		cur += bytes; | ||||
| 		page_offset = 0; | ||||
| 	} | ||||
| 	ret = priv->count; | ||||
| 
 | ||||
| out: | ||||
| 	unlock_extent(io_tree, priv->start, priv->lockend, &priv->cached_state); | ||||
| 	btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); | ||||
| 
 | ||||
| 	io_uring_cmd_done(cmd, ret, 0, issue_flags); | ||||
| 	add_rchar(current, ret); | ||||
| 
 | ||||
| 	for (index = 0; index < priv->nr_pages; index++) | ||||
| 		__free_page(priv->pages[index]); | ||||
| 
 | ||||
| 	kfree(priv->pages); | ||||
| 	kfree(priv->iov); | ||||
| 	kfree(priv); | ||||
| } | ||||
| 
 | ||||
| void btrfs_uring_read_extent_endio(void *ctx, int err) | ||||
| { | ||||
| 	struct btrfs_uring_priv *priv = ctx; | ||||
| 	struct io_btrfs_cmd *bc = io_uring_cmd_to_pdu(priv->cmd, struct io_btrfs_cmd); | ||||
| 
 | ||||
| 	priv->err = err; | ||||
| 	bc->priv = priv; | ||||
| 
 | ||||
| 	io_uring_cmd_complete_in_task(priv->cmd, btrfs_uring_read_finished); | ||||
| } | ||||
| 
 | ||||
| static int btrfs_uring_read_extent(struct kiocb *iocb, struct iov_iter *iter, | ||||
| 				   u64 start, u64 lockend, | ||||
| 				   struct extent_state *cached_state, | ||||
| 				   u64 disk_bytenr, u64 disk_io_size, | ||||
| 				   size_t count, bool compressed, | ||||
| 				   struct iovec *iov, struct io_uring_cmd *cmd) | ||||
| { | ||||
| 	struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp)); | ||||
| 	struct extent_io_tree *io_tree = &inode->io_tree; | ||||
| 	struct page **pages; | ||||
| 	struct btrfs_uring_priv *priv = NULL; | ||||
| 	unsigned long nr_pages; | ||||
| 	int ret; | ||||
| 
 | ||||
| 	nr_pages = DIV_ROUND_UP(disk_io_size, PAGE_SIZE); | ||||
| 	pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); | ||||
| 	if (!pages) | ||||
| 		return -ENOMEM; | ||||
| 	ret = btrfs_alloc_page_array(nr_pages, pages, 0); | ||||
| 	if (ret) { | ||||
| 		ret = -ENOMEM; | ||||
| 		goto out_fail; | ||||
| 	} | ||||
| 
 | ||||
| 	priv = kmalloc(sizeof(*priv), GFP_NOFS); | ||||
| 	if (!priv) { | ||||
| 		ret = -ENOMEM; | ||||
| 		goto out_fail; | ||||
| 	} | ||||
| 
 | ||||
| 	priv->iocb = *iocb; | ||||
| 	priv->iov = iov; | ||||
| 	priv->iter = *iter; | ||||
| 	priv->count = count; | ||||
| 	priv->cmd = cmd; | ||||
| 	priv->cached_state = cached_state; | ||||
| 	priv->compressed = compressed; | ||||
| 	priv->nr_pages = nr_pages; | ||||
| 	priv->pages = pages; | ||||
| 	priv->start = start; | ||||
| 	priv->lockend = lockend; | ||||
| 	priv->err = 0; | ||||
| 
 | ||||
| 	ret = btrfs_encoded_read_regular_fill_pages(inode, disk_bytenr, | ||||
| 						    disk_io_size, pages, priv); | ||||
| 	if (ret && ret != -EIOCBQUEUED) | ||||
| 		goto out_fail; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * If we return -EIOCBQUEUED, we're deferring the cleanup to | ||||
| 	 * btrfs_uring_read_finished(), which will handle unlocking the extent | ||||
| 	 * and inode and freeing the allocations. | ||||
| 	 */ | ||||
| 
 | ||||
| 	return -EIOCBQUEUED; | ||||
| 
 | ||||
| out_fail: | ||||
| 	unlock_extent(io_tree, start, lockend, &cached_state); | ||||
| 	btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); | ||||
| 	kfree(priv); | ||||
| 	return ret; | ||||
| } | ||||
| 
 | ||||
| static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue_flags) | ||||
| { | ||||
| 	size_t copy_end_kernel = offsetofend(struct btrfs_ioctl_encoded_io_args, flags); | ||||
| 	size_t copy_end; | ||||
| 	struct btrfs_ioctl_encoded_io_args args = { 0 }; | ||||
| 	int ret; | ||||
| 	u64 disk_bytenr, disk_io_size; | ||||
| 	struct file *file; | ||||
| 	struct btrfs_inode *inode; | ||||
| 	struct btrfs_fs_info *fs_info; | ||||
| 	struct extent_io_tree *io_tree; | ||||
| 	struct iovec iovstack[UIO_FASTIOV]; | ||||
| 	struct iovec *iov = iovstack; | ||||
| 	struct iov_iter iter; | ||||
| 	loff_t pos; | ||||
| 	struct kiocb kiocb; | ||||
| 	struct extent_state *cached_state = NULL; | ||||
| 	u64 start, lockend; | ||||
| 	void __user *sqe_addr; | ||||
| 
 | ||||
| 	if (!capable(CAP_SYS_ADMIN)) { | ||||
| 		ret = -EPERM; | ||||
| 		goto out_acct; | ||||
| 	} | ||||
| 	file = cmd->file; | ||||
| 	inode = BTRFS_I(file->f_inode); | ||||
| 	fs_info = inode->root->fs_info; | ||||
| 	io_tree = &inode->io_tree; | ||||
| 	sqe_addr = u64_to_user_ptr(READ_ONCE(cmd->sqe->addr)); | ||||
| 
 | ||||
| 	if (issue_flags & IO_URING_F_COMPAT) { | ||||
| #if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) | ||||
| 		struct btrfs_ioctl_encoded_io_args_32 args32; | ||||
| 
 | ||||
| 		copy_end = offsetofend(struct btrfs_ioctl_encoded_io_args_32, flags); | ||||
| 		if (copy_from_user(&args32, sqe_addr, copy_end)) { | ||||
| 			ret = -EFAULT; | ||||
| 			goto out_acct; | ||||
| 		} | ||||
| 		args.iov = compat_ptr(args32.iov); | ||||
| 		args.iovcnt = args32.iovcnt; | ||||
| 		args.offset = args32.offset; | ||||
| 		args.flags = args32.flags; | ||||
| #else | ||||
| 		return -ENOTTY; | ||||
| #endif | ||||
| 	} else { | ||||
| 		copy_end = copy_end_kernel; | ||||
| 		if (copy_from_user(&args, sqe_addr, copy_end)) { | ||||
| 			ret = -EFAULT; | ||||
| 			goto out_acct; | ||||
| 		} | ||||
| 	} | ||||
| 
 | ||||
| 	if (args.flags != 0) | ||||
| 		return -EINVAL; | ||||
| 
 | ||||
| 	ret = import_iovec(ITER_DEST, args.iov, args.iovcnt, ARRAY_SIZE(iovstack), | ||||
| 			   &iov, &iter); | ||||
| 	if (ret < 0) | ||||
| 		goto out_acct; | ||||
| 
 | ||||
| 	if (iov_iter_count(&iter) == 0) { | ||||
| 		ret = 0; | ||||
| 		goto out_free; | ||||
| 	} | ||||
| 
 | ||||
| 	pos = args.offset; | ||||
| 	ret = rw_verify_area(READ, file, &pos, args.len); | ||||
| 	if (ret < 0) | ||||
| 		goto out_free; | ||||
| 
 | ||||
| 	init_sync_kiocb(&kiocb, file); | ||||
| 	kiocb.ki_pos = pos; | ||||
| 
 | ||||
| 	if (issue_flags & IO_URING_F_NONBLOCK) | ||||
| 		kiocb.ki_flags |= IOCB_NOWAIT; | ||||
| 
 | ||||
| 	start = ALIGN_DOWN(pos, fs_info->sectorsize); | ||||
| 	lockend = start + BTRFS_MAX_UNCOMPRESSED - 1; | ||||
| 
 | ||||
| 	ret = btrfs_encoded_read(&kiocb, &iter, &args, &cached_state, | ||||
| 				 &disk_bytenr, &disk_io_size); | ||||
| 	if (ret < 0 && ret != -EIOCBQUEUED) | ||||
| 		goto out_free; | ||||
| 
 | ||||
| 	file_accessed(file); | ||||
| 
 | ||||
| 	if (copy_to_user(sqe_addr + copy_end, (const char *)&args + copy_end_kernel, | ||||
| 			 sizeof(args) - copy_end_kernel)) { | ||||
| 		if (ret == -EIOCBQUEUED) { | ||||
| 			unlock_extent(io_tree, start, lockend, &cached_state); | ||||
| 			btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); | ||||
| 		} | ||||
| 		ret = -EFAULT; | ||||
| 		goto out_free; | ||||
| 	} | ||||
| 
 | ||||
| 	if (ret == -EIOCBQUEUED) { | ||||
| 		u64 count; | ||||
| 
 | ||||
| 		/*
 | ||||
| 		 * If we've optimized things by storing the iovecs on the stack, | ||||
| 		 * undo this. | ||||
| 		 */ | ||||
| 		if (!iov) { | ||||
| 			iov = kmalloc(sizeof(struct iovec) * args.iovcnt, GFP_NOFS); | ||||
| 			if (!iov) { | ||||
| 				unlock_extent(io_tree, start, lockend, &cached_state); | ||||
| 				btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); | ||||
| 				ret = -ENOMEM; | ||||
| 				goto out_acct; | ||||
| 			} | ||||
| 
 | ||||
| 			memcpy(iov, iovstack, sizeof(struct iovec) * args.iovcnt); | ||||
| 		} | ||||
| 
 | ||||
| 		count = min_t(u64, iov_iter_count(&iter), disk_io_size); | ||||
| 
 | ||||
| 		/* Match ioctl by not returning past EOF if uncompressed. */ | ||||
| 		if (!args.compression) | ||||
| 			count = min_t(u64, count, args.len); | ||||
| 
 | ||||
| 		ret = btrfs_uring_read_extent(&kiocb, &iter, start, lockend, | ||||
| 					      cached_state, disk_bytenr, | ||||
| 					      disk_io_size, count, | ||||
| 					      args.compression, iov, cmd); | ||||
| 
 | ||||
| 		goto out_acct; | ||||
| 	} | ||||
| 
 | ||||
| out_free: | ||||
| 	kfree(iov); | ||||
| 
 | ||||
| out_acct: | ||||
| 	if (ret > 0) | ||||
| 		add_rchar(current, ret); | ||||
| 	inc_syscr(current); | ||||
| 
 | ||||
| 	return ret; | ||||
| } | ||||
| 
 | ||||
| int btrfs_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) | ||||
| { | ||||
| 	switch (cmd->cmd_op) { | ||||
| 	case BTRFS_IOC_ENCODED_READ: | ||||
| #if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) | ||||
| 	case BTRFS_IOC_ENCODED_READ_32: | ||||
| #endif | ||||
| 		return btrfs_uring_encoded_read(cmd, issue_flags); | ||||
| 	} | ||||
| 
 | ||||
| 	return -EINVAL; | ||||
| } | ||||
| 
 | ||||
| static int btrfs_ioctl_subvol_sync(struct btrfs_fs_info *fs_info, void __user *argp) | ||||
| { | ||||
| 	struct btrfs_root *root; | ||||
| 	struct btrfs_ioctl_subvol_wait args = { 0 }; | ||||
| 	signed long sched_ret; | ||||
| 	int refs; | ||||
| 	u64 root_flags; | ||||
| 	bool wait_for_deletion = false; | ||||
| 	bool found = false; | ||||
| 
 | ||||
| 	if (copy_from_user(&args, argp, sizeof(args))) | ||||
| 		return -EFAULT; | ||||
| 
 | ||||
| 	switch (args.mode) { | ||||
| 	case BTRFS_SUBVOL_SYNC_WAIT_FOR_QUEUED: | ||||
| 		/*
 | ||||
| 		 * Wait for the first one deleted that waits until all previous | ||||
| 		 * are cleaned. | ||||
| 		 */ | ||||
| 		spin_lock(&fs_info->trans_lock); | ||||
| 		if (!list_empty(&fs_info->dead_roots)) { | ||||
| 			root = list_last_entry(&fs_info->dead_roots, | ||||
| 					       struct btrfs_root, root_list); | ||||
| 			args.subvolid = btrfs_root_id(root); | ||||
| 			found = true; | ||||
| 		} | ||||
| 		spin_unlock(&fs_info->trans_lock); | ||||
| 		if (!found) | ||||
| 			return -ENOENT; | ||||
| 
 | ||||
| 		fallthrough; | ||||
| 	case BTRFS_SUBVOL_SYNC_WAIT_FOR_ONE: | ||||
| 		if ((0 < args.subvolid && args.subvolid < BTRFS_FIRST_FREE_OBJECTID) || | ||||
| 		    BTRFS_LAST_FREE_OBJECTID < args.subvolid) | ||||
| 			return -EINVAL; | ||||
| 		break; | ||||
| 	case BTRFS_SUBVOL_SYNC_COUNT: | ||||
| 		spin_lock(&fs_info->trans_lock); | ||||
| 		args.count = list_count_nodes(&fs_info->dead_roots); | ||||
| 		spin_unlock(&fs_info->trans_lock); | ||||
| 		if (copy_to_user(argp, &args, sizeof(args))) | ||||
| 			return -EFAULT; | ||||
| 		return 0; | ||||
| 	case BTRFS_SUBVOL_SYNC_PEEK_FIRST: | ||||
| 		spin_lock(&fs_info->trans_lock); | ||||
| 		/* Last in the list was deleted first. */ | ||||
| 		if (!list_empty(&fs_info->dead_roots)) { | ||||
| 			root = list_last_entry(&fs_info->dead_roots, | ||||
| 					       struct btrfs_root, root_list); | ||||
| 			args.subvolid = btrfs_root_id(root); | ||||
| 		} else { | ||||
| 			args.subvolid = 0; | ||||
| 		} | ||||
| 		spin_unlock(&fs_info->trans_lock); | ||||
| 		if (copy_to_user(argp, &args, sizeof(args))) | ||||
| 			return -EFAULT; | ||||
| 		return 0; | ||||
| 	case BTRFS_SUBVOL_SYNC_PEEK_LAST: | ||||
| 		spin_lock(&fs_info->trans_lock); | ||||
| 		/* First in the list was deleted last. */ | ||||
| 		if (!list_empty(&fs_info->dead_roots)) { | ||||
| 			root = list_first_entry(&fs_info->dead_roots, | ||||
| 						struct btrfs_root, root_list); | ||||
| 			args.subvolid = btrfs_root_id(root); | ||||
| 		} else { | ||||
| 			args.subvolid = 0; | ||||
| 		} | ||||
| 		spin_unlock(&fs_info->trans_lock); | ||||
| 		if (copy_to_user(argp, &args, sizeof(args))) | ||||
| 			return -EFAULT; | ||||
| 		return 0; | ||||
| 	default: | ||||
| 		return -EINVAL; | ||||
| 	} | ||||
| 
 | ||||
| 	/* 32bit limitation: fs_roots_radix key is not wide enough. */ | ||||
| 	if (sizeof(unsigned long) != sizeof(u64) && args.subvolid > U32_MAX) | ||||
| 		return -EOVERFLOW; | ||||
| 
 | ||||
| 	while (1) { | ||||
| 		/* Wait for the specific one. */ | ||||
| 		if (down_read_interruptible(&fs_info->subvol_sem) == -EINTR) | ||||
| 			return -EINTR; | ||||
| 		refs = -1; | ||||
| 		spin_lock(&fs_info->fs_roots_radix_lock); | ||||
| 		root = radix_tree_lookup(&fs_info->fs_roots_radix, | ||||
| 					 (unsigned long)args.subvolid); | ||||
| 		if (root) { | ||||
| 			spin_lock(&root->root_item_lock); | ||||
| 			refs = btrfs_root_refs(&root->root_item); | ||||
| 			root_flags = btrfs_root_flags(&root->root_item); | ||||
| 			spin_unlock(&root->root_item_lock); | ||||
| 		} | ||||
| 		spin_unlock(&fs_info->fs_roots_radix_lock); | ||||
| 		up_read(&fs_info->subvol_sem); | ||||
| 
 | ||||
| 		/* Subvolume does not exist. */ | ||||
| 		if (!root) | ||||
| 			return -ENOENT; | ||||
| 
 | ||||
| 		/* Subvolume not deleted at all. */ | ||||
| 		if (refs > 0) | ||||
| 			return -EEXIST; | ||||
| 		/* We've waited and now the subvolume is gone. */ | ||||
| 		if (wait_for_deletion && refs == -1) { | ||||
| 			/* Return the one we waited for as the last one. */ | ||||
| 			if (copy_to_user(argp, &args, sizeof(args))) | ||||
| 				return -EFAULT; | ||||
| 			return 0; | ||||
| 		} | ||||
| 
 | ||||
| 		/* Subvolume not found on the first try (deleted or never existed). */ | ||||
| 		if (refs == -1) | ||||
| 			return -ENOENT; | ||||
| 
 | ||||
| 		wait_for_deletion = true; | ||||
| 		ASSERT(root_flags & BTRFS_ROOT_SUBVOL_DEAD); | ||||
| 		sched_ret = schedule_timeout_interruptible(HZ); | ||||
| 		/* Early wake up or error. */ | ||||
| 		if (sched_ret != 0) | ||||
| 			return -EINTR; | ||||
| 	} | ||||
| 
 | ||||
| 	return 0; | ||||
| } | ||||
| 
 | ||||
| long btrfs_ioctl(struct file *file, unsigned int | ||||
| 		cmd, unsigned long arg) | ||||
| { | ||||
|  | @ -4811,7 +5271,7 @@ long btrfs_ioctl(struct file *file, unsigned int | |||
| 	case BTRFS_IOC_QUOTA_RESCAN_STATUS: | ||||
| 		return btrfs_ioctl_quota_rescan_status(fs_info, argp); | ||||
| 	case BTRFS_IOC_QUOTA_RESCAN_WAIT: | ||||
| 		return btrfs_ioctl_quota_rescan_wait(fs_info, argp); | ||||
| 		return btrfs_ioctl_quota_rescan_wait(fs_info); | ||||
| 	case BTRFS_IOC_DEV_REPLACE: | ||||
| 		return btrfs_ioctl_dev_replace(fs_info, argp); | ||||
| 	case BTRFS_IOC_GET_SUPPORTED_FEATURES: | ||||
|  | @ -4840,6 +5300,8 @@ long btrfs_ioctl(struct file *file, unsigned int | |||
| 	case BTRFS_IOC_ENCODED_WRITE_32: | ||||
| 		return btrfs_ioctl_encoded_write(file, argp, true); | ||||
| #endif | ||||
| 	case BTRFS_IOC_SUBVOL_SYNC_WAIT: | ||||
| 		return btrfs_ioctl_subvol_sync(fs_info, argp); | ||||
| 	} | ||||
| 
 | ||||
| 	return -ENOTTY; | ||||
|  |  | |||
|  | @ -22,5 +22,7 @@ void btrfs_sync_inode_flags_to_i_flags(struct inode *inode); | |||
| int __pure btrfs_is_empty_uuid(const u8 *uuid); | ||||
| void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info, | ||||
| 				     struct btrfs_ioctl_balance_args *bargs); | ||||
| int btrfs_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags); | ||||
| void btrfs_uring_read_extent_endio(void *ctx, int err); | ||||
| 
 | ||||
| #endif | ||||
|  |  | |||
|  | @ -161,21 +161,6 @@ int btrfs_try_tree_read_lock(struct extent_buffer *eb) | |||
| 	return 0; | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Try-lock for write. | ||||
|  * | ||||
|  * Return 1 if the rwlock has been taken, 0 otherwise | ||||
|  */ | ||||
| int btrfs_try_tree_write_lock(struct extent_buffer *eb) | ||||
| { | ||||
| 	if (down_write_trylock(&eb->lock)) { | ||||
| 		btrfs_set_eb_lock_owner(eb, current->pid); | ||||
| 		trace_btrfs_try_tree_write_lock(eb); | ||||
| 		return 1; | ||||
| 	} | ||||
| 	return 0; | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Release read lock. | ||||
|  */ | ||||
|  |  | |||
|  | @ -180,7 +180,6 @@ static inline void btrfs_tree_read_lock(struct extent_buffer *eb) | |||
| 
 | ||||
| void btrfs_tree_read_unlock(struct extent_buffer *eb); | ||||
| int btrfs_try_tree_read_lock(struct extent_buffer *eb); | ||||
| int btrfs_try_tree_write_lock(struct extent_buffer *eb); | ||||
| struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root); | ||||
| struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root); | ||||
| struct extent_buffer *btrfs_try_read_lock_root_node(struct btrfs_root *root); | ||||
|  |  | |||
|  | @ -80,7 +80,7 @@ void lzo_free_workspace(struct list_head *ws) | |||
| 	kfree(workspace); | ||||
| } | ||||
| 
 | ||||
| struct list_head *lzo_alloc_workspace(unsigned int level) | ||||
| struct list_head *lzo_alloc_workspace(void) | ||||
| { | ||||
| 	struct workspace *workspace; | ||||
| 
 | ||||
|  |  | |||
|  | @ -226,8 +226,7 @@ static struct btrfs_qgroup *add_qgroup_rb(struct btrfs_fs_info *fs_info, | |||
| 	return qgroup; | ||||
| } | ||||
| 
 | ||||
| static void __del_qgroup_rb(struct btrfs_fs_info *fs_info, | ||||
| 			    struct btrfs_qgroup *qgroup) | ||||
| static void __del_qgroup_rb(struct btrfs_qgroup *qgroup) | ||||
| { | ||||
| 	struct btrfs_qgroup_list *list; | ||||
| 
 | ||||
|  | @ -258,7 +257,7 @@ static int del_qgroup_rb(struct btrfs_fs_info *fs_info, u64 qgroupid) | |||
| 		return -ENOENT; | ||||
| 
 | ||||
| 	rb_erase(&qgroup->node, &fs_info->qgroup_tree); | ||||
| 	__del_qgroup_rb(fs_info, qgroup); | ||||
| 	__del_qgroup_rb(qgroup); | ||||
| 	return 0; | ||||
| } | ||||
| 
 | ||||
|  | @ -469,7 +468,7 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info) | |||
| 			/*
 | ||||
| 			 * If a qgroup exists for a subvolume ID, it is possible | ||||
| 			 * that subvolume has been deleted, in which case | ||||
| 			 * re-using that ID would lead to incorrect accounting. | ||||
| 			 * reusing that ID would lead to incorrect accounting. | ||||
| 			 * | ||||
| 			 * Ensure that we skip any such subvol ids. | ||||
| 			 * | ||||
|  | @ -643,7 +642,7 @@ void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info) | |||
| 	while ((n = rb_first(&fs_info->qgroup_tree))) { | ||||
| 		qgroup = rb_entry(n, struct btrfs_qgroup, node); | ||||
| 		rb_erase(n, &fs_info->qgroup_tree); | ||||
| 		__del_qgroup_rb(fs_info, qgroup); | ||||
| 		__del_qgroup_rb(qgroup); | ||||
| 		btrfs_sysfs_del_one_qgroup(fs_info, qgroup); | ||||
| 		kfree(qgroup); | ||||
| 	} | ||||
|  | @ -2001,27 +2000,27 @@ out: | |||
|  * Return <0 for insertion failure, caller can free @record safely. | ||||
|  */ | ||||
| int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info, | ||||
| 				struct btrfs_delayed_ref_root *delayed_refs, | ||||
| 				struct btrfs_qgroup_extent_record *record) | ||||
| 				     struct btrfs_delayed_ref_root *delayed_refs, | ||||
| 				     struct btrfs_qgroup_extent_record *record, | ||||
| 				     u64 bytenr) | ||||
| { | ||||
| 	struct btrfs_qgroup_extent_record *existing, *ret; | ||||
| 	const unsigned long index = (record->bytenr >> fs_info->sectorsize_bits); | ||||
| 	const unsigned long index = (bytenr >> fs_info->sectorsize_bits); | ||||
| 
 | ||||
| 	if (!btrfs_qgroup_full_accounting(fs_info)) | ||||
| 		return 1; | ||||
| 
 | ||||
| #if BITS_PER_LONG == 32 | ||||
| 	if (record->bytenr >= MAX_LFS_FILESIZE) { | ||||
| 	if (bytenr >= MAX_LFS_FILESIZE) { | ||||
| 		btrfs_err_rl(fs_info, | ||||
| "qgroup record for extent at %llu is beyond 32bit page cache and xarray index limit", | ||||
| 			     record->bytenr); | ||||
| 			     bytenr); | ||||
| 		btrfs_err_32bit_limit(fs_info); | ||||
| 		return -EOVERFLOW; | ||||
| 	} | ||||
| #endif | ||||
| 
 | ||||
| 	lockdep_assert_held(&delayed_refs->lock); | ||||
| 	trace_btrfs_qgroup_trace_extent(fs_info, record); | ||||
| 	trace_btrfs_qgroup_trace_extent(fs_info, record, bytenr); | ||||
| 
 | ||||
| 	xa_lock(&delayed_refs->dirty_extents); | ||||
| 	existing = xa_load(&delayed_refs->dirty_extents, index); | ||||
|  | @ -2066,12 +2065,17 @@ int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info, | |||
|  * transaction committing, but not now as qgroup accounting will be wrong again. | ||||
|  */ | ||||
| int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans, | ||||
| 				   struct btrfs_qgroup_extent_record *qrecord) | ||||
| 				   struct btrfs_qgroup_extent_record *qrecord, | ||||
| 				   u64 bytenr) | ||||
| { | ||||
| 	struct btrfs_backref_walk_ctx ctx = { 0 }; | ||||
| 	struct btrfs_fs_info *fs_info = trans->fs_info; | ||||
| 	struct btrfs_backref_walk_ctx ctx = { | ||||
| 		.bytenr = bytenr, | ||||
| 		.fs_info = fs_info, | ||||
| 	}; | ||||
| 	int ret; | ||||
| 
 | ||||
| 	if (!btrfs_qgroup_full_accounting(trans->fs_info)) | ||||
| 	if (!btrfs_qgroup_full_accounting(fs_info)) | ||||
| 		return 0; | ||||
| 	/*
 | ||||
| 	 * We are always called in a context where we are already holding a | ||||
|  | @ -2094,16 +2098,13 @@ int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans, | |||
| 	 */ | ||||
| 	ASSERT(trans != NULL); | ||||
| 
 | ||||
| 	if (trans->fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING) | ||||
| 	if (fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING) | ||||
| 		return 0; | ||||
| 
 | ||||
| 	ctx.bytenr = qrecord->bytenr; | ||||
| 	ctx.fs_info = trans->fs_info; | ||||
| 
 | ||||
| 	ret = btrfs_find_all_roots(&ctx, true); | ||||
| 	if (ret < 0) { | ||||
| 		qgroup_mark_inconsistent(trans->fs_info); | ||||
| 		btrfs_warn(trans->fs_info, | ||||
| 		qgroup_mark_inconsistent(fs_info); | ||||
| 		btrfs_warn(fs_info, | ||||
| "error accounting new delayed refs extent (err code: %d), quota inconsistent", | ||||
| 			ret); | ||||
| 		return 0; | ||||
|  | @ -2138,7 +2139,7 @@ int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr, | |||
| { | ||||
| 	struct btrfs_fs_info *fs_info = trans->fs_info; | ||||
| 	struct btrfs_qgroup_extent_record *record; | ||||
| 	struct btrfs_delayed_ref_root *delayed_refs; | ||||
| 	struct btrfs_delayed_ref_root *delayed_refs = &trans->transaction->delayed_refs; | ||||
| 	const unsigned long index = (bytenr >> fs_info->sectorsize_bits); | ||||
| 	int ret; | ||||
| 
 | ||||
|  | @ -2148,26 +2149,21 @@ int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr, | |||
| 	if (!record) | ||||
| 		return -ENOMEM; | ||||
| 
 | ||||
| 	if (xa_reserve(&trans->transaction->delayed_refs.dirty_extents, index, GFP_NOFS)) { | ||||
| 	if (xa_reserve(&delayed_refs->dirty_extents, index, GFP_NOFS)) { | ||||
| 		kfree(record); | ||||
| 		return -ENOMEM; | ||||
| 	} | ||||
| 
 | ||||
| 	delayed_refs = &trans->transaction->delayed_refs; | ||||
| 	record->bytenr = bytenr; | ||||
| 	record->num_bytes = num_bytes; | ||||
| 	record->old_roots = NULL; | ||||
| 
 | ||||
| 	spin_lock(&delayed_refs->lock); | ||||
| 	ret = btrfs_qgroup_trace_extent_nolock(fs_info, delayed_refs, record); | ||||
| 	spin_unlock(&delayed_refs->lock); | ||||
| 	ret = btrfs_qgroup_trace_extent_nolock(fs_info, delayed_refs, record, bytenr); | ||||
| 	if (ret) { | ||||
| 		/* Clean up if insertion fails or item exists. */ | ||||
| 		xa_release(&delayed_refs->dirty_extents, index); | ||||
| 		kfree(record); | ||||
| 		return 0; | ||||
| 	} | ||||
| 	return btrfs_qgroup_trace_extent_post(trans, record); | ||||
| 	return btrfs_qgroup_trace_extent_post(trans, record, bytenr); | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  | @ -2652,7 +2648,6 @@ int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans, | |||
| 
 | ||||
| 	if (!extent_buffer_uptodate(root_eb)) { | ||||
| 		struct btrfs_tree_parent_check check = { | ||||
| 			.has_first_key = false, | ||||
| 			.transid = root_gen, | ||||
| 			.level = root_level | ||||
| 		}; | ||||
|  | @ -3043,14 +3038,16 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans) | |||
| 	delayed_refs = &trans->transaction->delayed_refs; | ||||
| 	qgroup_to_skip = delayed_refs->qgroup_to_skip; | ||||
| 	xa_for_each(&delayed_refs->dirty_extents, index, record) { | ||||
| 		const u64 bytenr = (((u64)index) << fs_info->sectorsize_bits); | ||||
| 
 | ||||
| 		num_dirty_extents++; | ||||
| 		trace_btrfs_qgroup_account_extents(fs_info, record); | ||||
| 		trace_btrfs_qgroup_account_extents(fs_info, record, bytenr); | ||||
| 
 | ||||
| 		if (!ret && !(fs_info->qgroup_flags & | ||||
| 			      BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING)) { | ||||
| 			struct btrfs_backref_walk_ctx ctx = { 0 }; | ||||
| 
 | ||||
| 			ctx.bytenr = record->bytenr; | ||||
| 			ctx.bytenr = bytenr; | ||||
| 			ctx.fs_info = fs_info; | ||||
| 
 | ||||
| 			/*
 | ||||
|  | @ -3092,7 +3089,7 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans) | |||
| 				ulist_del(record->old_roots, qgroup_to_skip, | ||||
| 					  0); | ||||
| 			} | ||||
| 			ret = btrfs_qgroup_account_extent(trans, record->bytenr, | ||||
| 			ret = btrfs_qgroup_account_extent(trans, bytenr, | ||||
| 							  record->num_bytes, | ||||
| 							  record->old_roots, | ||||
| 							  new_roots); | ||||
|  | @ -4196,13 +4193,20 @@ static int try_flush_qgroup(struct btrfs_root *root) | |||
| 		return 0; | ||||
| 	} | ||||
| 
 | ||||
| 	btrfs_run_delayed_iputs(root->fs_info); | ||||
| 	btrfs_wait_on_delayed_iputs(root->fs_info); | ||||
| 	ret = btrfs_start_delalloc_snapshot(root, true); | ||||
| 	if (ret < 0) | ||||
| 		goto out; | ||||
| 	btrfs_wait_ordered_extents(root, U64_MAX, NULL); | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * After waiting for ordered extents run delayed iputs in order to free | ||||
| 	 * space from unlinked files before committing the current transaction, | ||||
| 	 * as ordered extents may have been holding the last reference of an | ||||
| 	 * inode and they add a delayed iput when they complete. | ||||
| 	 */ | ||||
| 	btrfs_run_delayed_iputs(root->fs_info); | ||||
| 	btrfs_wait_on_delayed_iputs(root->fs_info); | ||||
| 
 | ||||
| 	ret = btrfs_commit_current_transaction(root); | ||||
| out: | ||||
| 	clear_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state); | ||||
|  | @ -4687,8 +4691,7 @@ out: | |||
|  *			BOTH POINTERS ARE BEFORE TREE SWAP | ||||
|  * @last_snapshot:	last snapshot generation of the subvolume tree | ||||
|  */ | ||||
| int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans, | ||||
| 		struct btrfs_root *subvol_root, | ||||
| int btrfs_qgroup_add_swapped_blocks(struct btrfs_root *subvol_root, | ||||
| 		struct btrfs_block_group *bg, | ||||
| 		struct extent_buffer *subvol_parent, int subvol_slot, | ||||
| 		struct extent_buffer *reloc_parent, int reloc_slot, | ||||
|  | @ -4894,17 +4897,6 @@ void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans) | |||
| 	xa_destroy(&trans->delayed_refs.dirty_extents); | ||||
| } | ||||
| 
 | ||||
| void btrfs_free_squota_rsv(struct btrfs_fs_info *fs_info, u64 root, u64 rsv_bytes) | ||||
| { | ||||
| 	if (btrfs_qgroup_mode(fs_info) != BTRFS_QGROUP_MODE_SIMPLE) | ||||
| 		return; | ||||
| 
 | ||||
| 	if (!is_fstree(root)) | ||||
| 		return; | ||||
| 
 | ||||
| 	btrfs_qgroup_free_refroot(fs_info, root, rsv_bytes, BTRFS_QGROUP_RSV_DATA); | ||||
| } | ||||
| 
 | ||||
| int btrfs_record_squota_delta(struct btrfs_fs_info *fs_info, | ||||
| 			      const struct btrfs_squota_delta *delta) | ||||
| { | ||||
|  |  | |||
|  | @ -127,7 +127,12 @@ struct btrfs_inode; | |||
|  * Record a dirty extent, and info qgroup to update quota on it | ||||
|  */ | ||||
| struct btrfs_qgroup_extent_record { | ||||
| 	u64 bytenr; | ||||
| 	/*
 | ||||
| 	 * The bytenr of the extent is given by its index in the dirty_extents | ||||
| 	 * xarray of struct btrfs_delayed_ref_root left shifted by | ||||
| 	 * fs_info->sectorsize_bits. | ||||
| 	 */ | ||||
| 
 | ||||
| 	u64 num_bytes; | ||||
| 
 | ||||
| 	/*
 | ||||
|  | @ -345,9 +350,11 @@ void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info); | |||
| int btrfs_qgroup_trace_extent_nolock( | ||||
| 		struct btrfs_fs_info *fs_info, | ||||
| 		struct btrfs_delayed_ref_root *delayed_refs, | ||||
| 		struct btrfs_qgroup_extent_record *record); | ||||
| 		struct btrfs_qgroup_extent_record *record, | ||||
| 		u64 bytenr); | ||||
| int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans, | ||||
| 				   struct btrfs_qgroup_extent_record *qrecord); | ||||
| 				   struct btrfs_qgroup_extent_record *qrecord, | ||||
| 				   u64 bytenr); | ||||
| int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr, | ||||
| 			      u64 num_bytes); | ||||
| int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans, | ||||
|  | @ -432,8 +439,7 @@ void btrfs_qgroup_init_swapped_blocks( | |||
| 	struct btrfs_qgroup_swapped_blocks *swapped_blocks); | ||||
| 
 | ||||
| void btrfs_qgroup_clean_swapped_blocks(struct btrfs_root *root); | ||||
| int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans, | ||||
| 		struct btrfs_root *subvol_root, | ||||
| int btrfs_qgroup_add_swapped_blocks(struct btrfs_root *subvol_root, | ||||
| 		struct btrfs_block_group *bg, | ||||
| 		struct extent_buffer *subvol_parent, int subvol_slot, | ||||
| 		struct extent_buffer *reloc_parent, int reloc_slot, | ||||
|  | @ -442,7 +448,6 @@ int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans, | |||
| 		struct btrfs_root *root, struct extent_buffer *eb); | ||||
| void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans); | ||||
| bool btrfs_check_quota_leak(const struct btrfs_fs_info *fs_info); | ||||
| void btrfs_free_squota_rsv(struct btrfs_fs_info *fs_info, u64 root, u64 rsv_bytes); | ||||
| int btrfs_record_squota_delta(struct btrfs_fs_info *fs_info, | ||||
| 			      const struct btrfs_squota_delta *delta); | ||||
| 
 | ||||
|  |  | |||
|  | @ -13,6 +13,39 @@ | |||
| #include "volumes.h" | ||||
| #include "print-tree.h" | ||||
| 
 | ||||
| static void btrfs_partially_delete_raid_extent(struct btrfs_trans_handle *trans, | ||||
| 					       struct btrfs_path *path, | ||||
| 					       const struct btrfs_key *oldkey, | ||||
| 					       u64 newlen, u64 frontpad) | ||||
| { | ||||
| 	struct btrfs_stripe_extent *extent; | ||||
| 	struct extent_buffer *leaf; | ||||
| 	int slot; | ||||
| 	size_t item_size; | ||||
| 	struct btrfs_key newkey = { | ||||
| 		.objectid = oldkey->objectid + frontpad, | ||||
| 		.type = BTRFS_RAID_STRIPE_KEY, | ||||
| 		.offset = newlen, | ||||
| 	}; | ||||
| 
 | ||||
| 	ASSERT(oldkey->type == BTRFS_RAID_STRIPE_KEY); | ||||
| 
 | ||||
| 	leaf = path->nodes[0]; | ||||
| 	slot = path->slots[0]; | ||||
| 	item_size = btrfs_item_size(leaf, slot); | ||||
| 	extent = btrfs_item_ptr(leaf, slot, struct btrfs_stripe_extent); | ||||
| 
 | ||||
| 	for (int i = 0; i < btrfs_num_raid_stripes(item_size); i++) { | ||||
| 		struct btrfs_raid_stride *stride = &extent->strides[i]; | ||||
| 		u64 phys; | ||||
| 
 | ||||
| 		phys = btrfs_raid_stride_physical(leaf, stride); | ||||
| 		btrfs_set_raid_stride_physical(leaf, stride, phys + frontpad); | ||||
| 	} | ||||
| 
 | ||||
| 	btrfs_set_item_key_safe(trans, path, &newkey); | ||||
| } | ||||
| 
 | ||||
| int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 length) | ||||
| { | ||||
| 	struct btrfs_fs_info *fs_info = trans->fs_info; | ||||
|  | @ -36,23 +69,24 @@ int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 le | |||
| 	while (1) { | ||||
| 		key.objectid = start; | ||||
| 		key.type = BTRFS_RAID_STRIPE_KEY; | ||||
| 		key.offset = length; | ||||
| 		key.offset = 0; | ||||
| 
 | ||||
| 		ret = btrfs_search_slot(trans, stripe_root, &key, path, -1, 1); | ||||
| 		if (ret < 0) | ||||
| 			break; | ||||
| 		if (ret > 0) { | ||||
| 			ret = 0; | ||||
| 			if (path->slots[0] == 0) | ||||
| 				break; | ||||
| 
 | ||||
| 		if (path->slots[0] == btrfs_header_nritems(path->nodes[0])) | ||||
| 			path->slots[0]--; | ||||
| 		} | ||||
| 
 | ||||
| 		leaf = path->nodes[0]; | ||||
| 		slot = path->slots[0]; | ||||
| 		btrfs_item_key_to_cpu(leaf, &key, slot); | ||||
| 		found_start = key.objectid; | ||||
| 		found_end = found_start + key.offset; | ||||
| 		ret = 0; | ||||
| 
 | ||||
| 		if (key.type != BTRFS_RAID_STRIPE_KEY) | ||||
| 			break; | ||||
| 
 | ||||
| 		/* That stripe ends before we start, we're done. */ | ||||
| 		if (found_end <= start) | ||||
|  | @ -61,7 +95,40 @@ int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 le | |||
| 		trace_btrfs_raid_extent_delete(fs_info, start, end, | ||||
| 					       found_start, found_end); | ||||
| 
 | ||||
| 		ASSERT(found_start >= start && found_end <= end); | ||||
| 		/*
 | ||||
| 		 * The stripe extent starts before the range we want to delete: | ||||
| 		 * | ||||
| 		 * |--- RAID Stripe Extent ---| | ||||
| 		 * |--- keep  ---|--- drop ---| | ||||
| 		 * | ||||
| 		 * This means we have to duplicate the tree item, truncate the | ||||
| 		 * length to the new size and then re-insert the item. | ||||
| 		 */ | ||||
| 		if (found_start < start) { | ||||
| 			u64 diff = start - found_start; | ||||
| 
 | ||||
| 			btrfs_partially_delete_raid_extent(trans, path, &key, | ||||
| 							   diff, 0); | ||||
| 			break; | ||||
| 		} | ||||
| 
 | ||||
| 		/*
 | ||||
| 		 * The stripe extent ends after the range we want to delete: | ||||
| 		 * | ||||
| 		 * |--- RAID Stripe Extent ---| | ||||
| 		 * |--- drop  ---|--- keep ---| | ||||
| 		 * | ||||
| 		 * This means we have to duplicate the tree item, truncate the | ||||
| 		 * length to the new size and then re-insert the item. | ||||
| 		 */ | ||||
| 		if (found_end > end) { | ||||
| 			u64 diff = found_end - end; | ||||
| 
 | ||||
| 			btrfs_partially_delete_raid_extent(trans, path, &key, | ||||
| 							   diff, diff); | ||||
| 			break; | ||||
| 		} | ||||
| 
 | ||||
| 		ret = btrfs_del_item(trans, stripe_root, path); | ||||
| 		if (ret) | ||||
| 			break; | ||||
|  | @ -108,8 +175,9 @@ static int update_raid_extent_item(struct btrfs_trans_handle *trans, | |||
| 	return ret; | ||||
| } | ||||
| 
 | ||||
| static int btrfs_insert_one_raid_extent(struct btrfs_trans_handle *trans, | ||||
| 					struct btrfs_io_context *bioc) | ||||
| EXPORT_FOR_TESTS | ||||
| int btrfs_insert_one_raid_extent(struct btrfs_trans_handle *trans, | ||||
| 				 struct btrfs_io_context *bioc) | ||||
| { | ||||
| 	struct btrfs_fs_info *fs_info = trans->fs_info; | ||||
| 	struct btrfs_key stripe_key; | ||||
|  | @ -233,7 +301,7 @@ int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info, | |||
| 		found_end = found_logical + found_length; | ||||
| 
 | ||||
| 		if (found_logical > end) { | ||||
| 			ret = -ENOENT; | ||||
| 			ret = -ENODATA; | ||||
| 			goto out; | ||||
| 		} | ||||
| 
 | ||||
|  | @ -279,10 +347,10 @@ int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info, | |||
| 	} | ||||
| 
 | ||||
| 	/* If we're here, we haven't found the requested devid in the stripe. */ | ||||
| 	ret = -ENOENT; | ||||
| 	ret = -ENODATA; | ||||
| out: | ||||
| 	if (ret > 0) | ||||
| 		ret = -ENOENT; | ||||
| 		ret = -ENODATA; | ||||
| 	if (ret && ret != -EIO && !stripe->rst_search_commit_root) { | ||||
| 		btrfs_debug(fs_info, | ||||
| 		"cannot find raid-stripe for logical [%llu, %llu] devid %llu, profile %s", | ||||
|  |  | |||
|  | @ -28,6 +28,11 @@ int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info, | |||
| int btrfs_insert_raid_extent(struct btrfs_trans_handle *trans, | ||||
| 			     struct btrfs_ordered_extent *ordered_extent); | ||||
| 
 | ||||
| #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS | ||||
| int btrfs_insert_one_raid_extent(struct btrfs_trans_handle *trans, | ||||
| 				 struct btrfs_io_context *bioc); | ||||
| #endif | ||||
| 
 | ||||
| static inline bool btrfs_need_stripe_tree_update(struct btrfs_fs_info *fs_info, | ||||
| 						 u64 map_type) | ||||
| { | ||||
|  |  | |||
|  | @ -1272,8 +1272,7 @@ static inline void bio_list_put(struct bio_list *bio_list) | |||
| 
 | ||||
| static void assert_rbio(struct btrfs_raid_bio *rbio) | ||||
| { | ||||
| 	if (!IS_ENABLED(CONFIG_BTRFS_DEBUG) || | ||||
| 	    !IS_ENABLED(CONFIG_BTRFS_ASSERT)) | ||||
| 	if (!IS_ENABLED(CONFIG_BTRFS_ASSERT)) | ||||
| 		return; | ||||
| 
 | ||||
| 	/*
 | ||||
|  |  | |||
|  | @ -1244,7 +1244,7 @@ again: | |||
| 		 * The real subtree rescan is delayed until we have new | ||||
| 		 * CoW on the subtree root node before transaction commit. | ||||
| 		 */ | ||||
| 		ret = btrfs_qgroup_add_swapped_blocks(trans, dest, | ||||
| 		ret = btrfs_qgroup_add_swapped_blocks(dest, | ||||
| 				rc->block_group, parent, slot, | ||||
| 				path->nodes[level], path->slots[level], | ||||
| 				last_snapshot); | ||||
|  |  | |||
|  | @ -1656,8 +1656,7 @@ static u32 stripe_length(const struct scrub_stripe *stripe) | |||
| 		   stripe->bg->start + stripe->bg->length - stripe->logical); | ||||
| } | ||||
| 
 | ||||
| static void scrub_submit_extent_sector_read(struct scrub_ctx *sctx, | ||||
| 					    struct scrub_stripe *stripe) | ||||
| static void scrub_submit_extent_sector_read(struct scrub_stripe *stripe) | ||||
| { | ||||
| 	struct btrfs_fs_info *fs_info = stripe->bg->fs_info; | ||||
| 	struct btrfs_bio *bbio = NULL; | ||||
|  | @ -1704,8 +1703,18 @@ static void scrub_submit_extent_sector_read(struct scrub_ctx *sctx, | |||
| 					      &stripe_len, &bioc, &io_stripe, &mirror); | ||||
| 			btrfs_put_bioc(bioc); | ||||
| 			if (err < 0) { | ||||
| 				set_bit(i, &stripe->io_error_bitmap); | ||||
| 				set_bit(i, &stripe->error_bitmap); | ||||
| 				if (err != -ENODATA) { | ||||
| 					/*
 | ||||
| 					 * Earlier btrfs_get_raid_extent_offset() | ||||
| 					 * returned -ENODATA, which means there's | ||||
| 					 * no entry for the corresponding range | ||||
| 					 * in the stripe tree.  But if it's in | ||||
| 					 * the extent tree, then it's a preallocated | ||||
| 					 * extent and not an error. | ||||
| 					 */ | ||||
| 					set_bit(i, &stripe->io_error_bitmap); | ||||
| 					set_bit(i, &stripe->error_bitmap); | ||||
| 				} | ||||
| 				continue; | ||||
| 			} | ||||
| 
 | ||||
|  | @ -1743,7 +1752,7 @@ static void scrub_submit_initial_read(struct scrub_ctx *sctx, | |||
| 	ASSERT(test_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state)); | ||||
| 
 | ||||
| 	if (btrfs_need_stripe_tree_update(fs_info, stripe->bg->flags)) { | ||||
| 		scrub_submit_extent_sector_read(sctx, stripe); | ||||
| 		scrub_submit_extent_sector_read(stripe); | ||||
| 		return; | ||||
| 	} | ||||
| 
 | ||||
|  | @ -1954,7 +1963,7 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, | |||
| 	ASSERT(sctx->raid56_data_stripes); | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * For data stripe search, we cannot re-use the same extent/csum paths, | ||||
| 	 * For data stripe search, we cannot reuse the same extent/csum paths, | ||||
| 	 * as the data stripe bytenr may be smaller than previous extent.  Thus | ||||
| 	 * we have to use our own extent/csum paths. | ||||
| 	 */ | ||||
|  | @ -2103,7 +2112,6 @@ out: | |||
|  */ | ||||
| static int scrub_simple_mirror(struct scrub_ctx *sctx, | ||||
| 			       struct btrfs_block_group *bg, | ||||
| 			       struct btrfs_chunk_map *map, | ||||
| 			       u64 logical_start, u64 logical_length, | ||||
| 			       struct btrfs_device *device, | ||||
| 			       u64 physical, int mirror_num) | ||||
|  | @ -2222,7 +2230,7 @@ static int scrub_simple_stripe(struct scrub_ctx *sctx, | |||
| 		 * just RAID1, so we can reuse scrub_simple_mirror() to scrub | ||||
| 		 * this stripe. | ||||
| 		 */ | ||||
| 		ret = scrub_simple_mirror(sctx, bg, map, cur_logical, | ||||
| 		ret = scrub_simple_mirror(sctx, bg, cur_logical, | ||||
| 					  BTRFS_STRIPE_LEN, device, cur_physical, | ||||
| 					  mirror_num); | ||||
| 		if (ret) | ||||
|  | @ -2256,7 +2264,6 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, | |||
| 	/* Offset inside the chunk */ | ||||
| 	u64 offset; | ||||
| 	u64 stripe_logical; | ||||
| 	int stop_loop = 0; | ||||
| 
 | ||||
| 	/* Extent_path should be released by now. */ | ||||
| 	ASSERT(sctx->extent_path.nodes[0] == NULL); | ||||
|  | @ -2307,7 +2314,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, | |||
| 		 * Only @physical and @mirror_num needs to calculated using | ||||
| 		 * @stripe_index. | ||||
| 		 */ | ||||
| 		ret = scrub_simple_mirror(sctx, bg, map, bg->start, bg->length, | ||||
| 		ret = scrub_simple_mirror(sctx, bg, bg->start, bg->length, | ||||
| 				scrub_dev, map->stripes[stripe_index].physical, | ||||
| 				stripe_index + 1); | ||||
| 		offset = 0; | ||||
|  | @ -2362,7 +2369,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, | |||
| 		 * We can reuse scrub_simple_mirror() here, as the repair part | ||||
| 		 * is still based on @mirror_num. | ||||
| 		 */ | ||||
| 		ret = scrub_simple_mirror(sctx, bg, map, logical, BTRFS_STRIPE_LEN, | ||||
| 		ret = scrub_simple_mirror(sctx, bg, logical, BTRFS_STRIPE_LEN, | ||||
| 					  scrub_dev, physical, 1); | ||||
| 		if (ret < 0) | ||||
| 			goto out; | ||||
|  | @ -2370,14 +2377,8 @@ next: | |||
| 		logical += increment; | ||||
| 		physical += BTRFS_STRIPE_LEN; | ||||
| 		spin_lock(&sctx->stat_lock); | ||||
| 		if (stop_loop) | ||||
| 			sctx->stat.last_physical = | ||||
| 				map->stripes[stripe_index].physical + dev_stripe_len; | ||||
| 		else | ||||
| 			sctx->stat.last_physical = physical; | ||||
| 		sctx->stat.last_physical = physical; | ||||
| 		spin_unlock(&sctx->stat_lock); | ||||
| 		if (stop_loop) | ||||
| 			break; | ||||
| 	} | ||||
| out: | ||||
| 	ret2 = flush_scrub_stripes(sctx); | ||||
|  |  | |||
|  | @ -980,9 +980,7 @@ static int get_inode_gen(struct btrfs_root *root, u64 ino, u64 *gen) | |||
| 	return ret; | ||||
| } | ||||
| 
 | ||||
| typedef int (*iterate_inode_ref_t)(int num, u64 dir, int index, | ||||
| 				   struct fs_path *p, | ||||
| 				   void *ctx); | ||||
| typedef int (*iterate_inode_ref_t)(u64 dir, struct fs_path *p, void *ctx); | ||||
| 
 | ||||
| /*
 | ||||
|  * Helper function to iterate the entries in ONE btrfs_inode_ref or | ||||
|  | @ -1007,8 +1005,6 @@ static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path, | |||
| 	u32 name_len; | ||||
| 	char *start; | ||||
| 	int ret = 0; | ||||
| 	int num = 0; | ||||
| 	int index; | ||||
| 	u64 dir; | ||||
| 	unsigned long name_off; | ||||
| 	unsigned long elem_size; | ||||
|  | @ -1043,13 +1039,11 @@ static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path, | |||
| 			iref = (struct btrfs_inode_ref *)(ptr + cur); | ||||
| 			name_len = btrfs_inode_ref_name_len(eb, iref); | ||||
| 			name_off = (unsigned long)(iref + 1); | ||||
| 			index = btrfs_inode_ref_index(eb, iref); | ||||
| 			dir = found_key->offset; | ||||
| 		} else { | ||||
| 			extref = (struct btrfs_inode_extref *)(ptr + cur); | ||||
| 			name_len = btrfs_inode_extref_name_len(eb, extref); | ||||
| 			name_off = (unsigned long)&extref->name; | ||||
| 			index = btrfs_inode_extref_index(eb, extref); | ||||
| 			dir = btrfs_inode_extref_parent(eb, extref); | ||||
| 		} | ||||
| 
 | ||||
|  | @ -1094,10 +1088,9 @@ static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path, | |||
| 		} | ||||
| 
 | ||||
| 		cur += elem_size + name_len; | ||||
| 		ret = iterate(num, dir, index, p, ctx); | ||||
| 		ret = iterate(dir, p, ctx); | ||||
| 		if (ret) | ||||
| 			goto out; | ||||
| 		num++; | ||||
| 	} | ||||
| 
 | ||||
| out: | ||||
|  | @ -1227,8 +1220,7 @@ out: | |||
| 	return ret; | ||||
| } | ||||
| 
 | ||||
| static int __copy_first_ref(int num, u64 dir, int index, | ||||
| 			    struct fs_path *p, void *ctx) | ||||
| static int __copy_first_ref(u64 dir, struct fs_path *p, void *ctx) | ||||
| { | ||||
| 	int ret; | ||||
| 	struct fs_path *pt = ctx; | ||||
|  | @ -3768,7 +3760,6 @@ static int wait_for_dest_dir_move(struct send_ctx *sctx, | |||
| 				  struct recorded_ref *parent_ref, | ||||
| 				  const bool is_orphan) | ||||
| { | ||||
| 	struct btrfs_fs_info *fs_info = sctx->parent_root->fs_info; | ||||
| 	struct btrfs_path *path; | ||||
| 	struct btrfs_key key; | ||||
| 	struct btrfs_key di_key; | ||||
|  | @ -3797,7 +3788,7 @@ static int wait_for_dest_dir_move(struct send_ctx *sctx, | |||
| 		goto out; | ||||
| 	} | ||||
| 
 | ||||
| 	di = btrfs_match_dir_item_name(fs_info, path, parent_ref->name, | ||||
| 	di = btrfs_match_dir_item_name(path, parent_ref->name, | ||||
| 				       parent_ref->name_len); | ||||
| 	if (!di) { | ||||
| 		ret = 0; | ||||
|  | @ -4708,8 +4699,7 @@ out: | |||
| 	return ret; | ||||
| } | ||||
| 
 | ||||
| static int record_new_ref_if_needed(int num, u64 dir, int index, | ||||
| 				    struct fs_path *name, void *ctx) | ||||
| static int record_new_ref_if_needed(u64 dir, struct fs_path *name, void *ctx) | ||||
| { | ||||
| 	int ret = 0; | ||||
| 	struct send_ctx *sctx = ctx; | ||||
|  | @ -4738,8 +4728,7 @@ out: | |||
| 	return ret; | ||||
| } | ||||
| 
 | ||||
| static int record_deleted_ref_if_needed(int num, u64 dir, int index, | ||||
| 					struct fs_path *name, void *ctx) | ||||
| static int record_deleted_ref_if_needed(u64 dir, struct fs_path *name, void *ctx) | ||||
| { | ||||
| 	int ret = 0; | ||||
| 	struct send_ctx *sctx = ctx; | ||||
|  | @ -5677,10 +5666,11 @@ static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path, | |||
| 	 * Note that send_buf is a mapping of send_buf_pages, so this is really | ||||
| 	 * reading into send_buf. | ||||
| 	 */ | ||||
| 	ret = btrfs_encoded_read_regular_fill_pages(BTRFS_I(inode), offset, | ||||
| 	ret = btrfs_encoded_read_regular_fill_pages(BTRFS_I(inode), | ||||
| 						    disk_bytenr, disk_num_bytes, | ||||
| 						    sctx->send_buf_pages + | ||||
| 						    (data_offset >> PAGE_SHIFT)); | ||||
| 						    (data_offset >> PAGE_SHIFT), | ||||
| 						    NULL); | ||||
| 	if (ret) | ||||
| 		goto out; | ||||
| 
 | ||||
|  | @ -8135,7 +8125,20 @@ long btrfs_ioctl_send(struct btrfs_inode *inode, const struct btrfs_ioctl_send_a | |||
| 	 * making it RW. This also protects against deletion. | ||||
| 	 */ | ||||
| 	spin_lock(&send_root->root_item_lock); | ||||
| 	if (btrfs_root_readonly(send_root) && send_root->dedupe_in_progress) { | ||||
| 	/*
 | ||||
| 	 * Unlikely but possible, if the subvolume is marked for deletion but | ||||
| 	 * is slow to remove the directory entry, send can still be started. | ||||
| 	 */ | ||||
| 	if (btrfs_root_dead(send_root)) { | ||||
| 		spin_unlock(&send_root->root_item_lock); | ||||
| 		return -EPERM; | ||||
| 	} | ||||
| 	/* Userspace tools do the checks and warn the user if it's not RO. */ | ||||
| 	if (!btrfs_root_readonly(send_root)) { | ||||
| 		spin_unlock(&send_root->root_item_lock); | ||||
| 		return -EPERM; | ||||
| 	} | ||||
| 	if (send_root->dedupe_in_progress) { | ||||
| 		dedupe_in_progress_warn(send_root); | ||||
| 		spin_unlock(&send_root->root_item_lock); | ||||
| 		return -EAGAIN; | ||||
|  | @ -8143,15 +8146,6 @@ long btrfs_ioctl_send(struct btrfs_inode *inode, const struct btrfs_ioctl_send_a | |||
| 	send_root->send_in_progress++; | ||||
| 	spin_unlock(&send_root->root_item_lock); | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Userspace tools do the checks and warn the user if it's | ||||
| 	 * not RO. | ||||
| 	 */ | ||||
| 	if (!btrfs_root_readonly(send_root)) { | ||||
| 		ret = -EPERM; | ||||
| 		goto out; | ||||
| 	} | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Check that we don't overflow at later allocations, we request | ||||
| 	 * clone_sources_count + 1 items, and compare to unsigned long inside | ||||
|  | @ -8217,15 +8211,6 @@ long btrfs_ioctl_send(struct btrfs_inode *inode, const struct btrfs_ioctl_send_a | |||
| 	} | ||||
| 
 | ||||
| 	sctx->send_root = send_root; | ||||
| 	/*
 | ||||
| 	 * Unlikely but possible, if the subvolume is marked for deletion but | ||||
| 	 * is slow to remove the directory entry, send can still be started | ||||
| 	 */ | ||||
| 	if (btrfs_root_dead(sctx->send_root)) { | ||||
| 		ret = -EPERM; | ||||
| 		goto out; | ||||
| 	} | ||||
| 
 | ||||
| 	sctx->clone_roots_cnt = arg->clone_sources_count; | ||||
| 
 | ||||
| 	if (sctx->proto >= 2) { | ||||
|  |  | |||
|  | @ -16,7 +16,7 @@ struct btrfs_ioctl_send_args; | |||
| 
 | ||||
| #define BTRFS_SEND_STREAM_MAGIC "btrfs-stream" | ||||
| /* Conditional support for the upcoming protocol version. */ | ||||
| #ifdef CONFIG_BTRFS_DEBUG | ||||
| #ifdef CONFIG_BTRFS_EXPERIMENTAL | ||||
| #define BTRFS_SEND_STREAM_VERSION 3 | ||||
| #else | ||||
| #define BTRFS_SEND_STREAM_VERSION 2 | ||||
|  |  | |||
|  | @ -1279,7 +1279,7 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work) | |||
|  *   If we are freeing inodes, we want to make sure all delayed iputs have | ||||
|  *   completed, because they could have been on an inode with i_nlink == 0, and | ||||
|  *   thus have been truncated and freed up space.  But again this space is not | ||||
|  *   immediately re-usable, it comes in the form of a delayed ref, which must be | ||||
|  *   immediately reusable, it comes in the form of a delayed ref, which must be | ||||
|  *   run and then the transaction must be committed. | ||||
|  * | ||||
|  * COMMIT_TRANS | ||||
|  | @ -1488,8 +1488,7 @@ static void priority_reclaim_data_space(struct btrfs_fs_info *fs_info, | |||
| 	spin_unlock(&space_info->lock); | ||||
| } | ||||
| 
 | ||||
| static void wait_reserve_ticket(struct btrfs_fs_info *fs_info, | ||||
| 				struct btrfs_space_info *space_info, | ||||
| static void wait_reserve_ticket(struct btrfs_space_info *space_info, | ||||
| 				struct reserve_ticket *ticket) | ||||
| 
 | ||||
| { | ||||
|  | @ -1547,7 +1546,7 @@ static int handle_reserve_ticket(struct btrfs_fs_info *fs_info, | |||
| 	case BTRFS_RESERVE_FLUSH_DATA: | ||||
| 	case BTRFS_RESERVE_FLUSH_ALL: | ||||
| 	case BTRFS_RESERVE_FLUSH_ALL_STEAL: | ||||
| 		wait_reserve_ticket(fs_info, space_info, ticket); | ||||
| 		wait_reserve_ticket(space_info, ticket); | ||||
| 		break; | ||||
| 	case BTRFS_RESERVE_FLUSH_LIMIT: | ||||
| 		priority_reclaim_metadata_space(fs_info, space_info, ticket, | ||||
|  | @ -1984,8 +1983,7 @@ static bool is_reclaim_urgent(struct btrfs_space_info *space_info) | |||
| 	return unalloc < data_chunk_size; | ||||
| } | ||||
| 
 | ||||
| static void do_reclaim_sweep(const struct btrfs_fs_info *fs_info, | ||||
| 			     struct btrfs_space_info *space_info, int raid) | ||||
| static void do_reclaim_sweep(struct btrfs_space_info *space_info, int raid) | ||||
| { | ||||
| 	struct btrfs_block_group *bg; | ||||
| 	int thresh_pct; | ||||
|  | @ -2081,6 +2079,6 @@ void btrfs_reclaim_sweep(const struct btrfs_fs_info *fs_info) | |||
| 		if (!btrfs_should_periodic_reclaim(space_info)) | ||||
| 			continue; | ||||
| 		for (raid = 0; raid < BTRFS_NR_RAID_TYPES; raid++) | ||||
| 			do_reclaim_sweep(fs_info, space_info, raid); | ||||
| 			do_reclaim_sweep(space_info, raid); | ||||
| 	} | ||||
| } | ||||
|  |  | |||
|  | @ -140,12 +140,10 @@ struct btrfs_subpage *btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info, | |||
| 		return ERR_PTR(-ENOMEM); | ||||
| 
 | ||||
| 	spin_lock_init(&ret->lock); | ||||
| 	if (type == BTRFS_SUBPAGE_METADATA) { | ||||
| 	if (type == BTRFS_SUBPAGE_METADATA) | ||||
| 		atomic_set(&ret->eb_refs, 0); | ||||
| 	} else { | ||||
| 		atomic_set(&ret->readers, 0); | ||||
| 		atomic_set(&ret->writers, 0); | ||||
| 	} | ||||
| 	else | ||||
| 		atomic_set(&ret->nr_locked, 0); | ||||
| 	return ret; | ||||
| } | ||||
| 
 | ||||
|  | @ -221,62 +219,6 @@ static void btrfs_subpage_assert(const struct btrfs_fs_info *fs_info, | |||
| 	__start_bit;							\ | ||||
| }) | ||||
| 
 | ||||
| void btrfs_subpage_start_reader(const struct btrfs_fs_info *fs_info, | ||||
| 				struct folio *folio, u64 start, u32 len) | ||||
| { | ||||
| 	struct btrfs_subpage *subpage = folio_get_private(folio); | ||||
| 	const int start_bit = subpage_calc_start_bit(fs_info, folio, locked, start, len); | ||||
| 	const int nbits = len >> fs_info->sectorsize_bits; | ||||
| 	unsigned long flags; | ||||
| 
 | ||||
| 
 | ||||
| 	btrfs_subpage_assert(fs_info, folio, start, len); | ||||
| 
 | ||||
| 	spin_lock_irqsave(&subpage->lock, flags); | ||||
| 	/*
 | ||||
| 	 * Even though it's just for reading the page, no one should have | ||||
| 	 * locked the subpage range. | ||||
| 	 */ | ||||
| 	ASSERT(bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits)); | ||||
| 	bitmap_set(subpage->bitmaps, start_bit, nbits); | ||||
| 	atomic_add(nbits, &subpage->readers); | ||||
| 	spin_unlock_irqrestore(&subpage->lock, flags); | ||||
| } | ||||
| 
 | ||||
| void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info, | ||||
| 			      struct folio *folio, u64 start, u32 len) | ||||
| { | ||||
| 	struct btrfs_subpage *subpage = folio_get_private(folio); | ||||
| 	const int start_bit = subpage_calc_start_bit(fs_info, folio, locked, start, len); | ||||
| 	const int nbits = len >> fs_info->sectorsize_bits; | ||||
| 	unsigned long flags; | ||||
| 	bool is_data; | ||||
| 	bool last; | ||||
| 
 | ||||
| 	btrfs_subpage_assert(fs_info, folio, start, len); | ||||
| 	is_data = is_data_inode(BTRFS_I(folio->mapping->host)); | ||||
| 
 | ||||
| 	spin_lock_irqsave(&subpage->lock, flags); | ||||
| 
 | ||||
| 	/* The range should have already been locked. */ | ||||
| 	ASSERT(bitmap_test_range_all_set(subpage->bitmaps, start_bit, nbits)); | ||||
| 	ASSERT(atomic_read(&subpage->readers) >= nbits); | ||||
| 
 | ||||
| 	bitmap_clear(subpage->bitmaps, start_bit, nbits); | ||||
| 	last = atomic_sub_and_test(nbits, &subpage->readers); | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * For data we need to unlock the page if the last read has finished. | ||||
| 	 * | ||||
| 	 * And please don't replace @last with atomic_sub_and_test() call | ||||
| 	 * inside if () condition. | ||||
| 	 * As we want the atomic_sub_and_test() to be always executed. | ||||
| 	 */ | ||||
| 	if (is_data && last) | ||||
| 		folio_unlock(folio); | ||||
| 	spin_unlock_irqrestore(&subpage->lock, flags); | ||||
| } | ||||
| 
 | ||||
| static void btrfs_subpage_clamp_range(struct folio *folio, u64 *start, u32 *len) | ||||
| { | ||||
| 	u64 orig_start = *start; | ||||
|  | @ -295,28 +237,8 @@ static void btrfs_subpage_clamp_range(struct folio *folio, u64 *start, u32 *len) | |||
| 			     orig_start + orig_len) - *start; | ||||
| } | ||||
| 
 | ||||
| static void btrfs_subpage_start_writer(const struct btrfs_fs_info *fs_info, | ||||
| 				       struct folio *folio, u64 start, u32 len) | ||||
| { | ||||
| 	struct btrfs_subpage *subpage = folio_get_private(folio); | ||||
| 	const int start_bit = subpage_calc_start_bit(fs_info, folio, locked, start, len); | ||||
| 	const int nbits = (len >> fs_info->sectorsize_bits); | ||||
| 	unsigned long flags; | ||||
| 	int ret; | ||||
| 
 | ||||
| 	btrfs_subpage_assert(fs_info, folio, start, len); | ||||
| 
 | ||||
| 	spin_lock_irqsave(&subpage->lock, flags); | ||||
| 	ASSERT(atomic_read(&subpage->readers) == 0); | ||||
| 	ASSERT(bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits)); | ||||
| 	bitmap_set(subpage->bitmaps, start_bit, nbits); | ||||
| 	ret = atomic_add_return(nbits, &subpage->writers); | ||||
| 	ASSERT(ret == nbits); | ||||
| 	spin_unlock_irqrestore(&subpage->lock, flags); | ||||
| } | ||||
| 
 | ||||
| static bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info, | ||||
| 					      struct folio *folio, u64 start, u32 len) | ||||
| static bool btrfs_subpage_end_and_test_lock(const struct btrfs_fs_info *fs_info, | ||||
| 					    struct folio *folio, u64 start, u32 len) | ||||
| { | ||||
| 	struct btrfs_subpage *subpage = folio_get_private(folio); | ||||
| 	const int start_bit = subpage_calc_start_bit(fs_info, folio, locked, start, len); | ||||
|  | @ -334,9 +256,9 @@ static bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_inf | |||
| 	 * extent_clear_unlock_delalloc() for compression path. | ||||
| 	 * | ||||
| 	 * This @locked_page is locked by plain lock_page(), thus its | ||||
| 	 * subpage::writers is 0.  Handle them in a special way. | ||||
| 	 * subpage::locked is 0.  Handle them in a special way. | ||||
| 	 */ | ||||
| 	if (atomic_read(&subpage->writers) == 0) { | ||||
| 	if (atomic_read(&subpage->nr_locked) == 0) { | ||||
| 		spin_unlock_irqrestore(&subpage->lock, flags); | ||||
| 		return true; | ||||
| 	} | ||||
|  | @ -345,39 +267,12 @@ static bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_inf | |||
| 		clear_bit(bit, subpage->bitmaps); | ||||
| 		cleared++; | ||||
| 	} | ||||
| 	ASSERT(atomic_read(&subpage->writers) >= cleared); | ||||
| 	last = atomic_sub_and_test(cleared, &subpage->writers); | ||||
| 	ASSERT(atomic_read(&subpage->nr_locked) >= cleared); | ||||
| 	last = atomic_sub_and_test(cleared, &subpage->nr_locked); | ||||
| 	spin_unlock_irqrestore(&subpage->lock, flags); | ||||
| 	return last; | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Lock a folio for delalloc page writeback. | ||||
|  * | ||||
|  * Return -EAGAIN if the page is not properly initialized. | ||||
|  * Return 0 with the page locked, and writer counter updated. | ||||
|  * | ||||
|  * Even with 0 returned, the page still need extra check to make sure | ||||
|  * it's really the correct page, as the caller is using | ||||
|  * filemap_get_folios_contig(), which can race with page invalidating. | ||||
|  */ | ||||
| int btrfs_folio_start_writer_lock(const struct btrfs_fs_info *fs_info, | ||||
| 				  struct folio *folio, u64 start, u32 len) | ||||
| { | ||||
| 	if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, folio->mapping)) { | ||||
| 		folio_lock(folio); | ||||
| 		return 0; | ||||
| 	} | ||||
| 	folio_lock(folio); | ||||
| 	if (!folio_test_private(folio) || !folio_get_private(folio)) { | ||||
| 		folio_unlock(folio); | ||||
| 		return -EAGAIN; | ||||
| 	} | ||||
| 	btrfs_subpage_clamp_range(folio, &start, &len); | ||||
| 	btrfs_subpage_start_writer(fs_info, folio, start, len); | ||||
| 	return 0; | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Handle different locked folios: | ||||
|  * | ||||
|  | @ -394,8 +289,8 @@ int btrfs_folio_start_writer_lock(const struct btrfs_fs_info *fs_info, | |||
|  *   bitmap, reduce the writer lock number, and unlock the page if that's | ||||
|  *   the last locked range. | ||||
|  */ | ||||
| void btrfs_folio_end_writer_lock(const struct btrfs_fs_info *fs_info, | ||||
| 				 struct folio *folio, u64 start, u32 len) | ||||
| void btrfs_folio_end_lock(const struct btrfs_fs_info *fs_info, | ||||
| 			  struct folio *folio, u64 start, u32 len) | ||||
| { | ||||
| 	struct btrfs_subpage *subpage = folio_get_private(folio); | ||||
| 
 | ||||
|  | @ -408,24 +303,24 @@ void btrfs_folio_end_writer_lock(const struct btrfs_fs_info *fs_info, | |||
| 
 | ||||
| 	/*
 | ||||
| 	 * For subpage case, there are two types of locked page.  With or | ||||
| 	 * without writers number. | ||||
| 	 * without locked number. | ||||
| 	 * | ||||
| 	 * Since we own the page lock, no one else could touch subpage::writers | ||||
| 	 * Since we own the page lock, no one else could touch subpage::locked | ||||
| 	 * and we are safe to do several atomic operations without spinlock. | ||||
| 	 */ | ||||
| 	if (atomic_read(&subpage->writers) == 0) { | ||||
| 		/* No writers, locked by plain lock_page(). */ | ||||
| 	if (atomic_read(&subpage->nr_locked) == 0) { | ||||
| 		/* No subpage lock, locked by plain lock_page(). */ | ||||
| 		folio_unlock(folio); | ||||
| 		return; | ||||
| 	} | ||||
| 
 | ||||
| 	btrfs_subpage_clamp_range(folio, &start, &len); | ||||
| 	if (btrfs_subpage_end_and_test_writer(fs_info, folio, start, len)) | ||||
| 	if (btrfs_subpage_end_and_test_lock(fs_info, folio, start, len)) | ||||
| 		folio_unlock(folio); | ||||
| } | ||||
| 
 | ||||
| void btrfs_folio_end_writer_lock_bitmap(const struct btrfs_fs_info *fs_info, | ||||
| 					struct folio *folio, unsigned long bitmap) | ||||
| void btrfs_folio_end_lock_bitmap(const struct btrfs_fs_info *fs_info, | ||||
| 				 struct folio *folio, unsigned long bitmap) | ||||
| { | ||||
| 	struct btrfs_subpage *subpage = folio_get_private(folio); | ||||
| 	const int start_bit = fs_info->sectors_per_page * btrfs_bitmap_nr_locked; | ||||
|  | @ -434,13 +329,13 @@ void btrfs_folio_end_writer_lock_bitmap(const struct btrfs_fs_info *fs_info, | |||
| 	int cleared = 0; | ||||
| 	int bit; | ||||
| 
 | ||||
| 	if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, folio->mapping)) { | ||||
| 	if (!btrfs_is_subpage(fs_info, folio->mapping)) { | ||||
| 		folio_unlock(folio); | ||||
| 		return; | ||||
| 	} | ||||
| 
 | ||||
| 	if (atomic_read(&subpage->writers) == 0) { | ||||
| 		/* No writers, locked by plain lock_page(). */ | ||||
| 	if (atomic_read(&subpage->nr_locked) == 0) { | ||||
| 		/* No subpage lock, locked by plain lock_page(). */ | ||||
| 		folio_unlock(folio); | ||||
| 		return; | ||||
| 	} | ||||
|  | @ -450,8 +345,8 @@ void btrfs_folio_end_writer_lock_bitmap(const struct btrfs_fs_info *fs_info, | |||
| 		if (test_and_clear_bit(bit + start_bit, subpage->bitmaps)) | ||||
| 			cleared++; | ||||
| 	} | ||||
| 	ASSERT(atomic_read(&subpage->writers) >= cleared); | ||||
| 	last = atomic_sub_and_test(cleared, &subpage->writers); | ||||
| 	ASSERT(atomic_read(&subpage->nr_locked) >= cleared); | ||||
| 	last = atomic_sub_and_test(cleared, &subpage->nr_locked); | ||||
| 	spin_unlock_irqrestore(&subpage->lock, flags); | ||||
| 	if (last) | ||||
| 		folio_unlock(folio); | ||||
|  | @ -776,8 +671,8 @@ void btrfs_folio_assert_not_dirty(const struct btrfs_fs_info *fs_info, | |||
|  * This populates the involved subpage ranges so that subpage helpers can | ||||
|  * properly unlock them. | ||||
|  */ | ||||
| void btrfs_folio_set_writer_lock(const struct btrfs_fs_info *fs_info, | ||||
| 				 struct folio *folio, u64 start, u32 len) | ||||
| void btrfs_folio_set_lock(const struct btrfs_fs_info *fs_info, | ||||
| 			  struct folio *folio, u64 start, u32 len) | ||||
| { | ||||
| 	struct btrfs_subpage *subpage; | ||||
| 	unsigned long flags; | ||||
|  | @ -796,58 +691,11 @@ void btrfs_folio_set_writer_lock(const struct btrfs_fs_info *fs_info, | |||
| 	/* Target range should not yet be locked. */ | ||||
| 	ASSERT(bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits)); | ||||
| 	bitmap_set(subpage->bitmaps, start_bit, nbits); | ||||
| 	ret = atomic_add_return(nbits, &subpage->writers); | ||||
| 	ret = atomic_add_return(nbits, &subpage->nr_locked); | ||||
| 	ASSERT(ret <= fs_info->sectors_per_page); | ||||
| 	spin_unlock_irqrestore(&subpage->lock, flags); | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Find any subpage writer locked range inside @folio, starting at file offset | ||||
|  * @search_start. The caller should ensure the folio is locked. | ||||
|  * | ||||
|  * Return true and update @found_start_ret and @found_len_ret to the first | ||||
|  * writer locked range. | ||||
|  * Return false if there is no writer locked range. | ||||
|  */ | ||||
| bool btrfs_subpage_find_writer_locked(const struct btrfs_fs_info *fs_info, | ||||
| 				      struct folio *folio, u64 search_start, | ||||
| 				      u64 *found_start_ret, u32 *found_len_ret) | ||||
| { | ||||
| 	struct btrfs_subpage *subpage = folio_get_private(folio); | ||||
| 	const u32 sectors_per_page = fs_info->sectors_per_page; | ||||
| 	const unsigned int len = PAGE_SIZE - offset_in_page(search_start); | ||||
| 	const unsigned int start_bit = subpage_calc_start_bit(fs_info, folio, | ||||
| 						locked, search_start, len); | ||||
| 	const unsigned int locked_bitmap_start = sectors_per_page * btrfs_bitmap_nr_locked; | ||||
| 	const unsigned int locked_bitmap_end = locked_bitmap_start + sectors_per_page; | ||||
| 	unsigned long flags; | ||||
| 	int first_zero; | ||||
| 	int first_set; | ||||
| 	bool found = false; | ||||
| 
 | ||||
| 	ASSERT(folio_test_locked(folio)); | ||||
| 	spin_lock_irqsave(&subpage->lock, flags); | ||||
| 	first_set = find_next_bit(subpage->bitmaps, locked_bitmap_end, start_bit); | ||||
| 	if (first_set >= locked_bitmap_end) | ||||
| 		goto out; | ||||
| 
 | ||||
| 	found = true; | ||||
| 
 | ||||
| 	*found_start_ret = folio_pos(folio) + | ||||
| 		((first_set - locked_bitmap_start) << fs_info->sectorsize_bits); | ||||
| 	/*
 | ||||
| 	 * Since @first_set is ensured to be smaller than locked_bitmap_end | ||||
| 	 * here, @found_start_ret should be inside the folio. | ||||
| 	 */ | ||||
| 	ASSERT(*found_start_ret < folio_pos(folio) + PAGE_SIZE); | ||||
| 
 | ||||
| 	first_zero = find_next_zero_bit(subpage->bitmaps, locked_bitmap_end, first_set); | ||||
| 	*found_len_ret = (first_zero - first_set) << fs_info->sectorsize_bits; | ||||
| out: | ||||
| 	spin_unlock_irqrestore(&subpage->lock, flags); | ||||
| 	return found; | ||||
| } | ||||
| 
 | ||||
| #define GET_SUBPAGE_BITMAP(subpage, fs_info, name, dst)			\ | ||||
| {									\ | ||||
| 	const int sectors_per_page = fs_info->sectors_per_page;		\ | ||||
|  |  | |||
|  | @ -45,14 +45,6 @@ enum { | |||
| struct btrfs_subpage { | ||||
| 	/* Common members for both data and metadata pages */ | ||||
| 	spinlock_t lock; | ||||
| 	/*
 | ||||
| 	 * Both data and metadata needs to track how many readers are for the | ||||
| 	 * page. | ||||
| 	 * Data relies on @readers to unlock the page when last reader finished. | ||||
| 	 * While metadata doesn't need page unlock, it needs to prevent | ||||
| 	 * page::private get cleared before the last end_page_read(). | ||||
| 	 */ | ||||
| 	atomic_t readers; | ||||
| 	union { | ||||
| 		/*
 | ||||
| 		 * Structures only used by metadata | ||||
|  | @ -62,8 +54,12 @@ struct btrfs_subpage { | |||
| 		 */ | ||||
| 		atomic_t eb_refs; | ||||
| 
 | ||||
| 		/* Structures only used by data */ | ||||
| 		atomic_t writers; | ||||
| 		/*
 | ||||
| 		 * Structures only used by data, | ||||
| 		 * | ||||
| 		 * How many sectors inside the page is locked. | ||||
| 		 */ | ||||
| 		atomic_t nr_locked; | ||||
| 	}; | ||||
| 	unsigned long bitmaps[]; | ||||
| }; | ||||
|  | @ -95,23 +91,12 @@ void btrfs_free_subpage(struct btrfs_subpage *subpage); | |||
| void btrfs_folio_inc_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *folio); | ||||
| void btrfs_folio_dec_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *folio); | ||||
| 
 | ||||
| void btrfs_subpage_start_reader(const struct btrfs_fs_info *fs_info, | ||||
| 				struct folio *folio, u64 start, u32 len); | ||||
| void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info, | ||||
| 			      struct folio *folio, u64 start, u32 len); | ||||
| 
 | ||||
| int btrfs_folio_start_writer_lock(const struct btrfs_fs_info *fs_info, | ||||
| 				  struct folio *folio, u64 start, u32 len); | ||||
| void btrfs_folio_end_writer_lock(const struct btrfs_fs_info *fs_info, | ||||
| 				 struct folio *folio, u64 start, u32 len); | ||||
| void btrfs_folio_set_writer_lock(const struct btrfs_fs_info *fs_info, | ||||
| 				 struct folio *folio, u64 start, u32 len); | ||||
| void btrfs_folio_end_writer_lock_bitmap(const struct btrfs_fs_info *fs_info, | ||||
| 					struct folio *folio, unsigned long bitmap); | ||||
| bool btrfs_subpage_find_writer_locked(const struct btrfs_fs_info *fs_info, | ||||
| 				      struct folio *folio, u64 search_start, | ||||
| 				      u64 *found_start_ret, u32 *found_len_ret); | ||||
| 
 | ||||
| void btrfs_folio_end_lock(const struct btrfs_fs_info *fs_info, | ||||
| 			  struct folio *folio, u64 start, u32 len); | ||||
| void btrfs_folio_set_lock(const struct btrfs_fs_info *fs_info, | ||||
| 			  struct folio *folio, u64 start, u32 len); | ||||
| void btrfs_folio_end_lock_bitmap(const struct btrfs_fs_info *fs_info, | ||||
| 				 struct folio *folio, unsigned long bitmap); | ||||
| /*
 | ||||
|  * Template for subpage related operations. | ||||
|  * | ||||
|  |  | |||
|  | @ -28,7 +28,6 @@ | |||
| #include <linux/btrfs.h> | ||||
| #include <linux/security.h> | ||||
| #include <linux/fs_parser.h> | ||||
| #include <linux/swap.h> | ||||
| #include "messages.h" | ||||
| #include "delayed-inode.h" | ||||
| #include "ctree.h" | ||||
|  | @ -946,8 +945,7 @@ static int get_default_subvol_objectid(struct btrfs_fs_info *fs_info, u64 *objec | |||
| } | ||||
| 
 | ||||
| static int btrfs_fill_super(struct super_block *sb, | ||||
| 			    struct btrfs_fs_devices *fs_devices, | ||||
| 			    void *data) | ||||
| 			    struct btrfs_fs_devices *fs_devices) | ||||
| { | ||||
| 	struct inode *inode; | ||||
| 	struct btrfs_fs_info *fs_info = btrfs_sb(sb); | ||||
|  | @ -971,7 +969,7 @@ static int btrfs_fill_super(struct super_block *sb, | |||
| 		return err; | ||||
| 	} | ||||
| 
 | ||||
| 	err = open_ctree(sb, fs_devices, (char *)data); | ||||
| 	err = open_ctree(sb, fs_devices); | ||||
| 	if (err) { | ||||
| 		btrfs_err(fs_info, "open_ctree failed"); | ||||
| 		return err; | ||||
|  | @ -1893,7 +1891,7 @@ static int btrfs_get_tree_super(struct fs_context *fc) | |||
| 		snprintf(sb->s_id, sizeof(sb->s_id), "%pg", bdev); | ||||
| 		shrinker_debugfs_rename(sb->s_shrink, "sb-btrfs:%s", sb->s_id); | ||||
| 		btrfs_sb(sb)->bdev_holder = &btrfs_fs_type; | ||||
| 		ret = btrfs_fill_super(sb, fs_devices, NULL); | ||||
| 		ret = btrfs_fill_super(sb, fs_devices); | ||||
| 	} | ||||
| 
 | ||||
| 	if (ret) { | ||||
|  | @ -2257,7 +2255,10 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd, | |||
| 		device = btrfs_scan_one_device(vol->name, BLK_OPEN_READ, false); | ||||
| 		if (IS_ERR_OR_NULL(device)) { | ||||
| 			mutex_unlock(&uuid_mutex); | ||||
| 			ret = PTR_ERR(device); | ||||
| 			if (IS_ERR(device)) | ||||
| 				ret = PTR_ERR(device); | ||||
| 			else | ||||
| 				ret = 0; | ||||
| 			break; | ||||
| 		} | ||||
| 		ret = !(device->fs_devices->num_devices == | ||||
|  | @ -2396,13 +2397,7 @@ static long btrfs_nr_cached_objects(struct super_block *sb, struct shrink_contro | |||
| 
 | ||||
| 	trace_btrfs_extent_map_shrinker_count(fs_info, nr); | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Only report the real number for DEBUG builds, as there are reports of | ||||
| 	 * serious performance degradation caused by too frequent shrinks. | ||||
| 	 */ | ||||
| 	if (IS_ENABLED(CONFIG_BTRFS_DEBUG)) | ||||
| 		return nr; | ||||
| 	return 0; | ||||
| 	return nr; | ||||
| } | ||||
| 
 | ||||
| static long btrfs_free_cached_objects(struct super_block *sb, struct shrink_control *sc) | ||||
|  | @ -2410,16 +2405,10 @@ static long btrfs_free_cached_objects(struct super_block *sb, struct shrink_cont | |||
| 	const long nr_to_scan = min_t(unsigned long, LONG_MAX, sc->nr_to_scan); | ||||
| 	struct btrfs_fs_info *fs_info = btrfs_sb(sb); | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * We may be called from any task trying to allocate memory and we don't | ||||
| 	 * want to slow it down with scanning and dropping extent maps. It would | ||||
| 	 * also cause heavy lock contention if many tasks concurrently enter | ||||
| 	 * here. Therefore only allow kswapd tasks to scan and drop extent maps. | ||||
| 	 */ | ||||
| 	if (!current_is_kswapd()) | ||||
| 		return 0; | ||||
| 	btrfs_free_extent_maps(fs_info, nr_to_scan); | ||||
| 
 | ||||
| 	return btrfs_free_extent_maps(fs_info, nr_to_scan); | ||||
| 	/* The extent map shrinker runs asynchronously, so always return 0. */ | ||||
| 	return 0; | ||||
| } | ||||
| 
 | ||||
| static const struct super_operations btrfs_super_ops = { | ||||
|  |  | |||
|  | @ -1390,7 +1390,7 @@ static ssize_t btrfs_bg_reclaim_threshold_store(struct kobject *kobj, | |||
| BTRFS_ATTR_RW(, bg_reclaim_threshold, btrfs_bg_reclaim_threshold_show, | ||||
| 	      btrfs_bg_reclaim_threshold_store); | ||||
| 
 | ||||
| #ifdef CONFIG_BTRFS_DEBUG | ||||
| #ifdef CONFIG_BTRFS_EXPERIMENTAL | ||||
| static ssize_t btrfs_offload_csum_show(struct kobject *kobj, | ||||
| 				       struct kobj_attribute *a, char *buf) | ||||
| { | ||||
|  | @ -1450,7 +1450,7 @@ static const struct attribute *btrfs_attrs[] = { | |||
| 	BTRFS_ATTR_PTR(, bg_reclaim_threshold), | ||||
| 	BTRFS_ATTR_PTR(, commit_stats), | ||||
| 	BTRFS_ATTR_PTR(, temp_fsid), | ||||
| #ifdef CONFIG_BTRFS_DEBUG | ||||
| #ifdef CONFIG_BTRFS_EXPERIMENTAL | ||||
| 	BTRFS_ATTR_PTR(, offload_csum), | ||||
| #endif | ||||
| 	NULL, | ||||
|  |  | |||
|  | @ -29,6 +29,7 @@ const char *test_error[] = { | |||
| 	[TEST_ALLOC_BLOCK_GROUP]     = "cannot allocate block group", | ||||
| 	[TEST_ALLOC_EXTENT_MAP]      = "cannot allocate extent map", | ||||
| 	[TEST_ALLOC_CHUNK_MAP]       = "cannot allocate chunk map", | ||||
| 	[TEST_ALLOC_IO_CONTEXT]	     = "cannot allocate io context", | ||||
| }; | ||||
| 
 | ||||
| static const struct super_operations btrfs_test_super_ops = { | ||||
|  | @ -291,6 +292,9 @@ int btrfs_run_sanity_tests(void) | |||
| 			ret = btrfs_test_free_space_tree(sectorsize, nodesize); | ||||
| 			if (ret) | ||||
| 				goto out; | ||||
| 			ret = btrfs_test_raid_stripe_tree(sectorsize, nodesize); | ||||
| 			if (ret) | ||||
| 				goto out; | ||||
| 		} | ||||
| 	} | ||||
| 	ret = btrfs_test_extent_map(); | ||||
|  |  | |||
|  | @ -24,6 +24,7 @@ enum { | |||
| 	TEST_ALLOC_BLOCK_GROUP, | ||||
| 	TEST_ALLOC_EXTENT_MAP, | ||||
| 	TEST_ALLOC_CHUNK_MAP, | ||||
| 	TEST_ALLOC_IO_CONTEXT, | ||||
| }; | ||||
| 
 | ||||
| extern const char *test_error[]; | ||||
|  | @ -37,6 +38,7 @@ int btrfs_test_extent_io(u32 sectorsize, u32 nodesize); | |||
| int btrfs_test_inodes(u32 sectorsize, u32 nodesize); | ||||
| int btrfs_test_qgroups(u32 sectorsize, u32 nodesize); | ||||
| int btrfs_test_free_space_tree(u32 sectorsize, u32 nodesize); | ||||
| int btrfs_test_raid_stripe_tree(u32 sectorsize, u32 nodesize); | ||||
| int btrfs_test_extent_map(void); | ||||
| struct inode *btrfs_new_test_inode(void); | ||||
| struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize); | ||||
|  |  | |||
							
								
								
									
										538
									
								
								fs/btrfs/tests/raid-stripe-tree-tests.c
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										538
									
								
								fs/btrfs/tests/raid-stripe-tree-tests.c
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,538 @@ | |||
| // SPDX-License-Identifier: GPL-2.0
 | ||||
| /*
 | ||||
|  * Copyright (C) 2024 Western Digital Corporation or its affiliates. | ||||
|  */ | ||||
| 
 | ||||
| #include <linux/sizes.h> | ||||
| #include "../fs.h" | ||||
| #include "../disk-io.h" | ||||
| #include "../transaction.h" | ||||
| #include "../volumes.h" | ||||
| #include "../raid-stripe-tree.h" | ||||
| #include "btrfs-tests.h" | ||||
| 
 | ||||
| #define RST_TEST_NUM_DEVICES	(2) | ||||
| #define RST_TEST_RAID1_TYPE	(BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_RAID1) | ||||
| 
 | ||||
| typedef int (*test_func_t)(struct btrfs_trans_handle *trans); | ||||
| 
 | ||||
| static struct btrfs_device *btrfs_device_by_devid(struct btrfs_fs_devices *fs_devices, | ||||
| 						  u64 devid) | ||||
| { | ||||
| 	struct btrfs_device *dev; | ||||
| 
 | ||||
| 	list_for_each_entry(dev, &fs_devices->devices, dev_list) { | ||||
| 		if (dev->devid == devid) | ||||
| 			return dev; | ||||
| 	} | ||||
| 
 | ||||
| 	return NULL; | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Test a 64K RST write on a 2 disk RAID1 at a logical address of 1M and then | ||||
|  * delete the 1st 32K, making the new start address 1M+32K. | ||||
|  */ | ||||
| static int test_front_delete(struct btrfs_trans_handle *trans) | ||||
| { | ||||
| 	struct btrfs_fs_info *fs_info = trans->fs_info; | ||||
| 	struct btrfs_io_context *bioc; | ||||
| 	struct btrfs_io_stripe io_stripe = { 0 }; | ||||
| 	u64 map_type = RST_TEST_RAID1_TYPE; | ||||
| 	u64 logical = SZ_1M; | ||||
| 	u64 len = SZ_64K; | ||||
| 	int ret; | ||||
| 
 | ||||
| 	bioc = alloc_btrfs_io_context(fs_info, logical, RST_TEST_NUM_DEVICES); | ||||
| 	if (!bioc) { | ||||
| 		test_std_err(TEST_ALLOC_IO_CONTEXT); | ||||
| 		ret = -ENOMEM; | ||||
| 		goto out; | ||||
| 	} | ||||
| 
 | ||||
| 	io_stripe.dev = btrfs_device_by_devid(fs_info->fs_devices, 0); | ||||
| 	bioc->map_type = map_type; | ||||
| 	bioc->size = len; | ||||
| 
 | ||||
| 	for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) { | ||||
| 		struct btrfs_io_stripe *stripe = &bioc->stripes[i]; | ||||
| 
 | ||||
| 		stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i); | ||||
| 		if (!stripe->dev) { | ||||
| 			test_err("cannot find device with devid %d", i); | ||||
| 			ret = -EINVAL; | ||||
| 			goto out; | ||||
| 		} | ||||
| 
 | ||||
| 		stripe->physical = logical + i * SZ_1G; | ||||
| 	} | ||||
| 
 | ||||
| 	ret = btrfs_insert_one_raid_extent(trans, bioc); | ||||
| 	if (ret) { | ||||
| 		test_err("inserting RAID extent failed: %d", ret); | ||||
| 		goto out; | ||||
| 	} | ||||
| 
 | ||||
| 	ret = btrfs_get_raid_extent_offset(fs_info, logical, &len, map_type, 0, &io_stripe); | ||||
| 	if (ret) { | ||||
| 		test_err("lookup of RAID extent [%llu, %llu] failed", logical, | ||||
| 			 logical + len); | ||||
| 		goto out; | ||||
| 	} | ||||
| 
 | ||||
| 	if (io_stripe.physical != logical) { | ||||
| 		test_err("invalid physical address, expected %llu got %llu", | ||||
| 			 logical, io_stripe.physical); | ||||
| 		ret = -EINVAL; | ||||
| 		goto out; | ||||
| 	} | ||||
| 
 | ||||
| 	if (len != SZ_64K) { | ||||
| 		test_err("invalid stripe length, expected %llu got %llu", | ||||
| 			 (u64)SZ_64K, len); | ||||
| 		ret = -EINVAL; | ||||
| 		goto out; | ||||
| 	} | ||||
| 
 | ||||
| 	ret = btrfs_delete_raid_extent(trans, logical, SZ_32K); | ||||
| 	if (ret) { | ||||
| 		test_err("deleting RAID extent [%llu, %llu] failed", logical, | ||||
| 			 logical + SZ_32K); | ||||
| 		goto out; | ||||
| 	} | ||||
| 
 | ||||
| 	len = SZ_32K; | ||||
| 	ret = btrfs_get_raid_extent_offset(fs_info, logical + SZ_32K, &len, | ||||
| 					   map_type, 0, &io_stripe); | ||||
| 	if (ret) { | ||||
| 		test_err("lookup of RAID extent [%llu, %llu] failed", | ||||
| 			 logical + SZ_32K, logical + SZ_32K + len); | ||||
| 		goto out; | ||||
| 	} | ||||
| 
 | ||||
| 	if (io_stripe.physical != logical + SZ_32K) { | ||||
| 		test_err("invalid physical address, expected %llu, got %llu", | ||||
| 			 logical + SZ_32K, io_stripe.physical); | ||||
| 		ret = -EINVAL; | ||||
| 		goto out; | ||||
| 	} | ||||
| 
 | ||||
| 	if (len != SZ_32K) { | ||||
| 		test_err("invalid stripe length, expected %llu, got %llu", | ||||
| 			 (u64)SZ_32K, len); | ||||
| 		ret = -EINVAL; | ||||
| 		goto out; | ||||
| 	} | ||||
| 
 | ||||
| 	ret = btrfs_get_raid_extent_offset(fs_info, logical, &len, map_type, 0, &io_stripe); | ||||
| 	if (!ret) { | ||||
| 		ret = -EINVAL; | ||||
| 		test_err("lookup of RAID extent [%llu, %llu] succeeded, should fail", | ||||
| 			 logical, logical + SZ_32K); | ||||
| 		goto out; | ||||
| 	} | ||||
| 
 | ||||
| 	ret = btrfs_delete_raid_extent(trans, logical + SZ_32K, SZ_32K); | ||||
| out: | ||||
| 	btrfs_put_bioc(bioc); | ||||
| 	return ret; | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Test a 64K RST write on a 2 disk RAID1 at a logical address of 1M and then | ||||
|  * truncate the stripe extent down to 32K. | ||||
|  */ | ||||
| static int test_tail_delete(struct btrfs_trans_handle *trans) | ||||
| { | ||||
| 	struct btrfs_fs_info *fs_info = trans->fs_info; | ||||
| 	struct btrfs_io_context *bioc; | ||||
| 	struct btrfs_io_stripe io_stripe = { 0 }; | ||||
| 	u64 map_type = RST_TEST_RAID1_TYPE; | ||||
| 	u64 logical = SZ_1M; | ||||
| 	u64 len = SZ_64K; | ||||
| 	int ret; | ||||
| 
 | ||||
| 	bioc = alloc_btrfs_io_context(fs_info, logical, RST_TEST_NUM_DEVICES); | ||||
| 	if (!bioc) { | ||||
| 		test_std_err(TEST_ALLOC_IO_CONTEXT); | ||||
| 		ret = -ENOMEM; | ||||
| 		goto out; | ||||
| 	} | ||||
| 
 | ||||
| 	io_stripe.dev = btrfs_device_by_devid(fs_info->fs_devices, 0); | ||||
| 	bioc->map_type = map_type; | ||||
| 	bioc->size = len; | ||||
| 
 | ||||
| 	for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) { | ||||
| 		struct btrfs_io_stripe *stripe = &bioc->stripes[i]; | ||||
| 
 | ||||
| 		stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i); | ||||
| 		if (!stripe->dev) { | ||||
| 			test_err("cannot find device with devid %d", i); | ||||
| 			ret = -EINVAL; | ||||
| 			goto out; | ||||
| 		} | ||||
| 
 | ||||
| 		stripe->physical = logical + i * SZ_1G; | ||||
| 	} | ||||
| 
 | ||||
| 	ret = btrfs_insert_one_raid_extent(trans, bioc); | ||||
| 	if (ret) { | ||||
| 		test_err("inserting RAID extent failed: %d", ret); | ||||
| 		goto out; | ||||
| 	} | ||||
| 
 | ||||
| 	io_stripe.dev = btrfs_device_by_devid(fs_info->fs_devices, 0); | ||||
| 	if (!io_stripe.dev) { | ||||
| 		ret = -EINVAL; | ||||
| 		goto out; | ||||
| 	} | ||||
| 
 | ||||
| 	ret = btrfs_get_raid_extent_offset(fs_info, logical, &len, map_type, 0, &io_stripe); | ||||
| 	if (ret) { | ||||
| 		test_err("lookup of RAID extent [%llu, %llu] failed", logical, | ||||
| 			 logical + len); | ||||
| 		goto out; | ||||
| 	} | ||||
| 
 | ||||
| 	if (io_stripe.physical != logical) { | ||||
| 		test_err("invalid physical address, expected %llu got %llu", | ||||
| 			 logical, io_stripe.physical); | ||||
| 		ret = -EINVAL; | ||||
| 		goto out; | ||||
| 	} | ||||
| 
 | ||||
| 	if (len != SZ_64K) { | ||||
| 		test_err("invalid stripe length, expected %llu got %llu", | ||||
| 			 (u64)SZ_64K, len); | ||||
| 		ret = -EINVAL; | ||||
| 		goto out; | ||||
| 	} | ||||
| 
 | ||||
| 	ret = btrfs_delete_raid_extent(trans, logical + SZ_32K, SZ_32K); | ||||
| 	if (ret) { | ||||
| 		test_err("deleting RAID extent [%llu, %llu] failed", | ||||
| 			 logical + SZ_32K, logical + SZ_64K); | ||||
| 		goto out; | ||||
| 	} | ||||
| 
 | ||||
| 	len = SZ_32K; | ||||
| 	ret = btrfs_get_raid_extent_offset(fs_info, logical, &len, map_type, 0, &io_stripe); | ||||
| 	if (ret) { | ||||
| 		test_err("lookup of RAID extent [%llu, %llu] failed", logical, | ||||
| 			 logical + len); | ||||
| 		goto out; | ||||
| 	} | ||||
| 
 | ||||
| 	if (io_stripe.physical != logical) { | ||||
| 		test_err("invalid physical address, expected %llu, got %llu", | ||||
| 			 logical, io_stripe.physical); | ||||
| 		ret = -EINVAL; | ||||
| 		goto out; | ||||
| 	} | ||||
| 
 | ||||
| 	if (len != SZ_32K) { | ||||
| 		test_err("invalid stripe length, expected %llu, got %llu", | ||||
| 			 (u64)SZ_32K, len); | ||||
| 		ret = -EINVAL; | ||||
| 		goto out; | ||||
| 	} | ||||
| 
 | ||||
| 	ret = btrfs_delete_raid_extent(trans, logical, len); | ||||
| 	if (ret) | ||||
| 		test_err("deleting RAID extent [%llu, %llu] failed", logical, | ||||
| 			 logical + len); | ||||
| 
 | ||||
| out: | ||||
| 	btrfs_put_bioc(bioc); | ||||
| 	return ret; | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Test a 64K RST write on a 2 disk RAID1 at a logical address of 1M and then | ||||
|  * overwrite the whole range giving it new physical address at an offset of 1G. | ||||
|  * The intent of this test is to exercise the 'update_raid_extent_item()' | ||||
|  * function called be btrfs_insert_one_raid_extent(). | ||||
|  */ | ||||
| static int test_create_update_delete(struct btrfs_trans_handle *trans) | ||||
| { | ||||
| 	struct btrfs_fs_info *fs_info = trans->fs_info; | ||||
| 	struct btrfs_io_context *bioc; | ||||
| 	struct btrfs_io_stripe io_stripe = { 0 }; | ||||
| 	u64 map_type = RST_TEST_RAID1_TYPE; | ||||
| 	u64 logical = SZ_1M; | ||||
| 	u64 len = SZ_64K; | ||||
| 	int ret; | ||||
| 
 | ||||
| 	bioc = alloc_btrfs_io_context(fs_info, logical, RST_TEST_NUM_DEVICES); | ||||
| 	if (!bioc) { | ||||
| 		test_std_err(TEST_ALLOC_IO_CONTEXT); | ||||
| 		ret = -ENOMEM; | ||||
| 		goto out; | ||||
| 	} | ||||
| 
 | ||||
| 	io_stripe.dev = btrfs_device_by_devid(fs_info->fs_devices, 0); | ||||
| 	bioc->map_type = map_type; | ||||
| 	bioc->size = len; | ||||
| 
 | ||||
| 	for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) { | ||||
| 		struct btrfs_io_stripe *stripe = &bioc->stripes[i]; | ||||
| 
 | ||||
| 		stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i); | ||||
| 		if (!stripe->dev) { | ||||
| 			test_err("cannot find device with devid %d", i); | ||||
| 			ret = -EINVAL; | ||||
| 			goto out; | ||||
| 		} | ||||
| 
 | ||||
| 		stripe->physical = logical + i * SZ_1G; | ||||
| 	} | ||||
| 
 | ||||
| 	ret = btrfs_insert_one_raid_extent(trans, bioc); | ||||
| 	if (ret) { | ||||
| 		test_err("inserting RAID extent failed: %d", ret); | ||||
| 		goto out; | ||||
| 	} | ||||
| 
 | ||||
| 	io_stripe.dev = btrfs_device_by_devid(fs_info->fs_devices, 0); | ||||
| 	if (!io_stripe.dev) { | ||||
| 		ret = -EINVAL; | ||||
| 		goto out; | ||||
| 	} | ||||
| 
 | ||||
| 	ret = btrfs_get_raid_extent_offset(fs_info, logical, &len, map_type, 0, &io_stripe); | ||||
| 	if (ret) { | ||||
| 		test_err("lookup of RAID extent [%llu, %llu] failed", logical, | ||||
| 			 logical + len); | ||||
| 		goto out; | ||||
| 	} | ||||
| 
 | ||||
| 	if (io_stripe.physical != logical) { | ||||
| 		test_err("invalid physical address, expected %llu got %llu", | ||||
| 			 logical, io_stripe.physical); | ||||
| 		ret = -EINVAL; | ||||
| 		goto out; | ||||
| 	} | ||||
| 
 | ||||
| 	if (len != SZ_64K) { | ||||
| 		test_err("invalid stripe length, expected %llu got %llu", | ||||
| 			 (u64)SZ_64K, len); | ||||
| 		ret = -EINVAL; | ||||
| 		goto out; | ||||
| 	} | ||||
| 
 | ||||
| 	for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) { | ||||
| 		struct btrfs_io_stripe *stripe = &bioc->stripes[i]; | ||||
| 
 | ||||
| 		stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i); | ||||
| 		if (!stripe->dev) { | ||||
| 			test_err("cannot find device with devid %d", i); | ||||
| 			ret = -EINVAL; | ||||
| 			goto out; | ||||
| 		} | ||||
| 
 | ||||
| 		stripe->physical = SZ_1G + logical + i * SZ_1G; | ||||
| 	} | ||||
| 
 | ||||
| 	ret = btrfs_insert_one_raid_extent(trans, bioc); | ||||
| 	if (ret) { | ||||
| 		test_err("updating RAID extent failed: %d", ret); | ||||
| 		goto out; | ||||
| 	} | ||||
| 
 | ||||
| 	ret = btrfs_get_raid_extent_offset(fs_info, logical, &len, map_type, 0, &io_stripe); | ||||
| 	if (ret) { | ||||
| 		test_err("lookup of RAID extent [%llu, %llu] failed", logical, | ||||
| 			 logical + len); | ||||
| 		goto out; | ||||
| 	} | ||||
| 
 | ||||
| 	if (io_stripe.physical != logical + SZ_1G) { | ||||
| 		test_err("invalid physical address, expected %llu, got %llu", | ||||
| 			 logical + SZ_1G, io_stripe.physical); | ||||
| 		ret = -EINVAL; | ||||
| 		goto out; | ||||
| 	} | ||||
| 
 | ||||
| 	if (len != SZ_64K) { | ||||
| 		test_err("invalid stripe length, expected %llu, got %llu", | ||||
| 			 (u64)SZ_64K, len); | ||||
| 		ret = -EINVAL; | ||||
| 		goto out; | ||||
| 	} | ||||
| 
 | ||||
| 	ret = btrfs_delete_raid_extent(trans, logical, len); | ||||
| 	if (ret) | ||||
| 		test_err("deleting RAID extent [%llu, %llu] failed", logical, | ||||
| 			 logical + len); | ||||
| 
 | ||||
| out: | ||||
| 	btrfs_put_bioc(bioc); | ||||
| 	return ret; | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Test a simple 64K RST write on a 2 disk RAID1 at a logical address of 1M. | ||||
|  * The "physical" copy on device 0 is at 1M, on device 1 it is at 1G+1M. | ||||
|  */ | ||||
| static int test_simple_create_delete(struct btrfs_trans_handle *trans) | ||||
| { | ||||
| 	struct btrfs_fs_info *fs_info = trans->fs_info; | ||||
| 	struct btrfs_io_context *bioc; | ||||
| 	struct btrfs_io_stripe io_stripe = { 0 }; | ||||
| 	u64 map_type = RST_TEST_RAID1_TYPE; | ||||
| 	u64 logical = SZ_1M; | ||||
| 	u64 len = SZ_64K; | ||||
| 	int ret; | ||||
| 
 | ||||
| 	bioc = alloc_btrfs_io_context(fs_info, logical, RST_TEST_NUM_DEVICES); | ||||
| 	if (!bioc) { | ||||
| 		test_std_err(TEST_ALLOC_IO_CONTEXT); | ||||
| 		ret = -ENOMEM; | ||||
| 		goto out; | ||||
| 	} | ||||
| 
 | ||||
| 	bioc->map_type = map_type; | ||||
| 	bioc->size = SZ_64K; | ||||
| 
 | ||||
| 	for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) { | ||||
| 		struct btrfs_io_stripe *stripe = &bioc->stripes[i]; | ||||
| 
 | ||||
| 		stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i); | ||||
| 		if (!stripe->dev) { | ||||
| 			test_err("cannot find device with devid %d", i); | ||||
| 			ret = -EINVAL; | ||||
| 			goto out; | ||||
| 		} | ||||
| 
 | ||||
| 		stripe->physical = logical + i * SZ_1G; | ||||
| 	} | ||||
| 
 | ||||
| 	ret = btrfs_insert_one_raid_extent(trans, bioc); | ||||
| 	if (ret) { | ||||
| 		test_err("inserting RAID extent failed: %d", ret); | ||||
| 		goto out; | ||||
| 	} | ||||
| 
 | ||||
| 	io_stripe.dev = btrfs_device_by_devid(fs_info->fs_devices, 0); | ||||
| 	if (!io_stripe.dev) { | ||||
| 		ret = -EINVAL; | ||||
| 		goto out; | ||||
| 	} | ||||
| 
 | ||||
| 	ret = btrfs_get_raid_extent_offset(fs_info, logical, &len, map_type, 0, &io_stripe); | ||||
| 	if (ret)  { | ||||
| 		test_err("lookup of RAID extent [%llu, %llu] failed", logical, | ||||
| 			 logical + len); | ||||
| 		goto out; | ||||
| 	} | ||||
| 
 | ||||
| 	if (io_stripe.physical != logical) { | ||||
| 		test_err("invalid physical address, expected %llu got %llu", | ||||
| 			 logical, io_stripe.physical); | ||||
| 		ret = -EINVAL; | ||||
| 		goto out; | ||||
| 	} | ||||
| 
 | ||||
| 	if (len != SZ_64K) { | ||||
| 		test_err("invalid stripe length, expected %llu got %llu", | ||||
| 			 (u64)SZ_64K, len); | ||||
| 		ret = -EINVAL; | ||||
| 		goto out; | ||||
| 	} | ||||
| 
 | ||||
| 	ret = btrfs_delete_raid_extent(trans, logical, len); | ||||
| 	if (ret) | ||||
| 		test_err("deleting RAID extent [%llu, %llu] failed", logical, | ||||
| 			 logical + len); | ||||
| 
 | ||||
| out: | ||||
| 	btrfs_put_bioc(bioc); | ||||
| 	return ret; | ||||
| } | ||||
| 
 | ||||
| static const test_func_t tests[] = { | ||||
| 	test_simple_create_delete, | ||||
| 	test_create_update_delete, | ||||
| 	test_tail_delete, | ||||
| 	test_front_delete, | ||||
| }; | ||||
| 
 | ||||
| static int run_test(test_func_t test, u32 sectorsize, u32 nodesize) | ||||
| { | ||||
| 	struct btrfs_trans_handle trans; | ||||
| 	struct btrfs_fs_info *fs_info; | ||||
| 	struct btrfs_root *root = NULL; | ||||
| 	int ret; | ||||
| 
 | ||||
| 	fs_info = btrfs_alloc_dummy_fs_info(sectorsize, nodesize); | ||||
| 	if (!fs_info) { | ||||
| 		test_std_err(TEST_ALLOC_FS_INFO); | ||||
| 		ret = -ENOMEM; | ||||
| 		goto out; | ||||
| 	} | ||||
| 
 | ||||
| 	root = btrfs_alloc_dummy_root(fs_info); | ||||
| 	if (IS_ERR(root)) { | ||||
| 		test_std_err(TEST_ALLOC_ROOT); | ||||
| 		ret = PTR_ERR(root); | ||||
| 		goto out; | ||||
| 	} | ||||
| 	btrfs_set_super_compat_ro_flags(root->fs_info->super_copy, | ||||
| 					BTRFS_FEATURE_INCOMPAT_RAID_STRIPE_TREE); | ||||
| 	root->root_key.objectid = BTRFS_RAID_STRIPE_TREE_OBJECTID; | ||||
| 	root->root_key.type = BTRFS_ROOT_ITEM_KEY; | ||||
| 	root->root_key.offset = 0; | ||||
| 	fs_info->stripe_root = root; | ||||
| 	root->fs_info->tree_root = root; | ||||
| 
 | ||||
| 	root->node = alloc_test_extent_buffer(root->fs_info, nodesize); | ||||
| 	if (IS_ERR(root->node)) { | ||||
| 		test_std_err(TEST_ALLOC_EXTENT_BUFFER); | ||||
| 		ret = PTR_ERR(root->node); | ||||
| 		goto out; | ||||
| 	} | ||||
| 	btrfs_set_header_level(root->node, 0); | ||||
| 	btrfs_set_header_nritems(root->node, 0); | ||||
| 	root->alloc_bytenr += 2 * nodesize; | ||||
| 
 | ||||
| 	for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) { | ||||
| 		struct btrfs_device *dev; | ||||
| 
 | ||||
| 		dev = btrfs_alloc_dummy_device(fs_info); | ||||
| 		if (IS_ERR(dev)) { | ||||
| 			test_err("cannot allocate device"); | ||||
| 			ret = PTR_ERR(dev); | ||||
| 			goto out; | ||||
| 		} | ||||
| 		dev->devid = i; | ||||
| 	} | ||||
| 
 | ||||
| 	btrfs_init_dummy_trans(&trans, root->fs_info); | ||||
| 	ret = test(&trans); | ||||
| 	if (ret) | ||||
| 		goto out; | ||||
| 
 | ||||
| out: | ||||
| 	btrfs_free_dummy_root(root); | ||||
| 	btrfs_free_dummy_fs_info(fs_info); | ||||
| 
 | ||||
| 	return ret; | ||||
| } | ||||
| 
 | ||||
| int btrfs_test_raid_stripe_tree(u32 sectorsize, u32 nodesize) | ||||
| { | ||||
| 	int ret = 0; | ||||
| 
 | ||||
| 	test_msg("running raid-stripe-tree tests"); | ||||
| 	for (int i = 0; i < ARRAY_SIZE(tests); i++) { | ||||
| 		ret = run_test(tests[i], sectorsize, nodesize); | ||||
| 		if (ret) { | ||||
| 			test_err("test-case %ps failed with %d\n", tests[i], ret); | ||||
| 			goto out; | ||||
| 		} | ||||
| 	} | ||||
| 
 | ||||
| out: | ||||
| 	return ret; | ||||
| } | ||||
|  | @ -141,8 +141,7 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction) | |||
| 	WARN_ON(refcount_read(&transaction->use_count) == 0); | ||||
| 	if (refcount_dec_and_test(&transaction->use_count)) { | ||||
| 		BUG_ON(!list_empty(&transaction->list)); | ||||
| 		WARN_ON(!RB_EMPTY_ROOT( | ||||
| 				&transaction->delayed_refs.href_root.rb_root)); | ||||
| 		WARN_ON(!xa_empty(&transaction->delayed_refs.head_refs)); | ||||
| 		WARN_ON(!xa_empty(&transaction->delayed_refs.dirty_extents)); | ||||
| 		if (transaction->delayed_refs.pending_csums) | ||||
| 			btrfs_err(transaction->fs_info, | ||||
|  | @ -349,9 +348,8 @@ loop: | |||
| 
 | ||||
| 	memset(&cur_trans->delayed_refs, 0, sizeof(cur_trans->delayed_refs)); | ||||
| 
 | ||||
| 	cur_trans->delayed_refs.href_root = RB_ROOT_CACHED; | ||||
| 	xa_init(&cur_trans->delayed_refs.head_refs); | ||||
| 	xa_init(&cur_trans->delayed_refs.dirty_extents); | ||||
| 	atomic_set(&cur_trans->delayed_refs.num_entries, 0); | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * although the tree mod log is per file system and not per transaction, | ||||
|  | @ -2052,7 +2050,7 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, int err) | |||
| 
 | ||||
| 	spin_unlock(&fs_info->trans_lock); | ||||
| 
 | ||||
| 	btrfs_cleanup_one_transaction(trans->transaction, fs_info); | ||||
| 	btrfs_cleanup_one_transaction(trans->transaction); | ||||
| 
 | ||||
| 	spin_lock(&fs_info->trans_lock); | ||||
| 	if (cur_trans == fs_info->running_transaction) | ||||
|  |  | |||
|  | @ -33,7 +33,7 @@ struct btrfs_path; | |||
|  */ | ||||
| #define BTRFS_TRANS_DIO_WRITE_STUB	((void *) 1) | ||||
| 
 | ||||
| /* Radix-tree tag for roots that are part of the trasaction. */ | ||||
| /* Radix-tree tag for roots that are part of the transaction. */ | ||||
| #define BTRFS_ROOT_TRANS_TAG			0 | ||||
| 
 | ||||
| enum btrfs_trans_state { | ||||
|  |  | |||
|  | @ -2183,8 +2183,8 @@ int btrfs_check_eb_owner(const struct extent_buffer *eb, u64 root_owner) | |||
| 	return 0; | ||||
| } | ||||
| 
 | ||||
| int btrfs_verify_level_key(struct extent_buffer *eb, int level, | ||||
| 			   struct btrfs_key *first_key, u64 parent_transid) | ||||
| int btrfs_verify_level_key(struct extent_buffer *eb, | ||||
| 			   const struct btrfs_tree_parent_check *check) | ||||
| { | ||||
| 	struct btrfs_fs_info *fs_info = eb->fs_info; | ||||
| 	int found_level; | ||||
|  | @ -2192,16 +2192,16 @@ int btrfs_verify_level_key(struct extent_buffer *eb, int level, | |||
| 	int ret; | ||||
| 
 | ||||
| 	found_level = btrfs_header_level(eb); | ||||
| 	if (found_level != level) { | ||||
| 	if (found_level != check->level) { | ||||
| 		WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG), | ||||
| 		     KERN_ERR "BTRFS: tree level check failed\n"); | ||||
| 		btrfs_err(fs_info, | ||||
| "tree level mismatch detected, bytenr=%llu level expected=%u has=%u", | ||||
| 			  eb->start, level, found_level); | ||||
| 			  eb->start, check->level, found_level); | ||||
| 		return -EIO; | ||||
| 	} | ||||
| 
 | ||||
| 	if (!first_key) | ||||
| 	if (!check->has_first_key) | ||||
| 		return 0; | ||||
| 
 | ||||
| 	/*
 | ||||
|  | @ -2226,15 +2226,15 @@ int btrfs_verify_level_key(struct extent_buffer *eb, int level, | |||
| 		btrfs_node_key_to_cpu(eb, &found_key, 0); | ||||
| 	else | ||||
| 		btrfs_item_key_to_cpu(eb, &found_key, 0); | ||||
| 	ret = btrfs_comp_cpu_keys(first_key, &found_key); | ||||
| 	ret = btrfs_comp_cpu_keys(&check->first_key, &found_key); | ||||
| 
 | ||||
| 	if (ret) { | ||||
| 		WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG), | ||||
| 		     KERN_ERR "BTRFS: tree first key check failed\n"); | ||||
| 		btrfs_err(fs_info, | ||||
| "tree first key mismatch detected, bytenr=%llu parent_transid=%llu key expected=(%llu,%u,%llu) has=(%llu,%u,%llu)", | ||||
| 			  eb->start, parent_transid, first_key->objectid, | ||||
| 			  first_key->type, first_key->offset, | ||||
| 			  eb->start, check->transid, check->first_key.objectid, | ||||
| 			  check->first_key.type, check->first_key.offset, | ||||
| 			  found_key.objectid, found_key.type, | ||||
| 			  found_key.offset); | ||||
| 	} | ||||
|  |  | |||
|  | @ -69,7 +69,7 @@ int btrfs_check_node(struct extent_buffer *node); | |||
| int btrfs_check_chunk_valid(struct extent_buffer *leaf, | ||||
| 			    struct btrfs_chunk *chunk, u64 logical); | ||||
| int btrfs_check_eb_owner(const struct extent_buffer *eb, u64 root_owner); | ||||
| int btrfs_verify_level_key(struct extent_buffer *eb, int level, | ||||
| 			   struct btrfs_key *first_key, u64 parent_transid); | ||||
| int btrfs_verify_level_key(struct extent_buffer *eb, | ||||
| 			   const struct btrfs_tree_parent_check *check); | ||||
| 
 | ||||
| #endif | ||||
|  |  | |||
|  | @ -6204,7 +6204,6 @@ static int log_delayed_deletions_full(struct btrfs_trans_handle *trans, | |||
| static int batch_delete_dir_index_items(struct btrfs_trans_handle *trans, | ||||
| 					struct btrfs_inode *inode, | ||||
| 					struct btrfs_path *path, | ||||
| 					struct btrfs_log_ctx *ctx, | ||||
| 					const struct list_head *delayed_del_list, | ||||
| 					const struct btrfs_delayed_item *first, | ||||
| 					const struct btrfs_delayed_item **last_ret) | ||||
|  | @ -6265,7 +6264,7 @@ static int log_delayed_deletions_incremental(struct btrfs_trans_handle *trans, | |||
| 		if (ret < 0) { | ||||
| 			return ret; | ||||
| 		} else if (ret == 0) { | ||||
| 			ret = batch_delete_dir_index_items(trans, inode, path, ctx, | ||||
| 			ret = batch_delete_dir_index_items(trans, inode, path, | ||||
| 							   delayed_del_list, curr, | ||||
| 							   &last); | ||||
| 			if (ret) | ||||
|  |  | |||
|  | @ -909,7 +909,6 @@ static void tree_mod_log_rewind(struct btrfs_fs_info *fs_info, | |||
|  * is freed (its refcount is decremented). | ||||
|  */ | ||||
| struct extent_buffer *btrfs_tree_mod_log_rewind(struct btrfs_fs_info *fs_info, | ||||
| 						struct btrfs_path *path, | ||||
| 						struct extent_buffer *eb, | ||||
| 						u64 time_seq) | ||||
| { | ||||
|  |  | |||
|  | @ -41,7 +41,6 @@ int btrfs_tree_mod_log_insert_key(const struct extent_buffer *eb, int slot, | |||
| 				  enum btrfs_mod_log_op op); | ||||
| int btrfs_tree_mod_log_free_eb(struct extent_buffer *eb); | ||||
| struct extent_buffer *btrfs_tree_mod_log_rewind(struct btrfs_fs_info *fs_info, | ||||
| 						struct btrfs_path *path, | ||||
| 						struct extent_buffer *eb, | ||||
| 						u64 time_seq); | ||||
| struct extent_buffer *btrfs_get_old_root(struct btrfs_root *root, u64 time_seq); | ||||
|  |  | |||
|  | @ -732,6 +732,114 @@ const u8 *btrfs_sb_fsid_ptr(const struct btrfs_super_block *sb) | |||
| 	return has_metadata_uuid ? sb->metadata_uuid : sb->fsid; | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * We can have very weird soft links passed in. | ||||
|  * One example is "/proc/self/fd/<fd>", which can be a soft link to | ||||
|  * a block device. | ||||
|  * | ||||
|  * But it's never a good idea to use those weird names. | ||||
|  * Here we check if the path (not following symlinks) is a good one inside | ||||
|  * "/dev/". | ||||
|  */ | ||||
| static bool is_good_dev_path(const char *dev_path) | ||||
| { | ||||
| 	struct path path = { .mnt = NULL, .dentry = NULL }; | ||||
| 	char *path_buf = NULL; | ||||
| 	char *resolved_path; | ||||
| 	bool is_good = false; | ||||
| 	int ret; | ||||
| 
 | ||||
| 	if (!dev_path) | ||||
| 		goto out; | ||||
| 
 | ||||
| 	path_buf = kmalloc(PATH_MAX, GFP_KERNEL); | ||||
| 	if (!path_buf) | ||||
| 		goto out; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Do not follow soft link, just check if the original path is inside | ||||
| 	 * "/dev/". | ||||
| 	 */ | ||||
| 	ret = kern_path(dev_path, 0, &path); | ||||
| 	if (ret) | ||||
| 		goto out; | ||||
| 	resolved_path = d_path(&path, path_buf, PATH_MAX); | ||||
| 	if (IS_ERR(resolved_path)) | ||||
| 		goto out; | ||||
| 	if (strncmp(resolved_path, "/dev/", strlen("/dev/"))) | ||||
| 		goto out; | ||||
| 	is_good = true; | ||||
| out: | ||||
| 	kfree(path_buf); | ||||
| 	path_put(&path); | ||||
| 	return is_good; | ||||
| } | ||||
| 
 | ||||
| static int get_canonical_dev_path(const char *dev_path, char *canonical) | ||||
| { | ||||
| 	struct path path = { .mnt = NULL, .dentry = NULL }; | ||||
| 	char *path_buf = NULL; | ||||
| 	char *resolved_path; | ||||
| 	int ret; | ||||
| 
 | ||||
| 	if (!dev_path) { | ||||
| 		ret = -EINVAL; | ||||
| 		goto out; | ||||
| 	} | ||||
| 
 | ||||
| 	path_buf = kmalloc(PATH_MAX, GFP_KERNEL); | ||||
| 	if (!path_buf) { | ||||
| 		ret = -ENOMEM; | ||||
| 		goto out; | ||||
| 	} | ||||
| 
 | ||||
| 	ret = kern_path(dev_path, LOOKUP_FOLLOW, &path); | ||||
| 	if (ret) | ||||
| 		goto out; | ||||
| 	resolved_path = d_path(&path, path_buf, PATH_MAX); | ||||
| 	ret = strscpy(canonical, resolved_path, PATH_MAX); | ||||
| out: | ||||
| 	kfree(path_buf); | ||||
| 	path_put(&path); | ||||
| 	return ret; | ||||
| } | ||||
| 
 | ||||
| static bool is_same_device(struct btrfs_device *device, const char *new_path) | ||||
| { | ||||
| 	struct path old = { .mnt = NULL, .dentry = NULL }; | ||||
| 	struct path new = { .mnt = NULL, .dentry = NULL }; | ||||
| 	char *old_path = NULL; | ||||
| 	bool is_same = false; | ||||
| 	int ret; | ||||
| 
 | ||||
| 	if (!device->name) | ||||
| 		goto out; | ||||
| 
 | ||||
| 	old_path = kzalloc(PATH_MAX, GFP_NOFS); | ||||
| 	if (!old_path) | ||||
| 		goto out; | ||||
| 
 | ||||
| 	rcu_read_lock(); | ||||
| 	ret = strscpy(old_path, rcu_str_deref(device->name), PATH_MAX); | ||||
| 	rcu_read_unlock(); | ||||
| 	if (ret < 0) | ||||
| 		goto out; | ||||
| 
 | ||||
| 	ret = kern_path(old_path, LOOKUP_FOLLOW, &old); | ||||
| 	if (ret) | ||||
| 		goto out; | ||||
| 	ret = kern_path(new_path, LOOKUP_FOLLOW, &new); | ||||
| 	if (ret) | ||||
| 		goto out; | ||||
| 	if (path_equal(&old, &new)) | ||||
| 		is_same = true; | ||||
| out: | ||||
| 	kfree(old_path); | ||||
| 	path_put(&old); | ||||
| 	path_put(&new); | ||||
| 	return is_same; | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Add new device to list of registered devices | ||||
|  * | ||||
|  | @ -852,7 +960,7 @@ static noinline struct btrfs_device *device_list_add(const char *path, | |||
| 				MAJOR(path_devt), MINOR(path_devt), | ||||
| 				current->comm, task_pid_nr(current)); | ||||
| 
 | ||||
| 	} else if (!device->name || strcmp(device->name->str, path)) { | ||||
| 	} else if (!device->name || !is_same_device(device, path)) { | ||||
| 		/*
 | ||||
| 		 * When FS is already mounted. | ||||
| 		 * 1. If you are here and if the device->name is NULL that | ||||
|  | @ -1383,12 +1491,23 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags, | |||
| 	bool new_device_added = false; | ||||
| 	struct btrfs_device *device = NULL; | ||||
| 	struct file *bdev_file; | ||||
| 	char *canonical_path = NULL; | ||||
| 	u64 bytenr; | ||||
| 	dev_t devt; | ||||
| 	int ret; | ||||
| 
 | ||||
| 	lockdep_assert_held(&uuid_mutex); | ||||
| 
 | ||||
| 	if (!is_good_dev_path(path)) { | ||||
| 		canonical_path = kmalloc(PATH_MAX, GFP_KERNEL); | ||||
| 		if (canonical_path) { | ||||
| 			ret = get_canonical_dev_path(path, canonical_path); | ||||
| 			if (ret < 0) { | ||||
| 				kfree(canonical_path); | ||||
| 				canonical_path = NULL; | ||||
| 			} | ||||
| 		} | ||||
| 	} | ||||
| 	/*
 | ||||
| 	 * Avoid an exclusive open here, as the systemd-udev may initiate the | ||||
| 	 * device scan which may race with the user's mount or mkfs command, | ||||
|  | @ -1433,7 +1552,8 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags, | |||
| 		goto free_disk_super; | ||||
| 	} | ||||
| 
 | ||||
| 	device = device_list_add(path, disk_super, &new_device_added); | ||||
| 	device = device_list_add(canonical_path ? : path, disk_super, | ||||
| 				 &new_device_added); | ||||
| 	if (!IS_ERR(device) && new_device_added) | ||||
| 		btrfs_free_stale_devices(device->devt, device); | ||||
| 
 | ||||
|  | @ -1442,6 +1562,7 @@ free_disk_super: | |||
| 
 | ||||
| error_bdev_put: | ||||
| 	fput(bdev_file); | ||||
| 	kfree(canonical_path); | ||||
| 
 | ||||
| 	return device; | ||||
| } | ||||
|  | @ -2721,8 +2842,6 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path | |||
| 	set_blocksize(device->bdev_file, BTRFS_BDEV_BLOCKSIZE); | ||||
| 
 | ||||
| 	if (seeding_dev) { | ||||
| 		btrfs_clear_sb_rdonly(sb); | ||||
| 
 | ||||
| 		/* GFP_KERNEL allocation must not be under device_list_mutex */ | ||||
| 		seed_devices = btrfs_init_sprout(fs_info); | ||||
| 		if (IS_ERR(seed_devices)) { | ||||
|  | @ -2865,8 +2984,6 @@ error_sysfs: | |||
| 	mutex_unlock(&fs_info->chunk_mutex); | ||||
| 	mutex_unlock(&fs_info->fs_devices->device_list_mutex); | ||||
| error_trans: | ||||
| 	if (seeding_dev) | ||||
| 		btrfs_set_sb_rdonly(sb); | ||||
| 	if (trans) | ||||
| 		btrfs_end_transaction(trans); | ||||
| error_free_zone: | ||||
|  | @ -5310,7 +5427,7 @@ static int decide_stripe_size_zoned(struct alloc_chunk_ctl *ctl, | |||
| 	ctl->num_stripes = ctl->ndevs * ctl->dev_stripes; | ||||
| 	data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies; | ||||
| 
 | ||||
| 	/* stripe_size is fixed in zoned filesysmte. Reduce ndevs instead. */ | ||||
| 	/* stripe_size is fixed in zoned filesystem. Reduce ndevs instead. */ | ||||
| 	if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) { | ||||
| 		ctl->ndevs = div_u64(div_u64(ctl->max_chunk_size * ctl->ncopies, | ||||
| 					     ctl->stripe_size) + ctl->nparity, | ||||
|  | @ -5842,24 +5959,6 @@ unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, | |||
| 	return len; | ||||
| } | ||||
| 
 | ||||
| int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len) | ||||
| { | ||||
| 	struct btrfs_chunk_map *map; | ||||
| 	int ret = 0; | ||||
| 
 | ||||
| 	if (!btrfs_fs_incompat(fs_info, RAID56)) | ||||
| 		return 0; | ||||
| 
 | ||||
| 	map = btrfs_get_chunk_map(fs_info, logical, len); | ||||
| 
 | ||||
| 	if (!WARN_ON(IS_ERR(map))) { | ||||
| 		if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) | ||||
| 			ret = 1; | ||||
| 		btrfs_free_chunk_map(map); | ||||
| 	} | ||||
| 	return ret; | ||||
| } | ||||
| 
 | ||||
| static int find_live_mirror(struct btrfs_fs_info *fs_info, | ||||
| 			    struct btrfs_chunk_map *map, int first, | ||||
| 			    int dev_replace_is_ongoing) | ||||
|  | @ -5920,9 +6019,9 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info, | |||
| 	return preferred_mirror; | ||||
| } | ||||
| 
 | ||||
| static struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info, | ||||
| 						       u64 logical, | ||||
| 						       u16 total_stripes) | ||||
| EXPORT_FOR_TESTS | ||||
| struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info, | ||||
| 						u64 logical, u16 total_stripes) | ||||
| { | ||||
| 	struct btrfs_io_context *bioc; | ||||
| 
 | ||||
|  | @ -6481,13 +6580,15 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, | |||
| 	max_len = btrfs_max_io_len(map, map_offset, &io_geom); | ||||
| 	*length = min_t(u64, map->chunk_len - map_offset, max_len); | ||||
| 
 | ||||
| 	down_read(&dev_replace->rwsem); | ||||
| 	if (dev_replace->replace_task != current) | ||||
| 		down_read(&dev_replace->rwsem); | ||||
| 
 | ||||
| 	dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace); | ||||
| 	/*
 | ||||
| 	 * Hold the semaphore for read during the whole operation, write is | ||||
| 	 * requested at commit time but must wait. | ||||
| 	 */ | ||||
| 	if (!dev_replace_is_ongoing) | ||||
| 	if (!dev_replace_is_ongoing && dev_replace->replace_task != current) | ||||
| 		up_read(&dev_replace->rwsem); | ||||
| 
 | ||||
| 	switch (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { | ||||
|  | @ -6627,7 +6728,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, | |||
| 	bioc->mirror_num = io_geom.mirror_num; | ||||
| 
 | ||||
| out: | ||||
| 	if (dev_replace_is_ongoing) { | ||||
| 	if (dev_replace_is_ongoing && dev_replace->replace_task != current) { | ||||
| 		lockdep_assert_held(&dev_replace->rwsem); | ||||
| 		/* Unlock and let waiting writers proceed */ | ||||
| 		up_read(&dev_replace->rwsem); | ||||
|  |  | |||
|  | @ -306,7 +306,7 @@ enum btrfs_read_policy { | |||
| 	BTRFS_NR_READ_POLICY, | ||||
| }; | ||||
| 
 | ||||
| #ifdef CONFIG_BTRFS_DEBUG | ||||
| #ifdef CONFIG_BTRFS_EXPERIMENTAL | ||||
| /*
 | ||||
|  * Checksum mode - offload it to workqueues or do it synchronously in | ||||
|  * btrfs_submit_chunk(). | ||||
|  | @ -430,7 +430,7 @@ struct btrfs_fs_devices { | |||
| 	/* Policy used to read the mirrored stripes. */ | ||||
| 	enum btrfs_read_policy read_policy; | ||||
| 
 | ||||
| #ifdef CONFIG_BTRFS_DEBUG | ||||
| #ifdef CONFIG_BTRFS_EXPERIMENTAL | ||||
| 	/* Checksum mode - offload it or do it synchronously. */ | ||||
| 	enum btrfs_offload_csum_mode offload_csum_mode; | ||||
| #endif | ||||
|  | @ -741,8 +741,6 @@ int btrfs_run_dev_stats(struct btrfs_trans_handle *trans); | |||
| void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev); | ||||
| void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev); | ||||
| void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev); | ||||
| int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, | ||||
| 			   u64 logical, u64 len); | ||||
| unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, | ||||
| 				    u64 logical); | ||||
| u64 btrfs_calc_stripe_length(const struct btrfs_chunk_map *map); | ||||
|  | @ -840,4 +838,9 @@ bool btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical); | |||
| bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr); | ||||
| const u8 *btrfs_sb_fsid_ptr(const struct btrfs_super_block *sb); | ||||
| 
 | ||||
| #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS | ||||
| struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info, | ||||
| 						u64 logical, u16 total_stripes); | ||||
| #endif | ||||
| 
 | ||||
| #endif | ||||
|  |  | |||
|  | @ -85,7 +85,6 @@ int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode, | |||
| { | ||||
| 	struct btrfs_dir_item *di = NULL; | ||||
| 	struct btrfs_root *root = BTRFS_I(inode)->root; | ||||
| 	struct btrfs_fs_info *fs_info = root->fs_info; | ||||
| 	struct btrfs_path *path; | ||||
| 	size_t name_len = strlen(name); | ||||
| 	int ret = 0; | ||||
|  | @ -143,14 +142,14 @@ int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode, | |||
| 		 */ | ||||
| 		ret = 0; | ||||
| 		btrfs_assert_tree_write_locked(path->nodes[0]); | ||||
| 		di = btrfs_match_dir_item_name(fs_info, path, name, name_len); | ||||
| 		di = btrfs_match_dir_item_name(path, name, name_len); | ||||
| 		if (!di && !(flags & XATTR_REPLACE)) { | ||||
| 			ret = -ENOSPC; | ||||
| 			goto out; | ||||
| 		} | ||||
| 	} else if (ret == -EEXIST) { | ||||
| 		ret = 0; | ||||
| 		di = btrfs_match_dir_item_name(fs_info, path, name, name_len); | ||||
| 		di = btrfs_match_dir_item_name(path, name, name_len); | ||||
| 		ASSERT(di); /* logic error */ | ||||
| 	} else if (ret) { | ||||
| 		goto out; | ||||
|  |  | |||
|  | @ -194,7 +194,7 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping, | |||
| 				pg_off = offset_in_page(start); | ||||
| 				cur_len = btrfs_calc_input_length(orig_end, start); | ||||
| 				data_in = kmap_local_folio(in_folio, pg_off); | ||||
| 				start += PAGE_SIZE; | ||||
| 				start += cur_len; | ||||
| 				workspace->strm.next_in = data_in; | ||||
| 				workspace->strm.avail_in = cur_len; | ||||
| 			} | ||||
|  |  | |||
|  | @ -1739,7 +1739,7 @@ bool btrfs_use_zone_append(struct btrfs_bio *bbio) | |||
| 		return false; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Using REQ_OP_ZONE_APPNED for relocation can break assumptions on the | ||||
| 	 * Using REQ_OP_ZONE_APPEND for relocation can break assumptions on the | ||||
| 	 * extent layout the relocation code has. | ||||
| 	 * Furthermore we have set aside own block-group from which only the | ||||
| 	 * relocation "process" can allocate and make sure only one process at a | ||||
|  | @ -1973,7 +1973,7 @@ int btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info, | |||
| 	if (block_group->meta_write_pointer > eb->start) | ||||
| 		return -EBUSY; | ||||
| 
 | ||||
| 	/* If for_sync, this hole will be filled with trasnsaction commit. */ | ||||
| 	/* If for_sync, this hole will be filled with transaction commit. */ | ||||
| 	if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) | ||||
| 		return -EAGAIN; | ||||
| 	return -EBUSY; | ||||
|  |  | |||
|  | @ -111,6 +111,8 @@ static void zstd_reclaim_timer_fn(struct timer_list *timer) | |||
| 	unsigned long reclaim_threshold = jiffies - ZSTD_BTRFS_RECLAIM_JIFFIES; | ||||
| 	struct list_head *pos, *next; | ||||
| 
 | ||||
| 	ASSERT(timer == &wsm.timer); | ||||
| 
 | ||||
| 	spin_lock(&wsm.lock); | ||||
| 
 | ||||
| 	if (list_empty(&wsm.lru_list)) { | ||||
|  | @ -495,7 +497,7 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping, | |||
| 
 | ||||
| 		/* Check if we need more input */ | ||||
| 		if (workspace->in_buf.pos == workspace->in_buf.size) { | ||||
| 			tot_in += PAGE_SIZE; | ||||
| 			tot_in += workspace->in_buf.size; | ||||
| 			kunmap_local(workspace->in_buf.src); | ||||
| 			workspace->in_buf.src = NULL; | ||||
| 			folio_put(in_folio); | ||||
|  |  | |||
|  | @ -37,6 +37,7 @@ enum io_uring_cmd_flags { | |||
| 	/* set when uring wants to cancel a previously issued command */ | ||||
| 	IO_URING_F_CANCEL		= (1 << 11), | ||||
| 	IO_URING_F_COMPAT		= (1 << 12), | ||||
| 	IO_URING_F_TASK_DEAD		= (1 << 13), | ||||
| }; | ||||
| 
 | ||||
| struct io_wq_work_node { | ||||
|  |  | |||
|  | @ -1706,9 +1706,10 @@ DEFINE_EVENT(btrfs__qgroup_rsv_data, btrfs_qgroup_release_data, | |||
| 
 | ||||
| DECLARE_EVENT_CLASS(btrfs_qgroup_extent, | ||||
| 	TP_PROTO(const struct btrfs_fs_info *fs_info, | ||||
| 		 const struct btrfs_qgroup_extent_record *rec), | ||||
| 		 const struct btrfs_qgroup_extent_record *rec, | ||||
| 		 u64 bytenr), | ||||
| 
 | ||||
| 	TP_ARGS(fs_info, rec), | ||||
| 	TP_ARGS(fs_info, rec, bytenr), | ||||
| 
 | ||||
| 	TP_STRUCT__entry_btrfs( | ||||
| 		__field(	u64,  bytenr		) | ||||
|  | @ -1716,7 +1717,7 @@ DECLARE_EVENT_CLASS(btrfs_qgroup_extent, | |||
| 	), | ||||
| 
 | ||||
| 	TP_fast_assign_btrfs(fs_info, | ||||
| 		__entry->bytenr		= rec->bytenr; | ||||
| 		__entry->bytenr		= bytenr; | ||||
| 		__entry->num_bytes	= rec->num_bytes; | ||||
| 	), | ||||
| 
 | ||||
|  | @ -1727,17 +1728,19 @@ DECLARE_EVENT_CLASS(btrfs_qgroup_extent, | |||
| DEFINE_EVENT(btrfs_qgroup_extent, btrfs_qgroup_account_extents, | ||||
| 
 | ||||
| 	TP_PROTO(const struct btrfs_fs_info *fs_info, | ||||
| 		 const struct btrfs_qgroup_extent_record *rec), | ||||
| 		 const struct btrfs_qgroup_extent_record *rec, | ||||
| 		 u64 bytenr), | ||||
| 
 | ||||
| 	TP_ARGS(fs_info, rec) | ||||
| 	TP_ARGS(fs_info, rec, bytenr) | ||||
| ); | ||||
| 
 | ||||
| DEFINE_EVENT(btrfs_qgroup_extent, btrfs_qgroup_trace_extent, | ||||
| 
 | ||||
| 	TP_PROTO(const struct btrfs_fs_info *fs_info, | ||||
| 		 const struct btrfs_qgroup_extent_record *rec), | ||||
| 		 const struct btrfs_qgroup_extent_record *rec, | ||||
| 		 u64 bytenr), | ||||
| 
 | ||||
| 	TP_ARGS(fs_info, rec) | ||||
| 	TP_ARGS(fs_info, rec, bytenr) | ||||
| ); | ||||
| 
 | ||||
| TRACE_EVENT(qgroup_num_dirty_extents, | ||||
|  | @ -2341,7 +2344,6 @@ DEFINE_BTRFS_LOCK_EVENT(btrfs_tree_read_unlock_blocking); | |||
| DEFINE_BTRFS_LOCK_EVENT(btrfs_set_lock_blocking_read); | ||||
| DEFINE_BTRFS_LOCK_EVENT(btrfs_set_lock_blocking_write); | ||||
| DEFINE_BTRFS_LOCK_EVENT(btrfs_try_tree_read_lock); | ||||
| DEFINE_BTRFS_LOCK_EVENT(btrfs_try_tree_write_lock); | ||||
| DEFINE_BTRFS_LOCK_EVENT(btrfs_tree_read_lock_atomic); | ||||
| 
 | ||||
| DECLARE_EVENT_CLASS(btrfs__space_info_update, | ||||
|  | @ -2553,10 +2555,9 @@ TRACE_EVENT(btrfs_extent_map_shrinker_count, | |||
| 
 | ||||
| TRACE_EVENT(btrfs_extent_map_shrinker_scan_enter, | ||||
| 
 | ||||
| 	TP_PROTO(const struct btrfs_fs_info *fs_info, long nr_to_scan, long nr, | ||||
| 		 u64 last_root_id, u64 last_ino), | ||||
| 	TP_PROTO(const struct btrfs_fs_info *fs_info, long nr), | ||||
| 
 | ||||
| 	TP_ARGS(fs_info, nr_to_scan, nr, last_root_id, last_ino), | ||||
| 	TP_ARGS(fs_info, nr), | ||||
| 
 | ||||
| 	TP_STRUCT__entry_btrfs( | ||||
| 		__field(	long,	nr_to_scan	) | ||||
|  | @ -2566,10 +2567,11 @@ TRACE_EVENT(btrfs_extent_map_shrinker_scan_enter, | |||
| 	), | ||||
| 
 | ||||
| 	TP_fast_assign_btrfs(fs_info, | ||||
| 		__entry->nr_to_scan	= nr_to_scan; | ||||
| 		__entry->nr_to_scan	= \ | ||||
| 		     atomic64_read(&fs_info->em_shrinker_nr_to_scan); | ||||
| 		__entry->nr		= nr; | ||||
| 		__entry->last_root_id	= last_root_id; | ||||
| 		__entry->last_ino	= last_ino; | ||||
| 		__entry->last_root_id	= fs_info->em_shrinker_last_root; | ||||
| 		__entry->last_ino	= fs_info->em_shrinker_last_ino; | ||||
| 	), | ||||
| 
 | ||||
| 	TP_printk_btrfs("nr_to_scan=%ld nr=%ld last_root=%llu(%s) last_ino=%llu", | ||||
|  | @ -2579,10 +2581,9 @@ TRACE_EVENT(btrfs_extent_map_shrinker_scan_enter, | |||
| 
 | ||||
| TRACE_EVENT(btrfs_extent_map_shrinker_scan_exit, | ||||
| 
 | ||||
| 	TP_PROTO(const struct btrfs_fs_info *fs_info, long nr_dropped, long nr, | ||||
| 		 u64 last_root_id, u64 last_ino), | ||||
| 	TP_PROTO(const struct btrfs_fs_info *fs_info, long nr_dropped, long nr), | ||||
| 
 | ||||
| 	TP_ARGS(fs_info, nr_dropped, nr, last_root_id, last_ino), | ||||
| 	TP_ARGS(fs_info, nr_dropped, nr), | ||||
| 
 | ||||
| 	TP_STRUCT__entry_btrfs( | ||||
| 		__field(	long,	nr_dropped	) | ||||
|  | @ -2594,8 +2595,8 @@ TRACE_EVENT(btrfs_extent_map_shrinker_scan_exit, | |||
| 	TP_fast_assign_btrfs(fs_info, | ||||
| 		__entry->nr_dropped	= nr_dropped; | ||||
| 		__entry->nr		= nr; | ||||
| 		__entry->last_root_id	= last_root_id; | ||||
| 		__entry->last_ino	= last_ino; | ||||
| 		__entry->last_root_id	= fs_info->em_shrinker_last_root; | ||||
| 		__entry->last_ino	= fs_info->em_shrinker_last_ino; | ||||
| 	), | ||||
| 
 | ||||
| 	TP_printk_btrfs("nr_dropped=%ld nr=%ld last_root=%llu(%s) last_ino=%llu", | ||||
|  |  | |||
|  | @ -1049,6 +1049,29 @@ struct btrfs_ioctl_encoded_io_args { | |||
| #define BTRFS_ENCODED_IO_ENCRYPTION_NONE 0 | ||||
| #define BTRFS_ENCODED_IO_ENCRYPTION_TYPES 1 | ||||
| 
 | ||||
| /*
 | ||||
|  * Wait for subvolume cleaning process. This queries the kernel queue and it | ||||
|  * can change between the calls. | ||||
|  * | ||||
|  * - FOR_ONE	- specify the subvolid | ||||
|  * - FOR_QUEUED - wait for all currently queued | ||||
|  * - COUNT	- count number of queued | ||||
|  * - PEEK_FIRST - read which is the first in the queue (to be cleaned or being | ||||
|  * 		  cleaned already), or 0 if the queue is empty | ||||
|  * - PEEK_LAST  - read the last subvolid in the queue, or 0 if the queue is empty | ||||
|  */ | ||||
| struct btrfs_ioctl_subvol_wait { | ||||
| 	__u64 subvolid; | ||||
| 	__u32 mode; | ||||
| 	__u32 count; | ||||
| }; | ||||
| 
 | ||||
| #define BTRFS_SUBVOL_SYNC_WAIT_FOR_ONE		(0) | ||||
| #define BTRFS_SUBVOL_SYNC_WAIT_FOR_QUEUED	(1) | ||||
| #define BTRFS_SUBVOL_SYNC_COUNT			(2) | ||||
| #define BTRFS_SUBVOL_SYNC_PEEK_FIRST		(3) | ||||
| #define BTRFS_SUBVOL_SYNC_PEEK_LAST		(4) | ||||
| 
 | ||||
| /* Error codes as returned by the kernel */ | ||||
| enum btrfs_err_code { | ||||
| 	BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET = 1, | ||||
|  | @ -1181,6 +1204,8 @@ enum btrfs_err_code { | |||
| 				    struct btrfs_ioctl_encoded_io_args) | ||||
| #define BTRFS_IOC_ENCODED_WRITE _IOW(BTRFS_IOCTL_MAGIC, 64, \ | ||||
| 				     struct btrfs_ioctl_encoded_io_args) | ||||
| #define BTRFS_IOC_SUBVOL_SYNC_WAIT _IOW(BTRFS_IOCTL_MAGIC, 65, \ | ||||
| 					struct btrfs_ioctl_subvol_wait) | ||||
| 
 | ||||
| #ifdef __cplusplus | ||||
| } | ||||
|  |  | |||
|  | @ -119,9 +119,13 @@ EXPORT_SYMBOL_GPL(io_uring_cmd_mark_cancelable); | |||
| static void io_uring_cmd_work(struct io_kiocb *req, struct io_tw_state *ts) | ||||
| { | ||||
| 	struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); | ||||
| 	unsigned int flags = IO_URING_F_COMPLETE_DEFER; | ||||
| 
 | ||||
| 	if (current->flags & (PF_EXITING | PF_KTHREAD)) | ||||
| 		flags |= IO_URING_F_TASK_DEAD; | ||||
| 
 | ||||
| 	/* task_work executor checks the deffered list completion */ | ||||
| 	ioucmd->task_work_cb(ioucmd, IO_URING_F_COMPLETE_DEFER); | ||||
| 	ioucmd->task_work_cb(ioucmd, flags); | ||||
| } | ||||
| 
 | ||||
| void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd, | ||||
|  |  | |||
		Loading…
	
	Add table
		
		Reference in a new issue
	
	 Linus Torvalds
						Linus Torvalds