mirror of
				git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
				synced 2025-09-18 22:14:16 +00:00 
			
		
		
		
	for-5.20/io_uring-buffered-writes-2022-07-29
-----BEGIN PGP SIGNATURE----- iQJEBAABCAAuFiEEwPw5LcreJtl1+l5K99NY+ylx4KYFAmLkm7UQHGF4Ym9lQGtl cm5lbC5kawAKCRD301j7KXHgpldTEADTg/96R+eq78UZBNZmdifY9/qwQD+kzNiK ACDoYZFSbWUjMeOWqRxbYr6mXBKHnGHyTGlraTTpLDzhpB1xwoWfgOK9uOYXW/Ik eWfgTujPW/8v/l/z86khE+GH9b/maGCRqNZgS6uLVLzhxG6oCkoYTyOh1iHaF1VM Rma4nbJ8GSEDtiXNDl0Bznnyks/pzwoz/9slwzZ7PxtFwZsBxKuxgMUR5HIXdRp7 5iUoFJhZrGWyi/dbQZUsK/9VYVVnKkcBCz2pb4GEmC+3dS/vlPEoeWUpPHInNyd1 9NB9v8c+KFmQaWnCxuxcdHvCfmRRQrX8Pr8/OBNZKO6McYrKWKA+lurp4EGClE3m cZdK+P/9FS/Eeua8hum9UnbPAqsJPqLTbpbrySeBdd4iFA6u7rRqDX2+nz3PNe9U 1b7V1bWBIEY/Rsw/PKo59oIeV0auD8v9OCHJ0lF2pv6dRln2/W0y1Qfd1DI18xFG +9bBnQzhF7R0O8UP5ApVayQCYrd906YsSVUOqAiLmUs/BoOgRq6g/0BqSOVVKE2u 5iq8zTsVMkxY0ZpExwZST/700JwkPIV4SVPEYRC6QssFTcylvlisIek6XYSS9HX4 Z6gzMwJW1H47bEfG4JolTI8uBjp0hQLCPX0O0XFLVnbHQwN0kjIBmv3axAwJO2NV qrrHXjf09w== =hV7G -----END PGP SIGNATURE----- Merge tag 'for-5.20/io_uring-buffered-writes-2022-07-29' of git://git.kernel.dk/linux-block Pull io_uring buffered writes support from Jens Axboe: "This contains support for buffered writes, specifically for XFS. btrfs is in progress, will be coming in the next release. io_uring does support buffered writes on any file type, but since the buffered write path just always -EAGAIN (or -EOPNOTSUPP) any attempt to do so if IOCB_NOWAIT is set, any buffered write will effectively be handled by io-wq offload. This isn't very efficient, and we even have specific code in io-wq to serialize buffered writes to the same inode to avoid further inefficiencies with thread offload. This is particularly sad since most buffered writes don't block, they simply copy data to a page and dirty it. With this pull request, we can handle buffered writes a lot more effiently. If balance_dirty_pages() needs to block, we back off on writes as indicated. This improves buffered write support by 2-3x. Jan Kara helped with the mm bits for this, and Stefan handled the fs/iomap/xfs/io_uring parts of it" * tag 'for-5.20/io_uring-buffered-writes-2022-07-29' of git://git.kernel.dk/linux-block: mm: honor FGP_NOWAIT for page cache page allocation xfs: Add async buffered write support xfs: Specify lockmode when calling xfs_ilock_for_iomap() io_uring: Add tracepoint for short writes io_uring: fix issue with io_write() not always undoing sb_start_write() io_uring: Add support for async buffered writes fs: Add async write file modification handling. fs: Split off inode_needs_update_time and __file_update_time fs: add __remove_file_privs() with flags parameter fs: add a FMODE_BUF_WASYNC flags for f_mode iomap: Return -EAGAIN from iomap_write_iter() iomap: Add async buffered write support iomap: Add flags parameter to iomap_page_create() mm: Add balance_dirty_pages_ratelimited_flags() function mm: Move updates of dirty_exceeded into one place mm: Move starting of background writeback into the main balancing loop
This commit is contained in:
		
						commit
						98e2474640
					
				
					 11 changed files with 324 additions and 109 deletions
				
			
		
							
								
								
									
										170
									
								
								fs/inode.c
									
										
									
									
									
								
							
							
						
						
									
										170
									
								
								fs/inode.c
									
										
									
									
									
								
							|  | @ -2010,67 +2010,57 @@ static int __remove_privs(struct user_namespace *mnt_userns, | |||
| 	return notify_change(mnt_userns, dentry, &newattrs, NULL); | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Remove special file priviledges (suid, capabilities) when file is written | ||||
|  * to or truncated. | ||||
|  */ | ||||
| int file_remove_privs(struct file *file) | ||||
| static int __file_remove_privs(struct file *file, unsigned int flags) | ||||
| { | ||||
| 	struct dentry *dentry = file_dentry(file); | ||||
| 	struct inode *inode = file_inode(file); | ||||
| 	int error; | ||||
| 	int kill; | ||||
| 	int error = 0; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Fast path for nothing security related. | ||||
| 	 * As well for non-regular files, e.g. blkdev inodes. | ||||
| 	 * For example, blkdev_write_iter() might get here | ||||
| 	 * trying to remove privs which it is not allowed to. | ||||
| 	 */ | ||||
| 	if (IS_NOSEC(inode) || !S_ISREG(inode->i_mode)) | ||||
| 		return 0; | ||||
| 
 | ||||
| 	kill = dentry_needs_remove_privs(dentry); | ||||
| 	if (kill < 0) | ||||
| 	if (kill <= 0) | ||||
| 		return kill; | ||||
| 	if (kill) | ||||
| 		error = __remove_privs(file_mnt_user_ns(file), dentry, kill); | ||||
| 
 | ||||
| 	if (flags & IOCB_NOWAIT) | ||||
| 		return -EAGAIN; | ||||
| 
 | ||||
| 	error = __remove_privs(file_mnt_user_ns(file), dentry, kill); | ||||
| 	if (!error) | ||||
| 		inode_has_no_xattr(inode); | ||||
| 
 | ||||
| 	return error; | ||||
| } | ||||
| EXPORT_SYMBOL(file_remove_privs); | ||||
| 
 | ||||
| /**
 | ||||
|  *	file_update_time	-	update mtime and ctime time | ||||
|  *	@file: file accessed | ||||
|  * file_remove_privs - remove special file privileges (suid, capabilities) | ||||
|  * @file: file to remove privileges from | ||||
|  * | ||||
|  *	Update the mtime and ctime members of an inode and mark the inode | ||||
|  *	for writeback.  Note that this function is meant exclusively for | ||||
|  *	usage in the file write path of filesystems, and filesystems may | ||||
|  *	choose to explicitly ignore update via this function with the | ||||
|  *	S_NOCMTIME inode flag, e.g. for network filesystem where these | ||||
|  *	timestamps are handled by the server.  This can return an error for | ||||
|  *	file systems who need to allocate space in order to update an inode. | ||||
|  * When file is modified by a write or truncation ensure that special | ||||
|  * file privileges are removed. | ||||
|  * | ||||
|  * Return: 0 on success, negative errno on failure. | ||||
|  */ | ||||
| 
 | ||||
| int file_update_time(struct file *file) | ||||
| int file_remove_privs(struct file *file) | ||||
| { | ||||
| 	return __file_remove_privs(file, 0); | ||||
| } | ||||
| EXPORT_SYMBOL(file_remove_privs); | ||||
| 
 | ||||
| static int inode_needs_update_time(struct inode *inode, struct timespec64 *now) | ||||
| { | ||||
| 	struct inode *inode = file_inode(file); | ||||
| 	struct timespec64 now; | ||||
| 	int sync_it = 0; | ||||
| 	int ret; | ||||
| 
 | ||||
| 	/* First try to exhaust all avenues to not sync */ | ||||
| 	if (IS_NOCMTIME(inode)) | ||||
| 		return 0; | ||||
| 
 | ||||
| 	now = current_time(inode); | ||||
| 	if (!timespec64_equal(&inode->i_mtime, &now)) | ||||
| 	if (!timespec64_equal(&inode->i_mtime, now)) | ||||
| 		sync_it = S_MTIME; | ||||
| 
 | ||||
| 	if (!timespec64_equal(&inode->i_ctime, &now)) | ||||
| 	if (!timespec64_equal(&inode->i_ctime, now)) | ||||
| 		sync_it |= S_CTIME; | ||||
| 
 | ||||
| 	if (IS_I_VERSION(inode) && inode_iversion_need_inc(inode)) | ||||
|  | @ -2079,37 +2069,127 @@ int file_update_time(struct file *file) | |||
| 	if (!sync_it) | ||||
| 		return 0; | ||||
| 
 | ||||
| 	/* Finally allowed to write? Takes lock. */ | ||||
| 	if (__mnt_want_write_file(file)) | ||||
| 		return 0; | ||||
| 	return sync_it; | ||||
| } | ||||
| 
 | ||||
| 	ret = inode_update_time(inode, &now, sync_it); | ||||
| 	__mnt_drop_write_file(file); | ||||
| static int __file_update_time(struct file *file, struct timespec64 *now, | ||||
| 			int sync_mode) | ||||
| { | ||||
| 	int ret = 0; | ||||
| 	struct inode *inode = file_inode(file); | ||||
| 
 | ||||
| 	/* try to update time settings */ | ||||
| 	if (!__mnt_want_write_file(file)) { | ||||
| 		ret = inode_update_time(inode, now, sync_mode); | ||||
| 		__mnt_drop_write_file(file); | ||||
| 	} | ||||
| 
 | ||||
| 	return ret; | ||||
| } | ||||
| 
 | ||||
| /**
 | ||||
|  * file_update_time - update mtime and ctime time | ||||
|  * @file: file accessed | ||||
|  * | ||||
|  * Update the mtime and ctime members of an inode and mark the inode for | ||||
|  * writeback. Note that this function is meant exclusively for usage in | ||||
|  * the file write path of filesystems, and filesystems may choose to | ||||
|  * explicitly ignore updates via this function with the _NOCMTIME inode | ||||
|  * flag, e.g. for network filesystem where these imestamps are handled | ||||
|  * by the server. This can return an error for file systems who need to | ||||
|  * allocate space in order to update an inode. | ||||
|  * | ||||
|  * Return: 0 on success, negative errno on failure. | ||||
|  */ | ||||
| int file_update_time(struct file *file) | ||||
| { | ||||
| 	int ret; | ||||
| 	struct inode *inode = file_inode(file); | ||||
| 	struct timespec64 now = current_time(inode); | ||||
| 
 | ||||
| 	ret = inode_needs_update_time(inode, &now); | ||||
| 	if (ret <= 0) | ||||
| 		return ret; | ||||
| 
 | ||||
| 	return __file_update_time(file, &now, ret); | ||||
| } | ||||
| EXPORT_SYMBOL(file_update_time); | ||||
| 
 | ||||
| /* Caller must hold the file's inode lock */ | ||||
| int file_modified(struct file *file) | ||||
| /**
 | ||||
|  * file_modified_flags - handle mandated vfs changes when modifying a file | ||||
|  * @file: file that was modified | ||||
|  * @flags: kiocb flags | ||||
|  * | ||||
|  * When file has been modified ensure that special | ||||
|  * file privileges are removed and time settings are updated. | ||||
|  * | ||||
|  * If IOCB_NOWAIT is set, special file privileges will not be removed and | ||||
|  * time settings will not be updated. It will return -EAGAIN. | ||||
|  * | ||||
|  * Context: Caller must hold the file's inode lock. | ||||
|  * | ||||
|  * Return: 0 on success, negative errno on failure. | ||||
|  */ | ||||
| static int file_modified_flags(struct file *file, int flags) | ||||
| { | ||||
| 	int err; | ||||
| 	int ret; | ||||
| 	struct inode *inode = file_inode(file); | ||||
| 	struct timespec64 now = current_time(inode); | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Clear the security bits if the process is not being run by root. | ||||
| 	 * This keeps people from modifying setuid and setgid binaries. | ||||
| 	 */ | ||||
| 	err = file_remove_privs(file); | ||||
| 	if (err) | ||||
| 		return err; | ||||
| 	ret = __file_remove_privs(file, flags); | ||||
| 	if (ret) | ||||
| 		return ret; | ||||
| 
 | ||||
| 	if (unlikely(file->f_mode & FMODE_NOCMTIME)) | ||||
| 		return 0; | ||||
| 
 | ||||
| 	return file_update_time(file); | ||||
| 	ret = inode_needs_update_time(inode, &now); | ||||
| 	if (ret <= 0) | ||||
| 		return ret; | ||||
| 	if (flags & IOCB_NOWAIT) | ||||
| 		return -EAGAIN; | ||||
| 
 | ||||
| 	return __file_update_time(file, &now, ret); | ||||
| } | ||||
| 
 | ||||
| /**
 | ||||
|  * file_modified - handle mandated vfs changes when modifying a file | ||||
|  * @file: file that was modified | ||||
|  * | ||||
|  * When file has been modified ensure that special | ||||
|  * file privileges are removed and time settings are updated. | ||||
|  * | ||||
|  * Context: Caller must hold the file's inode lock. | ||||
|  * | ||||
|  * Return: 0 on success, negative errno on failure. | ||||
|  */ | ||||
| int file_modified(struct file *file) | ||||
| { | ||||
| 	return file_modified_flags(file, 0); | ||||
| } | ||||
| EXPORT_SYMBOL(file_modified); | ||||
| 
 | ||||
| /**
 | ||||
|  * kiocb_modified - handle mandated vfs changes when modifying a file | ||||
|  * @iocb: iocb that was modified | ||||
|  * | ||||
|  * When file has been modified ensure that special | ||||
|  * file privileges are removed and time settings are updated. | ||||
|  * | ||||
|  * Context: Caller must hold the file's inode lock. | ||||
|  * | ||||
|  * Return: 0 on success, negative errno on failure. | ||||
|  */ | ||||
| int kiocb_modified(struct kiocb *iocb) | ||||
| { | ||||
| 	return file_modified_flags(iocb->ki_filp, iocb->ki_flags); | ||||
| } | ||||
| EXPORT_SYMBOL_GPL(kiocb_modified); | ||||
| 
 | ||||
| int inode_needs_sync(struct inode *inode) | ||||
| { | ||||
| 	if (IS_SYNC(inode)) | ||||
|  |  | |||
|  | @ -44,20 +44,28 @@ static inline struct iomap_page *to_iomap_page(struct folio *folio) | |||
| static struct bio_set iomap_ioend_bioset; | ||||
| 
 | ||||
| static struct iomap_page * | ||||
| iomap_page_create(struct inode *inode, struct folio *folio) | ||||
| iomap_page_create(struct inode *inode, struct folio *folio, unsigned int flags) | ||||
| { | ||||
| 	struct iomap_page *iop = to_iomap_page(folio); | ||||
| 	unsigned int nr_blocks = i_blocks_per_folio(inode, folio); | ||||
| 	gfp_t gfp; | ||||
| 
 | ||||
| 	if (iop || nr_blocks <= 1) | ||||
| 		return iop; | ||||
| 
 | ||||
| 	if (flags & IOMAP_NOWAIT) | ||||
| 		gfp = GFP_NOWAIT; | ||||
| 	else | ||||
| 		gfp = GFP_NOFS | __GFP_NOFAIL; | ||||
| 
 | ||||
| 	iop = kzalloc(struct_size(iop, uptodate, BITS_TO_LONGS(nr_blocks)), | ||||
| 			GFP_NOFS | __GFP_NOFAIL); | ||||
| 	spin_lock_init(&iop->uptodate_lock); | ||||
| 	if (folio_test_uptodate(folio)) | ||||
| 		bitmap_fill(iop->uptodate, nr_blocks); | ||||
| 	folio_attach_private(folio, iop); | ||||
| 		      gfp); | ||||
| 	if (iop) { | ||||
| 		spin_lock_init(&iop->uptodate_lock); | ||||
| 		if (folio_test_uptodate(folio)) | ||||
| 			bitmap_fill(iop->uptodate, nr_blocks); | ||||
| 		folio_attach_private(folio, iop); | ||||
| 	} | ||||
| 	return iop; | ||||
| } | ||||
| 
 | ||||
|  | @ -226,7 +234,7 @@ static int iomap_read_inline_data(const struct iomap_iter *iter, | |||
| 	if (WARN_ON_ONCE(size > iomap->length)) | ||||
| 		return -EIO; | ||||
| 	if (offset > 0) | ||||
| 		iop = iomap_page_create(iter->inode, folio); | ||||
| 		iop = iomap_page_create(iter->inode, folio, iter->flags); | ||||
| 	else | ||||
| 		iop = to_iomap_page(folio); | ||||
| 
 | ||||
|  | @ -264,7 +272,7 @@ static loff_t iomap_readpage_iter(const struct iomap_iter *iter, | |||
| 		return iomap_read_inline_data(iter, folio); | ||||
| 
 | ||||
| 	/* zero post-eof blocks as the page may be mapped */ | ||||
| 	iop = iomap_page_create(iter->inode, folio); | ||||
| 	iop = iomap_page_create(iter->inode, folio, iter->flags); | ||||
| 	iomap_adjust_read_range(iter->inode, folio, &pos, length, &poff, &plen); | ||||
| 	if (plen == 0) | ||||
| 		goto done; | ||||
|  | @ -547,10 +555,11 @@ static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos, | |||
| 		size_t len, struct folio *folio) | ||||
| { | ||||
| 	const struct iomap *srcmap = iomap_iter_srcmap(iter); | ||||
| 	struct iomap_page *iop = iomap_page_create(iter->inode, folio); | ||||
| 	struct iomap_page *iop; | ||||
| 	loff_t block_size = i_blocksize(iter->inode); | ||||
| 	loff_t block_start = round_down(pos, block_size); | ||||
| 	loff_t block_end = round_up(pos + len, block_size); | ||||
| 	unsigned int nr_blocks = i_blocks_per_folio(iter->inode, folio); | ||||
| 	size_t from = offset_in_folio(folio, pos), to = from + len; | ||||
| 	size_t poff, plen; | ||||
| 
 | ||||
|  | @ -558,6 +567,10 @@ static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos, | |||
| 		return 0; | ||||
| 	folio_clear_error(folio); | ||||
| 
 | ||||
| 	iop = iomap_page_create(iter->inode, folio, iter->flags); | ||||
| 	if ((iter->flags & IOMAP_NOWAIT) && !iop && nr_blocks > 1) | ||||
| 		return -EAGAIN; | ||||
| 
 | ||||
| 	do { | ||||
| 		iomap_adjust_read_range(iter->inode, folio, &block_start, | ||||
| 				block_end - block_start, &poff, &plen); | ||||
|  | @ -574,7 +587,12 @@ static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos, | |||
| 				return -EIO; | ||||
| 			folio_zero_segments(folio, poff, from, to, poff + plen); | ||||
| 		} else { | ||||
| 			int status = iomap_read_folio_sync(block_start, folio, | ||||
| 			int status; | ||||
| 
 | ||||
| 			if (iter->flags & IOMAP_NOWAIT) | ||||
| 				return -EAGAIN; | ||||
| 
 | ||||
| 			status = iomap_read_folio_sync(block_start, folio, | ||||
| 					poff, plen, srcmap); | ||||
| 			if (status) | ||||
| 				return status; | ||||
|  | @ -603,6 +621,9 @@ static int iomap_write_begin(const struct iomap_iter *iter, loff_t pos, | |||
| 	unsigned fgp = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE | FGP_NOFS; | ||||
| 	int status = 0; | ||||
| 
 | ||||
| 	if (iter->flags & IOMAP_NOWAIT) | ||||
| 		fgp |= FGP_NOWAIT; | ||||
| 
 | ||||
| 	BUG_ON(pos + len > iter->iomap.offset + iter->iomap.length); | ||||
| 	if (srcmap != &iter->iomap) | ||||
| 		BUG_ON(pos + len > srcmap->offset + srcmap->length); | ||||
|  | @ -622,7 +643,7 @@ static int iomap_write_begin(const struct iomap_iter *iter, loff_t pos, | |||
| 	folio = __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT, | ||||
| 			fgp, mapping_gfp_mask(iter->inode->i_mapping)); | ||||
| 	if (!folio) { | ||||
| 		status = -ENOMEM; | ||||
| 		status = (iter->flags & IOMAP_NOWAIT) ? -EAGAIN : -ENOMEM; | ||||
| 		goto out_no_page; | ||||
| 	} | ||||
| 	if (pos + len > folio_pos(folio) + folio_size(folio)) | ||||
|  | @ -740,6 +761,8 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i) | |||
| 	loff_t pos = iter->pos; | ||||
| 	ssize_t written = 0; | ||||
| 	long status = 0; | ||||
| 	struct address_space *mapping = iter->inode->i_mapping; | ||||
| 	unsigned int bdp_flags = (iter->flags & IOMAP_NOWAIT) ? BDP_ASYNC : 0; | ||||
| 
 | ||||
| 	do { | ||||
| 		struct folio *folio; | ||||
|  | @ -752,6 +775,11 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i) | |||
| 		bytes = min_t(unsigned long, PAGE_SIZE - offset, | ||||
| 						iov_iter_count(i)); | ||||
| again: | ||||
| 		status = balance_dirty_pages_ratelimited_flags(mapping, | ||||
| 							       bdp_flags); | ||||
| 		if (unlikely(status)) | ||||
| 			break; | ||||
| 
 | ||||
| 		if (bytes > length) | ||||
| 			bytes = length; | ||||
| 
 | ||||
|  | @ -760,6 +788,10 @@ again: | |||
| 		 * Otherwise there's a nasty deadlock on copying from the | ||||
| 		 * same page as we're writing to, without it being marked | ||||
| 		 * up-to-date. | ||||
| 		 * | ||||
| 		 * For async buffered writes the assumption is that the user | ||||
| 		 * page has already been faulted in. This can be optimized by | ||||
| 		 * faulting the user page. | ||||
| 		 */ | ||||
| 		if (unlikely(fault_in_iov_iter_readable(i, bytes) == bytes)) { | ||||
| 			status = -EFAULT; | ||||
|  | @ -771,7 +803,7 @@ again: | |||
| 			break; | ||||
| 
 | ||||
| 		page = folio_file_page(folio, pos >> PAGE_SHIFT); | ||||
| 		if (mapping_writably_mapped(iter->inode->i_mapping)) | ||||
| 		if (mapping_writably_mapped(mapping)) | ||||
| 			flush_dcache_page(page); | ||||
| 
 | ||||
| 		copied = copy_page_from_iter_atomic(page, offset, bytes, i); | ||||
|  | @ -796,10 +828,12 @@ again: | |||
| 		pos += status; | ||||
| 		written += status; | ||||
| 		length -= status; | ||||
| 
 | ||||
| 		balance_dirty_pages_ratelimited(iter->inode->i_mapping); | ||||
| 	} while (iov_iter_count(i) && length); | ||||
| 
 | ||||
| 	if (status == -EAGAIN) { | ||||
| 		iov_iter_revert(i, written); | ||||
| 		return -EAGAIN; | ||||
| 	} | ||||
| 	return written ? written : status; | ||||
| } | ||||
| 
 | ||||
|  | @ -815,6 +849,9 @@ iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *i, | |||
| 	}; | ||||
| 	int ret; | ||||
| 
 | ||||
| 	if (iocb->ki_flags & IOCB_NOWAIT) | ||||
| 		iter.flags |= IOMAP_NOWAIT; | ||||
| 
 | ||||
| 	while ((ret = iomap_iter(&iter, ops)) > 0) | ||||
| 		iter.processed = iomap_write_iter(&iter, i); | ||||
| 	if (iter.pos == iocb->ki_pos) | ||||
|  | @ -1329,7 +1366,7 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc, | |||
| 		struct writeback_control *wbc, struct inode *inode, | ||||
| 		struct folio *folio, u64 end_pos) | ||||
| { | ||||
| 	struct iomap_page *iop = iomap_page_create(inode, folio); | ||||
| 	struct iomap_page *iop = iomap_page_create(inode, folio, 0); | ||||
| 	struct iomap_ioend *ioend, *next; | ||||
| 	unsigned len = i_blocksize(inode); | ||||
| 	unsigned nblocks = i_blocks_per_folio(inode, folio); | ||||
|  |  | |||
|  | @ -1663,7 +1663,9 @@ int generic_write_checks_count(struct kiocb *iocb, loff_t *count) | |||
| 	if (iocb->ki_flags & IOCB_APPEND) | ||||
| 		iocb->ki_pos = i_size_read(inode); | ||||
| 
 | ||||
| 	if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT)) | ||||
| 	if ((iocb->ki_flags & IOCB_NOWAIT) && | ||||
| 	    !((iocb->ki_flags & IOCB_DIRECT) || | ||||
| 	      (file->f_mode & FMODE_BUF_WASYNC))) | ||||
| 		return -EINVAL; | ||||
| 
 | ||||
| 	return generic_write_check_limits(iocb->ki_filp, iocb->ki_pos, count); | ||||
|  |  | |||
|  | @ -410,7 +410,7 @@ restart: | |||
| 		spin_unlock(&ip->i_flags_lock); | ||||
| 
 | ||||
| out: | ||||
| 	return file_modified(file); | ||||
| 	return kiocb_modified(iocb); | ||||
| } | ||||
| 
 | ||||
| static int | ||||
|  | @ -700,12 +700,11 @@ xfs_file_buffered_write( | |||
| 	bool			cleared_space = false; | ||||
| 	unsigned int		iolock; | ||||
| 
 | ||||
| 	if (iocb->ki_flags & IOCB_NOWAIT) | ||||
| 		return -EOPNOTSUPP; | ||||
| 
 | ||||
| write_retry: | ||||
| 	iolock = XFS_IOLOCK_EXCL; | ||||
| 	xfs_ilock(ip, iolock); | ||||
| 	ret = xfs_ilock_iocb(iocb, iolock); | ||||
| 	if (ret) | ||||
| 		return ret; | ||||
| 
 | ||||
| 	ret = xfs_file_write_checks(iocb, from, &iolock); | ||||
| 	if (ret) | ||||
|  | @ -1165,7 +1164,7 @@ xfs_file_open( | |||
| { | ||||
| 	if (xfs_is_shutdown(XFS_M(inode->i_sb))) | ||||
| 		return -EIO; | ||||
| 	file->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC; | ||||
| 	file->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC | FMODE_BUF_WASYNC; | ||||
| 	return generic_file_open(inode, file); | ||||
| } | ||||
| 
 | ||||
|  |  | |||
|  | @ -664,7 +664,7 @@ xfs_ilock_for_iomap( | |||
| 	unsigned		flags, | ||||
| 	unsigned		*lockmode) | ||||
| { | ||||
| 	unsigned		mode = XFS_ILOCK_SHARED; | ||||
| 	unsigned int		mode = *lockmode; | ||||
| 	bool			is_write = flags & (IOMAP_WRITE | IOMAP_ZERO); | ||||
| 
 | ||||
| 	/*
 | ||||
|  | @ -742,7 +742,7 @@ xfs_direct_write_iomap_begin( | |||
| 	int			nimaps = 1, error = 0; | ||||
| 	bool			shared = false; | ||||
| 	u16			iomap_flags = 0; | ||||
| 	unsigned		lockmode; | ||||
| 	unsigned int		lockmode = XFS_ILOCK_SHARED; | ||||
| 
 | ||||
| 	ASSERT(flags & (IOMAP_WRITE | IOMAP_ZERO)); | ||||
| 
 | ||||
|  | @ -886,6 +886,7 @@ xfs_buffered_write_iomap_begin( | |||
| 	bool			eof = false, cow_eof = false, shared = false; | ||||
| 	int			allocfork = XFS_DATA_FORK; | ||||
| 	int			error = 0; | ||||
| 	unsigned int		lockmode = XFS_ILOCK_EXCL; | ||||
| 
 | ||||
| 	if (xfs_is_shutdown(mp)) | ||||
| 		return -EIO; | ||||
|  | @ -897,7 +898,9 @@ xfs_buffered_write_iomap_begin( | |||
| 
 | ||||
| 	ASSERT(!XFS_IS_REALTIME_INODE(ip)); | ||||
| 
 | ||||
| 	xfs_ilock(ip, XFS_ILOCK_EXCL); | ||||
| 	error = xfs_ilock_for_iomap(ip, flags, &lockmode); | ||||
| 	if (error) | ||||
| 		return error; | ||||
| 
 | ||||
| 	if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(&ip->i_df)) || | ||||
| 	    XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { | ||||
|  | @ -1172,7 +1175,7 @@ xfs_read_iomap_begin( | |||
| 	xfs_fileoff_t		end_fsb = xfs_iomap_end_fsb(mp, offset, length); | ||||
| 	int			nimaps = 1, error = 0; | ||||
| 	bool			shared = false; | ||||
| 	unsigned		lockmode; | ||||
| 	unsigned int		lockmode = XFS_ILOCK_SHARED; | ||||
| 
 | ||||
| 	ASSERT(!(flags & (IOMAP_WRITE | IOMAP_ZERO))); | ||||
| 
 | ||||
|  |  | |||
|  | @ -180,6 +180,9 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, | |||
| /* File supports async buffered reads */ | ||||
| #define FMODE_BUF_RASYNC	((__force fmode_t)0x40000000) | ||||
| 
 | ||||
| /* File supports async nowait buffered writes */ | ||||
| #define FMODE_BUF_WASYNC	((__force fmode_t)0x80000000) | ||||
| 
 | ||||
| /*
 | ||||
|  * Attribute flags.  These should be or-ed together to figure out what | ||||
|  * has been changed! | ||||
|  | @ -2515,6 +2518,7 @@ static inline void file_accessed(struct file *file) | |||
| } | ||||
| 
 | ||||
| extern int file_modified(struct file *file); | ||||
| int kiocb_modified(struct kiocb *iocb); | ||||
| 
 | ||||
| int sync_inode_metadata(struct inode *inode, int wait); | ||||
| 
 | ||||
|  |  | |||
|  | @ -364,7 +364,14 @@ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty); | |||
| unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh); | ||||
| 
 | ||||
| void wb_update_bandwidth(struct bdi_writeback *wb); | ||||
| 
 | ||||
| /* Invoke balance dirty pages in async mode. */ | ||||
| #define BDP_ASYNC 0x0001 | ||||
| 
 | ||||
| void balance_dirty_pages_ratelimited(struct address_space *mapping); | ||||
| int balance_dirty_pages_ratelimited_flags(struct address_space *mapping, | ||||
| 		unsigned int flags); | ||||
| 
 | ||||
| bool wb_over_bg_thresh(struct bdi_writeback *wb); | ||||
| 
 | ||||
| typedef int (*writepage_t)(struct page *page, struct writeback_control *wbc, | ||||
|  |  | |||
|  | @ -630,6 +630,31 @@ TRACE_EVENT(io_uring_task_work_run, | |||
| 		 __entry->tctx, __entry->count, __entry->loops) | ||||
| ); | ||||
| 
 | ||||
| TRACE_EVENT(io_uring_short_write, | ||||
| 
 | ||||
| 	TP_PROTO(void *ctx, u64 fpos, u64 wanted, u64 got), | ||||
| 
 | ||||
| 	TP_ARGS(ctx, fpos, wanted, got), | ||||
| 
 | ||||
| 	TP_STRUCT__entry( | ||||
| 		__field(void *,	ctx) | ||||
| 		__field(u64,	fpos) | ||||
| 		__field(u64,	wanted) | ||||
| 		__field(u64,	got) | ||||
| 	), | ||||
| 
 | ||||
| 	TP_fast_assign( | ||||
| 		__entry->ctx	= ctx; | ||||
| 		__entry->fpos	= fpos; | ||||
| 		__entry->wanted	= wanted; | ||||
| 		__entry->got	= got; | ||||
| 	), | ||||
| 
 | ||||
| 	TP_printk("ring %p, fpos %lld, wanted %lld, got %lld", | ||||
| 			  __entry->ctx, __entry->fpos, | ||||
| 			  __entry->wanted, __entry->got) | ||||
| ); | ||||
| 
 | ||||
| #endif /* _TRACE_IO_URING_H */ | ||||
| 
 | ||||
| /* This part must be outside protection */ | ||||
|  |  | |||
|  | @ -641,7 +641,7 @@ static inline int io_iter_do_read(struct io_rw *rw, struct iov_iter *iter) | |||
| 		return -EINVAL; | ||||
| } | ||||
| 
 | ||||
| static bool need_read_all(struct io_kiocb *req) | ||||
| static bool need_complete_io(struct io_kiocb *req) | ||||
| { | ||||
| 	return req->flags & REQ_F_ISREG || | ||||
| 		S_ISBLK(file_inode(req->file)->i_mode); | ||||
|  | @ -775,7 +775,7 @@ int io_read(struct io_kiocb *req, unsigned int issue_flags) | |||
| 			kfree(iovec); | ||||
| 		return IOU_ISSUE_SKIP_COMPLETE; | ||||
| 	} else if (ret == req->cqe.res || ret <= 0 || !force_nonblock || | ||||
| 		   (req->flags & REQ_F_NOWAIT) || !need_read_all(req)) { | ||||
| 		   (req->flags & REQ_F_NOWAIT) || !need_complete_io(req)) { | ||||
| 		/* read all, failed, already did sync or don't want to retry */ | ||||
| 		goto done; | ||||
| 	} | ||||
|  | @ -870,9 +870,10 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags) | |||
| 		if (unlikely(!io_file_supports_nowait(req))) | ||||
| 			goto copy_iov; | ||||
| 
 | ||||
| 		/* file path doesn't support NOWAIT for non-direct_IO */ | ||||
| 		if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT) && | ||||
| 		    (req->flags & REQ_F_ISREG)) | ||||
| 		/* File path supports NOWAIT for non-direct_IO only for block devices. */ | ||||
| 		if (!(kiocb->ki_flags & IOCB_DIRECT) && | ||||
| 			!(kiocb->ki_filp->f_mode & FMODE_BUF_WASYNC) && | ||||
| 			(req->flags & REQ_F_ISREG)) | ||||
| 			goto copy_iov; | ||||
| 
 | ||||
| 		kiocb->ki_flags |= IOCB_NOWAIT; | ||||
|  | @ -928,13 +929,41 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags) | |||
| 		/* IOPOLL retry should happen for io-wq threads */ | ||||
| 		if (ret2 == -EAGAIN && (req->ctx->flags & IORING_SETUP_IOPOLL)) | ||||
| 			goto copy_iov; | ||||
| 
 | ||||
| 		if (ret2 != req->cqe.res && ret2 >= 0 && need_complete_io(req)) { | ||||
| 			struct io_async_rw *rw; | ||||
| 
 | ||||
| 			trace_io_uring_short_write(req->ctx, kiocb->ki_pos - ret2, | ||||
| 						req->cqe.res, ret2); | ||||
| 
 | ||||
| 			/* This is a partial write. The file pos has already been
 | ||||
| 			 * updated, setup the async struct to complete the request | ||||
| 			 * in the worker. Also update bytes_done to account for | ||||
| 			 * the bytes already written. | ||||
| 			 */ | ||||
| 			iov_iter_save_state(&s->iter, &s->iter_state); | ||||
| 			ret = io_setup_async_rw(req, iovec, s, true); | ||||
| 
 | ||||
| 			rw = req->async_data; | ||||
| 			if (rw) | ||||
| 				rw->bytes_done += ret2; | ||||
| 
 | ||||
| 			if (kiocb->ki_flags & IOCB_WRITE) | ||||
| 				kiocb_end_write(req); | ||||
| 			return ret ? ret : -EAGAIN; | ||||
| 		} | ||||
| done: | ||||
| 		ret = kiocb_done(req, ret2, issue_flags); | ||||
| 	} else { | ||||
| copy_iov: | ||||
| 		iov_iter_restore(&s->iter, &s->iter_state); | ||||
| 		ret = io_setup_async_rw(req, iovec, s, false); | ||||
| 		return ret ?: -EAGAIN; | ||||
| 		if (!ret) { | ||||
| 			if (kiocb->ki_flags & IOCB_WRITE) | ||||
| 				kiocb_end_write(req); | ||||
| 			return -EAGAIN; | ||||
| 		} | ||||
| 		return ret; | ||||
| 	} | ||||
| 	/* it's reportedly faster than delegating the null check to kfree() */ | ||||
| 	if (iovec) | ||||
|  |  | |||
|  | @ -1988,6 +1988,10 @@ no_page: | |||
| 			gfp |= __GFP_WRITE; | ||||
| 		if (fgp_flags & FGP_NOFS) | ||||
| 			gfp &= ~__GFP_FS; | ||||
| 		if (fgp_flags & FGP_NOWAIT) { | ||||
| 			gfp &= ~GFP_KERNEL; | ||||
| 			gfp |= GFP_NOWAIT | __GFP_NOWARN; | ||||
| 		} | ||||
| 
 | ||||
| 		folio = filemap_alloc_folio(gfp, 0); | ||||
| 		if (!folio) | ||||
|  |  | |||
|  | @ -1554,8 +1554,8 @@ static inline void wb_dirty_limits(struct dirty_throttle_control *dtc) | |||
|  * If we're over `background_thresh' then the writeback threads are woken to | ||||
|  * perform some writeout. | ||||
|  */ | ||||
| static void balance_dirty_pages(struct bdi_writeback *wb, | ||||
| 				unsigned long pages_dirtied) | ||||
| static int balance_dirty_pages(struct bdi_writeback *wb, | ||||
| 			       unsigned long pages_dirtied, unsigned int flags) | ||||
| { | ||||
| 	struct dirty_throttle_control gdtc_stor = { GDTC_INIT(wb) }; | ||||
| 	struct dirty_throttle_control mdtc_stor = { MDTC_INIT(wb, &gdtc_stor) }; | ||||
|  | @ -1575,6 +1575,7 @@ static void balance_dirty_pages(struct bdi_writeback *wb, | |||
| 	struct backing_dev_info *bdi = wb->bdi; | ||||
| 	bool strictlimit = bdi->capabilities & BDI_CAP_STRICTLIMIT; | ||||
| 	unsigned long start_time = jiffies; | ||||
| 	int ret = 0; | ||||
| 
 | ||||
| 	for (;;) { | ||||
| 		unsigned long now = jiffies; | ||||
|  | @ -1627,6 +1628,19 @@ static void balance_dirty_pages(struct bdi_writeback *wb, | |||
| 			} | ||||
| 		} | ||||
| 
 | ||||
| 		/*
 | ||||
| 		 * In laptop mode, we wait until hitting the higher threshold | ||||
| 		 * before starting background writeout, and then write out all | ||||
| 		 * the way down to the lower threshold.  So slow writers cause | ||||
| 		 * minimal disk activity. | ||||
| 		 * | ||||
| 		 * In normal mode, we start background writeout at the lower | ||||
| 		 * background_thresh, to keep the amount of dirty memory low. | ||||
| 		 */ | ||||
| 		if (!laptop_mode && nr_reclaimable > gdtc->bg_thresh && | ||||
| 		    !writeback_in_progress(wb)) | ||||
| 			wb_start_background_writeback(wb); | ||||
| 
 | ||||
| 		/*
 | ||||
| 		 * Throttle it only when the background writeback cannot | ||||
| 		 * catch-up. This avoids (excessively) small writeouts | ||||
|  | @ -1657,6 +1671,7 @@ free_running: | |||
| 			break; | ||||
| 		} | ||||
| 
 | ||||
| 		/* Start writeback even when in laptop mode */ | ||||
| 		if (unlikely(!writeback_in_progress(wb))) | ||||
| 			wb_start_background_writeback(wb); | ||||
| 
 | ||||
|  | @ -1715,8 +1730,8 @@ free_running: | |||
| 				sdtc = mdtc; | ||||
| 		} | ||||
| 
 | ||||
| 		if (dirty_exceeded && !wb->dirty_exceeded) | ||||
| 			wb->dirty_exceeded = 1; | ||||
| 		if (dirty_exceeded != wb->dirty_exceeded) | ||||
| 			wb->dirty_exceeded = dirty_exceeded; | ||||
| 
 | ||||
| 		if (time_is_before_jiffies(READ_ONCE(wb->bw_time_stamp) + | ||||
| 					   BANDWIDTH_INTERVAL)) | ||||
|  | @ -1789,6 +1804,10 @@ pause: | |||
| 					  period, | ||||
| 					  pause, | ||||
| 					  start_time); | ||||
| 		if (flags & BDP_ASYNC) { | ||||
| 			ret = -EAGAIN; | ||||
| 			break; | ||||
| 		} | ||||
| 		__set_current_state(TASK_KILLABLE); | ||||
| 		wb->dirty_sleep = now; | ||||
| 		io_schedule_timeout(pause); | ||||
|  | @ -1820,26 +1839,7 @@ pause: | |||
| 		if (fatal_signal_pending(current)) | ||||
| 			break; | ||||
| 	} | ||||
| 
 | ||||
| 	if (!dirty_exceeded && wb->dirty_exceeded) | ||||
| 		wb->dirty_exceeded = 0; | ||||
| 
 | ||||
| 	if (writeback_in_progress(wb)) | ||||
| 		return; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * In laptop mode, we wait until hitting the higher threshold before | ||||
| 	 * starting background writeout, and then write out all the way down | ||||
| 	 * to the lower threshold.  So slow writers cause minimal disk activity. | ||||
| 	 * | ||||
| 	 * In normal mode, we start background writeout at the lower | ||||
| 	 * background_thresh, to keep the amount of dirty memory low. | ||||
| 	 */ | ||||
| 	if (laptop_mode) | ||||
| 		return; | ||||
| 
 | ||||
| 	if (nr_reclaimable > gdtc->bg_thresh) | ||||
| 		wb_start_background_writeback(wb); | ||||
| 	return ret; | ||||
| } | ||||
| 
 | ||||
| static DEFINE_PER_CPU(int, bdp_ratelimits); | ||||
|  | @ -1861,27 +1861,34 @@ static DEFINE_PER_CPU(int, bdp_ratelimits); | |||
| DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0; | ||||
| 
 | ||||
| /**
 | ||||
|  * balance_dirty_pages_ratelimited - balance dirty memory state | ||||
|  * @mapping: address_space which was dirtied | ||||
|  * balance_dirty_pages_ratelimited_flags - Balance dirty memory state. | ||||
|  * @mapping: address_space which was dirtied. | ||||
|  * @flags: BDP flags. | ||||
|  * | ||||
|  * Processes which are dirtying memory should call in here once for each page | ||||
|  * which was newly dirtied.  The function will periodically check the system's | ||||
|  * dirty state and will initiate writeback if needed. | ||||
|  * | ||||
|  * Once we're over the dirty memory limit we decrease the ratelimiting | ||||
|  * by a lot, to prevent individual processes from overshooting the limit | ||||
|  * by (ratelimit_pages) each. | ||||
|  * See balance_dirty_pages_ratelimited() for details. | ||||
|  * | ||||
|  * Return: If @flags contains BDP_ASYNC, it may return -EAGAIN to | ||||
|  * indicate that memory is out of balance and the caller must wait | ||||
|  * for I/O to complete.  Otherwise, it will return 0 to indicate | ||||
|  * that either memory was already in balance, or it was able to sleep | ||||
|  * until the amount of dirty memory returned to balance. | ||||
|  */ | ||||
| void balance_dirty_pages_ratelimited(struct address_space *mapping) | ||||
| int balance_dirty_pages_ratelimited_flags(struct address_space *mapping, | ||||
| 					unsigned int flags) | ||||
| { | ||||
| 	struct inode *inode = mapping->host; | ||||
| 	struct backing_dev_info *bdi = inode_to_bdi(inode); | ||||
| 	struct bdi_writeback *wb = NULL; | ||||
| 	int ratelimit; | ||||
| 	int ret = 0; | ||||
| 	int *p; | ||||
| 
 | ||||
| 	if (!(bdi->capabilities & BDI_CAP_WRITEBACK)) | ||||
| 		return; | ||||
| 		return ret; | ||||
| 
 | ||||
| 	if (inode_cgwb_enabled(inode)) | ||||
| 		wb = wb_get_create_current(bdi, GFP_KERNEL); | ||||
|  | @ -1921,9 +1928,27 @@ void balance_dirty_pages_ratelimited(struct address_space *mapping) | |||
| 	preempt_enable(); | ||||
| 
 | ||||
| 	if (unlikely(current->nr_dirtied >= ratelimit)) | ||||
| 		balance_dirty_pages(wb, current->nr_dirtied); | ||||
| 		ret = balance_dirty_pages(wb, current->nr_dirtied, flags); | ||||
| 
 | ||||
| 	wb_put(wb); | ||||
| 	return ret; | ||||
| } | ||||
| 
 | ||||
| /**
 | ||||
|  * balance_dirty_pages_ratelimited - balance dirty memory state. | ||||
|  * @mapping: address_space which was dirtied. | ||||
|  * | ||||
|  * Processes which are dirtying memory should call in here once for each page | ||||
|  * which was newly dirtied.  The function will periodically check the system's | ||||
|  * dirty state and will initiate writeback if needed. | ||||
|  * | ||||
|  * Once we're over the dirty memory limit we decrease the ratelimiting | ||||
|  * by a lot, to prevent individual processes from overshooting the limit | ||||
|  * by (ratelimit_pages) each. | ||||
|  */ | ||||
| void balance_dirty_pages_ratelimited(struct address_space *mapping) | ||||
| { | ||||
| 	balance_dirty_pages_ratelimited_flags(mapping, 0); | ||||
| } | ||||
| EXPORT_SYMBOL(balance_dirty_pages_ratelimited); | ||||
| 
 | ||||
|  |  | |||
		Loading…
	
	Add table
		
		Reference in a new issue
	
	 Linus Torvalds
						Linus Torvalds