mirror of
				git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
				synced 2025-10-31 16:54:21 +00:00 
			
		
		
		
	[PATCH] Add vector AIO support
This work is initially done by Zach Brown to add support for vectored aio. These are the core changes for AIO to support IOCB_CMD_PREADV/IOCB_CMD_PWRITEV. [akpm@osdl.org: huge build fix] Signed-off-by: Zach Brown <zach.brown@oracle.com> Signed-off-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Badari Pulavarty <pbadari@us.ibm.com> Acked-by: Benjamin LaHaise <bcrl@kvack.org> Acked-by: James Morris <jmorris@namei.org> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
This commit is contained in:
		
							parent
							
								
									543ade1fc9
								
							
						
					
					
						commit
						eed4e51fb6
					
				
					 5 changed files with 203 additions and 108 deletions
				
			
		
							
								
								
									
										169
									
								
								fs/aio.c
									
										
									
									
									
								
							
							
						
						
									
										169
									
								
								fs/aio.c
									
										
									
									
									
								
							|  | @ -415,6 +415,7 @@ static struct kiocb fastcall *__aio_get_req(struct kioctx *ctx) | |||
| 	req->ki_retry = NULL; | ||||
| 	req->ki_dtor = NULL; | ||||
| 	req->private = NULL; | ||||
| 	req->ki_iovec = NULL; | ||||
| 	INIT_LIST_HEAD(&req->ki_run_list); | ||||
| 
 | ||||
| 	/* Check if the completion queue has enough free space to
 | ||||
|  | @ -460,6 +461,8 @@ static inline void really_put_req(struct kioctx *ctx, struct kiocb *req) | |||
| 
 | ||||
| 	if (req->ki_dtor) | ||||
| 		req->ki_dtor(req); | ||||
| 	if (req->ki_iovec != &req->ki_inline_vec) | ||||
| 		kfree(req->ki_iovec); | ||||
| 	kmem_cache_free(kiocb_cachep, req); | ||||
| 	ctx->reqs_active--; | ||||
| 
 | ||||
|  | @ -1301,42 +1304,60 @@ asmlinkage long sys_io_destroy(aio_context_t ctx) | |||
| 	return -EINVAL; | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * aio_p{read,write} are the default  ki_retry methods for | ||||
|  * IO_CMD_P{READ,WRITE}.  They maintains kiocb retry state around potentially | ||||
|  * multiple calls to f_op->aio_read().  They loop around partial progress | ||||
|  * instead of returning -EIOCBRETRY because they don't have the means to call | ||||
|  * kick_iocb(). | ||||
|  */ | ||||
| static ssize_t aio_pread(struct kiocb *iocb) | ||||
| static void aio_advance_iovec(struct kiocb *iocb, ssize_t ret) | ||||
| { | ||||
| 	struct iovec *iov = &iocb->ki_iovec[iocb->ki_cur_seg]; | ||||
| 
 | ||||
| 	BUG_ON(ret <= 0); | ||||
| 
 | ||||
| 	while (iocb->ki_cur_seg < iocb->ki_nr_segs && ret > 0) { | ||||
| 		ssize_t this = min((ssize_t)iov->iov_len, ret); | ||||
| 		iov->iov_base += this; | ||||
| 		iov->iov_len -= this; | ||||
| 		iocb->ki_left -= this; | ||||
| 		ret -= this; | ||||
| 		if (iov->iov_len == 0) { | ||||
| 			iocb->ki_cur_seg++; | ||||
| 			iov++; | ||||
| 		} | ||||
| 	} | ||||
| 
 | ||||
| 	/* the caller should not have done more io than what fit in
 | ||||
| 	 * the remaining iovecs */ | ||||
| 	BUG_ON(ret > 0 && iocb->ki_left == 0); | ||||
| } | ||||
| 
 | ||||
| static ssize_t aio_rw_vect_retry(struct kiocb *iocb) | ||||
| { | ||||
| 	struct file *file = iocb->ki_filp; | ||||
| 	struct address_space *mapping = file->f_mapping; | ||||
| 	struct inode *inode = mapping->host; | ||||
| 	ssize_t (*rw_op)(struct kiocb *, const struct iovec *, | ||||
| 			 unsigned long, loff_t); | ||||
| 	ssize_t ret = 0; | ||||
| 	unsigned short opcode; | ||||
| 
 | ||||
| 	if ((iocb->ki_opcode == IOCB_CMD_PREADV) || | ||||
| 		(iocb->ki_opcode == IOCB_CMD_PREAD)) { | ||||
| 		rw_op = file->f_op->aio_read; | ||||
| 		opcode = IOCB_CMD_PREADV; | ||||
| 	} else { | ||||
| 		rw_op = file->f_op->aio_write; | ||||
| 		opcode = IOCB_CMD_PWRITEV; | ||||
| 	} | ||||
| 
 | ||||
| 	do { | ||||
| 		iocb->ki_inline_vec.iov_base = iocb->ki_buf; | ||||
| 		iocb->ki_inline_vec.iov_len = iocb->ki_left; | ||||
| 		ret = rw_op(iocb, &iocb->ki_iovec[iocb->ki_cur_seg], | ||||
| 			    iocb->ki_nr_segs - iocb->ki_cur_seg, | ||||
| 			    iocb->ki_pos); | ||||
| 		if (ret > 0) | ||||
| 			aio_advance_iovec(iocb, ret); | ||||
| 
 | ||||
| 		ret = file->f_op->aio_read(iocb, &iocb->ki_inline_vec, | ||||
| 						1, iocb->ki_pos); | ||||
| 		/*
 | ||||
| 		 * Can't just depend on iocb->ki_left to determine | ||||
| 		 * whether we are done. This may have been a short read. | ||||
| 		 */ | ||||
| 		if (ret > 0) { | ||||
| 			iocb->ki_buf += ret; | ||||
| 			iocb->ki_left -= ret; | ||||
| 		} | ||||
| 
 | ||||
| 		/*
 | ||||
| 		 * For pipes and sockets we return once we have some data; for | ||||
| 		 * regular files we retry till we complete the entire read or | ||||
| 		 * find that we can't read any more data (e.g short reads). | ||||
| 		 */ | ||||
| 	/* retry all partial writes.  retry partial reads as long as its a
 | ||||
| 	 * regular file. */ | ||||
| 	} while (ret > 0 && iocb->ki_left > 0 && | ||||
| 		 !S_ISFIFO(inode->i_mode) && !S_ISSOCK(inode->i_mode)); | ||||
| 		 (opcode == IOCB_CMD_PWRITEV || | ||||
| 		  (!S_ISFIFO(inode->i_mode) && !S_ISSOCK(inode->i_mode)))); | ||||
| 
 | ||||
| 	/* This means we must have transferred all that we could */ | ||||
| 	/* No need to retry anymore */ | ||||
|  | @ -1346,30 +1367,6 @@ static ssize_t aio_pread(struct kiocb *iocb) | |||
| 	return ret; | ||||
| } | ||||
| 
 | ||||
| /* see aio_pread() */ | ||||
| static ssize_t aio_pwrite(struct kiocb *iocb) | ||||
| { | ||||
| 	struct file *file = iocb->ki_filp; | ||||
| 	ssize_t ret = 0; | ||||
| 
 | ||||
| 	do { | ||||
| 		iocb->ki_inline_vec.iov_base = iocb->ki_buf; | ||||
| 		iocb->ki_inline_vec.iov_len = iocb->ki_left; | ||||
| 
 | ||||
| 		ret = file->f_op->aio_write(iocb, &iocb->ki_inline_vec, | ||||
| 						1, iocb->ki_pos); | ||||
| 		if (ret > 0) { | ||||
| 			iocb->ki_buf += ret; | ||||
| 			iocb->ki_left -= ret; | ||||
| 		} | ||||
| 	} while (ret > 0 && iocb->ki_left > 0); | ||||
| 
 | ||||
| 	if ((ret == 0) || (iocb->ki_left == 0)) | ||||
| 		ret = iocb->ki_nbytes - iocb->ki_left; | ||||
| 
 | ||||
| 	return ret; | ||||
| } | ||||
| 
 | ||||
| static ssize_t aio_fdsync(struct kiocb *iocb) | ||||
| { | ||||
| 	struct file *file = iocb->ki_filp; | ||||
|  | @ -1390,6 +1387,38 @@ static ssize_t aio_fsync(struct kiocb *iocb) | |||
| 	return ret; | ||||
| } | ||||
| 
 | ||||
| static ssize_t aio_setup_vectored_rw(int type, struct kiocb *kiocb) | ||||
| { | ||||
| 	ssize_t ret; | ||||
| 
 | ||||
| 	ret = rw_copy_check_uvector(type, (struct iovec __user *)kiocb->ki_buf, | ||||
| 				    kiocb->ki_nbytes, 1, | ||||
| 				    &kiocb->ki_inline_vec, &kiocb->ki_iovec); | ||||
| 	if (ret < 0) | ||||
| 		goto out; | ||||
| 
 | ||||
| 	kiocb->ki_nr_segs = kiocb->ki_nbytes; | ||||
| 	kiocb->ki_cur_seg = 0; | ||||
| 	/* ki_nbytes/left now reflect bytes instead of segs */ | ||||
| 	kiocb->ki_nbytes = ret; | ||||
| 	kiocb->ki_left = ret; | ||||
| 
 | ||||
| 	ret = 0; | ||||
| out: | ||||
| 	return ret; | ||||
| } | ||||
| 
 | ||||
| static ssize_t aio_setup_single_vector(struct kiocb *kiocb) | ||||
| { | ||||
| 	kiocb->ki_iovec = &kiocb->ki_inline_vec; | ||||
| 	kiocb->ki_iovec->iov_base = kiocb->ki_buf; | ||||
| 	kiocb->ki_iovec->iov_len = kiocb->ki_left; | ||||
| 	kiocb->ki_nr_segs = 1; | ||||
| 	kiocb->ki_cur_seg = 0; | ||||
| 	kiocb->ki_nbytes = kiocb->ki_left; | ||||
| 	return 0; | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * aio_setup_iocb: | ||||
|  *	Performs the initial checks and aio retry method | ||||
|  | @ -1412,9 +1441,12 @@ static ssize_t aio_setup_iocb(struct kiocb *kiocb) | |||
| 		ret = security_file_permission(file, MAY_READ); | ||||
| 		if (unlikely(ret)) | ||||
| 			break; | ||||
| 		ret = aio_setup_single_vector(kiocb); | ||||
| 		if (ret) | ||||
| 			break; | ||||
| 		ret = -EINVAL; | ||||
| 		if (file->f_op->aio_read) | ||||
| 			kiocb->ki_retry = aio_pread; | ||||
| 			kiocb->ki_retry = aio_rw_vect_retry; | ||||
| 		break; | ||||
| 	case IOCB_CMD_PWRITE: | ||||
| 		ret = -EBADF; | ||||
|  | @ -1427,9 +1459,40 @@ static ssize_t aio_setup_iocb(struct kiocb *kiocb) | |||
| 		ret = security_file_permission(file, MAY_WRITE); | ||||
| 		if (unlikely(ret)) | ||||
| 			break; | ||||
| 		ret = aio_setup_single_vector(kiocb); | ||||
| 		if (ret) | ||||
| 			break; | ||||
| 		ret = -EINVAL; | ||||
| 		if (file->f_op->aio_write) | ||||
| 			kiocb->ki_retry = aio_pwrite; | ||||
| 			kiocb->ki_retry = aio_rw_vect_retry; | ||||
| 		break; | ||||
| 	case IOCB_CMD_PREADV: | ||||
| 		ret = -EBADF; | ||||
| 		if (unlikely(!(file->f_mode & FMODE_READ))) | ||||
| 			break; | ||||
| 		ret = security_file_permission(file, MAY_READ); | ||||
| 		if (unlikely(ret)) | ||||
| 			break; | ||||
| 		ret = aio_setup_vectored_rw(READ, kiocb); | ||||
| 		if (ret) | ||||
| 			break; | ||||
| 		ret = -EINVAL; | ||||
| 		if (file->f_op->aio_read) | ||||
| 			kiocb->ki_retry = aio_rw_vect_retry; | ||||
| 		break; | ||||
| 	case IOCB_CMD_PWRITEV: | ||||
| 		ret = -EBADF; | ||||
| 		if (unlikely(!(file->f_mode & FMODE_WRITE))) | ||||
| 			break; | ||||
| 		ret = security_file_permission(file, MAY_WRITE); | ||||
| 		if (unlikely(ret)) | ||||
| 			break; | ||||
| 		ret = aio_setup_vectored_rw(WRITE, kiocb); | ||||
| 		if (ret) | ||||
| 			break; | ||||
| 		ret = -EINVAL; | ||||
| 		if (file->f_op->aio_write) | ||||
| 			kiocb->ki_retry = aio_rw_vect_retry; | ||||
| 		break; | ||||
| 	case IOCB_CMD_FDSYNC: | ||||
| 		ret = -EINVAL; | ||||
|  |  | |||
							
								
								
									
										131
									
								
								fs/read_write.c
									
										
									
									
									
								
							
							
						
						
									
										131
									
								
								fs/read_write.c
									
										
									
									
									
								
							|  | @ -511,6 +511,74 @@ ssize_t do_loop_readv_writev(struct file *filp, struct iovec *iov, | |||
| /* A write operation does a read from user space and vice versa */ | ||||
| #define vrfy_dir(type) ((type) == READ ? VERIFY_WRITE : VERIFY_READ) | ||||
| 
 | ||||
| ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector, | ||||
| 			      unsigned long nr_segs, unsigned long fast_segs, | ||||
| 			      struct iovec *fast_pointer, | ||||
| 			      struct iovec **ret_pointer) | ||||
|   { | ||||
| 	unsigned long seg; | ||||
|   	ssize_t ret; | ||||
| 	struct iovec *iov = fast_pointer; | ||||
| 
 | ||||
|   	/*
 | ||||
|   	 * SuS says "The readv() function *may* fail if the iovcnt argument | ||||
|   	 * was less than or equal to 0, or greater than {IOV_MAX}.  Linux has | ||||
|   	 * traditionally returned zero for zero segments, so... | ||||
|   	 */ | ||||
| 	if (nr_segs == 0) { | ||||
| 		ret = 0; | ||||
|   		goto out; | ||||
| 	} | ||||
| 
 | ||||
|   	/*
 | ||||
|   	 * First get the "struct iovec" from user memory and | ||||
|   	 * verify all the pointers | ||||
|   	 */ | ||||
| 	if (nr_segs > UIO_MAXIOV) { | ||||
| 		ret = -EINVAL; | ||||
|   		goto out; | ||||
| 	} | ||||
| 	if (nr_segs > fast_segs) { | ||||
|   		iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL); | ||||
| 		if (iov == NULL) { | ||||
| 			ret = -ENOMEM; | ||||
|   			goto out; | ||||
| 		} | ||||
|   	} | ||||
| 	if (copy_from_user(iov, uvector, nr_segs*sizeof(*uvector))) { | ||||
| 		ret = -EFAULT; | ||||
|   		goto out; | ||||
| 	} | ||||
| 
 | ||||
|   	/*
 | ||||
| 	 * According to the Single Unix Specification we should return EINVAL | ||||
| 	 * if an element length is < 0 when cast to ssize_t or if the | ||||
| 	 * total length would overflow the ssize_t return value of the | ||||
| 	 * system call. | ||||
|   	 */ | ||||
| 	ret = 0; | ||||
|   	for (seg = 0; seg < nr_segs; seg++) { | ||||
|   		void __user *buf = iov[seg].iov_base; | ||||
|   		ssize_t len = (ssize_t)iov[seg].iov_len; | ||||
| 
 | ||||
| 		/* see if we we're about to use an invalid len or if
 | ||||
| 		 * it's about to overflow ssize_t */ | ||||
| 		if (len < 0 || (ret + len < ret)) { | ||||
| 			ret = -EINVAL; | ||||
|   			goto out; | ||||
| 		} | ||||
| 		if (unlikely(!access_ok(vrfy_dir(type), buf, len))) { | ||||
| 			ret = -EFAULT; | ||||
|   			goto out; | ||||
| 		} | ||||
| 
 | ||||
| 		ret += len; | ||||
|   	} | ||||
| out: | ||||
| 	*ret_pointer = iov; | ||||
| 	return ret; | ||||
| } | ||||
| 
 | ||||
| static ssize_t do_readv_writev(int type, struct file *file, | ||||
| 			       const struct iovec __user * uvector, | ||||
| 			       unsigned long nr_segs, loff_t *pos) | ||||
|  | @ -519,64 +587,20 @@ static ssize_t do_readv_writev(int type, struct file *file, | |||
| 	struct iovec iovstack[UIO_FASTIOV]; | ||||
| 	struct iovec *iov = iovstack; | ||||
| 	ssize_t ret; | ||||
| 	int seg; | ||||
| 	io_fn_t fn; | ||||
| 	iov_fn_t fnv; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * SuS says "The readv() function *may* fail if the iovcnt argument | ||||
| 	 * was less than or equal to 0, or greater than {IOV_MAX}.  Linux has | ||||
| 	 * traditionally returned zero for zero segments, so... | ||||
| 	 */ | ||||
| 	ret = 0; | ||||
| 	if (nr_segs == 0) | ||||
| 		goto out; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * First get the "struct iovec" from user memory and | ||||
| 	 * verify all the pointers | ||||
| 	 */ | ||||
| 	ret = -EINVAL; | ||||
| 	if (nr_segs > UIO_MAXIOV) | ||||
| 		goto out; | ||||
| 	if (!file->f_op) | ||||
| 		goto out; | ||||
| 	if (nr_segs > UIO_FASTIOV) { | ||||
| 		ret = -ENOMEM; | ||||
| 		iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL); | ||||
| 		if (!iov) | ||||
| 			goto out; | ||||
| 	} | ||||
| 	ret = -EFAULT; | ||||
| 	if (copy_from_user(iov, uvector, nr_segs*sizeof(*uvector))) | ||||
| 		goto out; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Single unix specification: | ||||
| 	 * We should -EINVAL if an element length is not >= 0 and fitting an | ||||
| 	 * ssize_t.  The total length is fitting an ssize_t | ||||
| 	 * | ||||
| 	 * Be careful here because iov_len is a size_t not an ssize_t | ||||
| 	 */ | ||||
| 	tot_len = 0; | ||||
| 	ret = -EINVAL; | ||||
| 	for (seg = 0; seg < nr_segs; seg++) { | ||||
| 		void __user *buf = iov[seg].iov_base; | ||||
| 		ssize_t len = (ssize_t)iov[seg].iov_len; | ||||
| 
 | ||||
| 		if (len < 0)	/* size_t not fitting an ssize_t .. */ | ||||
| 			goto out; | ||||
| 		if (unlikely(!access_ok(vrfy_dir(type), buf, len))) | ||||
| 			goto Efault; | ||||
| 		tot_len += len; | ||||
| 		if ((ssize_t)tot_len < 0) /* maths overflow on the ssize_t */ | ||||
| 			goto out; | ||||
| 	} | ||||
| 	if (tot_len == 0) { | ||||
| 		ret = 0; | ||||
| 	if (!file->f_op) { | ||||
| 		ret = -EINVAL; | ||||
| 		goto out; | ||||
| 	} | ||||
| 
 | ||||
| 	ret = rw_copy_check_uvector(type, uvector, nr_segs, | ||||
| 			ARRAY_SIZE(iovstack), iovstack, &iov); | ||||
| 	if (ret <= 0) | ||||
| 		goto out; | ||||
| 
 | ||||
| 	tot_len = ret; | ||||
| 	ret = rw_verify_area(type, file, pos, tot_len); | ||||
| 	if (ret < 0) | ||||
| 		goto out; | ||||
|  | @ -609,9 +633,6 @@ out: | |||
| 			fsnotify_modify(file->f_dentry); | ||||
| 	} | ||||
| 	return ret; | ||||
| Efault: | ||||
| 	ret = -EFAULT; | ||||
| 	goto out; | ||||
| } | ||||
| 
 | ||||
| ssize_t vfs_readv(struct file *file, const struct iovec __user *vec, | ||||
|  |  | |||
|  | @ -7,6 +7,7 @@ | |||
| #include <linux/uio.h> | ||||
| 
 | ||||
| #include <asm/atomic.h> | ||||
| #include <linux/uio.h> | ||||
| 
 | ||||
| #define AIO_MAXSEGS		4 | ||||
| #define AIO_KIOGRP_NR_ATOMIC	8 | ||||
|  | @ -114,6 +115,9 @@ struct kiocb { | |||
| 	long			ki_kicked; 	/* just for testing */ | ||||
| 	long			ki_queued; 	/* just for testing */ | ||||
| 	struct iovec		ki_inline_vec;	/* inline vector */ | ||||
|  	struct iovec		*ki_iovec; | ||||
|  	unsigned long		ki_nr_segs; | ||||
|  	unsigned long		ki_cur_seg; | ||||
| 
 | ||||
| 	struct list_head	ki_list;	/* the aio core uses this
 | ||||
| 						 * for cancellation */ | ||||
|  |  | |||
|  | @ -41,6 +41,8 @@ enum { | |||
| 	 * IOCB_CMD_POLL = 5, | ||||
| 	 */ | ||||
| 	IOCB_CMD_NOOP = 6, | ||||
| 	IOCB_CMD_PREADV = 7, | ||||
| 	IOCB_CMD_PWRITEV = 8, | ||||
| }; | ||||
| 
 | ||||
| /* read() from /dev/aio returns these structures. */ | ||||
|  |  | |||
|  | @ -1150,6 +1150,11 @@ struct inode_operations { | |||
| 
 | ||||
| struct seq_file; | ||||
| 
 | ||||
| ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector, | ||||
| 				unsigned long nr_segs, unsigned long fast_segs, | ||||
| 				struct iovec *fast_pointer, | ||||
| 				struct iovec **ret_pointer); | ||||
| 
 | ||||
| extern ssize_t vfs_read(struct file *, char __user *, size_t, loff_t *); | ||||
| extern ssize_t vfs_write(struct file *, const char __user *, size_t, loff_t *); | ||||
| extern ssize_t vfs_readv(struct file *, const struct iovec __user *, | ||||
|  |  | |||
		Loading…
	
	Add table
		
		Reference in a new issue
	
	 Badari Pulavarty
						Badari Pulavarty