mirror of
				git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
				synced 2025-10-31 08:44:41 +00:00 
			
		
		
		
	block: Limit number of items taken from the I/O scheduler in one go
Flushes bypass the I/O scheduler and get added to hctx->dispatch in blk_mq_sched_bypass_insert. This can happen while a kworker is running hctx->run_work work item and is past the point in blk_mq_sched_dispatch_requests where hctx->dispatch is checked. The blk_mq_do_dispatch_sched call is not guaranteed to end in bounded time, because the I/O scheduler can feed an arbitrary number of commands. Since we have only one hctx->run_work, the commands waiting in hctx->dispatch will wait an arbitrary length of time for run_work to be rerun. A similar phenomenon exists with dispatches from the software queue. The solution is to poll hctx->dispatch in blk_mq_do_dispatch_sched and blk_mq_do_dispatch_ctx and return from the run_work handler and let it rerun. Signed-off-by: Salman Qazi <sqazi@google.com> Reviewed-by: Ming Lei <ming.lei@redhat.com> Signed-off-by: Jens Axboe <axboe@kernel.dk>
This commit is contained in:
		
							parent
							
								
									895d47759b
								
							
						
					
					
						commit
						28d65729b0
					
				
					 1 changed files with 51 additions and 13 deletions
				
			
		|  | @ -86,12 +86,16 @@ void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx) | |||
|  * Only SCSI implements .get_budget and .put_budget, and SCSI restarts | ||||
|  * its queue by itself in its completion handler, so we don't need to | ||||
|  * restart queue if .get_budget() returns BLK_STS_NO_RESOURCE. | ||||
|  * | ||||
|  * Returns -EAGAIN if hctx->dispatch was found non-empty and run_work has to | ||||
|  * be run again.  This is necessary to avoid starving flushes. | ||||
|  */ | ||||
| static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) | ||||
| static int blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) | ||||
| { | ||||
| 	struct request_queue *q = hctx->queue; | ||||
| 	struct elevator_queue *e = q->elevator; | ||||
| 	LIST_HEAD(rq_list); | ||||
| 	int ret = 0; | ||||
| 
 | ||||
| 	do { | ||||
| 		struct request *rq; | ||||
|  | @ -99,6 +103,11 @@ static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) | |||
| 		if (e->type->ops.has_work && !e->type->ops.has_work(hctx)) | ||||
| 			break; | ||||
| 
 | ||||
| 		if (!list_empty_careful(&hctx->dispatch)) { | ||||
| 			ret = -EAGAIN; | ||||
| 			break; | ||||
| 		} | ||||
| 
 | ||||
| 		if (!blk_mq_get_dispatch_budget(hctx)) | ||||
| 			break; | ||||
| 
 | ||||
|  | @ -123,6 +132,8 @@ static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) | |||
| 		 */ | ||||
| 		list_add(&rq->queuelist, &rq_list); | ||||
| 	} while (blk_mq_dispatch_rq_list(q, &rq_list, true)); | ||||
| 
 | ||||
| 	return ret; | ||||
| } | ||||
| 
 | ||||
| static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx, | ||||
|  | @ -140,16 +151,25 @@ static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx, | |||
|  * Only SCSI implements .get_budget and .put_budget, and SCSI restarts | ||||
|  * its queue by itself in its completion handler, so we don't need to | ||||
|  * restart queue if .get_budget() returns BLK_STS_NO_RESOURCE. | ||||
|  * | ||||
|  * Returns -EAGAIN if hctx->dispatch was found non-empty and run_work has to | ||||
|  * to be run again.  This is necessary to avoid starving flushes. | ||||
|  */ | ||||
| static void blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx) | ||||
| static int blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx) | ||||
| { | ||||
| 	struct request_queue *q = hctx->queue; | ||||
| 	LIST_HEAD(rq_list); | ||||
| 	struct blk_mq_ctx *ctx = READ_ONCE(hctx->dispatch_from); | ||||
| 	int ret = 0; | ||||
| 
 | ||||
| 	do { | ||||
| 		struct request *rq; | ||||
| 
 | ||||
| 		if (!list_empty_careful(&hctx->dispatch)) { | ||||
| 			ret = -EAGAIN; | ||||
| 			break; | ||||
| 		} | ||||
| 
 | ||||
| 		if (!sbitmap_any_bit_set(&hctx->ctx_map)) | ||||
| 			break; | ||||
| 
 | ||||
|  | @ -183,21 +203,17 @@ static void blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx) | |||
| 	} while (blk_mq_dispatch_rq_list(q, &rq_list, true)); | ||||
| 
 | ||||
| 	WRITE_ONCE(hctx->dispatch_from, ctx); | ||||
| 	return ret; | ||||
| } | ||||
| 
 | ||||
| void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) | ||||
| int __blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) | ||||
| { | ||||
| 	struct request_queue *q = hctx->queue; | ||||
| 	struct elevator_queue *e = q->elevator; | ||||
| 	const bool has_sched_dispatch = e && e->type->ops.dispatch_request; | ||||
| 	int ret = 0; | ||||
| 	LIST_HEAD(rq_list); | ||||
| 
 | ||||
| 	/* RCU or SRCU read lock is needed before checking quiesced flag */ | ||||
| 	if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q))) | ||||
| 		return; | ||||
| 
 | ||||
| 	hctx->run++; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * If we have previous entries on our dispatch list, grab them first for | ||||
| 	 * more fair dispatch. | ||||
|  | @ -226,19 +242,41 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) | |||
| 		blk_mq_sched_mark_restart_hctx(hctx); | ||||
| 		if (blk_mq_dispatch_rq_list(q, &rq_list, false)) { | ||||
| 			if (has_sched_dispatch) | ||||
| 				blk_mq_do_dispatch_sched(hctx); | ||||
| 				ret = blk_mq_do_dispatch_sched(hctx); | ||||
| 			else | ||||
| 				blk_mq_do_dispatch_ctx(hctx); | ||||
| 				ret = blk_mq_do_dispatch_ctx(hctx); | ||||
| 		} | ||||
| 	} else if (has_sched_dispatch) { | ||||
| 		blk_mq_do_dispatch_sched(hctx); | ||||
| 		ret = blk_mq_do_dispatch_sched(hctx); | ||||
| 	} else if (hctx->dispatch_busy) { | ||||
| 		/* dequeue request one by one from sw queue if queue is busy */ | ||||
| 		blk_mq_do_dispatch_ctx(hctx); | ||||
| 		ret = blk_mq_do_dispatch_ctx(hctx); | ||||
| 	} else { | ||||
| 		blk_mq_flush_busy_ctxs(hctx, &rq_list); | ||||
| 		blk_mq_dispatch_rq_list(q, &rq_list, false); | ||||
| 	} | ||||
| 
 | ||||
| 	return ret; | ||||
| } | ||||
| 
 | ||||
| void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) | ||||
| { | ||||
| 	struct request_queue *q = hctx->queue; | ||||
| 
 | ||||
| 	/* RCU or SRCU read lock is needed before checking quiesced flag */ | ||||
| 	if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q))) | ||||
| 		return; | ||||
| 
 | ||||
| 	hctx->run++; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * A return of -EAGAIN is an indication that hctx->dispatch is not | ||||
| 	 * empty and we must run again in order to avoid starving flushes. | ||||
| 	 */ | ||||
| 	if (__blk_mq_sched_dispatch_requests(hctx) == -EAGAIN) { | ||||
| 		if (__blk_mq_sched_dispatch_requests(hctx) == -EAGAIN) | ||||
| 			blk_mq_run_hw_queue(hctx, true); | ||||
| 	} | ||||
| } | ||||
| 
 | ||||
| bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio, | ||||
|  |  | |||
		Loading…
	
	Add table
		
		Reference in a new issue
	
	 Salman Qazi
						Salman Qazi