linux/drivers/gpu/drm/i915/gt/intel_gt_requests.c

247 lines
5.9 KiB
C
Raw Normal View History

/*
* SPDX-License-Identifier: MIT
*
* Copyright © 2019 Intel Corporation
*/
drm/i915/gt: Schedule request retirement when timeline idles The major drawback of commit 7e34f4e4aad3 ("drm/i915/gen8+: Add RC6 CTX corruption WA") is that it disables RC6 while Skylake (and friends) is active, and we do not consider the GPU idle until all outstanding requests have been retired and the engine switched over to the kernel context. If userspace is idle, this task falls onto our background idle worker, which only runs roughly once a second, meaning that userspace has to have been idle for a couple of seconds before we enable RC6 again. Naturally, this causes us to consume considerably more energy than before as powersaving is effectively disabled while a display server (here's looking at you Xorg) is running. As execlists will get a completion event as each context is completed, we can use this interrupt to queue a retire worker bound to this engine to cleanup idle timelines. We will then immediately notice the idle engine (without userspace intervention or the aid of the background retire worker) and start parking the GPU. Thus during light workloads, we will do much more work to idle the GPU faster... Hopefully with commensurate power saving! v2: Watch context completions and only look at those local to the engine when retiring to reduce the amount of excess work we perform. Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=112315 References: 7e34f4e4aad3 ("drm/i915/gen8+: Add RC6 CTX corruption WA") References: 2248a28384fe ("drm/i915/gen8+: Add RC6 CTX corruption WA") Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20191125105858.1718307-3-chris@chris-wilson.co.uk
2019-11-25 10:58:58 +00:00
#include <linux/workqueue.h>
#include "i915_drv.h" /* for_each_engine() */
#include "i915_request.h"
#include "intel_engine_heartbeat.h"
#include "intel_gt.h"
#include "intel_gt_pm.h"
#include "intel_gt_requests.h"
#include "intel_timeline.h"
static bool retire_requests(struct intel_timeline *tl)
{
struct i915_request *rq, *rn;
list_for_each_entry_safe(rq, rn, &tl->requests, link)
if (!i915_request_retire(rq))
return false;
/* And check nothing new was submitted */
return !i915_active_fence_isset(&tl->last_request);
}
static bool engine_active(const struct intel_engine_cs *engine)
{
return !list_empty(&engine->kernel_context->timeline->requests);
}
static bool flush_submission(struct intel_gt *gt, long timeout)
{
struct intel_engine_cs *engine;
enum intel_engine_id id;
bool active = false;
if (!timeout)
return false;
if (!intel_gt_pm_is_awake(gt))
return false;
for_each_engine(engine, gt, id) {
intel_engine_flush_submission(engine);
/* Flush the background retirement and idle barriers */
flush_work(&engine->retire_work);
flush_delayed_work(&engine->wakeref.work);
/* Is the idle barrier still outstanding? */
active |= engine_active(engine);
}
return active;
}
drm/i915/gt: Schedule request retirement when timeline idles The major drawback of commit 7e34f4e4aad3 ("drm/i915/gen8+: Add RC6 CTX corruption WA") is that it disables RC6 while Skylake (and friends) is active, and we do not consider the GPU idle until all outstanding requests have been retired and the engine switched over to the kernel context. If userspace is idle, this task falls onto our background idle worker, which only runs roughly once a second, meaning that userspace has to have been idle for a couple of seconds before we enable RC6 again. Naturally, this causes us to consume considerably more energy than before as powersaving is effectively disabled while a display server (here's looking at you Xorg) is running. As execlists will get a completion event as each context is completed, we can use this interrupt to queue a retire worker bound to this engine to cleanup idle timelines. We will then immediately notice the idle engine (without userspace intervention or the aid of the background retire worker) and start parking the GPU. Thus during light workloads, we will do much more work to idle the GPU faster... Hopefully with commensurate power saving! v2: Watch context completions and only look at those local to the engine when retiring to reduce the amount of excess work we perform. Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=112315 References: 7e34f4e4aad3 ("drm/i915/gen8+: Add RC6 CTX corruption WA") References: 2248a28384fe ("drm/i915/gen8+: Add RC6 CTX corruption WA") Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20191125105858.1718307-3-chris@chris-wilson.co.uk
2019-11-25 10:58:58 +00:00
static void engine_retire(struct work_struct *work)
{
struct intel_engine_cs *engine =
container_of(work, typeof(*engine), retire_work);
struct intel_timeline *tl = xchg(&engine->retire, NULL);
do {
struct intel_timeline *next = xchg(&tl->retire, NULL);
/*
* Our goal here is to retire _idle_ timelines as soon as
* possible (as they are idle, we do not expect userspace
* to be cleaning up anytime soon).
*
* If the timeline is currently locked, either it is being
* retired elsewhere or about to be!
*/
if (mutex_trylock(&tl->mutex)) {
retire_requests(tl);
mutex_unlock(&tl->mutex);
}
intel_timeline_put(tl);
GEM_BUG_ON(!next);
tl = ptr_mask_bits(next, 1);
} while (tl);
}
static bool add_retire(struct intel_engine_cs *engine,
struct intel_timeline *tl)
{
#define STUB ((struct intel_timeline *)1)
drm/i915/gt: Schedule request retirement when timeline idles The major drawback of commit 7e34f4e4aad3 ("drm/i915/gen8+: Add RC6 CTX corruption WA") is that it disables RC6 while Skylake (and friends) is active, and we do not consider the GPU idle until all outstanding requests have been retired and the engine switched over to the kernel context. If userspace is idle, this task falls onto our background idle worker, which only runs roughly once a second, meaning that userspace has to have been idle for a couple of seconds before we enable RC6 again. Naturally, this causes us to consume considerably more energy than before as powersaving is effectively disabled while a display server (here's looking at you Xorg) is running. As execlists will get a completion event as each context is completed, we can use this interrupt to queue a retire worker bound to this engine to cleanup idle timelines. We will then immediately notice the idle engine (without userspace intervention or the aid of the background retire worker) and start parking the GPU. Thus during light workloads, we will do much more work to idle the GPU faster... Hopefully with commensurate power saving! v2: Watch context completions and only look at those local to the engine when retiring to reduce the amount of excess work we perform. Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=112315 References: 7e34f4e4aad3 ("drm/i915/gen8+: Add RC6 CTX corruption WA") References: 2248a28384fe ("drm/i915/gen8+: Add RC6 CTX corruption WA") Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20191125105858.1718307-3-chris@chris-wilson.co.uk
2019-11-25 10:58:58 +00:00
struct intel_timeline *first;
/*
* We open-code a llist here to include the additional tag [BIT(0)]
* so that we know when the timeline is already on a
* retirement queue: either this engine or another.
*/
if (cmpxchg(&tl->retire, NULL, STUB)) /* already queued */
drm/i915/gt: Schedule request retirement when timeline idles The major drawback of commit 7e34f4e4aad3 ("drm/i915/gen8+: Add RC6 CTX corruption WA") is that it disables RC6 while Skylake (and friends) is active, and we do not consider the GPU idle until all outstanding requests have been retired and the engine switched over to the kernel context. If userspace is idle, this task falls onto our background idle worker, which only runs roughly once a second, meaning that userspace has to have been idle for a couple of seconds before we enable RC6 again. Naturally, this causes us to consume considerably more energy than before as powersaving is effectively disabled while a display server (here's looking at you Xorg) is running. As execlists will get a completion event as each context is completed, we can use this interrupt to queue a retire worker bound to this engine to cleanup idle timelines. We will then immediately notice the idle engine (without userspace intervention or the aid of the background retire worker) and start parking the GPU. Thus during light workloads, we will do much more work to idle the GPU faster... Hopefully with commensurate power saving! v2: Watch context completions and only look at those local to the engine when retiring to reduce the amount of excess work we perform. Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=112315 References: 7e34f4e4aad3 ("drm/i915/gen8+: Add RC6 CTX corruption WA") References: 2248a28384fe ("drm/i915/gen8+: Add RC6 CTX corruption WA") Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20191125105858.1718307-3-chris@chris-wilson.co.uk
2019-11-25 10:58:58 +00:00
return false;
intel_timeline_get(tl);
first = READ_ONCE(engine->retire);
do
tl->retire = ptr_pack_bits(first, 1, 1);
while (!try_cmpxchg(&engine->retire, &first, tl));
return !first;
}
void intel_engine_add_retire(struct intel_engine_cs *engine,
struct intel_timeline *tl)
{
/* We don't deal well with the engine disappearing beneath us */
GEM_BUG_ON(intel_engine_is_virtual(engine));
drm/i915/gt: Schedule request retirement when timeline idles The major drawback of commit 7e34f4e4aad3 ("drm/i915/gen8+: Add RC6 CTX corruption WA") is that it disables RC6 while Skylake (and friends) is active, and we do not consider the GPU idle until all outstanding requests have been retired and the engine switched over to the kernel context. If userspace is idle, this task falls onto our background idle worker, which only runs roughly once a second, meaning that userspace has to have been idle for a couple of seconds before we enable RC6 again. Naturally, this causes us to consume considerably more energy than before as powersaving is effectively disabled while a display server (here's looking at you Xorg) is running. As execlists will get a completion event as each context is completed, we can use this interrupt to queue a retire worker bound to this engine to cleanup idle timelines. We will then immediately notice the idle engine (without userspace intervention or the aid of the background retire worker) and start parking the GPU. Thus during light workloads, we will do much more work to idle the GPU faster... Hopefully with commensurate power saving! v2: Watch context completions and only look at those local to the engine when retiring to reduce the amount of excess work we perform. Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=112315 References: 7e34f4e4aad3 ("drm/i915/gen8+: Add RC6 CTX corruption WA") References: 2248a28384fe ("drm/i915/gen8+: Add RC6 CTX corruption WA") Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20191125105858.1718307-3-chris@chris-wilson.co.uk
2019-11-25 10:58:58 +00:00
if (add_retire(engine, tl))
schedule_work(&engine->retire_work);
}
void intel_engine_init_retire(struct intel_engine_cs *engine)
{
INIT_WORK(&engine->retire_work, engine_retire);
}
void intel_engine_fini_retire(struct intel_engine_cs *engine)
{
flush_work(&engine->retire_work);
GEM_BUG_ON(engine->retire);
}
long intel_gt_retire_requests_timeout(struct intel_gt *gt, long timeout)
{
struct intel_gt_timelines *timelines = &gt->timelines;
struct intel_timeline *tl, *tn;
Revert "drm/i915/gt: Wait for new requests in intel_gt_retire_requests()" From inside an active timeline in the execbuf ioctl, we may try to reclaim some space in the GGTT. We need GGTT space for all objects on !full-ppgtt platforms, and for context images everywhere. However, to free up space in the GGTT we may need to remove some pinned objects (e.g. context images) that require flushing the idle barriers to remove. For this we use the big hammer of intel_gt_wait_for_idle() However, commit 7936a22dd466 ("drm/i915/gt: Wait for new requests in intel_gt_retire_requests()") will continue spinning on the wait if a timeline is active but lacks requests, as is the case during execbuf reservation. Spinning forever is quite time consuming, so revert that commit and start again. In practice, the effect commit 7936a22dd466 was trying to achieve is accomplished by commit 1683d24c1470 ("drm/i915/gt: Move new timelines to the end of active_list"), so there is no immediate rush to replace the looping. Testcase: igt/gem_exec_reloc/basic-range Fixes: 7936a22dd466 ("drm/i915/gt: Wait for new requests in intel_gt_retire_requests()") References: 1683d24c1470 ("drm/i915/gt: Move new timelines to the end of active_list") Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20191121071044.97798-1-chris@chris-wilson.co.uk
2019-11-21 07:10:40 +00:00
unsigned long active_count = 0;
LIST_HEAD(free);
flush_submission(gt, timeout); /* kick the ksoftirqd tasklets */
spin_lock(&timelines->lock);
list_for_each_entry_safe(tl, tn, &timelines->active_list, link) {
if (!mutex_trylock(&tl->mutex)) {
active_count++; /* report busy to caller, try again? */
continue;
}
intel_timeline_get(tl);
drm/i915/gt: Close race between engine_park and intel_gt_retire_requests The general concept was that intel_timeline.active_count was locked by the intel_timeline.mutex. The exception was for power management, where the engine->kernel_context->timeline could be manipulated under the global wakeref.mutex. This was quite solid, as we always manipulated the timeline only while we held an engine wakeref. And then we started retiring requests outside of struct_mutex, only using the timelines.active_list and the timeline->mutex. There we started manipulating intel_timeline.active_count outside of an engine wakeref, and so introduced a race between __engine_park() and intel_gt_retire_requests(), a race that could result in the engine->kernel_context not being added to the active timelines and so losing requests, which caused us to keep the system permanently powered up [and unloadable]. The race would be easy to close if we could take the engine wakeref for the timeline before we retire -- except timelines are not bound to any engine and so we would need to keep all active engines awake. The alternative is to guard intel_timeline_enter/intel_timeline_exit for use outside of the timeline->mutex. Fixes: e5dadff4b093 ("drm/i915: Protect request retirement with timeline->mutex") Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Matthew Auld <matthew.auld@intel.com> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20191120165514.3955081-1-chris@chris-wilson.co.uk
2019-11-20 16:55:13 +00:00
GEM_BUG_ON(!atomic_read(&tl->active_count));
atomic_inc(&tl->active_count); /* pin the list element */
spin_unlock(&timelines->lock);
if (timeout > 0) {
struct dma_fence *fence;
fence = i915_active_fence_get(&tl->last_request);
if (fence) {
mutex_unlock(&tl->mutex);
timeout = dma_fence_wait_timeout(fence,
true,
timeout);
dma_fence_put(fence);
/* Retirement is best effort */
if (!mutex_trylock(&tl->mutex)) {
active_count++;
goto out_active;
}
}
}
if (!retire_requests(tl))
active_count++;
mutex_unlock(&tl->mutex);
out_active: spin_lock(&timelines->lock);
/* Resume list iteration after reacquiring spinlock */
list_safe_reset_next(tl, tn, link);
if (atomic_dec_and_test(&tl->active_count))
list_del(&tl->link);
/* Defer the final release to after the spinlock */
if (refcount_dec_and_test(&tl->kref.refcount)) {
drm/i915/gt: Close race between engine_park and intel_gt_retire_requests The general concept was that intel_timeline.active_count was locked by the intel_timeline.mutex. The exception was for power management, where the engine->kernel_context->timeline could be manipulated under the global wakeref.mutex. This was quite solid, as we always manipulated the timeline only while we held an engine wakeref. And then we started retiring requests outside of struct_mutex, only using the timelines.active_list and the timeline->mutex. There we started manipulating intel_timeline.active_count outside of an engine wakeref, and so introduced a race between __engine_park() and intel_gt_retire_requests(), a race that could result in the engine->kernel_context not being added to the active timelines and so losing requests, which caused us to keep the system permanently powered up [and unloadable]. The race would be easy to close if we could take the engine wakeref for the timeline before we retire -- except timelines are not bound to any engine and so we would need to keep all active engines awake. The alternative is to guard intel_timeline_enter/intel_timeline_exit for use outside of the timeline->mutex. Fixes: e5dadff4b093 ("drm/i915: Protect request retirement with timeline->mutex") Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Matthew Auld <matthew.auld@intel.com> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20191120165514.3955081-1-chris@chris-wilson.co.uk
2019-11-20 16:55:13 +00:00
GEM_BUG_ON(atomic_read(&tl->active_count));
list_add(&tl->link, &free);
}
}
spin_unlock(&timelines->lock);
list_for_each_entry_safe(tl, tn, &free, link)
__intel_timeline_free(&tl->kref);
if (flush_submission(gt, timeout)) /* Wait, there's more! */
active_count++;
Revert "drm/i915/gt: Wait for new requests in intel_gt_retire_requests()" From inside an active timeline in the execbuf ioctl, we may try to reclaim some space in the GGTT. We need GGTT space for all objects on !full-ppgtt platforms, and for context images everywhere. However, to free up space in the GGTT we may need to remove some pinned objects (e.g. context images) that require flushing the idle barriers to remove. For this we use the big hammer of intel_gt_wait_for_idle() However, commit 7936a22dd466 ("drm/i915/gt: Wait for new requests in intel_gt_retire_requests()") will continue spinning on the wait if a timeline is active but lacks requests, as is the case during execbuf reservation. Spinning forever is quite time consuming, so revert that commit and start again. In practice, the effect commit 7936a22dd466 was trying to achieve is accomplished by commit 1683d24c1470 ("drm/i915/gt: Move new timelines to the end of active_list"), so there is no immediate rush to replace the looping. Testcase: igt/gem_exec_reloc/basic-range Fixes: 7936a22dd466 ("drm/i915/gt: Wait for new requests in intel_gt_retire_requests()") References: 1683d24c1470 ("drm/i915/gt: Move new timelines to the end of active_list") Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20191121071044.97798-1-chris@chris-wilson.co.uk
2019-11-21 07:10:40 +00:00
return active_count ? timeout : 0;
}
int intel_gt_wait_for_idle(struct intel_gt *gt, long timeout)
{
/* If the device is asleep, we have no requests outstanding */
if (!intel_gt_pm_is_awake(gt))
return 0;
while ((timeout = intel_gt_retire_requests_timeout(gt, timeout)) > 0) {
cond_resched();
if (signal_pending(current))
return -EINTR;
}
return timeout;
}
static void retire_work_handler(struct work_struct *work)
{
struct intel_gt *gt =
container_of(work, typeof(*gt), requests.retire_work.work);
schedule_delayed_work(&gt->requests.retire_work,
round_jiffies_up_relative(HZ));
intel_gt_retire_requests(gt);
}
void intel_gt_init_requests(struct intel_gt *gt)
{
INIT_DELAYED_WORK(&gt->requests.retire_work, retire_work_handler);
}
void intel_gt_park_requests(struct intel_gt *gt)
{
cancel_delayed_work(&gt->requests.retire_work);
}
void intel_gt_unpark_requests(struct intel_gt *gt)
{
schedule_delayed_work(&gt->requests.retire_work,
round_jiffies_up_relative(HZ));
}
void intel_gt_fini_requests(struct intel_gt *gt)
{
/* Wait until the work is marked as finished before unloading! */
cancel_delayed_work_sync(&gt->requests.retire_work);
}