linux/drivers/gpu/drm/etnaviv/etnaviv_sched.c

160 lines
4.3 KiB
C
Raw Permalink Normal View History

// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2017 Etnaviv Project
*/
#include <linux/moduleparam.h>
#include "etnaviv_drv.h"
#include "etnaviv_dump.h"
#include "etnaviv_gem.h"
#include "etnaviv_gpu.h"
#include "etnaviv_sched.h"
#include "state.xml.h"
#include "state_hi.xml.h"
static int etnaviv_job_hang_limit = 0;
module_param_named(job_hang_limit, etnaviv_job_hang_limit, int , 0444);
static int etnaviv_hw_jobs_limit = 4;
module_param_named(hw_job_limit, etnaviv_hw_jobs_limit, int , 0444);
static struct dma_fence *etnaviv_sched_run_job(struct drm_sched_job *sched_job)
{
struct etnaviv_gem_submit *submit = to_etnaviv_submit(sched_job);
struct dma_fence *fence = NULL;
if (likely(!sched_job->s_fence->finished.error))
fence = etnaviv_gpu_submit(submit);
else
dev_dbg(submit->gpu->dev, "skipping bad job\n");
return fence;
}
drm/scheduler: Job timeout handler returns status (v3) This patch does not change current behaviour. The driver's job timeout handler now returns status indicating back to the DRM layer whether the device (GPU) is no longer available, such as after it's been unplugged, or whether all is normal, i.e. current behaviour. All drivers which make use of the drm_sched_backend_ops' .timedout_job() callback have been accordingly renamed and return the would've-been default value of DRM_GPU_SCHED_STAT_NOMINAL to restart the task's timeout timer--this is the old behaviour, and is preserved by this patch. v2: Use enum as the status of a driver's job timeout callback method. v3: Return scheduler/device information, rather than task information. Cc: Alexander Deucher <Alexander.Deucher@amd.com> Cc: Andrey Grodzovsky <Andrey.Grodzovsky@amd.com> Cc: Christian König <christian.koenig@amd.com> Cc: Daniel Vetter <daniel.vetter@ffwll.ch> Cc: Lucas Stach <l.stach@pengutronix.de> Cc: Russell King <linux+etnaviv@armlinux.org.uk> Cc: Christian Gmeiner <christian.gmeiner@gmail.com> Cc: Qiang Yu <yuq825@gmail.com> Cc: Rob Herring <robh@kernel.org> Cc: Tomeu Vizoso <tomeu.vizoso@collabora.com> Cc: Steven Price <steven.price@arm.com> Cc: Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com> Cc: Eric Anholt <eric@anholt.net> Reported-by: kernel test robot <lkp@intel.com> Signed-off-by: Luben Tuikov <luben.tuikov@amd.com> Acked-by: Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com> Acked-by: Christian König <christian.koenig@amd.com> Acked-by: Steven Price <steven.price@arm.com> Signed-off-by: Christian König <christian.koenig@amd.com> Link: https://patchwork.freedesktop.org/patch/415095/
2021-01-20 15:09:59 -05:00
static enum drm_gpu_sched_stat etnaviv_sched_timedout_job(struct drm_sched_job
*sched_job)
{
struct etnaviv_gem_submit *submit = to_etnaviv_submit(sched_job);
struct etnaviv_gpu *gpu = submit->gpu;
u32 dma_addr, primid = 0;
int change;
/*
* If the GPU managed to complete this jobs fence, the timeout has
* fired before free-job worker. The timeout is spurious, so bail out.
*/
if (dma_fence_is_signaled(submit->out_fence))
return DRM_GPU_SCHED_STAT_NO_HANG;
/*
* If the GPU is still making forward progress on the front-end (which
* should never loop) we shift out the timeout to give it a chance to
* finish the job.
*/
dma_addr = gpu_read(gpu, VIVS_FE_DMA_ADDRESS);
change = dma_addr - gpu->hangcheck_dma_addr;
if (submit->exec_state == ETNA_PIPE_3D) {
/* guard against concurrent usage from perfmon_sample */
mutex_lock(&gpu->lock);
gpu_write(gpu, VIVS_MC_PROFILE_CONFIG0,
VIVS_MC_PROFILE_CONFIG0_FE_CURRENT_PRIM <<
VIVS_MC_PROFILE_CONFIG0_FE__SHIFT);
primid = gpu_read(gpu, VIVS_MC_PROFILE_FE_READ);
mutex_unlock(&gpu->lock);
}
if (gpu->state == ETNA_GPU_STATE_RUNNING &&
(gpu->completed_fence != gpu->hangcheck_fence ||
change < 0 || change > 16 ||
(submit->exec_state == ETNA_PIPE_3D &&
gpu->hangcheck_primid != primid))) {
gpu->hangcheck_dma_addr = dma_addr;
gpu->hangcheck_primid = primid;
gpu->hangcheck_fence = gpu->completed_fence;
return DRM_GPU_SCHED_STAT_NO_HANG;
}
/* block scheduler */
drm_sched_stop(&gpu->sched, sched_job);
if(sched_job)
drm_sched_increase_karma(sched_job);
/* get the GPU back into the init state */
etnaviv_core_dump(submit);
etnaviv_gpu_recover_hang(submit);
drm_sched_resubmit_jobs(&gpu->sched);
drm/sched: add optional errno to drm_sched_start() The current implementation of drm_sched_start uses a hardcoded -ECANCELED to dispose of a job when the parent/hw fence is NULL. This results in drm_sched_job_done being called with -ECANCELED for each job with a NULL parent in the pending list, making it difficult to distinguish between recovery methods, whether a queue reset or a full GPU reset was used. To improve this, we first try a soft recovery for timeout jobs and use the error code -ENODATA. If soft recovery fails, we proceed with a queue reset, where the error code remains -ENODATA for the job. Finally, for a full GPU reset, we use error codes -ECANCELED or -ETIME. This patch adds an error code parameter to drm_sched_start, allowing us to differentiate between queue reset and GPU reset failures. This enables user mode and test applications to validate the expected correctness of the requested operation. After a successful queue reset, the only way to continue normal operation is to call drm_sched_job_done with the specific error code -ENODATA. v1: Initial implementation by Jesse utilized amdgpu_device_lock_reset_domain and amdgpu_device_unlock_reset_domain to allow user mode to track the queue reset status and distinguish between queue reset and GPU reset. v2: Christian suggested using the error codes -ENODATA for queue reset and -ECANCELED or -ETIME for GPU reset, returned to amdgpu_cs_wait_ioctl. v3: To meet the requirements, we introduce a new function drm_sched_start_ex with an additional parameter to set dma_fence_set_error, allowing us to handle the specific error codes appropriately and dispose of bad jobs with the selected error code depending on whether it was a queue reset or GPU reset. v4: Alex suggested using a new name, drm_sched_start_with_recovery_error, which more accurately describes the function's purpose. Additionally, it was recommended to add documentation details about the new method. v5: Fixed declaration of new function drm_sched_start_with_recovery_error.(Alex) v6 (chk): rebase on upstream changes, cleanup the commit message, drop the new function again and update all callers, apply the errno also to scheduler fences with hw fences v7 (chk): rebased Signed-off-by: Jesse Zhang <Jesse.Zhang@amd.com> Signed-off-by: Vitaly Prosyak <vitaly.prosyak@amd.com> Signed-off-by: Christian König <christian.koenig@amd.com> Acked-by: Daniel Vetter <daniel.vetter@ffwll.ch> Reviewed-by: Alex Deucher <alexander.deucher@amd.com> Link: https://patchwork.freedesktop.org/patch/msgid/20240826122541.85663-1-christian.koenig@amd.com
2024-08-26 14:25:38 +02:00
drm_sched_start(&gpu->sched, 0);
return DRM_GPU_SCHED_STAT_RESET;
}
static void etnaviv_sched_free_job(struct drm_sched_job *sched_job)
{
struct etnaviv_gem_submit *submit = to_etnaviv_submit(sched_job);
drm_sched_job_cleanup(sched_job);
etnaviv_submit_put(submit);
}
static const struct drm_sched_backend_ops etnaviv_sched_ops = {
.run_job = etnaviv_sched_run_job,
.timedout_job = etnaviv_sched_timedout_job,
.free_job = etnaviv_sched_free_job,
};
int etnaviv_sched_push_job(struct etnaviv_gem_submit *submit)
{
struct etnaviv_gpu *gpu = submit->gpu;
int ret;
/*
* Hold the sched lock across the whole operation to avoid jobs being
* pushed out of order with regard to their sched fence seqnos as
* allocated in drm_sched_job_arm.
*/
mutex_lock(&gpu->sched_lock);
drm/sched: Split drm_sched_job_init This is a very confusingly named function, because not just does it init an object, it arms it and provides a point of no return for pushing a job into the scheduler. It would be nice if that's a bit clearer in the interface. But the real reason is that I want to push the dependency tracking helpers into the scheduler code, and that means drm_sched_job_init must be called a lot earlier, without arming the job. v2: - don't change .gitignore (Steven) - don't forget v3d (Emma) v3: Emma noticed that I leak the memory allocated in drm_sched_job_init if we bail out before the point of no return in subsequent driver patches. To be able to fix this change drm_sched_job_cleanup() so it can handle being called both before and after drm_sched_job_arm(). Also improve the kerneldoc for this. v4: - Fix the drm_sched_job_cleanup logic, I inverted the booleans, as usual (Melissa) - Christian pointed out that drm_sched_entity_select_rq() also needs to be moved into drm_sched_job_arm, which made me realize that the job->id definitely needs to be moved too. Shuffle things to fit between job_init and job_arm. v5: Reshuffle the split between init/arm once more, amdgpu abuses drm_sched.ready to signal gpu reset failures. Also document this somewhat. (Christian) v6: Rebase on top of the msm drm/sched support. Note that the drm_sched_job_init() call is completely misplaced, and hence also the split-out drm_sched_entity_push_job(). I've put in a FIXME which the next patch will address. v7: Drop the FIXME in msm, after discussions with Rob I agree it shouldn't be a problem where it is now. Acked-by: Christian König <christian.koenig@amd.com> Acked-by: Melissa Wen <mwen@igalia.com> Cc: Melissa Wen <melissa.srw@gmail.com> Acked-by: Emma Anholt <emma@anholt.net> Acked-by: Steven Price <steven.price@arm.com> (v2) Reviewed-by: Boris Brezillon <boris.brezillon@collabora.com> (v5) Signed-off-by: Daniel Vetter <daniel.vetter@intel.com> Cc: Lucas Stach <l.stach@pengutronix.de> Cc: Russell King <linux+etnaviv@armlinux.org.uk> Cc: Christian Gmeiner <christian.gmeiner@gmail.com> Cc: Qiang Yu <yuq825@gmail.com> Cc: Rob Herring <robh@kernel.org> Cc: Tomeu Vizoso <tomeu.vizoso@collabora.com> Cc: Steven Price <steven.price@arm.com> Cc: Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com> Cc: David Airlie <airlied@linux.ie> Cc: Daniel Vetter <daniel@ffwll.ch> Cc: Sumit Semwal <sumit.semwal@linaro.org> Cc: "Christian König" <christian.koenig@amd.com> Cc: Masahiro Yamada <masahiroy@kernel.org> Cc: Kees Cook <keescook@chromium.org> Cc: Adam Borowski <kilobyte@angband.pl> Cc: Nick Terrell <terrelln@fb.com> Cc: Mauro Carvalho Chehab <mchehab+huawei@kernel.org> Cc: Paul Menzel <pmenzel@molgen.mpg.de> Cc: Sami Tolvanen <samitolvanen@google.com> Cc: Viresh Kumar <viresh.kumar@linaro.org> Cc: Alex Deucher <alexander.deucher@amd.com> Cc: Dave Airlie <airlied@redhat.com> Cc: Nirmoy Das <nirmoy.das@amd.com> Cc: Deepak R Varma <mh12gx2825@gmail.com> Cc: Lee Jones <lee.jones@linaro.org> Cc: Kevin Wang <kevin1.wang@amd.com> Cc: Chen Li <chenli@uniontech.com> Cc: Luben Tuikov <luben.tuikov@amd.com> Cc: "Marek Olšák" <marek.olsak@amd.com> Cc: Dennis Li <Dennis.Li@amd.com> Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com> Cc: Andrey Grodzovsky <andrey.grodzovsky@amd.com> Cc: Sonny Jiang <sonny.jiang@amd.com> Cc: Boris Brezillon <boris.brezillon@collabora.com> Cc: Tian Tao <tiantao6@hisilicon.com> Cc: etnaviv@lists.freedesktop.org Cc: lima@lists.freedesktop.org Cc: linux-media@vger.kernel.org Cc: linaro-mm-sig@lists.linaro.org Cc: Emma Anholt <emma@anholt.net> Cc: Rob Clark <robdclark@gmail.com> Cc: Sean Paul <sean@poorly.run> Cc: linux-arm-msm@vger.kernel.org Cc: freedreno@lists.freedesktop.org Link: https://patchwork.freedesktop.org/patch/msgid/20210817084917.3555822-1-daniel.vetter@ffwll.ch
2021-08-17 10:49:16 +02:00
drm_sched_job_arm(&submit->sched_job);
submit->out_fence = dma_fence_get(&submit->sched_job.s_fence->finished);
ret = xa_alloc_cyclic(&gpu->user_fences, &submit->out_fence_id,
submit->out_fence, xa_limit_32b,
&gpu->next_user_fence, GFP_KERNEL);
if (ret < 0) {
drm_sched_job_cleanup(&submit->sched_job);
goto out_unlock;
}
/* the scheduler holds on to the job now */
kref_get(&submit->refcount);
drm/sched: drop entity parameter from drm_sched_push_job Originally a job was only bound to the queue when we pushed this, but now that's done in drm_sched_job_init, making that parameter entirely redundant. Remove it. The same applies to the context parameter in lima_sched_context_queue_task, simplify that too. v2: Rebase on top of msm adopting drm/sched Reviewed-by: Christian König <christian.koenig@amd.com> Acked-by: Emma Anholt <emma@anholt.net> Acked-by: Melissa Wen <mwen@igalia.com> Reviewed-by: Steven Price <steven.price@arm.com> (v1) Reviewed-by: Boris Brezillon <boris.brezillon@collabora.com> (v1) Signed-off-by: Daniel Vetter <daniel.vetter@intel.com> Cc: Lucas Stach <l.stach@pengutronix.de> Cc: Russell King <linux+etnaviv@armlinux.org.uk> Cc: Christian Gmeiner <christian.gmeiner@gmail.com> Cc: Qiang Yu <yuq825@gmail.com> Cc: Rob Herring <robh@kernel.org> Cc: Tomeu Vizoso <tomeu.vizoso@collabora.com> Cc: Steven Price <steven.price@arm.com> Cc: Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com> Cc: Emma Anholt <emma@anholt.net> Cc: David Airlie <airlied@linux.ie> Cc: Daniel Vetter <daniel@ffwll.ch> Cc: Sumit Semwal <sumit.semwal@linaro.org> Cc: "Christian König" <christian.koenig@amd.com> Cc: Alex Deucher <alexander.deucher@amd.com> Cc: Nirmoy Das <nirmoy.das@amd.com> Cc: Dave Airlie <airlied@redhat.com> Cc: Chen Li <chenli@uniontech.com> Cc: Lee Jones <lee.jones@linaro.org> Cc: Deepak R Varma <mh12gx2825@gmail.com> Cc: Kevin Wang <kevin1.wang@amd.com> Cc: Luben Tuikov <luben.tuikov@amd.com> Cc: "Marek Olšák" <marek.olsak@amd.com> Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com> Cc: Andrey Grodzovsky <andrey.grodzovsky@amd.com> Cc: Dennis Li <Dennis.Li@amd.com> Cc: Boris Brezillon <boris.brezillon@collabora.com> Cc: etnaviv@lists.freedesktop.org Cc: lima@lists.freedesktop.org Cc: linux-media@vger.kernel.org Cc: linaro-mm-sig@lists.linaro.org Cc: Rob Clark <robdclark@gmail.com> Cc: Sean Paul <sean@poorly.run> Cc: Melissa Wen <mwen@igalia.com> Cc: linux-arm-msm@vger.kernel.org Cc: freedreno@lists.freedesktop.org Link: https://patchwork.freedesktop.org/patch/msgid/20210805104705.862416-6-daniel.vetter@ffwll.ch
2021-08-05 12:46:50 +02:00
drm_sched_entity_push_job(&submit->sched_job);
out_unlock:
mutex_unlock(&gpu->sched_lock);
return ret;
}
int etnaviv_sched_init(struct etnaviv_gpu *gpu)
{
const struct drm_sched_init_args args = {
.ops = &etnaviv_sched_ops,
.num_rqs = DRM_SCHED_PRIORITY_COUNT,
.credit_limit = etnaviv_hw_jobs_limit,
.hang_limit = etnaviv_job_hang_limit,
.timeout = msecs_to_jiffies(500),
.name = dev_name(gpu->dev),
.dev = gpu->dev,
};
return drm_sched_init(&gpu->sched, &args);
}
void etnaviv_sched_fini(struct etnaviv_gpu *gpu)
{
drm_sched_fini(&gpu->sched);
}