2019-08-12 09:29:35 +00:00
|
|
|
/* SPDX-License-Identifier: MIT */
|
2017-10-04 18:13:41 +00:00
|
|
|
/*
|
2019-08-12 09:29:35 +00:00
|
|
|
* Copyright © 2014-2019 Intel Corporation
|
2017-10-04 18:13:41 +00:00
|
|
|
*/
|
|
|
|
|
|
|
|
#ifndef _INTEL_GUC_H_
|
|
|
|
#define _INTEL_GUC_H_
|
|
|
|
|
2021-07-21 14:50:49 -07:00
|
|
|
#include <linux/delay.h>
|
2022-02-16 09:41:35 -08:00
|
|
|
#include <linux/iosys-map.h>
|
|
|
|
#include <linux/xarray.h>
|
2021-07-21 14:50:46 -07:00
|
|
|
|
2022-03-21 09:45:15 -07:00
|
|
|
#include "intel_guc_ct.h"
|
2017-10-16 14:47:14 +00:00
|
|
|
#include "intel_guc_fw.h"
|
2017-10-04 18:13:41 +00:00
|
|
|
#include "intel_guc_fwif.h"
|
|
|
|
#include "intel_guc_log.h"
|
2017-11-24 09:53:40 +00:00
|
|
|
#include "intel_guc_reg.h"
|
2021-07-30 13:21:06 -07:00
|
|
|
#include "intel_guc_slpc_types.h"
|
2017-10-04 18:13:41 +00:00
|
|
|
#include "intel_uc_fw.h"
|
2022-03-21 09:45:15 -07:00
|
|
|
#include "intel_uncore.h"
|
2019-03-08 13:25:17 +00:00
|
|
|
#include "i915_utils.h"
|
2017-10-04 18:13:41 +00:00
|
|
|
#include "i915_vma.h"
|
|
|
|
|
2019-07-01 11:04:51 +01:00
|
|
|
struct __guc_ads_blob;
|
2022-03-21 09:45:15 -07:00
|
|
|
struct intel_guc_state_capture;
|
2019-07-01 11:04:51 +01:00
|
|
|
|
2021-09-09 09:47:44 -07:00
|
|
|
/**
|
|
|
|
* struct intel_guc - Top level structure of GuC.
|
|
|
|
*
|
|
|
|
* It handles firmware loading and manages client pool. intel_guc owns an
|
|
|
|
* i915_sched_engine for submission.
|
2017-10-16 14:47:13 +00:00
|
|
|
*/
|
2017-10-04 18:13:41 +00:00
|
|
|
struct intel_guc {
|
2021-09-09 09:47:44 -07:00
|
|
|
/** @fw: the GuC firmware */
|
2017-10-04 18:13:41 +00:00
|
|
|
struct intel_uc_fw fw;
|
2021-09-09 09:47:44 -07:00
|
|
|
/** @log: sub-structure containing GuC log related data and objects */
|
2017-10-04 18:13:41 +00:00
|
|
|
struct intel_guc_log log;
|
2021-09-09 09:47:44 -07:00
|
|
|
/** @ct: the command transport communication channel */
|
2017-10-04 18:13:41 +00:00
|
|
|
struct intel_guc_ct ct;
|
2021-09-09 09:47:44 -07:00
|
|
|
/** @slpc: sub-structure containing SLPC related data and objects */
|
2021-07-30 13:21:06 -07:00
|
|
|
struct intel_guc_slpc slpc;
|
2022-03-21 09:45:15 -07:00
|
|
|
/** @capture: the error-state-capture module's data and objects */
|
|
|
|
struct intel_guc_state_capture *capture;
|
2017-10-04 18:13:41 +00:00
|
|
|
|
2023-05-02 18:37:33 +03:00
|
|
|
/** @dbgfs_node: debugfs node */
|
2023-03-18 21:36:14 +01:00
|
|
|
struct dentry *dbgfs_node;
|
|
|
|
|
2021-09-09 09:47:44 -07:00
|
|
|
/** @sched_engine: Global engine used to submit requests to GuC */
|
2021-07-21 14:50:47 -07:00
|
|
|
struct i915_sched_engine *sched_engine;
|
2021-09-09 09:47:44 -07:00
|
|
|
/**
|
|
|
|
* @stalled_request: if GuC can't process a request for any reason, we
|
|
|
|
* save it until GuC restarts processing. No other request can be
|
|
|
|
* submitted until the stalled request is processed.
|
|
|
|
*/
|
2021-07-21 14:50:47 -07:00
|
|
|
struct i915_request *stalled_request;
|
2021-10-14 10:19:52 -07:00
|
|
|
/**
|
|
|
|
* @submission_stall_reason: reason why submission is stalled
|
|
|
|
*/
|
|
|
|
enum {
|
|
|
|
STALL_NONE,
|
|
|
|
STALL_REGISTER_CONTEXT,
|
|
|
|
STALL_MOVE_LRC_TAIL,
|
|
|
|
STALL_ADD_REQUEST,
|
|
|
|
} submission_stall_reason;
|
2021-07-21 14:50:47 -07:00
|
|
|
|
2017-10-04 18:13:41 +00:00
|
|
|
/* intel_guc_recv interrupt related state */
|
2021-09-09 09:47:44 -07:00
|
|
|
/** @irq_lock: protects GuC irq state */
|
2018-03-19 10:53:36 +01:00
|
|
|
spinlock_t irq_lock;
|
2021-09-09 09:47:44 -07:00
|
|
|
/**
|
|
|
|
* @msg_enabled_mask: mask of events that are processed when receiving
|
|
|
|
* an INTEL_GUC_ACTION_DEFAULT G2H message.
|
|
|
|
*/
|
2018-03-19 10:53:36 +01:00
|
|
|
unsigned int msg_enabled_mask;
|
2017-10-04 18:13:41 +00:00
|
|
|
|
2021-09-09 09:47:44 -07:00
|
|
|
/**
|
|
|
|
* @outstanding_submission_g2h: number of outstanding GuC to Host
|
|
|
|
* responses related to GuC submission, used to determine if the GT is
|
|
|
|
* idle
|
|
|
|
*/
|
2021-07-21 14:50:58 -07:00
|
|
|
atomic_t outstanding_submission_g2h;
|
|
|
|
|
2023-10-17 11:08:02 -07:00
|
|
|
/** @tlb_lookup: xarray to store all pending TLB invalidation requests */
|
|
|
|
struct xarray tlb_lookup;
|
|
|
|
|
|
|
|
/**
|
|
|
|
* @serial_slot: id to the initial waiter created in tlb_lookup,
|
|
|
|
* which is used only when failed to allocate new waiter.
|
|
|
|
*/
|
|
|
|
u32 serial_slot;
|
|
|
|
|
|
|
|
/** @next_seqno: the next id (sequence number) to allocate. */
|
|
|
|
u32 next_seqno;
|
|
|
|
|
2021-09-09 09:47:44 -07:00
|
|
|
/** @interrupts: pointers to GuC interrupt-managing functions. */
|
2019-05-27 18:36:07 +00:00
|
|
|
struct {
|
2022-11-07 18:06:00 -08:00
|
|
|
bool enabled;
|
2019-07-13 11:00:09 +01:00
|
|
|
void (*reset)(struct intel_guc *guc);
|
|
|
|
void (*enable)(struct intel_guc *guc);
|
|
|
|
void (*disable)(struct intel_guc *guc);
|
2019-05-27 18:36:07 +00:00
|
|
|
} interrupts;
|
|
|
|
|
2021-09-09 09:47:44 -07:00
|
|
|
/**
|
2021-10-14 10:19:41 -07:00
|
|
|
* @submission_state: sub-structure for submission state protected by
|
|
|
|
* single lock
|
2021-07-21 14:50:49 -07:00
|
|
|
*/
|
2021-10-14 10:19:41 -07:00
|
|
|
struct {
|
|
|
|
/**
|
2023-12-26 11:54:31 -08:00
|
|
|
* @submission_state.lock: protects everything in
|
|
|
|
* submission_state, ce->guc_id.id, and ce->guc_id.ref
|
|
|
|
* when transitioning in and out of zero
|
2021-10-14 10:19:41 -07:00
|
|
|
*/
|
|
|
|
spinlock_t lock;
|
|
|
|
/**
|
2023-12-26 11:54:31 -08:00
|
|
|
* @submission_state.guc_ids: used to allocate new
|
|
|
|
* guc_ids, single-lrc
|
2021-10-14 10:19:41 -07:00
|
|
|
*/
|
|
|
|
struct ida guc_ids;
|
2021-12-14 09:05:00 -08:00
|
|
|
/**
|
2023-12-26 11:54:31 -08:00
|
|
|
* @submission_state.num_guc_ids: Number of guc_ids, selftest
|
|
|
|
* feature to be able to reduce this number while testing.
|
2021-12-14 09:05:00 -08:00
|
|
|
*/
|
|
|
|
int num_guc_ids;
|
2021-10-14 10:19:50 -07:00
|
|
|
/**
|
2023-12-26 11:54:31 -08:00
|
|
|
* @submission_state.guc_ids_bitmap: used to allocate
|
|
|
|
* new guc_ids, multi-lrc
|
2021-10-14 10:19:50 -07:00
|
|
|
*/
|
|
|
|
unsigned long *guc_ids_bitmap;
|
2021-10-14 10:19:41 -07:00
|
|
|
/**
|
2023-12-26 11:54:31 -08:00
|
|
|
* @submission_state.guc_id_list: list of intel_context
|
|
|
|
* with valid guc_ids but no refs
|
2021-10-14 10:19:41 -07:00
|
|
|
*/
|
|
|
|
struct list_head guc_id_list;
|
drm/i915/guc: Delay disabling guc_id scheduling for better hysteresis
Add a delay, configurable via debugfs (default 34ms), to disable
scheduling of a context after the pin count goes to zero. Disable
scheduling is a costly operation as it requires synchronizing with
the GuC. So the idea is that a delay allows the user to resubmit
something before doing this operation. This delay is only done if
the context isn't closed and less than a given threshold
(default is 3/4) of the guc_ids are in use.
Alan Previn: Matt Brost first introduced this patch back in Oct 2021.
However no real world workload with measured performance impact was
available to prove the intended results. Today, this series is being
republished in response to a real world workload that benefited greatly
from it along with measured performance improvement.
Workload description: 36 containers were created on a DG2 device where
each container was performing a combination of 720p 3d game rendering
and 30fps video encoding. The workload density was configured in a way
that guaranteed each container to ALWAYS be able to render and
encode no less than 30fps with a predefined maximum render + encode
latency time. That means the totality of all 36 containers and their
workloads were not saturating the engines to their max (in order to
maintain just enough headrooom to meet the min fps and max latencies
of incoming container submissions).
Problem statement: It was observed that the CPU core processing the i915
soft IRQ work was experiencing severe load. Using tracelogs and an
instrumentation patch to count specific i915 IRQ events, it was confirmed
that the majority of the CPU cycles were caused by the
gen11_other_irq_handler() -> guc_irq_handler() code path. The vast
majority of the cycles was determined to be processing a specific G2H
IRQ: i.e. INTEL_GUC_ACTION_SCHED_CONTEXT_MODE_DONE. These IRQs are sent
by GuC in response to i915 KMD sending H2G requests:
INTEL_GUC_ACTION_SCHED_CONTEXT_MODE_SET. Those H2G requests are sent
whenever a context goes idle so that we can unpin the context from GuC.
The high CPU utilization % symptom was limiting density scaling.
Root Cause Analysis: Because the incoming execution buffers were spread
across 36 different containers (each with multiple contexts) but the
system in totality was NOT saturated to the max, it was assumed that each
context was constantly idling between submissions. This was causing
a thrashing of unpinning contexts from GuC at one moment, followed quickly
by repinning them due to incoming workload the very next moment. These
event-pairs were being triggered across multiple contexts per container,
across all containers at the rate of > 30 times per sec per context.
Metrics: When running this workload without this patch, we measured an
average of ~69K INTEL_GUC_ACTION_SCHED_CONTEXT_MODE_DONE events every 10
seconds or ~10 million times over ~25+ mins. With this patch, the count
reduced to ~480 every 10 seconds or about ~28K over ~10 mins. The
improvement observed is ~99% for the average counts per 10 seconds.
Design awareness: Selftest impact.
As temporary WA disable this feature for the selftests. Selftests are
very timing sensitive and any change in timing can cause failure. A
follow up patch will fixup the selftests to understand this delay.
Design awareness: Race between guc_request_alloc and guc_context_close.
If a context close is issued while there is a request submission in
flight and a delayed schedule disable is pending, guc_context_close
and guc_request_alloc will race to cancel the delayed disable.
To close the race, make sure that guc_request_alloc waits for
guc_context_close to finish running before checking any state.
Design awareness: GT Reset event.
If a gt reset is triggered, as preparation steps, add an additional step
to ensure all contexts that have a pending delay-disable-schedule task
be flushed of it. Move them directly into the closed state after cancelling
the worker. This is okay because the existing flow flushes all
yet-to-arrive G2H's dropping them anyway.
Signed-off-by: Matthew Brost <matthew.brost@intel.com>
Signed-off-by: Alan Previn <alan.previn.teres.alexis@intel.com>
Signed-off-by: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
Reviewed-by: John Harrison <John.C.Harrison@Intel.com>
Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20221006225121.826257-2-alan.previn.teres.alexis@intel.com
2022-10-06 15:51:20 -07:00
|
|
|
/**
|
2023-12-26 11:54:31 -08:00
|
|
|
* @submission_state.guc_ids_in_use: Number single-lrc
|
|
|
|
* guc_ids in use
|
drm/i915/guc: Delay disabling guc_id scheduling for better hysteresis
Add a delay, configurable via debugfs (default 34ms), to disable
scheduling of a context after the pin count goes to zero. Disable
scheduling is a costly operation as it requires synchronizing with
the GuC. So the idea is that a delay allows the user to resubmit
something before doing this operation. This delay is only done if
the context isn't closed and less than a given threshold
(default is 3/4) of the guc_ids are in use.
Alan Previn: Matt Brost first introduced this patch back in Oct 2021.
However no real world workload with measured performance impact was
available to prove the intended results. Today, this series is being
republished in response to a real world workload that benefited greatly
from it along with measured performance improvement.
Workload description: 36 containers were created on a DG2 device where
each container was performing a combination of 720p 3d game rendering
and 30fps video encoding. The workload density was configured in a way
that guaranteed each container to ALWAYS be able to render and
encode no less than 30fps with a predefined maximum render + encode
latency time. That means the totality of all 36 containers and their
workloads were not saturating the engines to their max (in order to
maintain just enough headrooom to meet the min fps and max latencies
of incoming container submissions).
Problem statement: It was observed that the CPU core processing the i915
soft IRQ work was experiencing severe load. Using tracelogs and an
instrumentation patch to count specific i915 IRQ events, it was confirmed
that the majority of the CPU cycles were caused by the
gen11_other_irq_handler() -> guc_irq_handler() code path. The vast
majority of the cycles was determined to be processing a specific G2H
IRQ: i.e. INTEL_GUC_ACTION_SCHED_CONTEXT_MODE_DONE. These IRQs are sent
by GuC in response to i915 KMD sending H2G requests:
INTEL_GUC_ACTION_SCHED_CONTEXT_MODE_SET. Those H2G requests are sent
whenever a context goes idle so that we can unpin the context from GuC.
The high CPU utilization % symptom was limiting density scaling.
Root Cause Analysis: Because the incoming execution buffers were spread
across 36 different containers (each with multiple contexts) but the
system in totality was NOT saturated to the max, it was assumed that each
context was constantly idling between submissions. This was causing
a thrashing of unpinning contexts from GuC at one moment, followed quickly
by repinning them due to incoming workload the very next moment. These
event-pairs were being triggered across multiple contexts per container,
across all containers at the rate of > 30 times per sec per context.
Metrics: When running this workload without this patch, we measured an
average of ~69K INTEL_GUC_ACTION_SCHED_CONTEXT_MODE_DONE events every 10
seconds or ~10 million times over ~25+ mins. With this patch, the count
reduced to ~480 every 10 seconds or about ~28K over ~10 mins. The
improvement observed is ~99% for the average counts per 10 seconds.
Design awareness: Selftest impact.
As temporary WA disable this feature for the selftests. Selftests are
very timing sensitive and any change in timing can cause failure. A
follow up patch will fixup the selftests to understand this delay.
Design awareness: Race between guc_request_alloc and guc_context_close.
If a context close is issued while there is a request submission in
flight and a delayed schedule disable is pending, guc_context_close
and guc_request_alloc will race to cancel the delayed disable.
To close the race, make sure that guc_request_alloc waits for
guc_context_close to finish running before checking any state.
Design awareness: GT Reset event.
If a gt reset is triggered, as preparation steps, add an additional step
to ensure all contexts that have a pending delay-disable-schedule task
be flushed of it. Move them directly into the closed state after cancelling
the worker. This is okay because the existing flow flushes all
yet-to-arrive G2H's dropping them anyway.
Signed-off-by: Matthew Brost <matthew.brost@intel.com>
Signed-off-by: Alan Previn <alan.previn.teres.alexis@intel.com>
Signed-off-by: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
Reviewed-by: John Harrison <John.C.Harrison@Intel.com>
Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20221006225121.826257-2-alan.previn.teres.alexis@intel.com
2022-10-06 15:51:20 -07:00
|
|
|
*/
|
|
|
|
unsigned int guc_ids_in_use;
|
2021-10-14 10:19:42 -07:00
|
|
|
/**
|
2023-12-26 11:54:31 -08:00
|
|
|
* @submission_state.destroyed_contexts: list of contexts
|
|
|
|
* waiting to be destroyed (deregistered with the GuC)
|
2021-10-14 10:19:42 -07:00
|
|
|
*/
|
|
|
|
struct list_head destroyed_contexts;
|
|
|
|
/**
|
2023-12-26 11:54:31 -08:00
|
|
|
* @submission_state.destroyed_worker: worker to deregister
|
|
|
|
* contexts, need as we need to take a GT PM reference and
|
|
|
|
* can't from destroy function as it might be in an atomic
|
|
|
|
* context (no sleeping)
|
2021-10-14 10:19:42 -07:00
|
|
|
*/
|
|
|
|
struct work_struct destroyed_worker;
|
2022-01-20 20:31:17 -08:00
|
|
|
/**
|
2023-12-26 11:54:31 -08:00
|
|
|
* @submission_state.reset_fail_worker: worker to trigger
|
|
|
|
* a GT reset after an engine reset fails
|
2022-01-20 20:31:17 -08:00
|
|
|
*/
|
|
|
|
struct work_struct reset_fail_worker;
|
|
|
|
/**
|
2023-12-26 11:54:31 -08:00
|
|
|
* @submission_state.reset_fail_mask: mask of engines that
|
|
|
|
* failed to reset
|
2022-01-20 20:31:17 -08:00
|
|
|
*/
|
|
|
|
intel_engine_mask_t reset_fail_mask;
|
drm/i915/guc: Delay disabling guc_id scheduling for better hysteresis
Add a delay, configurable via debugfs (default 34ms), to disable
scheduling of a context after the pin count goes to zero. Disable
scheduling is a costly operation as it requires synchronizing with
the GuC. So the idea is that a delay allows the user to resubmit
something before doing this operation. This delay is only done if
the context isn't closed and less than a given threshold
(default is 3/4) of the guc_ids are in use.
Alan Previn: Matt Brost first introduced this patch back in Oct 2021.
However no real world workload with measured performance impact was
available to prove the intended results. Today, this series is being
republished in response to a real world workload that benefited greatly
from it along with measured performance improvement.
Workload description: 36 containers were created on a DG2 device where
each container was performing a combination of 720p 3d game rendering
and 30fps video encoding. The workload density was configured in a way
that guaranteed each container to ALWAYS be able to render and
encode no less than 30fps with a predefined maximum render + encode
latency time. That means the totality of all 36 containers and their
workloads were not saturating the engines to their max (in order to
maintain just enough headrooom to meet the min fps and max latencies
of incoming container submissions).
Problem statement: It was observed that the CPU core processing the i915
soft IRQ work was experiencing severe load. Using tracelogs and an
instrumentation patch to count specific i915 IRQ events, it was confirmed
that the majority of the CPU cycles were caused by the
gen11_other_irq_handler() -> guc_irq_handler() code path. The vast
majority of the cycles was determined to be processing a specific G2H
IRQ: i.e. INTEL_GUC_ACTION_SCHED_CONTEXT_MODE_DONE. These IRQs are sent
by GuC in response to i915 KMD sending H2G requests:
INTEL_GUC_ACTION_SCHED_CONTEXT_MODE_SET. Those H2G requests are sent
whenever a context goes idle so that we can unpin the context from GuC.
The high CPU utilization % symptom was limiting density scaling.
Root Cause Analysis: Because the incoming execution buffers were spread
across 36 different containers (each with multiple contexts) but the
system in totality was NOT saturated to the max, it was assumed that each
context was constantly idling between submissions. This was causing
a thrashing of unpinning contexts from GuC at one moment, followed quickly
by repinning them due to incoming workload the very next moment. These
event-pairs were being triggered across multiple contexts per container,
across all containers at the rate of > 30 times per sec per context.
Metrics: When running this workload without this patch, we measured an
average of ~69K INTEL_GUC_ACTION_SCHED_CONTEXT_MODE_DONE events every 10
seconds or ~10 million times over ~25+ mins. With this patch, the count
reduced to ~480 every 10 seconds or about ~28K over ~10 mins. The
improvement observed is ~99% for the average counts per 10 seconds.
Design awareness: Selftest impact.
As temporary WA disable this feature for the selftests. Selftests are
very timing sensitive and any change in timing can cause failure. A
follow up patch will fixup the selftests to understand this delay.
Design awareness: Race between guc_request_alloc and guc_context_close.
If a context close is issued while there is a request submission in
flight and a delayed schedule disable is pending, guc_context_close
and guc_request_alloc will race to cancel the delayed disable.
To close the race, make sure that guc_request_alloc waits for
guc_context_close to finish running before checking any state.
Design awareness: GT Reset event.
If a gt reset is triggered, as preparation steps, add an additional step
to ensure all contexts that have a pending delay-disable-schedule task
be flushed of it. Move them directly into the closed state after cancelling
the worker. This is okay because the existing flow flushes all
yet-to-arrive G2H's dropping them anyway.
Signed-off-by: Matthew Brost <matthew.brost@intel.com>
Signed-off-by: Alan Previn <alan.previn.teres.alexis@intel.com>
Signed-off-by: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
Reviewed-by: John Harrison <John.C.Harrison@Intel.com>
Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20221006225121.826257-2-alan.previn.teres.alexis@intel.com
2022-10-06 15:51:20 -07:00
|
|
|
/**
|
2023-12-26 11:54:31 -08:00
|
|
|
* @submission_state.sched_disable_delay_ms: schedule
|
|
|
|
* disable delay, in ms, for contexts
|
drm/i915/guc: Delay disabling guc_id scheduling for better hysteresis
Add a delay, configurable via debugfs (default 34ms), to disable
scheduling of a context after the pin count goes to zero. Disable
scheduling is a costly operation as it requires synchronizing with
the GuC. So the idea is that a delay allows the user to resubmit
something before doing this operation. This delay is only done if
the context isn't closed and less than a given threshold
(default is 3/4) of the guc_ids are in use.
Alan Previn: Matt Brost first introduced this patch back in Oct 2021.
However no real world workload with measured performance impact was
available to prove the intended results. Today, this series is being
republished in response to a real world workload that benefited greatly
from it along with measured performance improvement.
Workload description: 36 containers were created on a DG2 device where
each container was performing a combination of 720p 3d game rendering
and 30fps video encoding. The workload density was configured in a way
that guaranteed each container to ALWAYS be able to render and
encode no less than 30fps with a predefined maximum render + encode
latency time. That means the totality of all 36 containers and their
workloads were not saturating the engines to their max (in order to
maintain just enough headrooom to meet the min fps and max latencies
of incoming container submissions).
Problem statement: It was observed that the CPU core processing the i915
soft IRQ work was experiencing severe load. Using tracelogs and an
instrumentation patch to count specific i915 IRQ events, it was confirmed
that the majority of the CPU cycles were caused by the
gen11_other_irq_handler() -> guc_irq_handler() code path. The vast
majority of the cycles was determined to be processing a specific G2H
IRQ: i.e. INTEL_GUC_ACTION_SCHED_CONTEXT_MODE_DONE. These IRQs are sent
by GuC in response to i915 KMD sending H2G requests:
INTEL_GUC_ACTION_SCHED_CONTEXT_MODE_SET. Those H2G requests are sent
whenever a context goes idle so that we can unpin the context from GuC.
The high CPU utilization % symptom was limiting density scaling.
Root Cause Analysis: Because the incoming execution buffers were spread
across 36 different containers (each with multiple contexts) but the
system in totality was NOT saturated to the max, it was assumed that each
context was constantly idling between submissions. This was causing
a thrashing of unpinning contexts from GuC at one moment, followed quickly
by repinning them due to incoming workload the very next moment. These
event-pairs were being triggered across multiple contexts per container,
across all containers at the rate of > 30 times per sec per context.
Metrics: When running this workload without this patch, we measured an
average of ~69K INTEL_GUC_ACTION_SCHED_CONTEXT_MODE_DONE events every 10
seconds or ~10 million times over ~25+ mins. With this patch, the count
reduced to ~480 every 10 seconds or about ~28K over ~10 mins. The
improvement observed is ~99% for the average counts per 10 seconds.
Design awareness: Selftest impact.
As temporary WA disable this feature for the selftests. Selftests are
very timing sensitive and any change in timing can cause failure. A
follow up patch will fixup the selftests to understand this delay.
Design awareness: Race between guc_request_alloc and guc_context_close.
If a context close is issued while there is a request submission in
flight and a delayed schedule disable is pending, guc_context_close
and guc_request_alloc will race to cancel the delayed disable.
To close the race, make sure that guc_request_alloc waits for
guc_context_close to finish running before checking any state.
Design awareness: GT Reset event.
If a gt reset is triggered, as preparation steps, add an additional step
to ensure all contexts that have a pending delay-disable-schedule task
be flushed of it. Move them directly into the closed state after cancelling
the worker. This is okay because the existing flow flushes all
yet-to-arrive G2H's dropping them anyway.
Signed-off-by: Matthew Brost <matthew.brost@intel.com>
Signed-off-by: Alan Previn <alan.previn.teres.alexis@intel.com>
Signed-off-by: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
Reviewed-by: John Harrison <John.C.Harrison@Intel.com>
Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20221006225121.826257-2-alan.previn.teres.alexis@intel.com
2022-10-06 15:51:20 -07:00
|
|
|
*/
|
|
|
|
unsigned int sched_disable_delay_ms;
|
|
|
|
/**
|
2023-12-26 11:54:31 -08:00
|
|
|
* @submission_state.sched_disable_gucid_threshold:
|
|
|
|
* threshold of min remaining available guc_ids before
|
|
|
|
* we start bypassing the schedule disable delay
|
drm/i915/guc: Delay disabling guc_id scheduling for better hysteresis
Add a delay, configurable via debugfs (default 34ms), to disable
scheduling of a context after the pin count goes to zero. Disable
scheduling is a costly operation as it requires synchronizing with
the GuC. So the idea is that a delay allows the user to resubmit
something before doing this operation. This delay is only done if
the context isn't closed and less than a given threshold
(default is 3/4) of the guc_ids are in use.
Alan Previn: Matt Brost first introduced this patch back in Oct 2021.
However no real world workload with measured performance impact was
available to prove the intended results. Today, this series is being
republished in response to a real world workload that benefited greatly
from it along with measured performance improvement.
Workload description: 36 containers were created on a DG2 device where
each container was performing a combination of 720p 3d game rendering
and 30fps video encoding. The workload density was configured in a way
that guaranteed each container to ALWAYS be able to render and
encode no less than 30fps with a predefined maximum render + encode
latency time. That means the totality of all 36 containers and their
workloads were not saturating the engines to their max (in order to
maintain just enough headrooom to meet the min fps and max latencies
of incoming container submissions).
Problem statement: It was observed that the CPU core processing the i915
soft IRQ work was experiencing severe load. Using tracelogs and an
instrumentation patch to count specific i915 IRQ events, it was confirmed
that the majority of the CPU cycles were caused by the
gen11_other_irq_handler() -> guc_irq_handler() code path. The vast
majority of the cycles was determined to be processing a specific G2H
IRQ: i.e. INTEL_GUC_ACTION_SCHED_CONTEXT_MODE_DONE. These IRQs are sent
by GuC in response to i915 KMD sending H2G requests:
INTEL_GUC_ACTION_SCHED_CONTEXT_MODE_SET. Those H2G requests are sent
whenever a context goes idle so that we can unpin the context from GuC.
The high CPU utilization % symptom was limiting density scaling.
Root Cause Analysis: Because the incoming execution buffers were spread
across 36 different containers (each with multiple contexts) but the
system in totality was NOT saturated to the max, it was assumed that each
context was constantly idling between submissions. This was causing
a thrashing of unpinning contexts from GuC at one moment, followed quickly
by repinning them due to incoming workload the very next moment. These
event-pairs were being triggered across multiple contexts per container,
across all containers at the rate of > 30 times per sec per context.
Metrics: When running this workload without this patch, we measured an
average of ~69K INTEL_GUC_ACTION_SCHED_CONTEXT_MODE_DONE events every 10
seconds or ~10 million times over ~25+ mins. With this patch, the count
reduced to ~480 every 10 seconds or about ~28K over ~10 mins. The
improvement observed is ~99% for the average counts per 10 seconds.
Design awareness: Selftest impact.
As temporary WA disable this feature for the selftests. Selftests are
very timing sensitive and any change in timing can cause failure. A
follow up patch will fixup the selftests to understand this delay.
Design awareness: Race between guc_request_alloc and guc_context_close.
If a context close is issued while there is a request submission in
flight and a delayed schedule disable is pending, guc_context_close
and guc_request_alloc will race to cancel the delayed disable.
To close the race, make sure that guc_request_alloc waits for
guc_context_close to finish running before checking any state.
Design awareness: GT Reset event.
If a gt reset is triggered, as preparation steps, add an additional step
to ensure all contexts that have a pending delay-disable-schedule task
be flushed of it. Move them directly into the closed state after cancelling
the worker. This is okay because the existing flow flushes all
yet-to-arrive G2H's dropping them anyway.
Signed-off-by: Matthew Brost <matthew.brost@intel.com>
Signed-off-by: Alan Previn <alan.previn.teres.alexis@intel.com>
Signed-off-by: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
Reviewed-by: John Harrison <John.C.Harrison@Intel.com>
Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20221006225121.826257-2-alan.previn.teres.alexis@intel.com
2022-10-06 15:51:20 -07:00
|
|
|
*/
|
|
|
|
unsigned int sched_disable_gucid_threshold;
|
2021-10-14 10:19:41 -07:00
|
|
|
} submission_state;
|
2021-07-21 14:50:49 -07:00
|
|
|
|
2021-09-09 09:47:44 -07:00
|
|
|
/**
|
|
|
|
* @submission_supported: tracks whether we support GuC submission on
|
|
|
|
* the current platform
|
|
|
|
*/
|
2021-07-26 17:23:48 -07:00
|
|
|
bool submission_supported;
|
2021-09-09 09:47:44 -07:00
|
|
|
/** @submission_selected: tracks whether the user enabled GuC submission */
|
2020-02-18 14:33:24 -08:00
|
|
|
bool submission_selected;
|
2022-03-01 16:33:51 -08:00
|
|
|
/** @submission_initialized: tracks whether GuC submission has been initialised */
|
|
|
|
bool submission_initialized;
|
2022-11-29 15:20:31 -08:00
|
|
|
/** @submission_version: Submission API version of the currently loaded firmware */
|
|
|
|
struct intel_uc_fw_ver submission_version;
|
|
|
|
|
2021-09-09 09:47:44 -07:00
|
|
|
/**
|
|
|
|
* @rc_supported: tracks whether we support GuC rc on the current platform
|
|
|
|
*/
|
2021-07-30 13:21:19 -07:00
|
|
|
bool rc_supported;
|
2021-09-09 09:47:44 -07:00
|
|
|
/** @rc_selected: tracks whether the user enabled GuC rc */
|
2021-07-30 13:21:19 -07:00
|
|
|
bool rc_selected;
|
2019-07-31 22:33:20 +00:00
|
|
|
|
2021-09-09 09:47:44 -07:00
|
|
|
/** @ads_vma: object allocated to hold the GuC ADS */
|
2017-10-04 18:13:41 +00:00
|
|
|
struct i915_vma *ads_vma;
|
2022-02-16 09:41:47 -08:00
|
|
|
/** @ads_map: contents of the GuC ADS */
|
2022-02-16 09:41:35 -08:00
|
|
|
struct iosys_map ads_map;
|
2021-09-09 09:47:44 -07:00
|
|
|
/** @ads_regset_size: size of the save/restore regsets in the ADS */
|
2021-07-26 17:23:30 -07:00
|
|
|
u32 ads_regset_size;
|
2022-02-07 23:01:41 -08:00
|
|
|
/**
|
|
|
|
* @ads_regset_count: number of save/restore registers in the ADS for
|
|
|
|
* each engine
|
|
|
|
*/
|
|
|
|
u32 ads_regset_count[I915_NUM_ENGINES];
|
|
|
|
/** @ads_regset: save/restore regsets in the ADS */
|
|
|
|
struct guc_mmio_reg *ads_regset;
|
2021-09-09 09:47:44 -07:00
|
|
|
/** @ads_golden_ctxt_size: size of the golden contexts in the ADS */
|
2021-07-26 17:23:38 -07:00
|
|
|
u32 ads_golden_ctxt_size;
|
2024-02-23 12:56:31 -08:00
|
|
|
/** @ads_waklv_size: size of workaround KLVs */
|
|
|
|
u32 ads_waklv_size;
|
2022-03-21 09:45:15 -07:00
|
|
|
/** @ads_capture_size: size of register lists in the ADS used for error capture */
|
|
|
|
u32 ads_capture_size;
|
2019-07-01 11:04:51 +01:00
|
|
|
|
2022-07-18 16:07:32 -07:00
|
|
|
/** @lrc_desc_pool_v69: object allocated to hold the GuC LRC descriptor pool */
|
|
|
|
struct i915_vma *lrc_desc_pool_v69;
|
|
|
|
/** @lrc_desc_pool_vaddr_v69: contents of the GuC LRC descriptor pool */
|
|
|
|
void *lrc_desc_pool_vaddr_v69;
|
|
|
|
|
2021-09-09 09:47:44 -07:00
|
|
|
/**
|
|
|
|
* @context_lookup: used to resolve intel_context from guc_id, if a
|
|
|
|
* context is present in this structure it is registered with the GuC
|
|
|
|
*/
|
2021-07-21 14:50:46 -07:00
|
|
|
struct xarray context_lookup;
|
|
|
|
|
2021-09-09 09:47:44 -07:00
|
|
|
/** @params: Control params for fw initialization */
|
2019-07-24 09:58:49 +01:00
|
|
|
u32 params[GUC_CTL_MAX_DWORDS];
|
|
|
|
|
2021-09-09 09:47:44 -07:00
|
|
|
/** @send_regs: GuC's FW specific registers used for sending MMIO H2G */
|
2017-10-04 18:13:41 +00:00
|
|
|
struct {
|
|
|
|
u32 base;
|
|
|
|
unsigned int count;
|
|
|
|
enum forcewake_domains fw_domains;
|
|
|
|
} send_regs;
|
|
|
|
|
2021-09-09 09:47:44 -07:00
|
|
|
/** @notify_reg: register used to send interrupts to the GuC FW */
|
2019-12-16 17:23:15 -08:00
|
|
|
i915_reg_t notify_reg;
|
|
|
|
|
2021-09-09 09:47:44 -07:00
|
|
|
/**
|
|
|
|
* @mmio_msg: notification bitmask that the GuC writes in one of its
|
|
|
|
* registers when the CT channel is disabled, to be processed when the
|
|
|
|
* channel is back up.
|
|
|
|
*/
|
2019-06-21 11:21:23 -07:00
|
|
|
u32 mmio_msg;
|
|
|
|
|
2021-09-09 09:47:44 -07:00
|
|
|
/** @send_mutex: used to serialize the intel_guc_send actions */
|
2017-10-04 18:13:41 +00:00
|
|
|
struct mutex send_mutex;
|
drm/i915/pmu: Connect engine busyness stats from GuC to pmu
With GuC handling scheduling, i915 is not aware of the time that a
context is scheduled in and out of the engine. Since i915 pmu relies on
this info to provide engine busyness to the user, GuC shares this info
with i915 for all engines using shared memory. For each engine, this
info contains:
- total busyness: total time that the context was running (total)
- id: id of the running context (id)
- start timestamp: timestamp when the context started running (start)
At the time (now) of sampling the engine busyness, if the id is valid
(!= ~0), and start is non-zero, then the context is considered to be
active and the engine busyness is calculated using the below equation
engine busyness = total + (now - start)
All times are obtained from the gt clock base. For inactive contexts,
engine busyness is just equal to the total.
The start and total values provided by GuC are 32 bits and wrap around
in a few minutes. Since perf pmu provides busyness as 64 bit
monotonically increasing values, there is a need for this implementation
to account for overflows and extend the time to 64 bits before returning
busyness to the user. In order to do that, a worker runs periodically at
frequency = 1/8th the time it takes for the timestamp to wrap. As an
example, that would be once in 27 seconds for a gt clock frequency of
19.2 MHz.
Note:
There might be an over-accounting of busyness due to the fact that GuC
may be updating the total and start values while kmd is reading them.
(i.e kmd may read the updated total and the stale start). In such a
case, user may see higher busyness value followed by smaller ones which
would eventually catch up to the higher value.
v2: (Tvrtko)
- Include details in commit message
- Move intel engine busyness function into execlist code
- Use union inside engine->stats
- Use natural type for ping delay jiffies
- Drop active_work condition checks
- Use for_each_engine if iterating all engines
- Drop seq locking, use spinlock at GuC level to update engine stats
- Document worker specific details
v3: (Tvrtko/Umesh)
- Demarcate GuC and execlist stat objects with comments
- Document known over-accounting issue in commit
- Provide a consistent view of GuC state
- Add hooks to gt park/unpark for GuC busyness
- Stop/start worker in gt park/unpark path
- Drop inline
- Move spinlock and worker inits to GuC initialization
- Drop helpers that are called only once
v4: (Tvrtko/Matt/Umesh)
- Drop addressed opens from commit message
- Get runtime pm in ping, remove from the park path
- Use cancel_delayed_work_sync in disable_submission path
- Update stats during reset prepare
- Skip ping if reset in progress
- Explicitly name execlists and GuC stats objects
- Since disable_submission is called from many places, move resetting
stats to intel_guc_submission_reset_prepare
v5: (Tvrtko)
- Add a trylock helper that does not sleep and synchronize PMU event
callbacks and worker with gt reset
v6: (CI BAT failures)
- DUTs using execlist submission failed to boot since __gt_unpark is
called during i915 load. This ends up calling the GuC busyness unpark
hook and results in kick-starting an uninitialized worker. Let
park/unpark hooks check if GuC submission has been initialized.
- drop cant_sleep() from trylock helper since rcu_read_lock takes care
of that.
v7: (CI) Fix igt@i915_selftest@live@gt_engines
- For GuC mode of submission the engine busyness is derived from gt time
domain. Use gt time elapsed as reference in the selftest.
- Increase busyness calculation to 10ms duration to ensure batch runs
longer and falls within the busyness tolerances in selftest.
v8:
- Use ktime_get in selftest as before
- intel_reset_trylock_no_wait results in a lockdep splat that is not
trivial to fix since the PMU callback runs in irq context and the
reset paths are tightly knit into the driver. The test that uncovers
this is igt@perf_pmu@faulting-read. Drop intel_reset_trylock_no_wait,
instead use the reset_count to synchronize with gt reset during pmu
callback. For the ping, continue to use intel_reset_trylock since ping
is not run in irq context.
- GuC PM timestamp does not tick when GuC is idle. This can potentially
result in wrong busyness values when a context is active on the
engine, but GuC is idle. Use the RING TIMESTAMP as GPU timestamp to
process the GuC busyness stats. This works since both GuC timestamp and
RING timestamp are synced with the same clock.
- The busyness stats may get updated after the batch starts running.
This delay causes the busyness reported for 100us duration to fall
below 95% in the selftest. The only option at this time is to wait for
GuC busyness to change from idle to active before we sample busyness
over a 100us period.
Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
Acked-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Reviewed-by: Matthew Brost <matthew.brost@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20211027004821.66097-2-umesh.nerlige.ramappa@intel.com
2021-10-26 17:48:21 -07:00
|
|
|
|
|
|
|
/**
|
|
|
|
* @timestamp: GT timestamp object that stores a copy of the timestamp
|
|
|
|
* and adjusts it for overflow using a worker.
|
|
|
|
*/
|
|
|
|
struct {
|
|
|
|
/**
|
2023-12-26 11:54:31 -08:00
|
|
|
* @timestamp.lock: Lock protecting the below fields and
|
|
|
|
* the engine stats.
|
drm/i915/pmu: Connect engine busyness stats from GuC to pmu
With GuC handling scheduling, i915 is not aware of the time that a
context is scheduled in and out of the engine. Since i915 pmu relies on
this info to provide engine busyness to the user, GuC shares this info
with i915 for all engines using shared memory. For each engine, this
info contains:
- total busyness: total time that the context was running (total)
- id: id of the running context (id)
- start timestamp: timestamp when the context started running (start)
At the time (now) of sampling the engine busyness, if the id is valid
(!= ~0), and start is non-zero, then the context is considered to be
active and the engine busyness is calculated using the below equation
engine busyness = total + (now - start)
All times are obtained from the gt clock base. For inactive contexts,
engine busyness is just equal to the total.
The start and total values provided by GuC are 32 bits and wrap around
in a few minutes. Since perf pmu provides busyness as 64 bit
monotonically increasing values, there is a need for this implementation
to account for overflows and extend the time to 64 bits before returning
busyness to the user. In order to do that, a worker runs periodically at
frequency = 1/8th the time it takes for the timestamp to wrap. As an
example, that would be once in 27 seconds for a gt clock frequency of
19.2 MHz.
Note:
There might be an over-accounting of busyness due to the fact that GuC
may be updating the total and start values while kmd is reading them.
(i.e kmd may read the updated total and the stale start). In such a
case, user may see higher busyness value followed by smaller ones which
would eventually catch up to the higher value.
v2: (Tvrtko)
- Include details in commit message
- Move intel engine busyness function into execlist code
- Use union inside engine->stats
- Use natural type for ping delay jiffies
- Drop active_work condition checks
- Use for_each_engine if iterating all engines
- Drop seq locking, use spinlock at GuC level to update engine stats
- Document worker specific details
v3: (Tvrtko/Umesh)
- Demarcate GuC and execlist stat objects with comments
- Document known over-accounting issue in commit
- Provide a consistent view of GuC state
- Add hooks to gt park/unpark for GuC busyness
- Stop/start worker in gt park/unpark path
- Drop inline
- Move spinlock and worker inits to GuC initialization
- Drop helpers that are called only once
v4: (Tvrtko/Matt/Umesh)
- Drop addressed opens from commit message
- Get runtime pm in ping, remove from the park path
- Use cancel_delayed_work_sync in disable_submission path
- Update stats during reset prepare
- Skip ping if reset in progress
- Explicitly name execlists and GuC stats objects
- Since disable_submission is called from many places, move resetting
stats to intel_guc_submission_reset_prepare
v5: (Tvrtko)
- Add a trylock helper that does not sleep and synchronize PMU event
callbacks and worker with gt reset
v6: (CI BAT failures)
- DUTs using execlist submission failed to boot since __gt_unpark is
called during i915 load. This ends up calling the GuC busyness unpark
hook and results in kick-starting an uninitialized worker. Let
park/unpark hooks check if GuC submission has been initialized.
- drop cant_sleep() from trylock helper since rcu_read_lock takes care
of that.
v7: (CI) Fix igt@i915_selftest@live@gt_engines
- For GuC mode of submission the engine busyness is derived from gt time
domain. Use gt time elapsed as reference in the selftest.
- Increase busyness calculation to 10ms duration to ensure batch runs
longer and falls within the busyness tolerances in selftest.
v8:
- Use ktime_get in selftest as before
- intel_reset_trylock_no_wait results in a lockdep splat that is not
trivial to fix since the PMU callback runs in irq context and the
reset paths are tightly knit into the driver. The test that uncovers
this is igt@perf_pmu@faulting-read. Drop intel_reset_trylock_no_wait,
instead use the reset_count to synchronize with gt reset during pmu
callback. For the ping, continue to use intel_reset_trylock since ping
is not run in irq context.
- GuC PM timestamp does not tick when GuC is idle. This can potentially
result in wrong busyness values when a context is active on the
engine, but GuC is idle. Use the RING TIMESTAMP as GPU timestamp to
process the GuC busyness stats. This works since both GuC timestamp and
RING timestamp are synced with the same clock.
- The busyness stats may get updated after the batch starts running.
This delay causes the busyness reported for 100us duration to fall
below 95% in the selftest. The only option at this time is to wait for
GuC busyness to change from idle to active before we sample busyness
over a 100us period.
Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
Acked-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Reviewed-by: Matthew Brost <matthew.brost@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20211027004821.66097-2-umesh.nerlige.ramappa@intel.com
2021-10-26 17:48:21 -07:00
|
|
|
*/
|
|
|
|
spinlock_t lock;
|
|
|
|
|
|
|
|
/**
|
2023-12-26 11:54:31 -08:00
|
|
|
* @timestamp.gt_stamp: 64-bit extended value of the GT
|
|
|
|
* timestamp.
|
drm/i915/pmu: Connect engine busyness stats from GuC to pmu
With GuC handling scheduling, i915 is not aware of the time that a
context is scheduled in and out of the engine. Since i915 pmu relies on
this info to provide engine busyness to the user, GuC shares this info
with i915 for all engines using shared memory. For each engine, this
info contains:
- total busyness: total time that the context was running (total)
- id: id of the running context (id)
- start timestamp: timestamp when the context started running (start)
At the time (now) of sampling the engine busyness, if the id is valid
(!= ~0), and start is non-zero, then the context is considered to be
active and the engine busyness is calculated using the below equation
engine busyness = total + (now - start)
All times are obtained from the gt clock base. For inactive contexts,
engine busyness is just equal to the total.
The start and total values provided by GuC are 32 bits and wrap around
in a few minutes. Since perf pmu provides busyness as 64 bit
monotonically increasing values, there is a need for this implementation
to account for overflows and extend the time to 64 bits before returning
busyness to the user. In order to do that, a worker runs periodically at
frequency = 1/8th the time it takes for the timestamp to wrap. As an
example, that would be once in 27 seconds for a gt clock frequency of
19.2 MHz.
Note:
There might be an over-accounting of busyness due to the fact that GuC
may be updating the total and start values while kmd is reading them.
(i.e kmd may read the updated total and the stale start). In such a
case, user may see higher busyness value followed by smaller ones which
would eventually catch up to the higher value.
v2: (Tvrtko)
- Include details in commit message
- Move intel engine busyness function into execlist code
- Use union inside engine->stats
- Use natural type for ping delay jiffies
- Drop active_work condition checks
- Use for_each_engine if iterating all engines
- Drop seq locking, use spinlock at GuC level to update engine stats
- Document worker specific details
v3: (Tvrtko/Umesh)
- Demarcate GuC and execlist stat objects with comments
- Document known over-accounting issue in commit
- Provide a consistent view of GuC state
- Add hooks to gt park/unpark for GuC busyness
- Stop/start worker in gt park/unpark path
- Drop inline
- Move spinlock and worker inits to GuC initialization
- Drop helpers that are called only once
v4: (Tvrtko/Matt/Umesh)
- Drop addressed opens from commit message
- Get runtime pm in ping, remove from the park path
- Use cancel_delayed_work_sync in disable_submission path
- Update stats during reset prepare
- Skip ping if reset in progress
- Explicitly name execlists and GuC stats objects
- Since disable_submission is called from many places, move resetting
stats to intel_guc_submission_reset_prepare
v5: (Tvrtko)
- Add a trylock helper that does not sleep and synchronize PMU event
callbacks and worker with gt reset
v6: (CI BAT failures)
- DUTs using execlist submission failed to boot since __gt_unpark is
called during i915 load. This ends up calling the GuC busyness unpark
hook and results in kick-starting an uninitialized worker. Let
park/unpark hooks check if GuC submission has been initialized.
- drop cant_sleep() from trylock helper since rcu_read_lock takes care
of that.
v7: (CI) Fix igt@i915_selftest@live@gt_engines
- For GuC mode of submission the engine busyness is derived from gt time
domain. Use gt time elapsed as reference in the selftest.
- Increase busyness calculation to 10ms duration to ensure batch runs
longer and falls within the busyness tolerances in selftest.
v8:
- Use ktime_get in selftest as before
- intel_reset_trylock_no_wait results in a lockdep splat that is not
trivial to fix since the PMU callback runs in irq context and the
reset paths are tightly knit into the driver. The test that uncovers
this is igt@perf_pmu@faulting-read. Drop intel_reset_trylock_no_wait,
instead use the reset_count to synchronize with gt reset during pmu
callback. For the ping, continue to use intel_reset_trylock since ping
is not run in irq context.
- GuC PM timestamp does not tick when GuC is idle. This can potentially
result in wrong busyness values when a context is active on the
engine, but GuC is idle. Use the RING TIMESTAMP as GPU timestamp to
process the GuC busyness stats. This works since both GuC timestamp and
RING timestamp are synced with the same clock.
- The busyness stats may get updated after the batch starts running.
This delay causes the busyness reported for 100us duration to fall
below 95% in the selftest. The only option at this time is to wait for
GuC busyness to change from idle to active before we sample busyness
over a 100us period.
Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
Acked-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Reviewed-by: Matthew Brost <matthew.brost@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20211027004821.66097-2-umesh.nerlige.ramappa@intel.com
2021-10-26 17:48:21 -07:00
|
|
|
*/
|
|
|
|
u64 gt_stamp;
|
|
|
|
|
|
|
|
/**
|
2023-12-26 11:54:31 -08:00
|
|
|
* @timestamp.ping_delay: Period for polling the GT
|
|
|
|
* timestamp for overflow.
|
drm/i915/pmu: Connect engine busyness stats from GuC to pmu
With GuC handling scheduling, i915 is not aware of the time that a
context is scheduled in and out of the engine. Since i915 pmu relies on
this info to provide engine busyness to the user, GuC shares this info
with i915 for all engines using shared memory. For each engine, this
info contains:
- total busyness: total time that the context was running (total)
- id: id of the running context (id)
- start timestamp: timestamp when the context started running (start)
At the time (now) of sampling the engine busyness, if the id is valid
(!= ~0), and start is non-zero, then the context is considered to be
active and the engine busyness is calculated using the below equation
engine busyness = total + (now - start)
All times are obtained from the gt clock base. For inactive contexts,
engine busyness is just equal to the total.
The start and total values provided by GuC are 32 bits and wrap around
in a few minutes. Since perf pmu provides busyness as 64 bit
monotonically increasing values, there is a need for this implementation
to account for overflows and extend the time to 64 bits before returning
busyness to the user. In order to do that, a worker runs periodically at
frequency = 1/8th the time it takes for the timestamp to wrap. As an
example, that would be once in 27 seconds for a gt clock frequency of
19.2 MHz.
Note:
There might be an over-accounting of busyness due to the fact that GuC
may be updating the total and start values while kmd is reading them.
(i.e kmd may read the updated total and the stale start). In such a
case, user may see higher busyness value followed by smaller ones which
would eventually catch up to the higher value.
v2: (Tvrtko)
- Include details in commit message
- Move intel engine busyness function into execlist code
- Use union inside engine->stats
- Use natural type for ping delay jiffies
- Drop active_work condition checks
- Use for_each_engine if iterating all engines
- Drop seq locking, use spinlock at GuC level to update engine stats
- Document worker specific details
v3: (Tvrtko/Umesh)
- Demarcate GuC and execlist stat objects with comments
- Document known over-accounting issue in commit
- Provide a consistent view of GuC state
- Add hooks to gt park/unpark for GuC busyness
- Stop/start worker in gt park/unpark path
- Drop inline
- Move spinlock and worker inits to GuC initialization
- Drop helpers that are called only once
v4: (Tvrtko/Matt/Umesh)
- Drop addressed opens from commit message
- Get runtime pm in ping, remove from the park path
- Use cancel_delayed_work_sync in disable_submission path
- Update stats during reset prepare
- Skip ping if reset in progress
- Explicitly name execlists and GuC stats objects
- Since disable_submission is called from many places, move resetting
stats to intel_guc_submission_reset_prepare
v5: (Tvrtko)
- Add a trylock helper that does not sleep and synchronize PMU event
callbacks and worker with gt reset
v6: (CI BAT failures)
- DUTs using execlist submission failed to boot since __gt_unpark is
called during i915 load. This ends up calling the GuC busyness unpark
hook and results in kick-starting an uninitialized worker. Let
park/unpark hooks check if GuC submission has been initialized.
- drop cant_sleep() from trylock helper since rcu_read_lock takes care
of that.
v7: (CI) Fix igt@i915_selftest@live@gt_engines
- For GuC mode of submission the engine busyness is derived from gt time
domain. Use gt time elapsed as reference in the selftest.
- Increase busyness calculation to 10ms duration to ensure batch runs
longer and falls within the busyness tolerances in selftest.
v8:
- Use ktime_get in selftest as before
- intel_reset_trylock_no_wait results in a lockdep splat that is not
trivial to fix since the PMU callback runs in irq context and the
reset paths are tightly knit into the driver. The test that uncovers
this is igt@perf_pmu@faulting-read. Drop intel_reset_trylock_no_wait,
instead use the reset_count to synchronize with gt reset during pmu
callback. For the ping, continue to use intel_reset_trylock since ping
is not run in irq context.
- GuC PM timestamp does not tick when GuC is idle. This can potentially
result in wrong busyness values when a context is active on the
engine, but GuC is idle. Use the RING TIMESTAMP as GPU timestamp to
process the GuC busyness stats. This works since both GuC timestamp and
RING timestamp are synced with the same clock.
- The busyness stats may get updated after the batch starts running.
This delay causes the busyness reported for 100us duration to fall
below 95% in the selftest. The only option at this time is to wait for
GuC busyness to change from idle to active before we sample busyness
over a 100us period.
Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
Acked-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Reviewed-by: Matthew Brost <matthew.brost@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20211027004821.66097-2-umesh.nerlige.ramappa@intel.com
2021-10-26 17:48:21 -07:00
|
|
|
*/
|
|
|
|
unsigned long ping_delay;
|
|
|
|
|
|
|
|
/**
|
2023-12-26 11:54:31 -08:00
|
|
|
* @timestamp.work: Periodic work to adjust GT timestamp,
|
|
|
|
* engine and context usage for overflows.
|
drm/i915/pmu: Connect engine busyness stats from GuC to pmu
With GuC handling scheduling, i915 is not aware of the time that a
context is scheduled in and out of the engine. Since i915 pmu relies on
this info to provide engine busyness to the user, GuC shares this info
with i915 for all engines using shared memory. For each engine, this
info contains:
- total busyness: total time that the context was running (total)
- id: id of the running context (id)
- start timestamp: timestamp when the context started running (start)
At the time (now) of sampling the engine busyness, if the id is valid
(!= ~0), and start is non-zero, then the context is considered to be
active and the engine busyness is calculated using the below equation
engine busyness = total + (now - start)
All times are obtained from the gt clock base. For inactive contexts,
engine busyness is just equal to the total.
The start and total values provided by GuC are 32 bits and wrap around
in a few minutes. Since perf pmu provides busyness as 64 bit
monotonically increasing values, there is a need for this implementation
to account for overflows and extend the time to 64 bits before returning
busyness to the user. In order to do that, a worker runs periodically at
frequency = 1/8th the time it takes for the timestamp to wrap. As an
example, that would be once in 27 seconds for a gt clock frequency of
19.2 MHz.
Note:
There might be an over-accounting of busyness due to the fact that GuC
may be updating the total and start values while kmd is reading them.
(i.e kmd may read the updated total and the stale start). In such a
case, user may see higher busyness value followed by smaller ones which
would eventually catch up to the higher value.
v2: (Tvrtko)
- Include details in commit message
- Move intel engine busyness function into execlist code
- Use union inside engine->stats
- Use natural type for ping delay jiffies
- Drop active_work condition checks
- Use for_each_engine if iterating all engines
- Drop seq locking, use spinlock at GuC level to update engine stats
- Document worker specific details
v3: (Tvrtko/Umesh)
- Demarcate GuC and execlist stat objects with comments
- Document known over-accounting issue in commit
- Provide a consistent view of GuC state
- Add hooks to gt park/unpark for GuC busyness
- Stop/start worker in gt park/unpark path
- Drop inline
- Move spinlock and worker inits to GuC initialization
- Drop helpers that are called only once
v4: (Tvrtko/Matt/Umesh)
- Drop addressed opens from commit message
- Get runtime pm in ping, remove from the park path
- Use cancel_delayed_work_sync in disable_submission path
- Update stats during reset prepare
- Skip ping if reset in progress
- Explicitly name execlists and GuC stats objects
- Since disable_submission is called from many places, move resetting
stats to intel_guc_submission_reset_prepare
v5: (Tvrtko)
- Add a trylock helper that does not sleep and synchronize PMU event
callbacks and worker with gt reset
v6: (CI BAT failures)
- DUTs using execlist submission failed to boot since __gt_unpark is
called during i915 load. This ends up calling the GuC busyness unpark
hook and results in kick-starting an uninitialized worker. Let
park/unpark hooks check if GuC submission has been initialized.
- drop cant_sleep() from trylock helper since rcu_read_lock takes care
of that.
v7: (CI) Fix igt@i915_selftest@live@gt_engines
- For GuC mode of submission the engine busyness is derived from gt time
domain. Use gt time elapsed as reference in the selftest.
- Increase busyness calculation to 10ms duration to ensure batch runs
longer and falls within the busyness tolerances in selftest.
v8:
- Use ktime_get in selftest as before
- intel_reset_trylock_no_wait results in a lockdep splat that is not
trivial to fix since the PMU callback runs in irq context and the
reset paths are tightly knit into the driver. The test that uncovers
this is igt@perf_pmu@faulting-read. Drop intel_reset_trylock_no_wait,
instead use the reset_count to synchronize with gt reset during pmu
callback. For the ping, continue to use intel_reset_trylock since ping
is not run in irq context.
- GuC PM timestamp does not tick when GuC is idle. This can potentially
result in wrong busyness values when a context is active on the
engine, but GuC is idle. Use the RING TIMESTAMP as GPU timestamp to
process the GuC busyness stats. This works since both GuC timestamp and
RING timestamp are synced with the same clock.
- The busyness stats may get updated after the batch starts running.
This delay causes the busyness reported for 100us duration to fall
below 95% in the selftest. The only option at this time is to wait for
GuC busyness to change from idle to active before we sample busyness
over a 100us period.
Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
Acked-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Reviewed-by: Matthew Brost <matthew.brost@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20211027004821.66097-2-umesh.nerlige.ramappa@intel.com
2021-10-26 17:48:21 -07:00
|
|
|
*/
|
|
|
|
struct delayed_work work;
|
2022-01-10 17:55:23 -08:00
|
|
|
|
|
|
|
/**
|
2023-12-26 11:54:31 -08:00
|
|
|
* @timestamp.shift: Right shift value for the gpm timestamp
|
2022-01-10 17:55:23 -08:00
|
|
|
*/
|
|
|
|
u32 shift;
|
drm/i915/guc: Don't update engine busyness stats too frequently
Using two different types of workoads, it was observed that
guc_update_engine_gt_clks was being called too frequently and/or
causing a CPU-to-lmem bandwidth hit over PCIE. Details on
the workloads and numbers are in the notes below.
Background: At the moment, guc_update_engine_gt_clks can be invoked
via one of 3 ways. #1 and #2 are infrequent under normal operating
conditions:
1.When a predefined "ping_delay" timer expires so that GuC-
busyness can sample the GTPM clock counter to ensure it
doesn't miss a wrap-around of the 32-bits of the HW counter.
(The ping_delay is calculated based on 1/8th the time taken
for the counter go from 0x0 to 0xffffffff based on the
GT frequency. This comes to about once every 28 seconds at a
GT frequency of 19.2Mhz).
2.In preparation for a gt reset.
3.In response to __gt_park events (as the gt power management
puts the gt into a lower power state when there is no work
being done).
Root-cause: For both the workloads described farther below, it was
observed that when user space calls IOCTLs that unparks the
gt momentarily and repeats such calls many times in quick succession,
it triggers calling guc_update_engine_gt_clks as many times. However,
the primary purpose of guc_update_engine_gt_clks is to ensure we don't
miss the wraparound while the counter is ticking. Thus, the solution
is to ensure we skip that check if gt_park is calling this function
earlier than necessary.
Solution: Snapshot jiffies when we do actually update the busyness
stats. Then get the new jiffies every time intel_guc_busyness_park
is called and bail if we are being called too soon. Use half of the
ping_delay as a safe threshold.
NOTE1: Workload1: IGTs' gem_create was modified to create a file handle,
allocate memory with sizes that range from a min of 4K to the max supported
(in power of two step-sizes). Its maps, modifies and reads back the
memory. Allocations and modification is repeated until total memory
allocation reaches the max. Then the file handle is closed. With this
workload, guc_update_engine_gt_clks was called over 188 thousand times
in the span of 15 seconds while this test ran three times. With this patch,
the number of calls reduced to 14.
NOTE2: Workload2: 30 transcode sessions are created in quick succession.
While these sessions are created, pcm-iio tool was used to measure I/O
read operation bandwidth consumption sampled at 100 milisecond intervals
over the course of 20 seconds. The total bandwidth consumed over 20 seconds
without this patch was measured at average at 311KBps per sample. With this
patch, the number went down to about 175Kbps which is about a 43% savings.
Signed-off-by: Alan Previn <alan.previn.teres.alexis@intel.com>
Reviewed-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
Acked-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20220623023157.211650-2-alan.previn.teres.alexis@intel.com
2022-06-22 19:31:57 -07:00
|
|
|
|
|
|
|
/**
|
2023-12-26 11:54:31 -08:00
|
|
|
* @timestamp.last_stat_jiffies: jiffies at last actual
|
|
|
|
* stats collection time. We use this timestamp to ensure
|
|
|
|
* we don't oversample the stats because runtime power
|
|
|
|
* management events can trigger stats collection at much
|
|
|
|
* higher rates than required.
|
drm/i915/guc: Don't update engine busyness stats too frequently
Using two different types of workoads, it was observed that
guc_update_engine_gt_clks was being called too frequently and/or
causing a CPU-to-lmem bandwidth hit over PCIE. Details on
the workloads and numbers are in the notes below.
Background: At the moment, guc_update_engine_gt_clks can be invoked
via one of 3 ways. #1 and #2 are infrequent under normal operating
conditions:
1.When a predefined "ping_delay" timer expires so that GuC-
busyness can sample the GTPM clock counter to ensure it
doesn't miss a wrap-around of the 32-bits of the HW counter.
(The ping_delay is calculated based on 1/8th the time taken
for the counter go from 0x0 to 0xffffffff based on the
GT frequency. This comes to about once every 28 seconds at a
GT frequency of 19.2Mhz).
2.In preparation for a gt reset.
3.In response to __gt_park events (as the gt power management
puts the gt into a lower power state when there is no work
being done).
Root-cause: For both the workloads described farther below, it was
observed that when user space calls IOCTLs that unparks the
gt momentarily and repeats such calls many times in quick succession,
it triggers calling guc_update_engine_gt_clks as many times. However,
the primary purpose of guc_update_engine_gt_clks is to ensure we don't
miss the wraparound while the counter is ticking. Thus, the solution
is to ensure we skip that check if gt_park is calling this function
earlier than necessary.
Solution: Snapshot jiffies when we do actually update the busyness
stats. Then get the new jiffies every time intel_guc_busyness_park
is called and bail if we are being called too soon. Use half of the
ping_delay as a safe threshold.
NOTE1: Workload1: IGTs' gem_create was modified to create a file handle,
allocate memory with sizes that range from a min of 4K to the max supported
(in power of two step-sizes). Its maps, modifies and reads back the
memory. Allocations and modification is repeated until total memory
allocation reaches the max. Then the file handle is closed. With this
workload, guc_update_engine_gt_clks was called over 188 thousand times
in the span of 15 seconds while this test ran three times. With this patch,
the number of calls reduced to 14.
NOTE2: Workload2: 30 transcode sessions are created in quick succession.
While these sessions are created, pcm-iio tool was used to measure I/O
read operation bandwidth consumption sampled at 100 milisecond intervals
over the course of 20 seconds. The total bandwidth consumed over 20 seconds
without this patch was measured at average at 311KBps per sample. With this
patch, the number went down to about 175Kbps which is about a 43% savings.
Signed-off-by: Alan Previn <alan.previn.teres.alexis@intel.com>
Reviewed-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
Acked-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20220623023157.211650-2-alan.previn.teres.alexis@intel.com
2022-06-22 19:31:57 -07:00
|
|
|
*/
|
|
|
|
unsigned long last_stat_jiffies;
|
drm/i915/pmu: Connect engine busyness stats from GuC to pmu
With GuC handling scheduling, i915 is not aware of the time that a
context is scheduled in and out of the engine. Since i915 pmu relies on
this info to provide engine busyness to the user, GuC shares this info
with i915 for all engines using shared memory. For each engine, this
info contains:
- total busyness: total time that the context was running (total)
- id: id of the running context (id)
- start timestamp: timestamp when the context started running (start)
At the time (now) of sampling the engine busyness, if the id is valid
(!= ~0), and start is non-zero, then the context is considered to be
active and the engine busyness is calculated using the below equation
engine busyness = total + (now - start)
All times are obtained from the gt clock base. For inactive contexts,
engine busyness is just equal to the total.
The start and total values provided by GuC are 32 bits and wrap around
in a few minutes. Since perf pmu provides busyness as 64 bit
monotonically increasing values, there is a need for this implementation
to account for overflows and extend the time to 64 bits before returning
busyness to the user. In order to do that, a worker runs periodically at
frequency = 1/8th the time it takes for the timestamp to wrap. As an
example, that would be once in 27 seconds for a gt clock frequency of
19.2 MHz.
Note:
There might be an over-accounting of busyness due to the fact that GuC
may be updating the total and start values while kmd is reading them.
(i.e kmd may read the updated total and the stale start). In such a
case, user may see higher busyness value followed by smaller ones which
would eventually catch up to the higher value.
v2: (Tvrtko)
- Include details in commit message
- Move intel engine busyness function into execlist code
- Use union inside engine->stats
- Use natural type for ping delay jiffies
- Drop active_work condition checks
- Use for_each_engine if iterating all engines
- Drop seq locking, use spinlock at GuC level to update engine stats
- Document worker specific details
v3: (Tvrtko/Umesh)
- Demarcate GuC and execlist stat objects with comments
- Document known over-accounting issue in commit
- Provide a consistent view of GuC state
- Add hooks to gt park/unpark for GuC busyness
- Stop/start worker in gt park/unpark path
- Drop inline
- Move spinlock and worker inits to GuC initialization
- Drop helpers that are called only once
v4: (Tvrtko/Matt/Umesh)
- Drop addressed opens from commit message
- Get runtime pm in ping, remove from the park path
- Use cancel_delayed_work_sync in disable_submission path
- Update stats during reset prepare
- Skip ping if reset in progress
- Explicitly name execlists and GuC stats objects
- Since disable_submission is called from many places, move resetting
stats to intel_guc_submission_reset_prepare
v5: (Tvrtko)
- Add a trylock helper that does not sleep and synchronize PMU event
callbacks and worker with gt reset
v6: (CI BAT failures)
- DUTs using execlist submission failed to boot since __gt_unpark is
called during i915 load. This ends up calling the GuC busyness unpark
hook and results in kick-starting an uninitialized worker. Let
park/unpark hooks check if GuC submission has been initialized.
- drop cant_sleep() from trylock helper since rcu_read_lock takes care
of that.
v7: (CI) Fix igt@i915_selftest@live@gt_engines
- For GuC mode of submission the engine busyness is derived from gt time
domain. Use gt time elapsed as reference in the selftest.
- Increase busyness calculation to 10ms duration to ensure batch runs
longer and falls within the busyness tolerances in selftest.
v8:
- Use ktime_get in selftest as before
- intel_reset_trylock_no_wait results in a lockdep splat that is not
trivial to fix since the PMU callback runs in irq context and the
reset paths are tightly knit into the driver. The test that uncovers
this is igt@perf_pmu@faulting-read. Drop intel_reset_trylock_no_wait,
instead use the reset_count to synchronize with gt reset during pmu
callback. For the ping, continue to use intel_reset_trylock since ping
is not run in irq context.
- GuC PM timestamp does not tick when GuC is idle. This can potentially
result in wrong busyness values when a context is active on the
engine, but GuC is idle. Use the RING TIMESTAMP as GPU timestamp to
process the GuC busyness stats. This works since both GuC timestamp and
RING timestamp are synced with the same clock.
- The busyness stats may get updated after the batch starts running.
This delay causes the busyness reported for 100us duration to fall
below 95% in the selftest. The only option at this time is to wait for
GuC busyness to change from idle to active before we sample busyness
over a 100us period.
Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
Acked-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Reviewed-by: Matthew Brost <matthew.brost@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20211027004821.66097-2-umesh.nerlige.ramappa@intel.com
2021-10-26 17:48:21 -07:00
|
|
|
} timestamp;
|
2021-12-14 09:05:00 -08:00
|
|
|
|
2023-08-15 17:39:57 -07:00
|
|
|
/**
|
|
|
|
* @dead_guc_worker: Asynchronous worker thread for forcing a GuC reset.
|
|
|
|
* Specifically used when the G2H handler wants to issue a reset. Resets
|
|
|
|
* require flushing the G2H queue. So, the G2H processing itself must not
|
|
|
|
* trigger a reset directly. Instead, go via this worker.
|
|
|
|
*/
|
|
|
|
struct work_struct dead_guc_worker;
|
|
|
|
/**
|
2025-01-20 13:45:10 +05:30
|
|
|
* @last_dead_guc_jiffies: timestamp of previous 'dead guc' occurrence
|
2023-08-15 17:39:57 -07:00
|
|
|
* used to prevent a fundamentally broken system from continuously
|
|
|
|
* reloading the GuC.
|
|
|
|
*/
|
|
|
|
unsigned long last_dead_guc_jiffies;
|
|
|
|
|
2021-12-14 09:05:00 -08:00
|
|
|
#ifdef CONFIG_DRM_I915_SELFTEST
|
|
|
|
/**
|
|
|
|
* @number_guc_id_stolen: The number of guc_ids that have been stolen
|
|
|
|
*/
|
|
|
|
int number_guc_id_stolen;
|
2023-11-13 17:00:16 -08:00
|
|
|
/**
|
|
|
|
* @fast_response_selftest: Backdoor to CT handler for fast response selftest
|
|
|
|
*/
|
|
|
|
u32 fast_response_selftest;
|
2021-12-14 09:05:00 -08:00
|
|
|
#endif
|
2017-10-04 18:13:41 +00:00
|
|
|
};
|
|
|
|
|
2023-10-17 11:08:02 -07:00
|
|
|
struct intel_guc_tlb_wait {
|
|
|
|
struct wait_queue_head wq;
|
|
|
|
bool busy;
|
|
|
|
};
|
|
|
|
|
2022-11-29 15:20:31 -08:00
|
|
|
/*
|
|
|
|
* GuC version number components are only 8-bit, so converting to a 32bit 8.8.8
|
|
|
|
* integer works.
|
|
|
|
*/
|
|
|
|
#define MAKE_GUC_VER(maj, min, pat) (((maj) << 16) | ((min) << 8) | (pat))
|
|
|
|
#define MAKE_GUC_VER_STRUCT(ver) MAKE_GUC_VER((ver).major, (ver).minor, (ver).patch)
|
|
|
|
#define GUC_SUBMIT_VER(guc) MAKE_GUC_VER_STRUCT((guc)->submission_version)
|
2023-10-05 18:35:53 -07:00
|
|
|
#define GUC_FIRMWARE_VER(guc) MAKE_GUC_VER_STRUCT((guc)->fw.file_selected.ver)
|
2022-11-29 15:20:31 -08:00
|
|
|
|
2020-03-26 11:11:20 -07:00
|
|
|
static inline struct intel_guc *log_to_guc(struct intel_guc_log *log)
|
|
|
|
{
|
|
|
|
return container_of(log, struct intel_guc, log);
|
|
|
|
}
|
|
|
|
|
2017-10-04 18:13:41 +00:00
|
|
|
static
|
|
|
|
inline int intel_guc_send(struct intel_guc *guc, const u32 *action, u32 len)
|
|
|
|
{
|
2021-07-08 09:20:52 -07:00
|
|
|
return intel_guc_ct_send(&guc->ct, action, len, NULL, 0, 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
static
|
2021-07-21 14:50:57 -07:00
|
|
|
inline int intel_guc_send_nb(struct intel_guc *guc, const u32 *action, u32 len,
|
|
|
|
u32 g2h_len_dw)
|
2021-07-08 09:20:52 -07:00
|
|
|
{
|
|
|
|
return intel_guc_ct_send(&guc->ct, action, len, NULL, 0,
|
2021-07-21 14:50:57 -07:00
|
|
|
MAKE_SEND_FLAGS(g2h_len_dw));
|
2018-03-26 19:48:20 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline int
|
|
|
|
intel_guc_send_and_receive(struct intel_guc *guc, const u32 *action, u32 len,
|
|
|
|
u32 *response_buf, u32 response_buf_size)
|
|
|
|
{
|
2019-12-16 17:23:14 -08:00
|
|
|
return intel_guc_ct_send(&guc->ct, action, len,
|
2021-07-08 09:20:52 -07:00
|
|
|
response_buf, response_buf_size, 0);
|
2017-10-04 18:13:41 +00:00
|
|
|
}
|
|
|
|
|
2021-07-21 14:50:49 -07:00
|
|
|
static inline int intel_guc_send_busy_loop(struct intel_guc *guc,
|
|
|
|
const u32 *action,
|
|
|
|
u32 len,
|
2021-07-21 14:50:57 -07:00
|
|
|
u32 g2h_len_dw,
|
2021-07-21 14:50:49 -07:00
|
|
|
bool loop)
|
|
|
|
{
|
|
|
|
int err;
|
|
|
|
unsigned int sleep_period_ms = 1;
|
|
|
|
bool not_atomic = !in_atomic() && !irqs_disabled();
|
|
|
|
|
|
|
|
/*
|
|
|
|
* FIXME: Have caller pass in if we are in an atomic context to avoid
|
|
|
|
* using in_atomic(). It is likely safe here as we check for irqs
|
|
|
|
* disabled which basically all the spin locks in the i915 do but
|
|
|
|
* regardless this should be cleaned up.
|
|
|
|
*/
|
|
|
|
|
|
|
|
/* No sleeping with spin locks, just busy loop */
|
|
|
|
might_sleep_if(loop && not_atomic);
|
|
|
|
|
|
|
|
retry:
|
2021-07-21 14:50:57 -07:00
|
|
|
err = intel_guc_send_nb(guc, action, len, g2h_len_dw);
|
2021-07-21 14:50:49 -07:00
|
|
|
if (unlikely(err == -EBUSY && loop)) {
|
|
|
|
if (likely(not_atomic)) {
|
|
|
|
if (msleep_interruptible(sleep_period_ms))
|
|
|
|
return -EINTR;
|
|
|
|
sleep_period_ms = sleep_period_ms << 1;
|
|
|
|
} else {
|
|
|
|
cpu_relax();
|
|
|
|
}
|
|
|
|
goto retry;
|
|
|
|
}
|
|
|
|
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
2022-11-07 18:06:00 -08:00
|
|
|
/* Only call this from the interrupt handler code */
|
2018-03-26 19:48:22 +00:00
|
|
|
static inline void intel_guc_to_host_event_handler(struct intel_guc *guc)
|
|
|
|
{
|
2022-11-07 18:06:00 -08:00
|
|
|
if (guc->interrupts.enabled)
|
|
|
|
intel_guc_ct_event_handler(&guc->ct);
|
2018-03-26 19:48:22 +00:00
|
|
|
}
|
|
|
|
|
2018-03-13 17:32:49 -07:00
|
|
|
/* GuC addresses above GUC_GGTT_TOP also don't map through the GTT */
|
|
|
|
#define GUC_GGTT_TOP 0xFEE00000
|
|
|
|
|
|
|
|
/**
|
|
|
|
* intel_guc_ggtt_offset() - Get and validate the GGTT offset of @vma
|
|
|
|
* @guc: intel_guc structure.
|
|
|
|
* @vma: i915 graphics virtual memory area.
|
|
|
|
*
|
2018-03-13 17:32:50 -07:00
|
|
|
* GuC does not allow any gfx GGTT address that falls into range
|
2018-07-27 16:11:45 +02:00
|
|
|
* [0, ggtt.pin_bias), which is reserved for Boot ROM, SRAM and WOPCM.
|
|
|
|
* Currently, in order to exclude [0, ggtt.pin_bias) address space from
|
2018-03-13 17:32:50 -07:00
|
|
|
* GGTT, all gfx objects used by GuC are allocated with intel_guc_allocate_vma()
|
2018-07-27 16:11:45 +02:00
|
|
|
* and pinned with PIN_OFFSET_BIAS along with the value of ggtt.pin_bias.
|
2018-03-13 17:32:49 -07:00
|
|
|
*
|
2018-03-13 17:32:50 -07:00
|
|
|
* Return: GGTT offset of the @vma.
|
2017-10-16 14:47:13 +00:00
|
|
|
*/
|
2018-03-13 17:32:49 -07:00
|
|
|
static inline u32 intel_guc_ggtt_offset(struct intel_guc *guc,
|
|
|
|
struct i915_vma *vma)
|
2017-10-04 18:13:41 +00:00
|
|
|
{
|
|
|
|
u32 offset = i915_ggtt_offset(vma);
|
|
|
|
|
2018-07-27 16:11:45 +02:00
|
|
|
GEM_BUG_ON(offset < i915_ggtt_pin_bias(vma));
|
2017-10-04 18:13:41 +00:00
|
|
|
GEM_BUG_ON(range_overflows_t(u64, offset, vma->size, GUC_GGTT_TOP));
|
|
|
|
|
|
|
|
return offset;
|
|
|
|
}
|
|
|
|
|
|
|
|
void intel_guc_init_early(struct intel_guc *guc);
|
2021-07-26 17:23:38 -07:00
|
|
|
void intel_guc_init_late(struct intel_guc *guc);
|
2017-10-04 18:13:41 +00:00
|
|
|
void intel_guc_init_send_regs(struct intel_guc *guc);
|
2019-07-24 09:58:49 +01:00
|
|
|
void intel_guc_write_params(struct intel_guc *guc);
|
2017-12-13 23:13:46 +01:00
|
|
|
int intel_guc_init(struct intel_guc *guc);
|
|
|
|
void intel_guc_fini(struct intel_guc *guc);
|
2019-12-16 17:23:15 -08:00
|
|
|
void intel_guc_notify(struct intel_guc *guc);
|
2018-03-26 19:48:20 +00:00
|
|
|
int intel_guc_send_mmio(struct intel_guc *guc, const u32 *action, u32 len,
|
|
|
|
u32 *response_buf, u32 response_buf_size);
|
2019-03-21 12:00:04 +00:00
|
|
|
int intel_guc_to_host_process_recv_msg(struct intel_guc *guc,
|
|
|
|
const u32 *payload, u32 len);
|
2017-10-04 18:13:41 +00:00
|
|
|
int intel_guc_auth_huc(struct intel_guc *guc, u32 rsa_offset);
|
2018-03-02 11:15:49 +00:00
|
|
|
int intel_guc_suspend(struct intel_guc *guc);
|
|
|
|
int intel_guc_resume(struct intel_guc *guc);
|
2017-10-04 18:13:41 +00:00
|
|
|
struct i915_vma *intel_guc_allocate_vma(struct intel_guc *guc, u32 size);
|
2019-12-05 14:02:40 -08:00
|
|
|
int intel_guc_allocate_and_map_vma(struct intel_guc *guc, u32 size,
|
|
|
|
struct i915_vma **out_vma, void **out_vaddr);
|
2022-01-06 16:06:21 -08:00
|
|
|
int intel_guc_self_cfg32(struct intel_guc *guc, u16 key, u32 value);
|
|
|
|
int intel_guc_self_cfg64(struct intel_guc *guc, u16 key, u64 value);
|
2017-10-04 18:13:41 +00:00
|
|
|
|
2019-07-31 22:33:19 +00:00
|
|
|
static inline bool intel_guc_is_supported(struct intel_guc *guc)
|
|
|
|
{
|
2019-08-16 20:56:58 +00:00
|
|
|
return intel_uc_fw_is_supported(&guc->fw);
|
|
|
|
}
|
|
|
|
|
2020-02-18 14:33:23 -08:00
|
|
|
static inline bool intel_guc_is_wanted(struct intel_guc *guc)
|
2019-08-16 20:56:58 +00:00
|
|
|
{
|
|
|
|
return intel_uc_fw_is_enabled(&guc->fw);
|
2019-07-31 22:33:19 +00:00
|
|
|
}
|
|
|
|
|
2020-02-18 14:33:23 -08:00
|
|
|
static inline bool intel_guc_is_used(struct intel_guc *guc)
|
|
|
|
{
|
|
|
|
GEM_BUG_ON(__intel_uc_fw_status(&guc->fw) == INTEL_UC_FIRMWARE_SELECTED);
|
|
|
|
return intel_uc_fw_is_available(&guc->fw);
|
|
|
|
}
|
|
|
|
|
2020-01-31 15:37:06 +00:00
|
|
|
static inline bool intel_guc_is_fw_running(struct intel_guc *guc)
|
2019-05-22 19:31:58 +00:00
|
|
|
{
|
2019-07-24 17:18:09 -07:00
|
|
|
return intel_uc_fw_is_running(&guc->fw);
|
2019-05-22 19:31:58 +00:00
|
|
|
}
|
|
|
|
|
2020-01-31 15:37:06 +00:00
|
|
|
static inline bool intel_guc_is_ready(struct intel_guc *guc)
|
|
|
|
{
|
|
|
|
return intel_guc_is_fw_running(guc) && intel_guc_ct_enabled(&guc->ct);
|
|
|
|
}
|
|
|
|
|
2021-07-26 17:23:25 -07:00
|
|
|
static inline void intel_guc_reset_interrupts(struct intel_guc *guc)
|
|
|
|
{
|
|
|
|
guc->interrupts.reset(guc);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void intel_guc_enable_interrupts(struct intel_guc *guc)
|
|
|
|
{
|
|
|
|
guc->interrupts.enable(guc);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void intel_guc_disable_interrupts(struct intel_guc *guc)
|
|
|
|
{
|
|
|
|
guc->interrupts.disable(guc);
|
|
|
|
}
|
|
|
|
|
2018-03-12 13:03:07 +00:00
|
|
|
static inline int intel_guc_sanitize(struct intel_guc *guc)
|
|
|
|
{
|
|
|
|
intel_uc_fw_sanitize(&guc->fw);
|
2021-07-26 17:23:25 -07:00
|
|
|
intel_guc_disable_interrupts(guc);
|
2020-02-07 13:19:38 +00:00
|
|
|
intel_guc_ct_sanitize(&guc->ct);
|
2019-06-21 11:21:23 -07:00
|
|
|
guc->mmio_msg = 0;
|
|
|
|
|
2018-03-12 13:03:07 +00:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2018-03-20 18:14:19 +00:00
|
|
|
static inline void intel_guc_enable_msg(struct intel_guc *guc, u32 mask)
|
|
|
|
{
|
|
|
|
spin_lock_irq(&guc->irq_lock);
|
|
|
|
guc->msg_enabled_mask |= mask;
|
|
|
|
spin_unlock_irq(&guc->irq_lock);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void intel_guc_disable_msg(struct intel_guc *guc, u32 mask)
|
|
|
|
{
|
|
|
|
spin_lock_irq(&guc->irq_lock);
|
|
|
|
guc->msg_enabled_mask &= ~mask;
|
|
|
|
spin_unlock_irq(&guc->irq_lock);
|
|
|
|
}
|
|
|
|
|
2021-07-21 14:50:58 -07:00
|
|
|
int intel_guc_wait_for_idle(struct intel_guc *guc, long timeout);
|
|
|
|
|
2021-07-21 14:50:49 -07:00
|
|
|
int intel_guc_deregister_done_process_msg(struct intel_guc *guc,
|
|
|
|
const u32 *msg, u32 len);
|
2021-07-21 14:50:51 -07:00
|
|
|
int intel_guc_sched_done_process_msg(struct intel_guc *guc,
|
|
|
|
const u32 *msg, u32 len);
|
2021-07-26 17:23:27 -07:00
|
|
|
int intel_guc_context_reset_process_msg(struct intel_guc *guc,
|
|
|
|
const u32 *msg, u32 len);
|
2021-07-26 17:23:28 -07:00
|
|
|
int intel_guc_engine_failure_process_msg(struct intel_guc *guc,
|
|
|
|
const u32 *msg, u32 len);
|
2022-01-06 16:06:21 -08:00
|
|
|
int intel_guc_error_capture_process_msg(struct intel_guc *guc,
|
|
|
|
const u32 *msg, u32 len);
|
2023-08-15 17:39:57 -07:00
|
|
|
int intel_guc_crash_process_msg(struct intel_guc *guc, u32 action);
|
2021-07-21 14:50:49 -07:00
|
|
|
|
2022-03-21 09:45:27 -07:00
|
|
|
struct intel_engine_cs *
|
|
|
|
intel_guc_lookup_engine(struct intel_guc *guc, u8 guc_class, u8 instance);
|
|
|
|
|
2021-07-26 17:23:34 -07:00
|
|
|
void intel_guc_find_hung_context(struct intel_engine_cs *engine);
|
|
|
|
|
2021-07-26 17:23:35 -07:00
|
|
|
int intel_guc_global_policies_update(struct intel_guc *guc);
|
|
|
|
|
2021-07-26 17:23:39 -07:00
|
|
|
void intel_guc_context_ban(struct intel_context *ce, struct i915_request *rq);
|
|
|
|
|
2021-07-26 17:23:23 -07:00
|
|
|
void intel_guc_submission_reset_prepare(struct intel_guc *guc);
|
2022-04-25 17:30:45 -07:00
|
|
|
void intel_guc_submission_reset(struct intel_guc *guc, intel_engine_mask_t stalled);
|
2021-07-26 17:23:23 -07:00
|
|
|
void intel_guc_submission_reset_finish(struct intel_guc *guc);
|
|
|
|
void intel_guc_submission_cancel_requests(struct intel_guc *guc);
|
|
|
|
|
2020-03-26 11:11:19 -07:00
|
|
|
void intel_guc_load_status(struct intel_guc *guc, struct drm_printer *p);
|
|
|
|
|
2021-10-14 10:19:52 -07:00
|
|
|
void intel_guc_write_barrier(struct intel_guc *guc);
|
|
|
|
|
2022-08-19 13:39:04 +01:00
|
|
|
void intel_guc_dump_time_info(struct intel_guc *guc, struct drm_printer *p);
|
2022-07-27 19:20:24 -07:00
|
|
|
|
drm/i915/guc: Delay disabling guc_id scheduling for better hysteresis
Add a delay, configurable via debugfs (default 34ms), to disable
scheduling of a context after the pin count goes to zero. Disable
scheduling is a costly operation as it requires synchronizing with
the GuC. So the idea is that a delay allows the user to resubmit
something before doing this operation. This delay is only done if
the context isn't closed and less than a given threshold
(default is 3/4) of the guc_ids are in use.
Alan Previn: Matt Brost first introduced this patch back in Oct 2021.
However no real world workload with measured performance impact was
available to prove the intended results. Today, this series is being
republished in response to a real world workload that benefited greatly
from it along with measured performance improvement.
Workload description: 36 containers were created on a DG2 device where
each container was performing a combination of 720p 3d game rendering
and 30fps video encoding. The workload density was configured in a way
that guaranteed each container to ALWAYS be able to render and
encode no less than 30fps with a predefined maximum render + encode
latency time. That means the totality of all 36 containers and their
workloads were not saturating the engines to their max (in order to
maintain just enough headrooom to meet the min fps and max latencies
of incoming container submissions).
Problem statement: It was observed that the CPU core processing the i915
soft IRQ work was experiencing severe load. Using tracelogs and an
instrumentation patch to count specific i915 IRQ events, it was confirmed
that the majority of the CPU cycles were caused by the
gen11_other_irq_handler() -> guc_irq_handler() code path. The vast
majority of the cycles was determined to be processing a specific G2H
IRQ: i.e. INTEL_GUC_ACTION_SCHED_CONTEXT_MODE_DONE. These IRQs are sent
by GuC in response to i915 KMD sending H2G requests:
INTEL_GUC_ACTION_SCHED_CONTEXT_MODE_SET. Those H2G requests are sent
whenever a context goes idle so that we can unpin the context from GuC.
The high CPU utilization % symptom was limiting density scaling.
Root Cause Analysis: Because the incoming execution buffers were spread
across 36 different containers (each with multiple contexts) but the
system in totality was NOT saturated to the max, it was assumed that each
context was constantly idling between submissions. This was causing
a thrashing of unpinning contexts from GuC at one moment, followed quickly
by repinning them due to incoming workload the very next moment. These
event-pairs were being triggered across multiple contexts per container,
across all containers at the rate of > 30 times per sec per context.
Metrics: When running this workload without this patch, we measured an
average of ~69K INTEL_GUC_ACTION_SCHED_CONTEXT_MODE_DONE events every 10
seconds or ~10 million times over ~25+ mins. With this patch, the count
reduced to ~480 every 10 seconds or about ~28K over ~10 mins. The
improvement observed is ~99% for the average counts per 10 seconds.
Design awareness: Selftest impact.
As temporary WA disable this feature for the selftests. Selftests are
very timing sensitive and any change in timing can cause failure. A
follow up patch will fixup the selftests to understand this delay.
Design awareness: Race between guc_request_alloc and guc_context_close.
If a context close is issued while there is a request submission in
flight and a delayed schedule disable is pending, guc_context_close
and guc_request_alloc will race to cancel the delayed disable.
To close the race, make sure that guc_request_alloc waits for
guc_context_close to finish running before checking any state.
Design awareness: GT Reset event.
If a gt reset is triggered, as preparation steps, add an additional step
to ensure all contexts that have a pending delay-disable-schedule task
be flushed of it. Move them directly into the closed state after cancelling
the worker. This is okay because the existing flow flushes all
yet-to-arrive G2H's dropping them anyway.
Signed-off-by: Matthew Brost <matthew.brost@intel.com>
Signed-off-by: Alan Previn <alan.previn.teres.alexis@intel.com>
Signed-off-by: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
Reviewed-by: John Harrison <John.C.Harrison@Intel.com>
Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20221006225121.826257-2-alan.previn.teres.alexis@intel.com
2022-10-06 15:51:20 -07:00
|
|
|
int intel_guc_sched_disable_gucid_threshold_max(struct intel_guc *guc);
|
|
|
|
|
2023-10-17 11:08:02 -07:00
|
|
|
bool intel_guc_tlb_invalidation_is_available(struct intel_guc *guc);
|
|
|
|
int intel_guc_invalidate_tlb_engines(struct intel_guc *guc);
|
|
|
|
int intel_guc_invalidate_tlb_guc(struct intel_guc *guc);
|
|
|
|
int intel_guc_tlb_invalidation_done(struct intel_guc *guc,
|
|
|
|
const u32 *payload, u32 len);
|
2023-10-17 11:08:03 -07:00
|
|
|
void wake_up_all_tlb_invalidate(struct intel_guc *guc);
|
2017-10-04 18:13:41 +00:00
|
|
|
#endif
|