- Don't touch survivability_mode on fini (Michal)

- Fixes around eviction and suspend (Thomas)
 - Extend Wa_13011645652 to PTL-H, WCL (Julia)
 -----BEGIN PGP SIGNATURE-----
 
 iQEzBAABCgAdFiEEbSBwaO7dZQkcLOKj+mJfZA7rE8oFAmjC6toACgkQ+mJfZA7r
 E8q/IQf8CNF5UxeohpunvIR6lO/igxmy4JxPPGt8/GLhz3rktJqXM/YWfZTBuy31
 0w/egd/nxtisTjgfZHPNUeMwVrNfer8w4VtwWfANVj6iQCRGDbwReo1vO3FK6ISA
 4bNUtv3kDx9VcaGMvyn13I8beAR+EHxNQ8qMGMczYvO3luCr6OcWTqIUmm4coEBL
 Cbiolupi8SSDIXRC+FUXSgwK9VU5928UVxmyim88sFT63FJSRO5CX+5u53LfBrlv
 qdlSO3wXnjE5RbCAxVnAPnfT5DYa0KEuzG8c+KpI3iQ9oDLUgqHafhW7B7L4XIB7
 iHrlRsvfRgZNGuiTFTj9kr+OEZjj8w==
 =xwQv
 -----END PGP SIGNATURE-----

Merge tag 'drm-xe-fixes-2025-09-11' of https://gitlab.freedesktop.org/drm/xe/kernel into drm-fixes

- Don't touch survivability_mode on fini (Michal)
- Fixes around eviction and suspend (Thomas)
- Extend Wa_13011645652 to PTL-H, WCL (Julia)

Signed-off-by: Dave Airlie <airlied@redhat.com>

From: Rodrigo Vivi <rodrigo.vivi@intel.com>
Link: https://lore.kernel.org/r/aMLq7QlaEPHGKXKX@intel.com
This commit is contained in:
Dave Airlie 2025-09-12 09:39:06 +10:00
commit 9a3f210737
13 changed files with 115 additions and 29 deletions

View file

@ -236,7 +236,7 @@ static int evict_test_run_tile(struct xe_device *xe, struct xe_tile *tile, struc
}
xe_bo_lock(external, false);
err = xe_bo_pin_external(external);
err = xe_bo_pin_external(external, false);
xe_bo_unlock(external);
if (err) {
KUNIT_FAIL(test, "external bo pin err=%pe\n",

View file

@ -89,15 +89,7 @@ static void check_residency(struct kunit *test, struct xe_bo *exported,
return;
}
/*
* If on different devices, the exporter is kept in system if
* possible, saving a migration step as the transfer is just
* likely as fast from system memory.
*/
if (params->mem_mask & XE_BO_FLAG_SYSTEM)
KUNIT_EXPECT_TRUE(test, xe_bo_is_mem_type(exported, XE_PL_TT));
else
KUNIT_EXPECT_TRUE(test, xe_bo_is_mem_type(exported, mem_type));
KUNIT_EXPECT_TRUE(test, xe_bo_is_mem_type(exported, mem_type));
if (params->force_different_devices)
KUNIT_EXPECT_TRUE(test, xe_bo_is_mem_type(imported, XE_PL_TT));

View file

@ -186,6 +186,8 @@ static void try_add_system(struct xe_device *xe, struct xe_bo *bo,
bo->placements[*c] = (struct ttm_place) {
.mem_type = XE_PL_TT,
.flags = (bo_flags & XE_BO_FLAG_VRAM_MASK) ?
TTM_PL_FLAG_FALLBACK : 0,
};
*c += 1;
}
@ -2269,6 +2271,7 @@ uint64_t vram_region_gpu_offset(struct ttm_resource *res)
/**
* xe_bo_pin_external - pin an external BO
* @bo: buffer object to be pinned
* @in_place: Pin in current placement, don't attempt to migrate.
*
* Pin an external (not tied to a VM, can be exported via dma-buf / prime FD)
* BO. Unique call compared to xe_bo_pin as this function has it own set of
@ -2276,7 +2279,7 @@ uint64_t vram_region_gpu_offset(struct ttm_resource *res)
*
* Returns 0 for success, negative error code otherwise.
*/
int xe_bo_pin_external(struct xe_bo *bo)
int xe_bo_pin_external(struct xe_bo *bo, bool in_place)
{
struct xe_device *xe = xe_bo_device(bo);
int err;
@ -2285,9 +2288,11 @@ int xe_bo_pin_external(struct xe_bo *bo)
xe_assert(xe, xe_bo_is_user(bo));
if (!xe_bo_is_pinned(bo)) {
err = xe_bo_validate(bo, NULL, false);
if (err)
return err;
if (!in_place) {
err = xe_bo_validate(bo, NULL, false);
if (err)
return err;
}
spin_lock(&xe->pinned.lock);
list_add_tail(&bo->pinned_link, &xe->pinned.late.external);
@ -2440,6 +2445,9 @@ int xe_bo_validate(struct xe_bo *bo, struct xe_vm *vm, bool allow_res_evict)
};
int ret;
if (xe_bo_is_pinned(bo))
return 0;
if (vm) {
lockdep_assert_held(&vm->lock);
xe_vm_assert_held(vm);

View file

@ -198,7 +198,7 @@ static inline void xe_bo_unlock_vm_held(struct xe_bo *bo)
}
}
int xe_bo_pin_external(struct xe_bo *bo);
int xe_bo_pin_external(struct xe_bo *bo, bool in_place);
int xe_bo_pin(struct xe_bo *bo);
void xe_bo_unpin_external(struct xe_bo *bo);
void xe_bo_unpin(struct xe_bo *bo);

View file

@ -553,6 +553,12 @@ struct xe_device {
/** @pm_notifier: Our PM notifier to perform actions in response to various PM events. */
struct notifier_block pm_notifier;
/** @pm_block: Completion to block validating tasks on suspend / hibernate prepare */
struct completion pm_block;
/** @rebind_resume_list: List of wq items to kick on resume. */
struct list_head rebind_resume_list;
/** @rebind_resume_lock: Lock to protect the rebind_resume_list */
struct mutex rebind_resume_lock;
/** @pmt: Support the PMT driver callback interface */
struct {

View file

@ -72,7 +72,7 @@ static int xe_dma_buf_pin(struct dma_buf_attachment *attach)
return ret;
}
ret = xe_bo_pin_external(bo);
ret = xe_bo_pin_external(bo, true);
xe_assert(xe, !ret);
return 0;

View file

@ -237,6 +237,15 @@ retry:
goto err_unlock_list;
}
/*
* It's OK to block interruptible here with the vm lock held, since
* on task freezing during suspend / hibernate, the call will
* return -ERESTARTSYS and the IOCTL will be rerun.
*/
err = wait_for_completion_interruptible(&xe->pm_block);
if (err)
goto err_unlock_list;
vm_exec.vm = &vm->gpuvm;
vm_exec.flags = DRM_EXEC_INTERRUPTIBLE_WAIT;
if (xe_vm_in_lr_mode(vm)) {

View file

@ -24,6 +24,7 @@
#include "xe_pcode.h"
#include "xe_pxp.h"
#include "xe_trace.h"
#include "xe_vm.h"
#include "xe_wa.h"
/**
@ -290,6 +291,19 @@ static u32 vram_threshold_value(struct xe_device *xe)
return DEFAULT_VRAM_THRESHOLD;
}
static void xe_pm_wake_rebind_workers(struct xe_device *xe)
{
struct xe_vm *vm, *next;
mutex_lock(&xe->rebind_resume_lock);
list_for_each_entry_safe(vm, next, &xe->rebind_resume_list,
preempt.pm_activate_link) {
list_del_init(&vm->preempt.pm_activate_link);
xe_vm_resume_rebind_worker(vm);
}
mutex_unlock(&xe->rebind_resume_lock);
}
static int xe_pm_notifier_callback(struct notifier_block *nb,
unsigned long action, void *data)
{
@ -299,30 +313,30 @@ static int xe_pm_notifier_callback(struct notifier_block *nb,
switch (action) {
case PM_HIBERNATION_PREPARE:
case PM_SUSPEND_PREPARE:
reinit_completion(&xe->pm_block);
xe_pm_runtime_get(xe);
err = xe_bo_evict_all_user(xe);
if (err) {
if (err)
drm_dbg(&xe->drm, "Notifier evict user failed (%d)\n", err);
xe_pm_runtime_put(xe);
break;
}
err = xe_bo_notifier_prepare_all_pinned(xe);
if (err) {
if (err)
drm_dbg(&xe->drm, "Notifier prepare pin failed (%d)\n", err);
xe_pm_runtime_put(xe);
}
/*
* Keep the runtime pm reference until post hibernation / post suspend to
* avoid a runtime suspend interfering with evicted objects or backup
* allocations.
*/
break;
case PM_POST_HIBERNATION:
case PM_POST_SUSPEND:
complete_all(&xe->pm_block);
xe_pm_wake_rebind_workers(xe);
xe_bo_notifier_unprepare_all_pinned(xe);
xe_pm_runtime_put(xe);
break;
}
if (err)
return NOTIFY_BAD;
return NOTIFY_DONE;
}
@ -344,6 +358,14 @@ int xe_pm_init(struct xe_device *xe)
if (err)
return err;
err = drmm_mutex_init(&xe->drm, &xe->rebind_resume_lock);
if (err)
goto err_unregister;
init_completion(&xe->pm_block);
complete_all(&xe->pm_block);
INIT_LIST_HEAD(&xe->rebind_resume_list);
/* For now suspend/resume is only allowed with GuC */
if (!xe_device_uc_enabled(xe))
return 0;

View file

@ -41,6 +41,8 @@
*
* # echo 1 > /sys/kernel/config/xe/0000:03:00.0/survivability_mode
*
* It is the responsibility of the user to clear the mode once firmware flash is complete.
*
* Refer :ref:`xe_configfs` for more details on how to use configfs
*
* Survivability mode is indicated by the below admin-only readable sysfs which provides additional
@ -147,7 +149,6 @@ static void xe_survivability_mode_fini(void *arg)
struct pci_dev *pdev = to_pci_dev(xe->drm.dev);
struct device *dev = &pdev->dev;
xe_configfs_clear_survivability_mode(pdev);
sysfs_remove_file(&dev->kobj, &dev_attr_survivability_mode.attr);
}

View file

@ -393,6 +393,9 @@ static int xe_gpuvm_validate(struct drm_gpuvm_bo *vm_bo, struct drm_exec *exec)
list_move_tail(&gpuva_to_vma(gpuva)->combined_links.rebind,
&vm->rebind_list);
if (!try_wait_for_completion(&vm->xe->pm_block))
return -EAGAIN;
ret = xe_bo_validate(gem_to_xe_bo(vm_bo->obj), vm, false);
if (ret)
return ret;
@ -479,6 +482,33 @@ static int xe_preempt_work_begin(struct drm_exec *exec, struct xe_vm *vm,
return xe_vm_validate_rebind(vm, exec, vm->preempt.num_exec_queues);
}
static bool vm_suspend_rebind_worker(struct xe_vm *vm)
{
struct xe_device *xe = vm->xe;
bool ret = false;
mutex_lock(&xe->rebind_resume_lock);
if (!try_wait_for_completion(&vm->xe->pm_block)) {
ret = true;
list_move_tail(&vm->preempt.pm_activate_link, &xe->rebind_resume_list);
}
mutex_unlock(&xe->rebind_resume_lock);
return ret;
}
/**
* xe_vm_resume_rebind_worker() - Resume the rebind worker.
* @vm: The vm whose preempt worker to resume.
*
* Resume a preempt worker that was previously suspended by
* vm_suspend_rebind_worker().
*/
void xe_vm_resume_rebind_worker(struct xe_vm *vm)
{
queue_work(vm->xe->ordered_wq, &vm->preempt.rebind_work);
}
static void preempt_rebind_work_func(struct work_struct *w)
{
struct xe_vm *vm = container_of(w, struct xe_vm, preempt.rebind_work);
@ -502,6 +532,11 @@ static void preempt_rebind_work_func(struct work_struct *w)
}
retry:
if (!try_wait_for_completion(&vm->xe->pm_block) && vm_suspend_rebind_worker(vm)) {
up_write(&vm->lock);
return;
}
if (xe_vm_userptr_check_repin(vm)) {
err = xe_vm_userptr_pin(vm);
if (err)
@ -1714,6 +1749,7 @@ struct xe_vm *xe_vm_create(struct xe_device *xe, u32 flags, struct xe_file *xef)
if (flags & XE_VM_FLAG_LR_MODE) {
INIT_WORK(&vm->preempt.rebind_work, preempt_rebind_work_func);
xe_pm_runtime_get_noresume(xe);
INIT_LIST_HEAD(&vm->preempt.pm_activate_link);
}
if (flags & XE_VM_FLAG_FAULT_MODE) {
@ -1895,8 +1931,12 @@ void xe_vm_close_and_put(struct xe_vm *vm)
xe_assert(xe, !vm->preempt.num_exec_queues);
xe_vm_close(vm);
if (xe_vm_in_preempt_fence_mode(vm))
if (xe_vm_in_preempt_fence_mode(vm)) {
mutex_lock(&xe->rebind_resume_lock);
list_del_init(&vm->preempt.pm_activate_link);
mutex_unlock(&xe->rebind_resume_lock);
flush_work(&vm->preempt.rebind_work);
}
if (xe_vm_in_fault_mode(vm))
xe_svm_close(vm);

View file

@ -273,6 +273,8 @@ struct dma_fence *xe_vm_bind_kernel_bo(struct xe_vm *vm, struct xe_bo *bo,
struct xe_exec_queue *q, u64 addr,
enum xe_cache_level cache_lvl);
void xe_vm_resume_rebind_worker(struct xe_vm *vm);
/**
* xe_vm_resv() - Return's the vm's reservation object
* @vm: The vm

View file

@ -293,6 +293,11 @@ struct xe_vm {
* BOs
*/
struct work_struct rebind_work;
/**
* @preempt.pm_activate_link: Link to list of rebind workers to be
* kicked on resume.
*/
struct list_head pm_activate_link;
} preempt;
/** @um: unified memory state */

View file

@ -30,7 +30,8 @@
16022287689 GRAPHICS_VERSION(2001)
GRAPHICS_VERSION(2004)
13011645652 GRAPHICS_VERSION(2004)
GRAPHICS_VERSION(3001)
GRAPHICS_VERSION_RANGE(3000, 3001)
GRAPHICS_VERSION(3003)
14022293748 GRAPHICS_VERSION_RANGE(2001, 2002)
GRAPHICS_VERSION(2004)
GRAPHICS_VERSION_RANGE(3000, 3001)