mirror of
git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2025-08-05 16:54:27 +00:00
drm/amdgpu: make reset method configurable for RAS poison
Each RAS block has different requirement for gpu reset in poison consumption handling. Add support for mmhub RAS poison consumption handling. v2: remove the mmhub poison support for kfd int v10. Signed-off-by: Tao Zhou <tao.zhou1@amd.com> Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
parent
e3d4de8d8b
commit
2fc46e0b2f
8 changed files with 41 additions and 29 deletions
|
@ -748,7 +748,7 @@ bool amdgpu_amdkfd_is_fed(struct amdgpu_device *adev)
|
|||
}
|
||||
|
||||
void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev,
|
||||
enum amdgpu_ras_block block, bool reset)
|
||||
enum amdgpu_ras_block block, uint32_t reset)
|
||||
{
|
||||
amdgpu_umc_poison_handler(adev, block, reset);
|
||||
}
|
||||
|
|
|
@ -336,7 +336,7 @@ void amdgpu_amdkfd_debug_mem_fence(struct amdgpu_device *adev);
|
|||
int amdgpu_amdkfd_get_tile_config(struct amdgpu_device *adev,
|
||||
struct tile_config *config);
|
||||
void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev,
|
||||
enum amdgpu_ras_block block, bool reset);
|
||||
enum amdgpu_ras_block block, uint32_t reset);
|
||||
bool amdgpu_amdkfd_is_fed(struct amdgpu_device *adev);
|
||||
bool amdgpu_amdkfd_bo_mapped_to_dev(struct amdgpu_device *adev, struct kgd_mem *mem);
|
||||
void amdgpu_amdkfd_block_mmu_notifications(void *p);
|
||||
|
|
|
@ -2051,7 +2051,7 @@ static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *
|
|||
}
|
||||
}
|
||||
|
||||
amdgpu_umc_poison_handler(adev, obj->head.block, false);
|
||||
amdgpu_umc_poison_handler(adev, obj->head.block, 0);
|
||||
|
||||
if (block_obj->hw_ops && block_obj->hw_ops->handle_poison_consumption)
|
||||
poison_stat = block_obj->hw_ops->handle_poison_consumption(adev);
|
||||
|
@ -2704,7 +2704,7 @@ static int amdgpu_ras_page_retirement_thread(void *param)
|
|||
atomic_dec(&con->page_retirement_req_cnt);
|
||||
|
||||
amdgpu_umc_bad_page_polling_timeout(adev,
|
||||
false, MAX_UMC_POISON_POLLING_TIME_ASYNC);
|
||||
0, MAX_UMC_POISON_POLLING_TIME_ASYNC);
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
|
|
@ -177,7 +177,7 @@ static void amdgpu_umc_handle_bad_pages(struct amdgpu_device *adev,
|
|||
static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
|
||||
void *ras_error_status,
|
||||
struct amdgpu_iv_entry *entry,
|
||||
bool reset)
|
||||
uint32_t reset)
|
||||
{
|
||||
struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
|
||||
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
|
||||
|
@ -186,9 +186,7 @@ static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
|
|||
amdgpu_umc_handle_bad_pages(adev, ras_error_status);
|
||||
|
||||
if (err_data->ue_count && reset) {
|
||||
/* use mode-2 reset for poison consumption */
|
||||
if (!entry)
|
||||
con->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE2_RESET;
|
||||
con->gpu_reset_flags |= reset;
|
||||
amdgpu_ras_reset_gpu(adev);
|
||||
}
|
||||
|
||||
|
@ -196,7 +194,7 @@ static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
|
|||
}
|
||||
|
||||
int amdgpu_umc_bad_page_polling_timeout(struct amdgpu_device *adev,
|
||||
bool reset, uint32_t timeout_ms)
|
||||
uint32_t reset, uint32_t timeout_ms)
|
||||
{
|
||||
struct ras_err_data err_data;
|
||||
struct ras_common_if head = {
|
||||
|
@ -238,8 +236,7 @@ int amdgpu_umc_bad_page_polling_timeout(struct amdgpu_device *adev,
|
|||
if (reset) {
|
||||
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
|
||||
|
||||
/* use mode-2 reset for poison consumption */
|
||||
con->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE2_RESET;
|
||||
con->gpu_reset_flags |= reset;
|
||||
amdgpu_ras_reset_gpu(adev);
|
||||
}
|
||||
|
||||
|
@ -247,7 +244,7 @@ int amdgpu_umc_bad_page_polling_timeout(struct amdgpu_device *adev,
|
|||
}
|
||||
|
||||
int amdgpu_umc_poison_handler(struct amdgpu_device *adev,
|
||||
enum amdgpu_ras_block block, bool reset)
|
||||
enum amdgpu_ras_block block, uint32_t reset)
|
||||
{
|
||||
int ret = AMDGPU_RAS_SUCCESS;
|
||||
|
||||
|
@ -311,7 +308,8 @@ int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev,
|
|||
void *ras_error_status,
|
||||
struct amdgpu_iv_entry *entry)
|
||||
{
|
||||
return amdgpu_umc_do_page_retirement(adev, ras_error_status, entry, true);
|
||||
return amdgpu_umc_do_page_retirement(adev, ras_error_status, entry,
|
||||
AMDGPU_RAS_GPU_RESET_MODE1_RESET);
|
||||
}
|
||||
|
||||
int amdgpu_umc_ras_sw_init(struct amdgpu_device *adev)
|
||||
|
|
|
@ -101,7 +101,7 @@ struct amdgpu_umc {
|
|||
int amdgpu_umc_ras_sw_init(struct amdgpu_device *adev);
|
||||
int amdgpu_umc_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block);
|
||||
int amdgpu_umc_poison_handler(struct amdgpu_device *adev,
|
||||
enum amdgpu_ras_block block, bool reset);
|
||||
enum amdgpu_ras_block block, uint32_t reset);
|
||||
int amdgpu_umc_process_ecc_irq(struct amdgpu_device *adev,
|
||||
struct amdgpu_irq_src *source,
|
||||
struct amdgpu_iv_entry *entry);
|
||||
|
@ -121,5 +121,5 @@ int amdgpu_umc_loop_channels(struct amdgpu_device *adev,
|
|||
umc_func func, void *data);
|
||||
|
||||
int amdgpu_umc_bad_page_polling_timeout(struct amdgpu_device *adev,
|
||||
bool reset, uint32_t timeout_ms);
|
||||
uint32_t reset, uint32_t timeout_ms);
|
||||
#endif
|
||||
|
|
|
@ -134,6 +134,7 @@ static void event_interrupt_poison_consumption(struct kfd_node *dev,
|
|||
{
|
||||
enum amdgpu_ras_block block = 0;
|
||||
int old_poison, ret = -EINVAL;
|
||||
uint32_t reset = 0;
|
||||
struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
|
||||
|
||||
if (!p)
|
||||
|
@ -153,6 +154,8 @@ static void event_interrupt_poison_consumption(struct kfd_node *dev,
|
|||
case SOC15_IH_CLIENTID_UTCL2:
|
||||
ret = kfd_dqm_evict_pasid(dev->dqm, pasid);
|
||||
block = AMDGPU_RAS_BLOCK__GFX;
|
||||
if (ret)
|
||||
reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
|
||||
break;
|
||||
case SOC15_IH_CLIENTID_SDMA0:
|
||||
case SOC15_IH_CLIENTID_SDMA1:
|
||||
|
@ -160,6 +163,7 @@ static void event_interrupt_poison_consumption(struct kfd_node *dev,
|
|||
case SOC15_IH_CLIENTID_SDMA3:
|
||||
case SOC15_IH_CLIENTID_SDMA4:
|
||||
block = AMDGPU_RAS_BLOCK__SDMA;
|
||||
reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
|
@ -170,17 +174,16 @@ static void event_interrupt_poison_consumption(struct kfd_node *dev,
|
|||
/* resetting queue passes, do page retirement without gpu reset
|
||||
* resetting queue fails, fallback to gpu reset solution
|
||||
*/
|
||||
if (!ret) {
|
||||
if (!ret)
|
||||
dev_warn(dev->adev->dev,
|
||||
"RAS poison consumption, unmap queue flow succeeded: client id %d\n",
|
||||
client_id);
|
||||
amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, false);
|
||||
} else {
|
||||
else
|
||||
dev_warn(dev->adev->dev,
|
||||
"RAS poison consumption, fall back to gpu reset flow: client id %d\n",
|
||||
client_id);
|
||||
amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, true);
|
||||
}
|
||||
|
||||
amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, reset);
|
||||
}
|
||||
|
||||
static bool event_interrupt_isr_v10(struct kfd_node *dev,
|
||||
|
|
|
@ -193,6 +193,7 @@ static void event_interrupt_poison_consumption_v11(struct kfd_node *dev,
|
|||
{
|
||||
enum amdgpu_ras_block block = 0;
|
||||
int ret = -EINVAL;
|
||||
uint32_t reset = 0;
|
||||
struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
|
||||
|
||||
if (!p)
|
||||
|
@ -212,10 +213,13 @@ static void event_interrupt_poison_consumption_v11(struct kfd_node *dev,
|
|||
if (dev->dqm->ops.reset_queues)
|
||||
ret = dev->dqm->ops.reset_queues(dev->dqm, pasid);
|
||||
block = AMDGPU_RAS_BLOCK__GFX;
|
||||
if (ret)
|
||||
reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
|
||||
break;
|
||||
case SOC21_INTSRC_SDMA_ECC:
|
||||
default:
|
||||
block = AMDGPU_RAS_BLOCK__GFX;
|
||||
reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
|
||||
break;
|
||||
}
|
||||
|
||||
|
@ -223,10 +227,7 @@ static void event_interrupt_poison_consumption_v11(struct kfd_node *dev,
|
|||
|
||||
/* resetting queue passes, do page retirement without gpu reset
|
||||
resetting queue fails, fallback to gpu reset solution */
|
||||
if (!ret)
|
||||
amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, false);
|
||||
else
|
||||
amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, true);
|
||||
amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, reset);
|
||||
}
|
||||
|
||||
static bool event_interrupt_isr_v11(struct kfd_node *dev,
|
||||
|
|
|
@ -145,6 +145,7 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev,
|
|||
{
|
||||
enum amdgpu_ras_block block = 0;
|
||||
int old_poison, ret = -EINVAL;
|
||||
uint32_t reset = 0;
|
||||
struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
|
||||
|
||||
if (!p)
|
||||
|
@ -164,6 +165,15 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev,
|
|||
case SOC15_IH_CLIENTID_UTCL2:
|
||||
ret = kfd_dqm_evict_pasid(dev->dqm, pasid);
|
||||
block = AMDGPU_RAS_BLOCK__GFX;
|
||||
if (ret)
|
||||
reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
|
||||
break;
|
||||
case SOC15_IH_CLIENTID_VMC:
|
||||
case SOC15_IH_CLIENTID_VMC1:
|
||||
ret = kfd_dqm_evict_pasid(dev->dqm, pasid);
|
||||
block = AMDGPU_RAS_BLOCK__MMHUB;
|
||||
if (ret)
|
||||
reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET;
|
||||
break;
|
||||
case SOC15_IH_CLIENTID_SDMA0:
|
||||
case SOC15_IH_CLIENTID_SDMA1:
|
||||
|
@ -171,6 +181,7 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev,
|
|||
case SOC15_IH_CLIENTID_SDMA3:
|
||||
case SOC15_IH_CLIENTID_SDMA4:
|
||||
block = AMDGPU_RAS_BLOCK__SDMA;
|
||||
reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
|
@ -181,17 +192,16 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev,
|
|||
/* resetting queue passes, do page retirement without gpu reset
|
||||
* resetting queue fails, fallback to gpu reset solution
|
||||
*/
|
||||
if (!ret) {
|
||||
if (!ret)
|
||||
dev_warn(dev->adev->dev,
|
||||
"RAS poison consumption, unmap queue flow succeeded: client id %d\n",
|
||||
client_id);
|
||||
amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, false);
|
||||
} else {
|
||||
else
|
||||
dev_warn(dev->adev->dev,
|
||||
"RAS poison consumption, fall back to gpu reset flow: client id %d\n",
|
||||
client_id);
|
||||
amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, true);
|
||||
}
|
||||
|
||||
amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, reset);
|
||||
}
|
||||
|
||||
static bool context_id_expected(struct kfd_dev *dev)
|
||||
|
|
Loading…
Add table
Reference in a new issue