mirror of
git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2025-08-05 16:54:27 +00:00
drm/amdgpu: add RAS poison handling for MCA
For MCA poison, if unmap queue fails, only gpu reset should be triggered without page retirement handling, MCA notifier will do it. v2: handle MCA poison consumption in umc_poison_handler directly. Signed-off-by: Tao Zhou <tao.zhou1@amd.com> Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
parent
24b822928b
commit
ae45a18b80
1 changed files with 20 additions and 11 deletions
|
@ -169,19 +169,28 @@ int amdgpu_umc_poison_handler(struct amdgpu_device *adev,
|
|||
void *ras_error_status,
|
||||
bool reset)
|
||||
{
|
||||
int ret;
|
||||
struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
|
||||
struct ras_common_if head = {
|
||||
.block = AMDGPU_RAS_BLOCK__UMC,
|
||||
};
|
||||
struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head);
|
||||
int ret = AMDGPU_RAS_SUCCESS;
|
||||
|
||||
ret =
|
||||
amdgpu_umc_do_page_retirement(adev, ras_error_status, NULL, reset);
|
||||
if (!adev->gmc.xgmi.connected_to_cpu) {
|
||||
struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
|
||||
struct ras_common_if head = {
|
||||
.block = AMDGPU_RAS_BLOCK__UMC,
|
||||
};
|
||||
struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head);
|
||||
|
||||
if (ret == AMDGPU_RAS_SUCCESS && obj) {
|
||||
obj->err_data.ue_count += err_data->ue_count;
|
||||
obj->err_data.ce_count += err_data->ce_count;
|
||||
ret =
|
||||
amdgpu_umc_do_page_retirement(adev, ras_error_status, NULL, reset);
|
||||
|
||||
if (ret == AMDGPU_RAS_SUCCESS && obj) {
|
||||
obj->err_data.ue_count += err_data->ue_count;
|
||||
obj->err_data.ce_count += err_data->ce_count;
|
||||
}
|
||||
} else if (reset) {
|
||||
/* MCA poison handler is only responsible for GPU reset,
|
||||
* let MCA notifier do page retirement.
|
||||
*/
|
||||
kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
|
||||
amdgpu_ras_reset_gpu(adev);
|
||||
}
|
||||
|
||||
return ret;
|
||||
|
|
Loading…
Add table
Reference in a new issue