mirror of
git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2025-08-04 08:17:46 +00:00
drm/amdgpu: Use asynchronous polling to handle umc_v12_0 poisoning
Use asynchronous polling to handle umc_v12_0 poisoning. v2: 1. Change function name. 2. Change the debugging information content. Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com> Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
parent
ee9c3031d0
commit
6c23f3d12a
3 changed files with 113 additions and 28 deletions
|
@ -120,6 +120,8 @@ const char *get_ras_block_str(struct ras_common_if *ras_block)
|
|||
/* typical ECC bad page rate is 1 bad page per 100MB VRAM */
|
||||
#define RAS_BAD_PAGE_COVER (100 * 1024 * 1024ULL)
|
||||
|
||||
#define MAX_UMC_POISON_POLLING_TIME_ASYNC 100 //ms
|
||||
|
||||
enum amdgpu_ras_retire_page_reservation {
|
||||
AMDGPU_RAS_RETIRE_PAGE_RESERVED,
|
||||
AMDGPU_RAS_RETIRE_PAGE_PENDING,
|
||||
|
@ -2674,6 +2676,9 @@ static int amdgpu_ras_page_retirement_thread(void *param)
|
|||
atomic_read(&con->page_retirement_req_cnt));
|
||||
|
||||
atomic_dec(&con->page_retirement_req_cnt);
|
||||
|
||||
amdgpu_umc_bad_page_polling_timeout(adev,
|
||||
false, MAX_UMC_POISON_POLLING_TIME_ASYNC);
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
|
|
@ -23,6 +23,7 @@
|
|||
|
||||
#include "amdgpu.h"
|
||||
#include "umc_v6_7.h"
|
||||
#define MAX_UMC_POISON_POLLING_TIME_SYNC 20 //ms
|
||||
|
||||
static int amdgpu_umc_convert_error_address(struct amdgpu_device *adev,
|
||||
struct ras_err_data *err_data, uint64_t err_addr,
|
||||
|
@ -85,17 +86,14 @@ out_fini_err_data:
|
|||
return ret;
|
||||
}
|
||||
|
||||
static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
|
||||
void *ras_error_status,
|
||||
struct amdgpu_iv_entry *entry,
|
||||
bool reset)
|
||||
static void amdgpu_umc_handle_bad_pages(struct amdgpu_device *adev,
|
||||
void *ras_error_status)
|
||||
{
|
||||
struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
|
||||
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
|
||||
int ret = 0;
|
||||
unsigned long err_count;
|
||||
|
||||
kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
|
||||
mutex_lock(&con->page_retirement_lock);
|
||||
ret = amdgpu_dpm_get_ecc_info(adev, (void *)&(con->umc_ecc));
|
||||
if (ret == -EOPNOTSUPP) {
|
||||
if (adev->umc.ras && adev->umc.ras->ras_block.hw_ops &&
|
||||
|
@ -163,19 +161,85 @@ static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
|
|||
con->update_channel_flag = false;
|
||||
}
|
||||
}
|
||||
|
||||
if (reset) {
|
||||
/* use mode-2 reset for poison consumption */
|
||||
if (!entry)
|
||||
con->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE2_RESET;
|
||||
amdgpu_ras_reset_gpu(adev);
|
||||
}
|
||||
}
|
||||
|
||||
kfree(err_data->err_addr);
|
||||
|
||||
mutex_unlock(&con->page_retirement_lock);
|
||||
}
|
||||
|
||||
static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
|
||||
void *ras_error_status,
|
||||
struct amdgpu_iv_entry *entry,
|
||||
bool reset)
|
||||
{
|
||||
struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
|
||||
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
|
||||
|
||||
kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
|
||||
amdgpu_umc_handle_bad_pages(adev, ras_error_status);
|
||||
|
||||
if (err_data->ue_count && reset) {
|
||||
/* use mode-2 reset for poison consumption */
|
||||
if (!entry)
|
||||
con->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE2_RESET;
|
||||
amdgpu_ras_reset_gpu(adev);
|
||||
}
|
||||
|
||||
return AMDGPU_RAS_SUCCESS;
|
||||
}
|
||||
|
||||
int amdgpu_umc_bad_page_polling_timeout(struct amdgpu_device *adev,
|
||||
bool reset, uint32_t timeout_ms)
|
||||
{
|
||||
struct ras_err_data err_data;
|
||||
struct ras_common_if head = {
|
||||
.block = AMDGPU_RAS_BLOCK__UMC,
|
||||
};
|
||||
struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head);
|
||||
uint32_t timeout = timeout_ms;
|
||||
|
||||
memset(&err_data, 0, sizeof(err_data));
|
||||
amdgpu_ras_error_data_init(&err_data);
|
||||
|
||||
do {
|
||||
|
||||
amdgpu_umc_handle_bad_pages(adev, &err_data);
|
||||
|
||||
if (timeout && !err_data.de_count) {
|
||||
msleep(1);
|
||||
timeout--;
|
||||
}
|
||||
|
||||
} while (timeout && !err_data.de_count);
|
||||
|
||||
if (!timeout)
|
||||
dev_warn(adev->dev, "Can't find bad pages\n");
|
||||
|
||||
if (err_data.de_count)
|
||||
dev_info(adev->dev, "%ld new deferred hardware errors detected\n", err_data.de_count);
|
||||
|
||||
if (obj) {
|
||||
obj->err_data.ue_count += err_data.ue_count;
|
||||
obj->err_data.ce_count += err_data.ce_count;
|
||||
obj->err_data.de_count += err_data.de_count;
|
||||
}
|
||||
|
||||
amdgpu_ras_error_data_fini(&err_data);
|
||||
|
||||
kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
|
||||
|
||||
if (reset) {
|
||||
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
|
||||
|
||||
/* use mode-2 reset for poison consumption */
|
||||
con->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE2_RESET;
|
||||
amdgpu_ras_reset_gpu(adev);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int amdgpu_umc_poison_handler(struct amdgpu_device *adev, bool reset)
|
||||
{
|
||||
int ret = AMDGPU_RAS_SUCCESS;
|
||||
|
@ -193,25 +257,38 @@ int amdgpu_umc_poison_handler(struct amdgpu_device *adev, bool reset)
|
|||
}
|
||||
|
||||
if (!amdgpu_sriov_vf(adev)) {
|
||||
struct ras_err_data err_data;
|
||||
struct ras_common_if head = {
|
||||
.block = AMDGPU_RAS_BLOCK__UMC,
|
||||
};
|
||||
struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head);
|
||||
if (amdgpu_ip_version(adev, UMC_HWIP, 0) < IP_VERSION(12, 0, 0)) {
|
||||
struct ras_err_data err_data;
|
||||
struct ras_common_if head = {
|
||||
.block = AMDGPU_RAS_BLOCK__UMC,
|
||||
};
|
||||
struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head);
|
||||
|
||||
ret = amdgpu_ras_error_data_init(&err_data);
|
||||
if (ret)
|
||||
return ret;
|
||||
ret = amdgpu_ras_error_data_init(&err_data);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
ret = amdgpu_umc_do_page_retirement(adev, &err_data, NULL, reset);
|
||||
ret = amdgpu_umc_do_page_retirement(adev, &err_data, NULL, reset);
|
||||
|
||||
if (ret == AMDGPU_RAS_SUCCESS && obj) {
|
||||
obj->err_data.ue_count += err_data.ue_count;
|
||||
obj->err_data.ce_count += err_data.ce_count;
|
||||
obj->err_data.de_count += err_data.de_count;
|
||||
if (ret == AMDGPU_RAS_SUCCESS && obj) {
|
||||
obj->err_data.ue_count += err_data.ue_count;
|
||||
obj->err_data.ce_count += err_data.ce_count;
|
||||
obj->err_data.de_count += err_data.de_count;
|
||||
}
|
||||
|
||||
amdgpu_ras_error_data_fini(&err_data);
|
||||
} else {
|
||||
if (reset) {
|
||||
amdgpu_umc_bad_page_polling_timeout(adev,
|
||||
reset, MAX_UMC_POISON_POLLING_TIME_SYNC);
|
||||
} else {
|
||||
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
|
||||
|
||||
atomic_inc(&con->page_retirement_req_cnt);
|
||||
|
||||
wake_up(&con->page_retirement_wq);
|
||||
}
|
||||
}
|
||||
|
||||
amdgpu_ras_error_data_fini(&err_data);
|
||||
} else {
|
||||
if (adev->virt.ops && adev->virt.ops->ras_poison_handler)
|
||||
adev->virt.ops->ras_poison_handler(adev);
|
||||
|
|
|
@ -118,4 +118,7 @@ int amdgpu_umc_page_retirement_mca(struct amdgpu_device *adev,
|
|||
|
||||
int amdgpu_umc_loop_channels(struct amdgpu_device *adev,
|
||||
umc_func func, void *data);
|
||||
|
||||
int amdgpu_umc_bad_page_polling_timeout(struct amdgpu_device *adev,
|
||||
bool reset, uint32_t timeout_ms);
|
||||
#endif
|
||||
|
|
Loading…
Add table
Reference in a new issue