mirror of
git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2025-08-05 16:54:27 +00:00
RAS/AMD/ATL: Add MI300 row retirement support
DRAM row retirement depends on model-specific information that is best done within the AMD Address Translation Library. Export a generic wrapper function for other modules to use. Add any model-specific helpers here. Signed-off-by: Yazen Ghannam <yazen.ghannam@amd.com> Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de> Link: https://lore.kernel.org/r/20240214033516.1344948-2-yazen.ghannam@amd.com
This commit is contained in:
parent
0e4fd816b0
commit
3b566b30b4
3 changed files with 54 additions and 0 deletions
|
@ -10,6 +10,7 @@
|
||||||
config AMD_ATL
|
config AMD_ATL
|
||||||
tristate "AMD Address Translation Library"
|
tristate "AMD Address Translation Library"
|
||||||
depends on AMD_NB && X86_64 && RAS
|
depends on AMD_NB && X86_64 && RAS
|
||||||
|
depends on MEMORY_FAILURE
|
||||||
default N
|
default N
|
||||||
help
|
help
|
||||||
This library includes support for implementation-specific
|
This library includes support for implementation-specific
|
||||||
|
|
|
@ -239,6 +239,57 @@ static unsigned long convert_dram_to_norm_addr_mi300(unsigned long addr)
|
||||||
return addr;
|
return addr;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* When a DRAM ECC error occurs on MI300 systems, it is recommended to retire
|
||||||
|
* all memory within that DRAM row. This applies to the memory with a DRAM
|
||||||
|
* bank.
|
||||||
|
*
|
||||||
|
* To find the memory addresses, loop through permutations of the DRAM column
|
||||||
|
* bits and find the System Physical address of each. The column bits are used
|
||||||
|
* to calculate the intermediate Normalized address, so all permutations should
|
||||||
|
* be checked.
|
||||||
|
*
|
||||||
|
* See amd_atl::convert_dram_to_norm_addr_mi300() for MI300 address formats.
|
||||||
|
*/
|
||||||
|
#define MI300_NUM_COL BIT(HWEIGHT(MI300_UMC_MCA_COL))
|
||||||
|
static void retire_row_mi300(struct atl_err *a_err)
|
||||||
|
{
|
||||||
|
unsigned long addr;
|
||||||
|
struct page *p;
|
||||||
|
u8 col;
|
||||||
|
|
||||||
|
for (col = 0; col < MI300_NUM_COL; col++) {
|
||||||
|
a_err->addr &= ~MI300_UMC_MCA_COL;
|
||||||
|
a_err->addr |= FIELD_PREP(MI300_UMC_MCA_COL, col);
|
||||||
|
|
||||||
|
addr = amd_convert_umc_mca_addr_to_sys_addr(a_err);
|
||||||
|
if (IS_ERR_VALUE(addr))
|
||||||
|
continue;
|
||||||
|
|
||||||
|
addr = PHYS_PFN(addr);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Skip invalid or already poisoned pages to avoid unnecessary
|
||||||
|
* error messages from memory_failure().
|
||||||
|
*/
|
||||||
|
p = pfn_to_online_page(addr);
|
||||||
|
if (!p)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
if (PageHWPoison(p))
|
||||||
|
continue;
|
||||||
|
|
||||||
|
memory_failure(addr, 0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void amd_retire_dram_row(struct atl_err *a_err)
|
||||||
|
{
|
||||||
|
if (df_cfg.rev == DF4p5 && df_cfg.flags.heterogeneous)
|
||||||
|
return retire_row_mi300(a_err);
|
||||||
|
}
|
||||||
|
EXPORT_SYMBOL_GPL(amd_retire_dram_row);
|
||||||
|
|
||||||
static unsigned long get_addr(unsigned long addr)
|
static unsigned long get_addr(unsigned long addr)
|
||||||
{
|
{
|
||||||
if (df_cfg.rev == DF4p5 && df_cfg.flags.heterogeneous)
|
if (df_cfg.rev == DF4p5 && df_cfg.flags.heterogeneous)
|
||||||
|
|
|
@ -45,8 +45,10 @@ struct atl_err {
|
||||||
#if IS_ENABLED(CONFIG_AMD_ATL)
|
#if IS_ENABLED(CONFIG_AMD_ATL)
|
||||||
void amd_atl_register_decoder(unsigned long (*f)(struct atl_err *));
|
void amd_atl_register_decoder(unsigned long (*f)(struct atl_err *));
|
||||||
void amd_atl_unregister_decoder(void);
|
void amd_atl_unregister_decoder(void);
|
||||||
|
void amd_retire_dram_row(struct atl_err *err);
|
||||||
unsigned long amd_convert_umc_mca_addr_to_sys_addr(struct atl_err *err);
|
unsigned long amd_convert_umc_mca_addr_to_sys_addr(struct atl_err *err);
|
||||||
#else
|
#else
|
||||||
|
static inline void amd_retire_dram_row(struct atl_err *err) { }
|
||||||
static inline unsigned long
|
static inline unsigned long
|
||||||
amd_convert_umc_mca_addr_to_sys_addr(struct atl_err *err) { return -EINVAL; }
|
amd_convert_umc_mca_addr_to_sys_addr(struct atl_err *err) { return -EINVAL; }
|
||||||
#endif /* CONFIG_AMD_ATL */
|
#endif /* CONFIG_AMD_ATL */
|
||||||
|
|
Loading…
Add table
Reference in a new issue